diff options
Diffstat (limited to 'kernel')
76 files changed, 5726 insertions, 2601 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9..6c07f30fa9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
45 | obj-$(CONFIG_SMP) += smp.o | 45 | obj-$(CONFIG_SMP) += smp.o |
46 | obj-$(CONFIG_SMP) += smpboot.o | ||
46 | ifneq ($(CONFIG_SMP),y) | 47 | ifneq ($(CONFIG_SMP),y) |
47 | obj-y += up.o | 48 | obj-y += up.o |
48 | endif | 49 | endif |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34ea..4b96415527b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
69 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
70 | #include <linux/compat.h> | ||
70 | 71 | ||
71 | #include "audit.h" | 72 | #include "audit.h" |
72 | 73 | ||
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) | |||
2710 | audit_log_end(ab); | 2711 | audit_log_end(ab); |
2711 | } | 2712 | } |
2712 | 2713 | ||
2713 | void __audit_seccomp(unsigned long syscall) | 2714 | void __audit_seccomp(unsigned long syscall, long signr, int code) |
2714 | { | 2715 | { |
2715 | struct audit_buffer *ab; | 2716 | struct audit_buffer *ab; |
2716 | 2717 | ||
2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2718 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2718 | audit_log_abend(ab, "seccomp", SIGKILL); | 2719 | audit_log_abend(ab, "seccomp", signr); |
2719 | audit_log_format(ab, " syscall=%ld", syscall); | 2720 | audit_log_format(ab, " syscall=%ld", syscall); |
2721 | audit_log_format(ab, " compat=%d", is_compat_task()); | ||
2722 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
2723 | audit_log_format(ab, " code=0x%x", code); | ||
2720 | audit_log_end(ab); | 2724 | audit_log_end(ab); |
2721 | } | 2725 | } |
2722 | 2726 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 3f1adb6c647..493d9725948 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -419,3 +419,24 @@ bool nsown_capable(int cap) | |||
419 | { | 419 | { |
420 | return ns_capable(current_user_ns(), cap); | 420 | return ns_capable(current_user_ns(), cap); |
421 | } | 421 | } |
422 | |||
423 | /** | ||
424 | * inode_capable - Check superior capability over inode | ||
425 | * @inode: The inode in question | ||
426 | * @cap: The capability in question | ||
427 | * | ||
428 | * Return true if the current task has the given superior capability | ||
429 | * targeted at it's own user namespace and that the given inode is owned | ||
430 | * by the current user namespace or a child namespace. | ||
431 | * | ||
432 | * Currently we check to see if an inode is owned by the current | ||
433 | * user namespace by seeing if the inode's owner maps into the | ||
434 | * current user namespace. | ||
435 | * | ||
436 | */ | ||
437 | bool inode_capable(const struct inode *inode, int cap) | ||
438 | { | ||
439 | struct user_namespace *ns = current_user_ns(); | ||
440 | |||
441 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | ||
442 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c..a0c6af34d50 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -60,9 +60,13 @@ | |||
60 | #include <linux/eventfd.h> | 60 | #include <linux/eventfd.h> |
61 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
63 | #include <linux/kthread.h> | ||
63 | 64 | ||
64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
65 | 66 | ||
67 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
68 | #define CSS_DEACT_BIAS INT_MIN | ||
69 | |||
66 | /* | 70 | /* |
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 71 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
68 | * hierarchy must be performed while holding it. | 72 | * hierarchy must be performed while holding it. |
@@ -127,6 +131,9 @@ struct cgroupfs_root { | |||
127 | /* A list running through the active hierarchies */ | 131 | /* A list running through the active hierarchies */ |
128 | struct list_head root_list; | 132 | struct list_head root_list; |
129 | 133 | ||
134 | /* All cgroups on this root, cgroup_mutex protected */ | ||
135 | struct list_head allcg_list; | ||
136 | |||
130 | /* Hierarchy-specific flags */ | 137 | /* Hierarchy-specific flags */ |
131 | unsigned long flags; | 138 | unsigned long flags; |
132 | 139 | ||
@@ -145,6 +152,15 @@ struct cgroupfs_root { | |||
145 | static struct cgroupfs_root rootnode; | 152 | static struct cgroupfs_root rootnode; |
146 | 153 | ||
147 | /* | 154 | /* |
155 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
156 | */ | ||
157 | struct cfent { | ||
158 | struct list_head node; | ||
159 | struct dentry *dentry; | ||
160 | struct cftype *type; | ||
161 | }; | ||
162 | |||
163 | /* | ||
148 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 164 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
149 | * cgroup_subsys->use_id != 0. | 165 | * cgroup_subsys->use_id != 0. |
150 | */ | 166 | */ |
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void) | |||
239 | 255 | ||
240 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
241 | 257 | ||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
259 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
260 | { | ||
261 | int v = atomic_read(&css->refcnt); | ||
262 | |||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | ||
264 | } | ||
265 | |||
242 | /* convenient tests for these bits */ | 266 | /* convenient tests for these bits */ |
243 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 267 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
244 | { | 268 | { |
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
279 | #define for_each_active_root(_root) \ | 303 | #define for_each_active_root(_root) \ |
280 | list_for_each_entry(_root, &roots, root_list) | 304 | list_for_each_entry(_root, &roots, root_list) |
281 | 305 | ||
306 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
307 | { | ||
308 | return dentry->d_fsdata; | ||
309 | } | ||
310 | |||
311 | static inline struct cfent *__d_cfe(struct dentry *dentry) | ||
312 | { | ||
313 | return dentry->d_fsdata; | ||
314 | } | ||
315 | |||
316 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
317 | { | ||
318 | return __d_cfe(dentry)->type; | ||
319 | } | ||
320 | |||
282 | /* the list of cgroups eligible for automatic release. Protected by | 321 | /* the list of cgroups eligible for automatic release. Protected by |
283 | * release_list_lock */ | 322 | * release_list_lock */ |
284 | static LIST_HEAD(release_list); | 323 | static LIST_HEAD(release_list); |
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
816 | struct cgroup_subsys *ss; | 855 | struct cgroup_subsys *ss; |
817 | int ret = 0; | 856 | int ret = 0; |
818 | 857 | ||
819 | for_each_subsys(cgrp->root, ss) | 858 | for_each_subsys(cgrp->root, ss) { |
820 | if (ss->pre_destroy) { | 859 | if (!ss->pre_destroy) |
821 | ret = ss->pre_destroy(cgrp); | 860 | continue; |
822 | if (ret) | 861 | |
823 | break; | 862 | ret = ss->pre_destroy(cgrp); |
863 | if (ret) { | ||
864 | /* ->pre_destroy() failure is being deprecated */ | ||
865 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
866 | break; | ||
824 | } | 867 | } |
868 | } | ||
825 | 869 | ||
826 | return ret; | 870 | return ret; |
827 | } | 871 | } |
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
864 | BUG_ON(!list_empty(&cgrp->pidlists)); | 908 | BUG_ON(!list_empty(&cgrp->pidlists)); |
865 | 909 | ||
866 | kfree_rcu(cgrp, rcu_head); | 910 | kfree_rcu(cgrp, rcu_head); |
911 | } else { | ||
912 | struct cfent *cfe = __d_cfe(dentry); | ||
913 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | ||
914 | |||
915 | WARN_ONCE(!list_empty(&cfe->node) && | ||
916 | cgrp != &cgrp->root->top_cgroup, | ||
917 | "cfe still linked for %s\n", cfe->type->name); | ||
918 | kfree(cfe); | ||
867 | } | 919 | } |
868 | iput(inode); | 920 | iput(inode); |
869 | } | 921 | } |
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d) | |||
882 | dput(parent); | 934 | dput(parent); |
883 | } | 935 | } |
884 | 936 | ||
885 | static void cgroup_clear_directory(struct dentry *dentry) | 937 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
886 | { | 938 | { |
887 | struct list_head *node; | 939 | struct cfent *cfe; |
888 | 940 | ||
889 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 941 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
890 | spin_lock(&dentry->d_lock); | 942 | lockdep_assert_held(&cgroup_mutex); |
891 | node = dentry->d_subdirs.next; | 943 | |
892 | while (node != &dentry->d_subdirs) { | 944 | list_for_each_entry(cfe, &cgrp->files, node) { |
893 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 945 | struct dentry *d = cfe->dentry; |
894 | 946 | ||
895 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 947 | if (cft && cfe->type != cft) |
896 | list_del_init(node); | 948 | continue; |
897 | if (d->d_inode) { | 949 | |
898 | /* This should never be called on a cgroup | 950 | dget(d); |
899 | * directory with child cgroups */ | 951 | d_delete(d); |
900 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 952 | simple_unlink(d->d_inode, d); |
901 | dget_dlock(d); | 953 | list_del_init(&cfe->node); |
902 | spin_unlock(&d->d_lock); | 954 | dput(d); |
903 | spin_unlock(&dentry->d_lock); | 955 | |
904 | d_delete(d); | 956 | return 0; |
905 | simple_unlink(dentry->d_inode, d); | ||
906 | dput(d); | ||
907 | spin_lock(&dentry->d_lock); | ||
908 | } else | ||
909 | spin_unlock(&d->d_lock); | ||
910 | node = dentry->d_subdirs.next; | ||
911 | } | 957 | } |
912 | spin_unlock(&dentry->d_lock); | 958 | return -ENOENT; |
959 | } | ||
960 | |||
961 | static void cgroup_clear_directory(struct dentry *dir) | ||
962 | { | ||
963 | struct cgroup *cgrp = __d_cgrp(dir); | ||
964 | |||
965 | while (!list_empty(&cgrp->files)) | ||
966 | cgroup_rm_file(cgrp, NULL); | ||
913 | } | 967 | } |
914 | 968 | ||
915 | /* | 969 | /* |
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1294 | if (ret) | 1348 | if (ret) |
1295 | goto out_unlock; | 1349 | goto out_unlock; |
1296 | 1350 | ||
1351 | /* See feature-removal-schedule.txt */ | ||
1352 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1353 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | ||
1354 | task_tgid_nr(current), current->comm); | ||
1355 | |||
1297 | /* Don't allow flags or name to change at remount */ | 1356 | /* Don't allow flags or name to change at remount */ |
1298 | if (opts.flags != root->flags || | 1357 | if (opts.flags != root->flags || |
1299 | (opts.name && strcmp(opts.name, root->name))) { | 1358 | (opts.name && strcmp(opts.name, root->name))) { |
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1308 | goto out_unlock; | 1367 | goto out_unlock; |
1309 | } | 1368 | } |
1310 | 1369 | ||
1311 | /* (re)populate subsystem files */ | 1370 | /* clear out any existing files and repopulate subsystem files */ |
1371 | cgroup_clear_directory(cgrp->dentry); | ||
1312 | cgroup_populate_dir(cgrp); | 1372 | cgroup_populate_dir(cgrp); |
1313 | 1373 | ||
1314 | if (opts.release_agent) | 1374 | if (opts.release_agent) |
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1333 | { | 1393 | { |
1334 | INIT_LIST_HEAD(&cgrp->sibling); | 1394 | INIT_LIST_HEAD(&cgrp->sibling); |
1335 | INIT_LIST_HEAD(&cgrp->children); | 1395 | INIT_LIST_HEAD(&cgrp->children); |
1396 | INIT_LIST_HEAD(&cgrp->files); | ||
1336 | INIT_LIST_HEAD(&cgrp->css_sets); | 1397 | INIT_LIST_HEAD(&cgrp->css_sets); |
1337 | INIT_LIST_HEAD(&cgrp->release_list); | 1398 | INIT_LIST_HEAD(&cgrp->release_list); |
1338 | INIT_LIST_HEAD(&cgrp->pidlists); | 1399 | INIT_LIST_HEAD(&cgrp->pidlists); |
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1344 | static void init_cgroup_root(struct cgroupfs_root *root) | 1405 | static void init_cgroup_root(struct cgroupfs_root *root) |
1345 | { | 1406 | { |
1346 | struct cgroup *cgrp = &root->top_cgroup; | 1407 | struct cgroup *cgrp = &root->top_cgroup; |
1408 | |||
1347 | INIT_LIST_HEAD(&root->subsys_list); | 1409 | INIT_LIST_HEAD(&root->subsys_list); |
1348 | INIT_LIST_HEAD(&root->root_list); | 1410 | INIT_LIST_HEAD(&root->root_list); |
1411 | INIT_LIST_HEAD(&root->allcg_list); | ||
1349 | root->number_of_cgroups = 1; | 1412 | root->number_of_cgroups = 1; |
1350 | cgrp->root = root; | 1413 | cgrp->root = root; |
1351 | cgrp->top_cgroup = cgrp; | 1414 | cgrp->top_cgroup = cgrp; |
1415 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1352 | init_cgroup_housekeeping(cgrp); | 1416 | init_cgroup_housekeeping(cgrp); |
1353 | } | 1417 | } |
1354 | 1418 | ||
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = { | |||
1692 | 1756 | ||
1693 | static struct kobject *cgroup_kobj; | 1757 | static struct kobject *cgroup_kobj; |
1694 | 1758 | ||
1695 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
1696 | { | ||
1697 | return dentry->d_fsdata; | ||
1698 | } | ||
1699 | |||
1700 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
1701 | { | ||
1702 | return dentry->d_fsdata; | ||
1703 | } | ||
1704 | |||
1705 | /** | 1759 | /** |
1706 | * cgroup_path - generate the path of a cgroup | 1760 | * cgroup_path - generate the path of a cgroup |
1707 | * @cgrp: the cgroup in question | 1761 | * @cgrp: the cgroup in question |
@@ -2160,9 +2214,9 @@ retry_find_task: | |||
2160 | * only need to check permissions on one of them. | 2214 | * only need to check permissions on one of them. |
2161 | */ | 2215 | */ |
2162 | tcred = __task_cred(tsk); | 2216 | tcred = __task_cred(tsk); |
2163 | if (cred->euid && | 2217 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && |
2164 | cred->euid != tcred->uid && | 2218 | !uid_eq(cred->euid, tcred->uid) && |
2165 | cred->euid != tcred->suid) { | 2219 | !uid_eq(cred->euid, tcred->suid)) { |
2166 | rcu_read_unlock(); | 2220 | rcu_read_unlock(); |
2167 | ret = -EACCES; | 2221 | ret = -EACCES; |
2168 | goto out_unlock_cgroup; | 2222 | goto out_unlock_cgroup; |
@@ -2172,6 +2226,18 @@ retry_find_task: | |||
2172 | 2226 | ||
2173 | if (threadgroup) | 2227 | if (threadgroup) |
2174 | tsk = tsk->group_leader; | 2228 | tsk = tsk->group_leader; |
2229 | |||
2230 | /* | ||
2231 | * Workqueue threads may acquire PF_THREAD_BOUND and become | ||
2232 | * trapped in a cpuset, or RT worker may be born in a cgroup | ||
2233 | * with no rt_runtime allocated. Just say no. | ||
2234 | */ | ||
2235 | if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { | ||
2236 | ret = -EINVAL; | ||
2237 | rcu_read_unlock(); | ||
2238 | goto out_unlock_cgroup; | ||
2239 | } | ||
2240 | |||
2175 | get_task_struct(tsk); | 2241 | get_task_struct(tsk); |
2176 | rcu_read_unlock(); | 2242 | rcu_read_unlock(); |
2177 | 2243 | ||
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2603 | return mode; | 2669 | return mode; |
2604 | } | 2670 | } |
2605 | 2671 | ||
2606 | int cgroup_add_file(struct cgroup *cgrp, | 2672 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2607 | struct cgroup_subsys *subsys, | 2673 | const struct cftype *cft) |
2608 | const struct cftype *cft) | ||
2609 | { | 2674 | { |
2610 | struct dentry *dir = cgrp->dentry; | 2675 | struct dentry *dir = cgrp->dentry; |
2676 | struct cgroup *parent = __d_cgrp(dir); | ||
2611 | struct dentry *dentry; | 2677 | struct dentry *dentry; |
2678 | struct cfent *cfe; | ||
2612 | int error; | 2679 | int error; |
2613 | umode_t mode; | 2680 | umode_t mode; |
2614 | |||
2615 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2681 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2682 | |||
2683 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2684 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2685 | return 0; | ||
2686 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2687 | return 0; | ||
2688 | |||
2616 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2689 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2617 | strcpy(name, subsys->name); | 2690 | strcpy(name, subsys->name); |
2618 | strcat(name, "."); | 2691 | strcat(name, "."); |
2619 | } | 2692 | } |
2620 | strcat(name, cft->name); | 2693 | strcat(name, cft->name); |
2694 | |||
2621 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2695 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2696 | |||
2697 | cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); | ||
2698 | if (!cfe) | ||
2699 | return -ENOMEM; | ||
2700 | |||
2622 | dentry = lookup_one_len(name, dir, strlen(name)); | 2701 | dentry = lookup_one_len(name, dir, strlen(name)); |
2623 | if (!IS_ERR(dentry)) { | 2702 | if (IS_ERR(dentry)) { |
2624 | mode = cgroup_file_mode(cft); | ||
2625 | error = cgroup_create_file(dentry, mode | S_IFREG, | ||
2626 | cgrp->root->sb); | ||
2627 | if (!error) | ||
2628 | dentry->d_fsdata = (void *)cft; | ||
2629 | dput(dentry); | ||
2630 | } else | ||
2631 | error = PTR_ERR(dentry); | 2703 | error = PTR_ERR(dentry); |
2704 | goto out; | ||
2705 | } | ||
2706 | |||
2707 | mode = cgroup_file_mode(cft); | ||
2708 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | ||
2709 | if (!error) { | ||
2710 | cfe->type = (void *)cft; | ||
2711 | cfe->dentry = dentry; | ||
2712 | dentry->d_fsdata = cfe; | ||
2713 | list_add_tail(&cfe->node, &parent->files); | ||
2714 | cfe = NULL; | ||
2715 | } | ||
2716 | dput(dentry); | ||
2717 | out: | ||
2718 | kfree(cfe); | ||
2632 | return error; | 2719 | return error; |
2633 | } | 2720 | } |
2634 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2635 | 2721 | ||
2636 | int cgroup_add_files(struct cgroup *cgrp, | 2722 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2637 | struct cgroup_subsys *subsys, | 2723 | const struct cftype cfts[], bool is_add) |
2638 | const struct cftype cft[], | ||
2639 | int count) | ||
2640 | { | 2724 | { |
2641 | int i, err; | 2725 | const struct cftype *cft; |
2642 | for (i = 0; i < count; i++) { | 2726 | int err, ret = 0; |
2643 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2727 | |
2644 | if (err) | 2728 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2645 | return err; | 2729 | if (is_add) |
2730 | err = cgroup_add_file(cgrp, subsys, cft); | ||
2731 | else | ||
2732 | err = cgroup_rm_file(cgrp, cft); | ||
2733 | if (err) { | ||
2734 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2735 | is_add ? "add" : "remove", cft->name, err); | ||
2736 | ret = err; | ||
2737 | } | ||
2738 | } | ||
2739 | return ret; | ||
2740 | } | ||
2741 | |||
2742 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2743 | |||
2744 | static void cgroup_cfts_prepare(void) | ||
2745 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | ||
2746 | { | ||
2747 | /* | ||
2748 | * Thanks to the entanglement with vfs inode locking, we can't walk | ||
2749 | * the existing cgroups under cgroup_mutex and create files. | ||
2750 | * Instead, we increment reference on all cgroups and build list of | ||
2751 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | ||
2752 | * exclusive access to the field. | ||
2753 | */ | ||
2754 | mutex_lock(&cgroup_cft_mutex); | ||
2755 | mutex_lock(&cgroup_mutex); | ||
2756 | } | ||
2757 | |||
2758 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | ||
2759 | const struct cftype *cfts, bool is_add) | ||
2760 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | ||
2761 | { | ||
2762 | LIST_HEAD(pending); | ||
2763 | struct cgroup *cgrp, *n; | ||
2764 | |||
2765 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | ||
2766 | if (cfts && ss->root != &rootnode) { | ||
2767 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | ||
2768 | dget(cgrp->dentry); | ||
2769 | list_add_tail(&cgrp->cft_q_node, &pending); | ||
2770 | } | ||
2771 | } | ||
2772 | |||
2773 | mutex_unlock(&cgroup_mutex); | ||
2774 | |||
2775 | /* | ||
2776 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | ||
2777 | * files for all cgroups which were created before. | ||
2778 | */ | ||
2779 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | ||
2780 | struct inode *inode = cgrp->dentry->d_inode; | ||
2781 | |||
2782 | mutex_lock(&inode->i_mutex); | ||
2783 | mutex_lock(&cgroup_mutex); | ||
2784 | if (!cgroup_is_removed(cgrp)) | ||
2785 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | ||
2786 | mutex_unlock(&cgroup_mutex); | ||
2787 | mutex_unlock(&inode->i_mutex); | ||
2788 | |||
2789 | list_del_init(&cgrp->cft_q_node); | ||
2790 | dput(cgrp->dentry); | ||
2646 | } | 2791 | } |
2792 | |||
2793 | mutex_unlock(&cgroup_cft_mutex); | ||
2794 | } | ||
2795 | |||
2796 | /** | ||
2797 | * cgroup_add_cftypes - add an array of cftypes to a subsystem | ||
2798 | * @ss: target cgroup subsystem | ||
2799 | * @cfts: zero-length name terminated array of cftypes | ||
2800 | * | ||
2801 | * Register @cfts to @ss. Files described by @cfts are created for all | ||
2802 | * existing cgroups to which @ss is attached and all future cgroups will | ||
2803 | * have them too. This function can be called anytime whether @ss is | ||
2804 | * attached or not. | ||
2805 | * | ||
2806 | * Returns 0 on successful registration, -errno on failure. Note that this | ||
2807 | * function currently returns 0 as long as @cfts registration is successful | ||
2808 | * even if some file creation attempts on existing cgroups fail. | ||
2809 | */ | ||
2810 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2811 | { | ||
2812 | struct cftype_set *set; | ||
2813 | |||
2814 | set = kzalloc(sizeof(*set), GFP_KERNEL); | ||
2815 | if (!set) | ||
2816 | return -ENOMEM; | ||
2817 | |||
2818 | cgroup_cfts_prepare(); | ||
2819 | set->cfts = cfts; | ||
2820 | list_add_tail(&set->node, &ss->cftsets); | ||
2821 | cgroup_cfts_commit(ss, cfts, true); | ||
2822 | |||
2647 | return 0; | 2823 | return 0; |
2648 | } | 2824 | } |
2649 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2825 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2826 | |||
2827 | /** | ||
2828 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | ||
2829 | * @ss: target cgroup subsystem | ||
2830 | * @cfts: zero-length name terminated array of cftypes | ||
2831 | * | ||
2832 | * Unregister @cfts from @ss. Files described by @cfts are removed from | ||
2833 | * all existing cgroups to which @ss is attached and all future cgroups | ||
2834 | * won't have them either. This function can be called anytime whether @ss | ||
2835 | * is attached or not. | ||
2836 | * | ||
2837 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | ||
2838 | * registered with @ss. | ||
2839 | */ | ||
2840 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2841 | { | ||
2842 | struct cftype_set *set; | ||
2843 | |||
2844 | cgroup_cfts_prepare(); | ||
2845 | |||
2846 | list_for_each_entry(set, &ss->cftsets, node) { | ||
2847 | if (set->cfts == cfts) { | ||
2848 | list_del_init(&set->node); | ||
2849 | cgroup_cfts_commit(ss, cfts, false); | ||
2850 | return 0; | ||
2851 | } | ||
2852 | } | ||
2853 | |||
2854 | cgroup_cfts_commit(ss, NULL, false); | ||
2855 | return -ENOENT; | ||
2856 | } | ||
2650 | 2857 | ||
2651 | /** | 2858 | /** |
2652 | * cgroup_task_count - count the number of tasks in a cgroup. | 2859 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -3625,13 +3832,14 @@ static struct cftype files[] = { | |||
3625 | .read_u64 = cgroup_clone_children_read, | 3832 | .read_u64 = cgroup_clone_children_read, |
3626 | .write_u64 = cgroup_clone_children_write, | 3833 | .write_u64 = cgroup_clone_children_write, |
3627 | }, | 3834 | }, |
3628 | }; | 3835 | { |
3629 | 3836 | .name = "release_agent", | |
3630 | static struct cftype cft_release_agent = { | 3837 | .flags = CFTYPE_ONLY_ON_ROOT, |
3631 | .name = "release_agent", | 3838 | .read_seq_string = cgroup_release_agent_show, |
3632 | .read_seq_string = cgroup_release_agent_show, | 3839 | .write_string = cgroup_release_agent_write, |
3633 | .write_string = cgroup_release_agent_write, | 3840 | .max_write_len = PATH_MAX, |
3634 | .max_write_len = PATH_MAX, | 3841 | }, |
3842 | { } /* terminate */ | ||
3635 | }; | 3843 | }; |
3636 | 3844 | ||
3637 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3845 | static int cgroup_populate_dir(struct cgroup *cgrp) |
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3639 | int err; | 3847 | int err; |
3640 | struct cgroup_subsys *ss; | 3848 | struct cgroup_subsys *ss; |
3641 | 3849 | ||
3642 | /* First clear out any existing files */ | 3850 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3643 | cgroup_clear_directory(cgrp->dentry); | ||
3644 | |||
3645 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | ||
3646 | if (err < 0) | 3851 | if (err < 0) |
3647 | return err; | 3852 | return err; |
3648 | 3853 | ||
3649 | if (cgrp == cgrp->top_cgroup) { | 3854 | /* process cftsets of each subsystem */ |
3650 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | ||
3651 | return err; | ||
3652 | } | ||
3653 | |||
3654 | for_each_subsys(cgrp->root, ss) { | 3855 | for_each_subsys(cgrp->root, ss) { |
3655 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3856 | struct cftype_set *set; |
3656 | return err; | 3857 | |
3858 | list_for_each_entry(set, &ss->cftsets, node) | ||
3859 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | ||
3657 | } | 3860 | } |
3861 | |||
3658 | /* This cgroup is ready now */ | 3862 | /* This cgroup is ready now */ |
3659 | for_each_subsys(cgrp->root, ss) { | 3863 | for_each_subsys(cgrp->root, ss) { |
3660 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3864 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3670 | return 0; | 3874 | return 0; |
3671 | } | 3875 | } |
3672 | 3876 | ||
3877 | static void css_dput_fn(struct work_struct *work) | ||
3878 | { | ||
3879 | struct cgroup_subsys_state *css = | ||
3880 | container_of(work, struct cgroup_subsys_state, dput_work); | ||
3881 | |||
3882 | dput(css->cgroup->dentry); | ||
3883 | } | ||
3884 | |||
3673 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3885 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3674 | struct cgroup_subsys *ss, | 3886 | struct cgroup_subsys *ss, |
3675 | struct cgroup *cgrp) | 3887 | struct cgroup *cgrp) |
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3682 | set_bit(CSS_ROOT, &css->flags); | 3894 | set_bit(CSS_ROOT, &css->flags); |
3683 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3895 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3684 | cgrp->subsys[ss->subsys_id] = css; | 3896 | cgrp->subsys[ss->subsys_id] = css; |
3897 | |||
3898 | /* | ||
3899 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | ||
3900 | * which is put on the last css_put(). dput() requires process | ||
3901 | * context, which css_put() may be called without. @css->dput_work | ||
3902 | * will be used to invoke dput() asynchronously from css_put(). | ||
3903 | */ | ||
3904 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
3905 | if (ss->__DEPRECATED_clear_css_refs) | ||
3906 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
3685 | } | 3907 | } |
3686 | 3908 | ||
3687 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3909 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3784 | if (err < 0) | 4006 | if (err < 0) |
3785 | goto err_remove; | 4007 | goto err_remove; |
3786 | 4008 | ||
4009 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4010 | for_each_subsys(root, ss) | ||
4011 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4012 | dget(dentry); | ||
4013 | |||
3787 | /* The cgroup directory was pre-locked for us */ | 4014 | /* The cgroup directory was pre-locked for us */ |
3788 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4015 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3789 | 4016 | ||
4017 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4018 | |||
3790 | err = cgroup_populate_dir(cgrp); | 4019 | err = cgroup_populate_dir(cgrp); |
3791 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4020 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3792 | 4021 | ||
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
3826 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4055 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3827 | } | 4056 | } |
3828 | 4057 | ||
4058 | /* | ||
4059 | * Check the reference count on each subsystem. Since we already | ||
4060 | * established that there are no tasks in the cgroup, if the css refcount | ||
4061 | * is also 1, then there should be no outstanding references, so the | ||
4062 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4063 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4064 | * be called via check_for_release() with no synchronization other than | ||
4065 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4066 | */ | ||
3829 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 4067 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3830 | { | 4068 | { |
3831 | /* Check the reference count on each subsystem. Since we | ||
3832 | * already established that there are no tasks in the | ||
3833 | * cgroup, if the css refcount is also 1, then there should | ||
3834 | * be no outstanding references, so the subsystem is safe to | ||
3835 | * destroy. We scan across all subsystems rather than using | ||
3836 | * the per-hierarchy linked list of mounted subsystems since | ||
3837 | * we can be called via check_for_release() with no | ||
3838 | * synchronization other than RCU, and the subsystem linked | ||
3839 | * list isn't RCU-safe */ | ||
3840 | int i; | 4069 | int i; |
4070 | |||
3841 | /* | 4071 | /* |
3842 | * We won't need to lock the subsys array, because the subsystems | 4072 | * We won't need to lock the subsys array, because the subsystems |
3843 | * we're concerned about aren't going anywhere since our cgroup root | 4073 | * we're concerned about aren't going anywhere since our cgroup root |
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4076 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3847 | struct cgroup_subsys *ss = subsys[i]; | 4077 | struct cgroup_subsys *ss = subsys[i]; |
3848 | struct cgroup_subsys_state *css; | 4078 | struct cgroup_subsys_state *css; |
4079 | |||
3849 | /* Skip subsystems not present or not in this hierarchy */ | 4080 | /* Skip subsystems not present or not in this hierarchy */ |
3850 | if (ss == NULL || ss->root != cgrp->root) | 4081 | if (ss == NULL || ss->root != cgrp->root) |
3851 | continue; | 4082 | continue; |
4083 | |||
3852 | css = cgrp->subsys[ss->subsys_id]; | 4084 | css = cgrp->subsys[ss->subsys_id]; |
3853 | /* When called from check_for_release() it's possible | 4085 | /* |
4086 | * When called from check_for_release() it's possible | ||
3854 | * that by this point the cgroup has been removed | 4087 | * that by this point the cgroup has been removed |
3855 | * and the css deleted. But a false-positive doesn't | 4088 | * and the css deleted. But a false-positive doesn't |
3856 | * matter, since it can only happen if the cgroup | 4089 | * matter, since it can only happen if the cgroup |
3857 | * has been deleted and hence no longer needs the | 4090 | * has been deleted and hence no longer needs the |
3858 | * release agent to be called anyway. */ | 4091 | * release agent to be called anyway. |
3859 | if (css && (atomic_read(&css->refcnt) > 1)) | 4092 | */ |
4093 | if (css && css_refcnt(css) > 1) | ||
3860 | return 1; | 4094 | return 1; |
3861 | } | 4095 | } |
3862 | return 0; | 4096 | return 0; |
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3866 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4100 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3867 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 4101 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3868 | * busy subsystems. Call with cgroup_mutex held | 4102 | * busy subsystems. Call with cgroup_mutex held |
4103 | * | ||
4104 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4105 | * not, cgroup removal behaves differently. | ||
4106 | * | ||
4107 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4108 | * cgroup removal can be committed. This is implemented by | ||
4109 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4110 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4111 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4112 | * removed as soon as the existing user (memcg) is updated. | ||
4113 | * | ||
4114 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4115 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4116 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4117 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4118 | * is put so that dentry destruction happens only after all css's are | ||
4119 | * released. | ||
3869 | */ | 4120 | */ |
3870 | |||
3871 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 4121 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3872 | { | 4122 | { |
3873 | struct cgroup_subsys *ss; | 4123 | struct cgroup_subsys *ss; |
3874 | unsigned long flags; | 4124 | unsigned long flags; |
3875 | bool failed = false; | 4125 | bool failed = false; |
4126 | |||
3876 | local_irq_save(flags); | 4127 | local_irq_save(flags); |
4128 | |||
4129 | /* | ||
4130 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4131 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4132 | * deactivation, we succeeded. | ||
4133 | */ | ||
3877 | for_each_subsys(cgrp->root, ss) { | 4134 | for_each_subsys(cgrp->root, ss) { |
3878 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4135 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3879 | int refcnt; | 4136 | |
3880 | while (1) { | 4137 | WARN_ON(atomic_read(&css->refcnt) < 0); |
3881 | /* We can only remove a CSS with a refcnt==1 */ | 4138 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
3882 | refcnt = atomic_read(&css->refcnt); | 4139 | |
3883 | if (refcnt > 1) { | 4140 | if (ss->__DEPRECATED_clear_css_refs) |
3884 | failed = true; | 4141 | failed |= css_refcnt(css) != 1; |
3885 | goto done; | ||
3886 | } | ||
3887 | BUG_ON(!refcnt); | ||
3888 | /* | ||
3889 | * Drop the refcnt to 0 while we check other | ||
3890 | * subsystems. This will cause any racing | ||
3891 | * css_tryget() to spin until we set the | ||
3892 | * CSS_REMOVED bits or abort | ||
3893 | */ | ||
3894 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | ||
3895 | break; | ||
3896 | cpu_relax(); | ||
3897 | } | ||
3898 | } | 4142 | } |
3899 | done: | 4143 | |
4144 | /* | ||
4145 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4146 | * restore refcnts to positive values. Either way, all in-progress | ||
4147 | * css_tryget() will be released. | ||
4148 | */ | ||
3900 | for_each_subsys(cgrp->root, ss) { | 4149 | for_each_subsys(cgrp->root, ss) { |
3901 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4150 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3902 | if (failed) { | 4151 | |
3903 | /* | 4152 | if (!failed) { |
3904 | * Restore old refcnt if we previously managed | ||
3905 | * to clear it from 1 to 0 | ||
3906 | */ | ||
3907 | if (!atomic_read(&css->refcnt)) | ||
3908 | atomic_set(&css->refcnt, 1); | ||
3909 | } else { | ||
3910 | /* Commit the fact that the CSS is removed */ | ||
3911 | set_bit(CSS_REMOVED, &css->flags); | 4153 | set_bit(CSS_REMOVED, &css->flags); |
4154 | css_put(css); | ||
4155 | } else { | ||
4156 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
3912 | } | 4157 | } |
3913 | } | 4158 | } |
4159 | |||
3914 | local_irq_restore(flags); | 4160 | local_irq_restore(flags); |
3915 | return !failed; | 4161 | return !failed; |
3916 | } | 4162 | } |
@@ -3995,6 +4241,8 @@ again: | |||
3995 | list_del_init(&cgrp->sibling); | 4241 | list_del_init(&cgrp->sibling); |
3996 | cgroup_unlock_hierarchy(cgrp->root); | 4242 | cgroup_unlock_hierarchy(cgrp->root); |
3997 | 4243 | ||
4244 | list_del_init(&cgrp->allcg_node); | ||
4245 | |||
3998 | d = dget(cgrp->dentry); | 4246 | d = dget(cgrp->dentry); |
3999 | 4247 | ||
4000 | cgroup_d_remove_dir(d); | 4248 | cgroup_d_remove_dir(d); |
@@ -4021,12 +4269,29 @@ again: | |||
4021 | return 0; | 4269 | return 0; |
4022 | } | 4270 | } |
4023 | 4271 | ||
4272 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | ||
4273 | { | ||
4274 | INIT_LIST_HEAD(&ss->cftsets); | ||
4275 | |||
4276 | /* | ||
4277 | * base_cftset is embedded in subsys itself, no need to worry about | ||
4278 | * deregistration. | ||
4279 | */ | ||
4280 | if (ss->base_cftypes) { | ||
4281 | ss->base_cftset.cfts = ss->base_cftypes; | ||
4282 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | ||
4283 | } | ||
4284 | } | ||
4285 | |||
4024 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4286 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4025 | { | 4287 | { |
4026 | struct cgroup_subsys_state *css; | 4288 | struct cgroup_subsys_state *css; |
4027 | 4289 | ||
4028 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4290 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4029 | 4291 | ||
4292 | /* init base cftset */ | ||
4293 | cgroup_init_cftsets(ss); | ||
4294 | |||
4030 | /* Create the top cgroup state for this subsystem */ | 4295 | /* Create the top cgroup state for this subsystem */ |
4031 | list_add(&ss->sibling, &rootnode.subsys_list); | 4296 | list_add(&ss->sibling, &rootnode.subsys_list); |
4032 | ss->root = &rootnode; | 4297 | ss->root = &rootnode; |
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4096 | return 0; | 4361 | return 0; |
4097 | } | 4362 | } |
4098 | 4363 | ||
4364 | /* init base cftset */ | ||
4365 | cgroup_init_cftsets(ss); | ||
4366 | |||
4099 | /* | 4367 | /* |
4100 | * need to register a subsys id before anything else - for example, | 4368 | * need to register a subsys id before anything else - for example, |
4101 | * init_cgroup_css needs it. | 4369 | * init_cgroup_css needs it. |
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp) | |||
4685 | } | 4953 | } |
4686 | 4954 | ||
4687 | /* Caller must verify that the css is not for root cgroup */ | 4955 | /* Caller must verify that the css is not for root cgroup */ |
4688 | void __css_put(struct cgroup_subsys_state *css, int count) | 4956 | bool __css_tryget(struct cgroup_subsys_state *css) |
4957 | { | ||
4958 | do { | ||
4959 | int v = css_refcnt(css); | ||
4960 | |||
4961 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | ||
4962 | return true; | ||
4963 | cpu_relax(); | ||
4964 | } while (!test_bit(CSS_REMOVED, &css->flags)); | ||
4965 | |||
4966 | return false; | ||
4967 | } | ||
4968 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4969 | |||
4970 | /* Caller must verify that the css is not for root cgroup */ | ||
4971 | void __css_put(struct cgroup_subsys_state *css) | ||
4689 | { | 4972 | { |
4690 | struct cgroup *cgrp = css->cgroup; | 4973 | struct cgroup *cgrp = css->cgroup; |
4691 | int val; | 4974 | |
4692 | rcu_read_lock(); | 4975 | rcu_read_lock(); |
4693 | val = atomic_sub_return(count, &css->refcnt); | 4976 | atomic_dec(&css->refcnt); |
4694 | if (val == 1) { | 4977 | switch (css_refcnt(css)) { |
4978 | case 1: | ||
4695 | if (notify_on_release(cgrp)) { | 4979 | if (notify_on_release(cgrp)) { |
4696 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4980 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4697 | check_for_release(cgrp); | 4981 | check_for_release(cgrp); |
4698 | } | 4982 | } |
4699 | cgroup_wakeup_rmdir_waiter(cgrp); | 4983 | cgroup_wakeup_rmdir_waiter(cgrp); |
4984 | break; | ||
4985 | case 0: | ||
4986 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | ||
4987 | schedule_work(&css->dput_work); | ||
4988 | break; | ||
4700 | } | 4989 | } |
4701 | rcu_read_unlock(); | 4990 | rcu_read_unlock(); |
4702 | WARN_ON_ONCE(val < 1); | ||
4703 | } | 4991 | } |
4704 | EXPORT_SYMBOL_GPL(__css_put); | 4992 | EXPORT_SYMBOL_GPL(__css_put); |
4705 | 4993 | ||
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4818 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5106 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4819 | * it's unchanged until freed. | 5107 | * it's unchanged until freed. |
4820 | */ | 5108 | */ |
4821 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5109 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4822 | 5110 | ||
4823 | if (cssid) | 5111 | if (cssid) |
4824 | return cssid->id; | 5112 | return cssid->id; |
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4830 | { | 5118 | { |
4831 | struct css_id *cssid; | 5119 | struct css_id *cssid; |
4832 | 5120 | ||
4833 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5121 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4834 | 5122 | ||
4835 | if (cssid) | 5123 | if (cssid) |
4836 | return cssid->depth; | 5124 | return cssid->depth; |
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = { | |||
5211 | .name = "releasable", | 5499 | .name = "releasable", |
5212 | .read_u64 = releasable_read, | 5500 | .read_u64 = releasable_read, |
5213 | }, | 5501 | }, |
5214 | }; | ||
5215 | 5502 | ||
5216 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5503 | { } /* terminate */ |
5217 | { | 5504 | }; |
5218 | return cgroup_add_files(cont, ss, debug_files, | ||
5219 | ARRAY_SIZE(debug_files)); | ||
5220 | } | ||
5221 | 5505 | ||
5222 | struct cgroup_subsys debug_subsys = { | 5506 | struct cgroup_subsys debug_subsys = { |
5223 | .name = "debug", | 5507 | .name = "debug", |
5224 | .create = debug_create, | 5508 | .create = debug_create, |
5225 | .destroy = debug_destroy, | 5509 | .destroy = debug_destroy, |
5226 | .populate = debug_populate, | ||
5227 | .subsys_id = debug_subsys_id, | 5510 | .subsys_id = debug_subsys_id, |
5511 | .base_cftypes = debug_files, | ||
5228 | }; | 5512 | }; |
5229 | #endif /* CONFIG_CGROUP_DEBUG */ | 5513 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b6..3649fc6b3ea 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, | |||
358 | static struct cftype files[] = { | 358 | static struct cftype files[] = { |
359 | { | 359 | { |
360 | .name = "state", | 360 | .name = "state", |
361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
361 | .read_seq_string = freezer_read, | 362 | .read_seq_string = freezer_read, |
362 | .write_string = freezer_write, | 363 | .write_string = freezer_write, |
363 | }, | 364 | }, |
365 | { } /* terminate */ | ||
364 | }; | 366 | }; |
365 | 367 | ||
366 | static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | ||
367 | { | ||
368 | if (!cgroup->parent) | ||
369 | return 0; | ||
370 | return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | ||
371 | } | ||
372 | |||
373 | struct cgroup_subsys freezer_subsys = { | 368 | struct cgroup_subsys freezer_subsys = { |
374 | .name = "freezer", | 369 | .name = "freezer", |
375 | .create = freezer_create, | 370 | .create = freezer_create, |
376 | .destroy = freezer_destroy, | 371 | .destroy = freezer_destroy, |
377 | .populate = freezer_populate, | ||
378 | .subsys_id = freezer_subsys_id, | 372 | .subsys_id = freezer_subsys_id, |
379 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
380 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
375 | .base_cftypes = files, | ||
381 | }; | 376 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index 74ff8498809..c28a306ae05 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
372 | 372 | ||
373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
374 | 374 | ||
375 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 375 | /* |
376 | compat_old_sigset_t __user *oset) | 376 | * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the |
377 | * blocked set of signals to the supplied signal set | ||
378 | */ | ||
379 | static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) | ||
377 | { | 380 | { |
378 | old_sigset_t s; | 381 | memcpy(blocked->sig, &set, sizeof(set)); |
379 | long ret; | 382 | } |
380 | mm_segment_t old_fs; | ||
381 | 383 | ||
382 | if (set && get_user(s, set)) | 384 | asmlinkage long compat_sys_sigprocmask(int how, |
383 | return -EFAULT; | 385 | compat_old_sigset_t __user *nset, |
384 | old_fs = get_fs(); | 386 | compat_old_sigset_t __user *oset) |
385 | set_fs(KERNEL_DS); | 387 | { |
386 | ret = sys_sigprocmask(how, | 388 | old_sigset_t old_set, new_set; |
387 | set ? (old_sigset_t __user *) &s : NULL, | 389 | sigset_t new_blocked; |
388 | oset ? (old_sigset_t __user *) &s : NULL); | 390 | |
389 | set_fs(old_fs); | 391 | old_set = current->blocked.sig[0]; |
390 | if (ret == 0) | 392 | |
391 | if (oset) | 393 | if (nset) { |
392 | ret = put_user(s, oset); | 394 | if (get_user(new_set, nset)) |
393 | return ret; | 395 | return -EFAULT; |
396 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
397 | |||
398 | new_blocked = current->blocked; | ||
399 | |||
400 | switch (how) { | ||
401 | case SIG_BLOCK: | ||
402 | sigaddsetmask(&new_blocked, new_set); | ||
403 | break; | ||
404 | case SIG_UNBLOCK: | ||
405 | sigdelsetmask(&new_blocked, new_set); | ||
406 | break; | ||
407 | case SIG_SETMASK: | ||
408 | compat_sig_setmask(&new_blocked, new_set); | ||
409 | break; | ||
410 | default: | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | |||
414 | set_current_blocked(&new_blocked); | ||
415 | } | ||
416 | |||
417 | if (oset) { | ||
418 | if (put_user(old_set, oset)) | ||
419 | return -EFAULT; | ||
420 | } | ||
421 | |||
422 | return 0; | ||
394 | } | 423 | } |
395 | 424 | ||
396 | #endif | 425 | #endif |
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
1044 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) | 1073 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) |
1045 | return -EFAULT; | 1074 | return -EFAULT; |
1046 | sigset_from_compat(&newset, &newset32); | 1075 | sigset_from_compat(&newset, &newset32); |
1047 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 1076 | return sigsuspend(&newset); |
1048 | |||
1049 | current->saved_sigmask = current->blocked; | ||
1050 | set_current_blocked(&newset); | ||
1051 | |||
1052 | current->state = TASK_INTERRUPTIBLE; | ||
1053 | schedule(); | ||
1054 | set_restore_sigmask(); | ||
1055 | return -ERESTARTNOHAND; | ||
1056 | } | 1077 | } |
1057 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | 1078 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ |
1058 | 1079 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e5702..0e6353cf147 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | 18 | #include <linux/suspend.h> |
19 | 19 | ||
20 | #include "smpboot.h" | ||
21 | |||
20 | #ifdef CONFIG_SMP | 22 | #ifdef CONFIG_SMP |
21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 23 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
22 | static DEFINE_MUTEX(cpu_add_remove_lock); | 24 | static DEFINE_MUTEX(cpu_add_remove_lock); |
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
295 | int ret, nr_calls = 0; | 297 | int ret, nr_calls = 0; |
296 | void *hcpu = (void *)(long)cpu; | 298 | void *hcpu = (void *)(long)cpu; |
297 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 299 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
300 | struct task_struct *idle; | ||
298 | 301 | ||
299 | if (cpu_online(cpu) || !cpu_present(cpu)) | 302 | if (cpu_online(cpu) || !cpu_present(cpu)) |
300 | return -EINVAL; | 303 | return -EINVAL; |
301 | 304 | ||
302 | cpu_hotplug_begin(); | 305 | cpu_hotplug_begin(); |
306 | |||
307 | idle = idle_thread_get(cpu); | ||
308 | if (IS_ERR(idle)) { | ||
309 | ret = PTR_ERR(idle); | ||
310 | goto out; | ||
311 | } | ||
312 | |||
303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 313 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
304 | if (ret) { | 314 | if (ret) { |
305 | nr_calls--; | 315 | nr_calls--; |
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
309 | } | 319 | } |
310 | 320 | ||
311 | /* Arch-specific enabling code. */ | 321 | /* Arch-specific enabling code. */ |
312 | ret = __cpu_up(cpu); | 322 | ret = __cpu_up(cpu, idle); |
313 | if (ret != 0) | 323 | if (ret != 0) |
314 | goto out_notify; | 324 | goto out_notify; |
315 | BUG_ON(!cpu_online(cpu)); | 325 | BUG_ON(!cpu_online(cpu)); |
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
320 | out_notify: | 330 | out_notify: |
321 | if (ret != 0) | 331 | if (ret != 0) |
322 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 332 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
333 | out: | ||
323 | cpu_hotplug_done(); | 334 | cpu_hotplug_done(); |
324 | 335 | ||
325 | return ret; | 336 | return ret; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba..8c8bd652dd1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1765,28 +1765,17 @@ static struct cftype files[] = { | |||
1765 | .write_u64 = cpuset_write_u64, | 1765 | .write_u64 = cpuset_write_u64, |
1766 | .private = FILE_SPREAD_SLAB, | 1766 | .private = FILE_SPREAD_SLAB, |
1767 | }, | 1767 | }, |
1768 | }; | ||
1769 | |||
1770 | static struct cftype cft_memory_pressure_enabled = { | ||
1771 | .name = "memory_pressure_enabled", | ||
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }; | ||
1776 | 1768 | ||
1777 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 1769 | { |
1778 | { | 1770 | .name = "memory_pressure_enabled", |
1779 | int err; | 1771 | .flags = CFTYPE_ONLY_ON_ROOT, |
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }, | ||
1780 | 1776 | ||
1781 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 1777 | { } /* terminate */ |
1782 | if (err) | 1778 | }; |
1783 | return err; | ||
1784 | /* memory_pressure_enabled is in root cpuset only */ | ||
1785 | if (!cont->parent) | ||
1786 | err = cgroup_add_file(cont, ss, | ||
1787 | &cft_memory_pressure_enabled); | ||
1788 | return err; | ||
1789 | } | ||
1790 | 1779 | ||
1791 | /* | 1780 | /* |
1792 | * post_clone() is called during cgroup_create() when the | 1781 | * post_clone() is called during cgroup_create() when the |
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1887 | .destroy = cpuset_destroy, | 1876 | .destroy = cpuset_destroy, |
1888 | .can_attach = cpuset_can_attach, | 1877 | .can_attach = cpuset_can_attach, |
1889 | .attach = cpuset_attach, | 1878 | .attach = cpuset_attach, |
1890 | .populate = cpuset_populate, | ||
1891 | .post_clone = cpuset_post_clone, | 1879 | .post_clone = cpuset_post_clone, |
1892 | .subsys_id = cpuset_subsys_id, | 1880 | .subsys_id = cpuset_subsys_id, |
1881 | .base_cftypes = files, | ||
1893 | .early_init = 1, | 1882 | .early_init = 1, |
1894 | }; | 1883 | }; |
1895 | 1884 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index e70683d9ec3..430557ea488 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -49,6 +49,14 @@ struct cred init_cred = { | |||
49 | .subscribers = ATOMIC_INIT(2), | 49 | .subscribers = ATOMIC_INIT(2), |
50 | .magic = CRED_MAGIC, | 50 | .magic = CRED_MAGIC, |
51 | #endif | 51 | #endif |
52 | .uid = GLOBAL_ROOT_UID, | ||
53 | .gid = GLOBAL_ROOT_GID, | ||
54 | .suid = GLOBAL_ROOT_UID, | ||
55 | .sgid = GLOBAL_ROOT_GID, | ||
56 | .euid = GLOBAL_ROOT_UID, | ||
57 | .egid = GLOBAL_ROOT_GID, | ||
58 | .fsuid = GLOBAL_ROOT_UID, | ||
59 | .fsgid = GLOBAL_ROOT_GID, | ||
52 | .securebits = SECUREBITS_DEFAULT, | 60 | .securebits = SECUREBITS_DEFAULT, |
53 | .cap_inheritable = CAP_EMPTY_SET, | 61 | .cap_inheritable = CAP_EMPTY_SET, |
54 | .cap_permitted = CAP_FULL_SET, | 62 | .cap_permitted = CAP_FULL_SET, |
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu) | |||
148 | if (cred->group_info) | 156 | if (cred->group_info) |
149 | put_group_info(cred->group_info); | 157 | put_group_info(cred->group_info); |
150 | free_uid(cred->user); | 158 | free_uid(cred->user); |
159 | put_user_ns(cred->user_ns); | ||
151 | kmem_cache_free(cred_jar, cred); | 160 | kmem_cache_free(cred_jar, cred); |
152 | } | 161 | } |
153 | 162 | ||
@@ -303,6 +312,7 @@ struct cred *prepare_creds(void) | |||
303 | set_cred_subscribers(new, 0); | 312 | set_cred_subscribers(new, 0); |
304 | get_group_info(new->group_info); | 313 | get_group_info(new->group_info); |
305 | get_uid(new->user); | 314 | get_uid(new->user); |
315 | get_user_ns(new->user_ns); | ||
306 | 316 | ||
307 | #ifdef CONFIG_KEYS | 317 | #ifdef CONFIG_KEYS |
308 | key_get(new->thread_keyring); | 318 | key_get(new->thread_keyring); |
@@ -414,11 +424,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
414 | goto error_put; | 424 | goto error_put; |
415 | } | 425 | } |
416 | 426 | ||
417 | /* cache user_ns in cred. Doesn't need a refcount because it will | ||
418 | * stay pinned by cred->user | ||
419 | */ | ||
420 | new->user_ns = new->user->user_ns; | ||
421 | |||
422 | #ifdef CONFIG_KEYS | 427 | #ifdef CONFIG_KEYS |
423 | /* new threads get their own thread keyrings if their parent already | 428 | /* new threads get their own thread keyrings if their parent already |
424 | * had one */ | 429 | * had one */ |
@@ -493,10 +498,10 @@ int commit_creds(struct cred *new) | |||
493 | get_cred(new); /* we will require a ref for the subj creds too */ | 498 | get_cred(new); /* we will require a ref for the subj creds too */ |
494 | 499 | ||
495 | /* dumpability changes */ | 500 | /* dumpability changes */ |
496 | if (old->euid != new->euid || | 501 | if (!uid_eq(old->euid, new->euid) || |
497 | old->egid != new->egid || | 502 | !gid_eq(old->egid, new->egid) || |
498 | old->fsuid != new->fsuid || | 503 | !uid_eq(old->fsuid, new->fsuid) || |
499 | old->fsgid != new->fsgid || | 504 | !gid_eq(old->fsgid, new->fsgid) || |
500 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { | 505 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { |
501 | if (task->mm) | 506 | if (task->mm) |
502 | set_dumpable(task->mm, suid_dumpable); | 507 | set_dumpable(task->mm, suid_dumpable); |
@@ -505,9 +510,9 @@ int commit_creds(struct cred *new) | |||
505 | } | 510 | } |
506 | 511 | ||
507 | /* alter the thread keyring */ | 512 | /* alter the thread keyring */ |
508 | if (new->fsuid != old->fsuid) | 513 | if (!uid_eq(new->fsuid, old->fsuid)) |
509 | key_fsuid_changed(task); | 514 | key_fsuid_changed(task); |
510 | if (new->fsgid != old->fsgid) | 515 | if (!gid_eq(new->fsgid, old->fsgid)) |
511 | key_fsgid_changed(task); | 516 | key_fsgid_changed(task); |
512 | 517 | ||
513 | /* do it | 518 | /* do it |
@@ -524,16 +529,16 @@ int commit_creds(struct cred *new) | |||
524 | alter_cred_subscribers(old, -2); | 529 | alter_cred_subscribers(old, -2); |
525 | 530 | ||
526 | /* send notifications */ | 531 | /* send notifications */ |
527 | if (new->uid != old->uid || | 532 | if (!uid_eq(new->uid, old->uid) || |
528 | new->euid != old->euid || | 533 | !uid_eq(new->euid, old->euid) || |
529 | new->suid != old->suid || | 534 | !uid_eq(new->suid, old->suid) || |
530 | new->fsuid != old->fsuid) | 535 | !uid_eq(new->fsuid, old->fsuid)) |
531 | proc_id_connector(task, PROC_EVENT_UID); | 536 | proc_id_connector(task, PROC_EVENT_UID); |
532 | 537 | ||
533 | if (new->gid != old->gid || | 538 | if (!gid_eq(new->gid, old->gid) || |
534 | new->egid != old->egid || | 539 | !gid_eq(new->egid, old->egid) || |
535 | new->sgid != old->sgid || | 540 | !gid_eq(new->sgid, old->sgid) || |
536 | new->fsgid != old->fsgid) | 541 | !gid_eq(new->fsgid, old->fsgid)) |
537 | proc_id_connector(task, PROC_EVENT_GID); | 542 | proc_id_connector(task, PROC_EVENT_GID); |
538 | 543 | ||
539 | /* release the old obj and subj refs both */ | 544 | /* release the old obj and subj refs both */ |
@@ -678,6 +683,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
678 | atomic_set(&new->usage, 1); | 683 | atomic_set(&new->usage, 1); |
679 | set_cred_subscribers(new, 0); | 684 | set_cred_subscribers(new, 0); |
680 | get_uid(new->user); | 685 | get_uid(new->user); |
686 | get_user_ns(new->user_ns); | ||
681 | get_group_info(new->group_info); | 687 | get_group_info(new->group_info); |
682 | 688 | ||
683 | #ifdef CONFIG_KEYS | 689 | #ifdef CONFIG_KEYS |
diff --git a/kernel/events/core.c b/kernel/events/core.c index e82c7a1face..5b06cbbf693 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -2039,8 +2039,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2039 | * accessing the event control register. If a NMI hits, then it will | 2039 | * accessing the event control register. If a NMI hits, then it will |
2040 | * not restart the event. | 2040 | * not restart the event. |
2041 | */ | 2041 | */ |
2042 | static void __perf_event_task_sched_out(struct task_struct *task, | 2042 | void __perf_event_task_sched_out(struct task_struct *task, |
2043 | struct task_struct *next) | 2043 | struct task_struct *next) |
2044 | { | 2044 | { |
2045 | int ctxn; | 2045 | int ctxn; |
2046 | 2046 | ||
@@ -2279,8 +2279,8 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, | |||
2279 | * accessing the event control register. If a NMI hits, then it will | 2279 | * accessing the event control register. If a NMI hits, then it will |
2280 | * keep the event running. | 2280 | * keep the event running. |
2281 | */ | 2281 | */ |
2282 | static void __perf_event_task_sched_in(struct task_struct *prev, | 2282 | void __perf_event_task_sched_in(struct task_struct *prev, |
2283 | struct task_struct *task) | 2283 | struct task_struct *task) |
2284 | { | 2284 | { |
2285 | struct perf_event_context *ctx; | 2285 | struct perf_event_context *ctx; |
2286 | int ctxn; | 2286 | int ctxn; |
@@ -2305,12 +2305,6 @@ static void __perf_event_task_sched_in(struct task_struct *prev, | |||
2305 | perf_branch_stack_sched_in(prev, task); | 2305 | perf_branch_stack_sched_in(prev, task); |
2306 | } | 2306 | } |
2307 | 2307 | ||
2308 | void __perf_event_task_sched(struct task_struct *prev, struct task_struct *next) | ||
2309 | { | ||
2310 | __perf_event_task_sched_out(prev, next); | ||
2311 | __perf_event_task_sched_in(prev, next); | ||
2312 | } | ||
2313 | |||
2314 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2308 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
2315 | { | 2309 | { |
2316 | u64 frequency = event->attr.sample_freq; | 2310 | u64 frequency = event->attr.sample_freq; |
@@ -3189,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event, | |||
3189 | perf_event_for_each_child(event, func); | 3183 | perf_event_for_each_child(event, func); |
3190 | func(event); | 3184 | func(event); |
3191 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 3185 | list_for_each_entry(sibling, &event->sibling_list, group_entry) |
3192 | perf_event_for_each_child(event, func); | 3186 | perf_event_for_each_child(sibling, func); |
3193 | mutex_unlock(&ctx->mutex); | 3187 | mutex_unlock(&ctx->mutex); |
3194 | } | 3188 | } |
3195 | 3189 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index d8bd3b425fa..910a0716e17 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1214 | unsigned long state; | 1214 | unsigned long state; |
1215 | int retval, status, traced; | 1215 | int retval, status, traced; |
1216 | pid_t pid = task_pid_vnr(p); | 1216 | pid_t pid = task_pid_vnr(p); |
1217 | uid_t uid = __task_cred(p)->uid; | 1217 | uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); |
1218 | struct siginfo __user *infop; | 1218 | struct siginfo __user *infop; |
1219 | 1219 | ||
1220 | if (!likely(wo->wo_flags & WEXITED)) | 1220 | if (!likely(wo->wo_flags & WEXITED)) |
@@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1427 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1427 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1428 | *p_code = 0; | 1428 | *p_code = 0; |
1429 | 1429 | ||
1430 | uid = task_uid(p); | 1430 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1431 | unlock_sig: | 1431 | unlock_sig: |
1432 | spin_unlock_irq(&p->sighand->siglock); | 1432 | spin_unlock_irq(&p->sighand->siglock); |
1433 | if (!exit_code) | 1433 | if (!exit_code) |
@@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
1500 | } | 1500 | } |
1501 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1501 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1502 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1502 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1503 | uid = task_uid(p); | 1503 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1504 | spin_unlock_irq(&p->sighand->siglock); | 1504 | spin_unlock_irq(&p->sighand->siglock); |
1505 | 1505 | ||
1506 | pid = task_pid_vnr(p); | 1506 | pid = task_pid_vnr(p); |
diff --git a/kernel/extable.c b/kernel/extable.c index 5339705b824..fe35a634bf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex); | |||
35 | extern struct exception_table_entry __start___ex_table[]; | 35 | extern struct exception_table_entry __start___ex_table[]; |
36 | extern struct exception_table_entry __stop___ex_table[]; | 36 | extern struct exception_table_entry __stop___ex_table[]; |
37 | 37 | ||
38 | /* Cleared by build time tools if the table is already sorted. */ | ||
39 | u32 __initdata main_extable_sort_needed = 1; | ||
40 | |||
38 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
39 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
40 | { | 43 | { |
41 | sort_extable(__start___ex_table, __stop___ex_table); | 44 | if (main_extable_sort_needed) |
45 | sort_extable(__start___ex_table, __stop___ex_table); | ||
46 | else | ||
47 | pr_notice("__ex_table already sorted, skipping sort\n"); | ||
42 | } | 48 | } |
43 | 49 | ||
44 | /* Given an address, look for it in the exception tables. */ | 50 | /* Given an address, look for it in the exception tables. */ |
diff --git a/kernel/fork.c b/kernel/fork.c index ca9a3845ef3..47b4e4f379f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/cgroup.h> | 34 | #include <linux/cgroup.h> |
35 | #include <linux/security.h> | 35 | #include <linux/security.h> |
36 | #include <linux/hugetlb.h> | 36 | #include <linux/hugetlb.h> |
37 | #include <linux/seccomp.h> | ||
37 | #include <linux/swap.h> | 38 | #include <linux/swap.h> |
38 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
39 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
@@ -47,6 +48,7 @@ | |||
47 | #include <linux/audit.h> | 48 | #include <linux/audit.h> |
48 | #include <linux/memcontrol.h> | 49 | #include <linux/memcontrol.h> |
49 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/proc_fs.h> | ||
50 | #include <linux/profile.h> | 52 | #include <linux/profile.h> |
51 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
52 | #include <linux/ksm.h> | 54 | #include <linux/ksm.h> |
@@ -112,32 +114,67 @@ int nr_processes(void) | |||
112 | return total; | 114 | return total; |
113 | } | 115 | } |
114 | 116 | ||
115 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 117 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
116 | # define alloc_task_struct_node(node) \ | ||
117 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) | ||
118 | # define free_task_struct(tsk) \ | ||
119 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
120 | static struct kmem_cache *task_struct_cachep; | 118 | static struct kmem_cache *task_struct_cachep; |
119 | |||
120 | static inline struct task_struct *alloc_task_struct_node(int node) | ||
121 | { | ||
122 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); | ||
123 | } | ||
124 | |||
125 | void __weak arch_release_task_struct(struct task_struct *tsk) { } | ||
126 | |||
127 | static inline void free_task_struct(struct task_struct *tsk) | ||
128 | { | ||
129 | arch_release_task_struct(tsk); | ||
130 | kmem_cache_free(task_struct_cachep, tsk); | ||
131 | } | ||
121 | #endif | 132 | #endif |
122 | 133 | ||
123 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 134 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR |
135 | void __weak arch_release_thread_info(struct thread_info *ti) { } | ||
136 | |||
137 | /* | ||
138 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | ||
139 | * kmemcache based allocator. | ||
140 | */ | ||
141 | # if THREAD_SIZE >= PAGE_SIZE | ||
124 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 142 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
125 | int node) | 143 | int node) |
126 | { | 144 | { |
127 | #ifdef CONFIG_DEBUG_STACK_USAGE | 145 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
128 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 146 | THREAD_SIZE_ORDER); |
129 | #else | ||
130 | gfp_t mask = GFP_KERNEL; | ||
131 | #endif | ||
132 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); | ||
133 | 147 | ||
134 | return page ? page_address(page) : NULL; | 148 | return page ? page_address(page) : NULL; |
135 | } | 149 | } |
136 | 150 | ||
137 | static inline void free_thread_info(struct thread_info *ti) | 151 | static inline void free_thread_info(struct thread_info *ti) |
138 | { | 152 | { |
153 | arch_release_thread_info(ti); | ||
139 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 154 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
140 | } | 155 | } |
156 | # else | ||
157 | static struct kmem_cache *thread_info_cache; | ||
158 | |||
159 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | ||
160 | int node) | ||
161 | { | ||
162 | return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); | ||
163 | } | ||
164 | |||
165 | static void free_thread_info(struct thread_info *ti) | ||
166 | { | ||
167 | arch_release_thread_info(ti); | ||
168 | kmem_cache_free(thread_info_cache, ti); | ||
169 | } | ||
170 | |||
171 | void thread_info_cache_init(void) | ||
172 | { | ||
173 | thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, | ||
174 | THREAD_SIZE, 0, NULL); | ||
175 | BUG_ON(thread_info_cache == NULL); | ||
176 | } | ||
177 | # endif | ||
141 | #endif | 178 | #endif |
142 | 179 | ||
143 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 180 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
@@ -171,6 +208,7 @@ void free_task(struct task_struct *tsk) | |||
171 | free_thread_info(tsk->stack); | 208 | free_thread_info(tsk->stack); |
172 | rt_mutex_debug_task_free(tsk); | 209 | rt_mutex_debug_task_free(tsk); |
173 | ftrace_graph_exit_task(tsk); | 210 | ftrace_graph_exit_task(tsk); |
211 | put_seccomp_filter(tsk); | ||
174 | free_task_struct(tsk); | 212 | free_task_struct(tsk); |
175 | } | 213 | } |
176 | EXPORT_SYMBOL(free_task); | 214 | EXPORT_SYMBOL(free_task); |
@@ -204,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk) | |||
204 | } | 242 | } |
205 | EXPORT_SYMBOL_GPL(__put_task_struct); | 243 | EXPORT_SYMBOL_GPL(__put_task_struct); |
206 | 244 | ||
207 | /* | 245 | void __init __weak arch_task_cache_init(void) { } |
208 | * macro override instead of weak attribute alias, to workaround | ||
209 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
210 | */ | ||
211 | #ifndef arch_task_cache_init | ||
212 | #define arch_task_cache_init() | ||
213 | #endif | ||
214 | 246 | ||
215 | void __init fork_init(unsigned long mempages) | 247 | void __init fork_init(unsigned long mempages) |
216 | { | 248 | { |
217 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 249 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
218 | #ifndef ARCH_MIN_TASKALIGN | 250 | #ifndef ARCH_MIN_TASKALIGN |
219 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 251 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
220 | #endif | 252 | #endif |
@@ -261,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
261 | int node = tsk_fork_get_node(orig); | 293 | int node = tsk_fork_get_node(orig); |
262 | int err; | 294 | int err; |
263 | 295 | ||
264 | prepare_to_copy(orig); | ||
265 | |||
266 | tsk = alloc_task_struct_node(node); | 296 | tsk = alloc_task_struct_node(node); |
267 | if (!tsk) | 297 | if (!tsk) |
268 | return NULL; | 298 | return NULL; |
@@ -1170,6 +1200,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1170 | goto fork_out; | 1200 | goto fork_out; |
1171 | 1201 | ||
1172 | ftrace_graph_init_task(p); | 1202 | ftrace_graph_init_task(p); |
1203 | get_seccomp_filter(p); | ||
1173 | 1204 | ||
1174 | rt_mutex_init_task(p); | 1205 | rt_mutex_init_task(p); |
1175 | 1206 | ||
@@ -1473,6 +1504,8 @@ bad_fork_cleanup_io: | |||
1473 | if (p->io_context) | 1504 | if (p->io_context) |
1474 | exit_io_context(p); | 1505 | exit_io_context(p); |
1475 | bad_fork_cleanup_namespaces: | 1506 | bad_fork_cleanup_namespaces: |
1507 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1508 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1476 | exit_task_namespaces(p); | 1509 | exit_task_namespaces(p); |
1477 | bad_fork_cleanup_mm: | 1510 | bad_fork_cleanup_mm: |
1478 | if (p->mm) | 1511 | if (p->mm) |
diff --git a/kernel/groups.c b/kernel/groups.c index 99b53d1eb7e..6b2588dd04f 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize) | |||
31 | group_info->blocks[0] = group_info->small_block; | 31 | group_info->blocks[0] = group_info->small_block; |
32 | else { | 32 | else { |
33 | for (i = 0; i < nblocks; i++) { | 33 | for (i = 0; i < nblocks; i++) { |
34 | gid_t *b; | 34 | kgid_t *b; |
35 | b = (void *)__get_free_page(GFP_USER); | 35 | b = (void *)__get_free_page(GFP_USER); |
36 | if (!b) | 36 | if (!b) |
37 | goto out_undo_partial_alloc; | 37 | goto out_undo_partial_alloc; |
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free); | |||
66 | static int groups_to_user(gid_t __user *grouplist, | 66 | static int groups_to_user(gid_t __user *grouplist, |
67 | const struct group_info *group_info) | 67 | const struct group_info *group_info) |
68 | { | 68 | { |
69 | struct user_namespace *user_ns = current_user_ns(); | ||
69 | int i; | 70 | int i; |
70 | unsigned int count = group_info->ngroups; | 71 | unsigned int count = group_info->ngroups; |
71 | 72 | ||
72 | for (i = 0; i < group_info->nblocks; i++) { | 73 | for (i = 0; i < count; i++) { |
73 | unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); | 74 | gid_t gid; |
74 | unsigned int len = cp_count * sizeof(*grouplist); | 75 | gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); |
75 | 76 | if (put_user(gid, grouplist+i)) | |
76 | if (copy_to_user(grouplist, group_info->blocks[i], len)) | ||
77 | return -EFAULT; | 77 | return -EFAULT; |
78 | |||
79 | grouplist += NGROUPS_PER_BLOCK; | ||
80 | count -= cp_count; | ||
81 | } | 78 | } |
82 | return 0; | 79 | return 0; |
83 | } | 80 | } |
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist, | |||
86 | static int groups_from_user(struct group_info *group_info, | 83 | static int groups_from_user(struct group_info *group_info, |
87 | gid_t __user *grouplist) | 84 | gid_t __user *grouplist) |
88 | { | 85 | { |
86 | struct user_namespace *user_ns = current_user_ns(); | ||
89 | int i; | 87 | int i; |
90 | unsigned int count = group_info->ngroups; | 88 | unsigned int count = group_info->ngroups; |
91 | 89 | ||
92 | for (i = 0; i < group_info->nblocks; i++) { | 90 | for (i = 0; i < count; i++) { |
93 | unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); | 91 | gid_t gid; |
94 | unsigned int len = cp_count * sizeof(*grouplist); | 92 | kgid_t kgid; |
95 | 93 | if (get_user(gid, grouplist+i)) | |
96 | if (copy_from_user(group_info->blocks[i], grouplist, len)) | ||
97 | return -EFAULT; | 94 | return -EFAULT; |
98 | 95 | ||
99 | grouplist += NGROUPS_PER_BLOCK; | 96 | kgid = make_kgid(user_ns, gid); |
100 | count -= cp_count; | 97 | if (!gid_valid(kgid)) |
98 | return -EINVAL; | ||
99 | |||
100 | GROUP_AT(group_info, i) = kgid; | ||
101 | } | 101 | } |
102 | return 0; | 102 | return 0; |
103 | } | 103 | } |
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info) | |||
117 | for (base = 0; base < max; base++) { | 117 | for (base = 0; base < max; base++) { |
118 | int left = base; | 118 | int left = base; |
119 | int right = left + stride; | 119 | int right = left + stride; |
120 | gid_t tmp = GROUP_AT(group_info, right); | 120 | kgid_t tmp = GROUP_AT(group_info, right); |
121 | 121 | ||
122 | while (left >= 0 && GROUP_AT(group_info, left) > tmp) { | 122 | while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { |
123 | GROUP_AT(group_info, right) = | 123 | GROUP_AT(group_info, right) = |
124 | GROUP_AT(group_info, left); | 124 | GROUP_AT(group_info, left); |
125 | right = left; | 125 | right = left; |
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info) | |||
132 | } | 132 | } |
133 | 133 | ||
134 | /* a simple bsearch */ | 134 | /* a simple bsearch */ |
135 | int groups_search(const struct group_info *group_info, gid_t grp) | 135 | int groups_search(const struct group_info *group_info, kgid_t grp) |
136 | { | 136 | { |
137 | unsigned int left, right; | 137 | unsigned int left, right; |
138 | 138 | ||
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) | |||
143 | right = group_info->ngroups; | 143 | right = group_info->ngroups; |
144 | while (left < right) { | 144 | while (left < right) { |
145 | unsigned int mid = (left+right)/2; | 145 | unsigned int mid = (left+right)/2; |
146 | if (grp > GROUP_AT(group_info, mid)) | 146 | if (gid_gt(grp, GROUP_AT(group_info, mid))) |
147 | left = mid + 1; | 147 | left = mid + 1; |
148 | else if (grp < GROUP_AT(group_info, mid)) | 148 | else if (gid_lt(grp, GROUP_AT(group_info, mid))) |
149 | right = mid; | 149 | right = mid; |
150 | else | 150 | else |
151 | return 1; | 151 | return 1; |
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
256 | /* | 256 | /* |
257 | * Check whether we're fsgid/egid or in the supplemental group.. | 257 | * Check whether we're fsgid/egid or in the supplemental group.. |
258 | */ | 258 | */ |
259 | int in_group_p(gid_t grp) | 259 | int in_group_p(kgid_t grp) |
260 | { | 260 | { |
261 | const struct cred *cred = current_cred(); | 261 | const struct cred *cred = current_cred(); |
262 | int retval = 1; | 262 | int retval = 1; |
263 | 263 | ||
264 | if (grp != cred->fsgid) | 264 | if (!gid_eq(grp, cred->fsgid)) |
265 | retval = groups_search(cred->group_info, grp); | 265 | retval = groups_search(cred->group_info, grp); |
266 | return retval; | 266 | return retval; |
267 | } | 267 | } |
268 | 268 | ||
269 | EXPORT_SYMBOL(in_group_p); | 269 | EXPORT_SYMBOL(in_group_p); |
270 | 270 | ||
271 | int in_egroup_p(gid_t grp) | 271 | int in_egroup_p(kgid_t grp) |
272 | { | 272 | { |
273 | const struct cred *cred = current_cred(); | 273 | const struct cred *cred = current_cred(); |
274 | int retval = 1; | 274 | int retval = 1; |
275 | 275 | ||
276 | if (grp != cred->egid) | 276 | if (!gid_eq(grp, cred->egid)) |
277 | retval = groups_search(cred->group_info, grp); | 277 | retval = groups_search(cred->group_info, grp); |
278 | return retval; | 278 | return retval; |
279 | } | 279 | } |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2..6df614912b9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
108 | 108 | ||
109 | touch_nmi_watchdog(); | 109 | touch_nmi_watchdog(); |
110 | 110 | ||
111 | if (sysctl_hung_task_panic) | 111 | if (sysctl_hung_task_panic) { |
112 | trigger_all_cpu_backtrace(); | ||
112 | panic("hung_task: blocked tasks"); | 113 | panic("hung_task: blocked tasks"); |
114 | } | ||
113 | } | 115 | } |
114 | 116 | ||
115 | /* | 117 | /* |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6080f6bc8c3..fc275e4f629 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
379 | * If its disabled or no action available | 379 | * If its disabled or no action available |
380 | * keep it masked and get out of here | 380 | * keep it masked and get out of here |
381 | */ | 381 | */ |
382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
383 | desc->istate |= IRQS_PENDING; | ||
383 | goto out_unlock; | 384 | goto out_unlock; |
385 | } | ||
384 | 386 | ||
385 | handle_irq_event(desc); | 387 | handle_irq_event(desc); |
386 | 388 | ||
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
518 | out_unlock: | 520 | out_unlock: |
519 | raw_spin_unlock(&desc->lock); | 521 | raw_spin_unlock(&desc->lock); |
520 | } | 522 | } |
523 | EXPORT_SYMBOL(handle_edge_irq); | ||
521 | 524 | ||
522 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | 525 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER |
523 | /** | 526 | /** |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 97a8bfadc88..e75e29e4434 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
@@ -4,10 +4,10 @@ | |||
4 | 4 | ||
5 | #include <linux/kallsyms.h> | 5 | #include <linux/kallsyms.h> |
6 | 6 | ||
7 | #define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) | 7 | #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) |
8 | #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) | 8 | #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) |
9 | /* FIXME */ | 9 | /* FIXME */ |
10 | #define PD(f) do { } while (0) | 10 | #define ___PD(f) do { } while (0) |
11 | 11 | ||
12 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | 12 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) |
13 | { | 13 | { |
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
23 | print_symbol("%s\n", (unsigned long)desc->action->handler); | 23 | print_symbol("%s\n", (unsigned long)desc->action->handler); |
24 | } | 24 | } |
25 | 25 | ||
26 | P(IRQ_LEVEL); | 26 | ___P(IRQ_LEVEL); |
27 | P(IRQ_PER_CPU); | 27 | ___P(IRQ_PER_CPU); |
28 | P(IRQ_NOPROBE); | 28 | ___P(IRQ_NOPROBE); |
29 | P(IRQ_NOREQUEST); | 29 | ___P(IRQ_NOREQUEST); |
30 | P(IRQ_NOTHREAD); | 30 | ___P(IRQ_NOTHREAD); |
31 | P(IRQ_NOAUTOEN); | 31 | ___P(IRQ_NOAUTOEN); |
32 | 32 | ||
33 | PS(IRQS_AUTODETECT); | 33 | ___PS(IRQS_AUTODETECT); |
34 | PS(IRQS_REPLAY); | 34 | ___PS(IRQS_REPLAY); |
35 | PS(IRQS_WAITING); | 35 | ___PS(IRQS_WAITING); |
36 | PS(IRQS_PENDING); | 36 | ___PS(IRQS_PENDING); |
37 | 37 | ||
38 | PD(IRQS_INPROGRESS); | 38 | ___PD(IRQS_INPROGRESS); |
39 | PD(IRQS_DISABLED); | 39 | ___PD(IRQS_DISABLED); |
40 | PD(IRQS_MASKED); | 40 | ___PD(IRQS_MASKED); |
41 | } | 41 | } |
42 | 42 | ||
43 | #undef P | 43 | #undef ___P |
44 | #undef PS | 44 | #undef ___PS |
45 | #undef PD | 45 | #undef ___PD |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d86e254b95e..192a302d6cf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
112 | { | 112 | { |
113 | return radix_tree_lookup(&irq_desc_tree, irq); | 113 | return radix_tree_lookup(&irq_desc_tree, irq); |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(irq_to_desc); | ||
115 | 116 | ||
116 | static void delete_irq_desc(unsigned int irq) | 117 | static void delete_irq_desc(unsigned int irq) |
117 | { | 118 | { |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569..bb32326afe8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
565 | * IRQF_TRIGGER_* but the PIC does not support multiple | 565 | * IRQF_TRIGGER_* but the PIC does not support multiple |
566 | * flow-types? | 566 | * flow-types? |
567 | */ | 567 | */ |
568 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, | 568 | pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, |
569 | chip ? (chip->name ? : "unknown") : "unknown"); | 569 | chip ? (chip->name ? : "unknown") : "unknown"); |
570 | return 0; | 570 | return 0; |
571 | } | 571 | } |
572 | 572 | ||
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
600 | ret = 0; | 600 | ret = 0; |
601 | break; | 601 | break; |
602 | default: | 602 | default: |
603 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 603 | pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", |
604 | flags, irq, chip->irq_set_type); | 604 | flags, irq, chip->irq_set_type); |
605 | } | 605 | } |
606 | if (unmask) | 606 | if (unmask) |
@@ -837,8 +837,7 @@ void exit_irq_thread(void) | |||
837 | 837 | ||
838 | action = kthread_data(tsk); | 838 | action = kthread_data(tsk); |
839 | 839 | ||
840 | printk(KERN_ERR | 840 | pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
841 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
842 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | 841 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); |
843 | 842 | ||
844 | desc = irq_to_desc(action->irq); | 843 | desc = irq_to_desc(action->irq); |
@@ -878,7 +877,6 @@ static int | |||
878 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 877 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
879 | { | 878 | { |
880 | struct irqaction *old, **old_ptr; | 879 | struct irqaction *old, **old_ptr; |
881 | const char *old_name = NULL; | ||
882 | unsigned long flags, thread_mask = 0; | 880 | unsigned long flags, thread_mask = 0; |
883 | int ret, nested, shared = 0; | 881 | int ret, nested, shared = 0; |
884 | cpumask_var_t mask; | 882 | cpumask_var_t mask; |
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
972 | */ | 970 | */ |
973 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 971 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
974 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || | 972 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
975 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | 973 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) |
976 | old_name = old->name; | ||
977 | goto mismatch; | 974 | goto mismatch; |
978 | } | ||
979 | 975 | ||
980 | /* All handlers must agree on per-cpuness */ | 976 | /* All handlers must agree on per-cpuness */ |
981 | if ((old->flags & IRQF_PERCPU) != | 977 | if ((old->flags & IRQF_PERCPU) != |
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1031 | * all existing action->thread_mask bits. | 1027 | * all existing action->thread_mask bits. |
1032 | */ | 1028 | */ |
1033 | new->thread_mask = 1 << ffz(thread_mask); | 1029 | new->thread_mask = 1 << ffz(thread_mask); |
1030 | |||
1031 | } else if (new->handler == irq_default_primary_handler) { | ||
1032 | /* | ||
1033 | * The interrupt was requested with handler = NULL, so | ||
1034 | * we use the default primary handler for it. But it | ||
1035 | * does not have the oneshot flag set. In combination | ||
1036 | * with level interrupts this is deadly, because the | ||
1037 | * default primary handler just wakes the thread, then | ||
1038 | * the irq lines is reenabled, but the device still | ||
1039 | * has the level irq asserted. Rinse and repeat.... | ||
1040 | * | ||
1041 | * While this works for edge type interrupts, we play | ||
1042 | * it safe and reject unconditionally because we can't | ||
1043 | * say for sure which type this interrupt really | ||
1044 | * has. The type flags are unreliable as the | ||
1045 | * underlying chip implementation can override them. | ||
1046 | */ | ||
1047 | pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", | ||
1048 | irq); | ||
1049 | ret = -EINVAL; | ||
1050 | goto out_mask; | ||
1034 | } | 1051 | } |
1035 | 1052 | ||
1036 | if (!shared) { | 1053 | if (!shared) { |
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1078 | 1095 | ||
1079 | if (nmsk != omsk) | 1096 | if (nmsk != omsk) |
1080 | /* hope the handler works with current trigger mode */ | 1097 | /* hope the handler works with current trigger mode */ |
1081 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", | 1098 | pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", |
1082 | irq, nmsk, omsk); | 1099 | irq, nmsk, omsk); |
1083 | } | 1100 | } |
1084 | 1101 | ||
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1115 | return 0; | 1132 | return 0; |
1116 | 1133 | ||
1117 | mismatch: | 1134 | mismatch: |
1118 | #ifdef CONFIG_DEBUG_SHIRQ | ||
1119 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 1135 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
1120 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 1136 | pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", |
1121 | if (old_name) | 1137 | irq, new->flags, new->name, old->flags, old->name); |
1122 | printk(KERN_ERR "current handler: %s\n", old_name); | 1138 | #ifdef CONFIG_DEBUG_SHIRQ |
1123 | dump_stack(); | 1139 | dump_stack(); |
1124 | } | ||
1125 | #endif | 1140 | #endif |
1141 | } | ||
1126 | ret = -EBUSY; | 1142 | ret = -EBUSY; |
1127 | 1143 | ||
1128 | out_mask: | 1144 | out_mask: |
@@ -1204,12 +1220,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1204 | /* Found it - now remove it from the list of entries: */ | 1220 | /* Found it - now remove it from the list of entries: */ |
1205 | *action_ptr = action->next; | 1221 | *action_ptr = action->next; |
1206 | 1222 | ||
1207 | /* Currently used only by UML, might disappear one day: */ | ||
1208 | #ifdef CONFIG_IRQ_RELEASE_METHOD | ||
1209 | if (desc->irq_data.chip->release) | ||
1210 | desc->irq_data.chip->release(irq, dev_id); | ||
1211 | #endif | ||
1212 | |||
1213 | /* If this was the last handler, shut down the IRQ line: */ | 1223 | /* If this was the last handler, shut down the IRQ line: */ |
1214 | if (!desc->action) | 1224 | if (!desc->action) |
1215 | irq_shutdown(desc); | 1225 | irq_shutdown(desc); |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a..cb228bf2176 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void) | |||
103 | int irq; | 103 | int irq; |
104 | 104 | ||
105 | for_each_irq_desc(irq, desc) { | 105 | for_each_irq_desc(irq, desc) { |
106 | /* | ||
107 | * Only interrupts which are marked as wakeup source | ||
108 | * and have not been disabled before the suspend check | ||
109 | * can abort suspend. | ||
110 | */ | ||
106 | if (irqd_is_wakeup_set(&desc->irq_data)) { | 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
107 | if (desc->istate & IRQS_PENDING) | 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) |
108 | return -EBUSY; | 113 | return -EBUSY; |
109 | continue; | 114 | continue; |
110 | } | 115 | } |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c..6454db7b6a4 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
58 | /* | 58 | /* |
59 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
60 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
61 | * active. | 61 | * active. Clear the pending bit so suspend/resume does not |
62 | * get confused. | ||
62 | */ | 63 | */ |
63 | if (irq_settings_is_level(desc)) | 64 | if (irq_settings_is_level(desc)) { |
65 | desc->istate &= ~IRQS_PENDING; | ||
64 | return; | 66 | return; |
67 | } | ||
65 | if (desc->istate & IRQS_REPLAY) | 68 | if (desc->istate & IRQS_REPLAY) |
66 | return; | 69 | return; |
67 | if (desc->istate & IRQS_PENDING) { | 70 | if (desc->istate & IRQS_PENDING) { |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index c744b88c44e..59dcf5b81d2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize) | |||
402 | return max; | 402 | return max; |
403 | return len; | 403 | return len; |
404 | } | 404 | } |
405 | EXPORT_SYMBOL(__kfifo_max_r); | ||
405 | 406 | ||
406 | #define __KFIFO_PEEK(data, out, mask) \ | 407 | #define __KFIFO_PEEK(data, out, mask) \ |
407 | ((data)[(out) & (mask)]) | 408 | ((data)[(out) & (mask)]) |
diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e42..4edbd9c11ac 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info, | |||
2429 | goto free_hdr; | 2429 | goto free_hdr; |
2430 | } | 2430 | } |
2431 | 2431 | ||
2432 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { | 2432 | if (hdr->e_shoff >= len || |
2433 | hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { | ||
2433 | err = -ENOEXEC; | 2434 | err = -ENOEXEC; |
2434 | goto free_hdr; | 2435 | goto free_hdr; |
2435 | } | 2436 | } |
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod, | |||
2953 | 2954 | ||
2954 | /* Module is ready to execute: parsing args may do that. */ | 2955 | /* Module is ready to execute: parsing args may do that. */ |
2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 2956 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
2956 | -32768, 32767, NULL); | 2957 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
2957 | if (err < 0) | 2958 | if (err < 0) |
2958 | goto unlink; | 2959 | goto unlink; |
2959 | 2960 | ||
diff --git a/kernel/params.c b/kernel/params.c index f37d8263134..ed35345be53 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) | |||
85 | 85 | ||
86 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
87 | char *val, | 87 | char *val, |
88 | const char *doing, | ||
88 | const struct kernel_param *params, | 89 | const struct kernel_param *params, |
89 | unsigned num_params, | 90 | unsigned num_params, |
90 | s16 min_level, | 91 | s16 min_level, |
91 | s16 max_level, | 92 | s16 max_level, |
92 | int (*handle_unknown)(char *param, char *val)) | 93 | int (*handle_unknown)(char *param, char *val, |
94 | const char *doing)) | ||
93 | { | 95 | { |
94 | unsigned int i; | 96 | unsigned int i; |
95 | int err; | 97 | int err; |
@@ -104,8 +106,8 @@ static int parse_one(char *param, | |||
104 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && params[i].ops->set != param_set_bool |
105 | && params[i].ops->set != param_set_bint) | 107 | && params[i].ops->set != param_set_bint) |
106 | return -EINVAL; | 108 | return -EINVAL; |
107 | pr_debug("They are equal! Calling %p\n", | 109 | pr_debug("handling %s with %p\n", param, |
108 | params[i].ops->set); | 110 | params[i].ops->set); |
109 | mutex_lock(¶m_lock); | 111 | mutex_lock(¶m_lock); |
110 | err = params[i].ops->set(val, ¶ms[i]); | 112 | err = params[i].ops->set(val, ¶ms[i]); |
111 | mutex_unlock(¶m_lock); | 113 | mutex_unlock(¶m_lock); |
@@ -114,11 +116,11 @@ static int parse_one(char *param, | |||
114 | } | 116 | } |
115 | 117 | ||
116 | if (handle_unknown) { | 118 | if (handle_unknown) { |
117 | pr_debug("Unknown argument: calling %p\n", handle_unknown); | 119 | pr_debug("doing %s: %s='%s'\n", doing, param, val); |
118 | return handle_unknown(param, val); | 120 | return handle_unknown(param, val, doing); |
119 | } | 121 | } |
120 | 122 | ||
121 | pr_debug("Unknown argument `%s'\n", param); | 123 | pr_debug("Unknown argument '%s'\n", param); |
122 | return -ENOENT; | 124 | return -ENOENT; |
123 | } | 125 | } |
124 | 126 | ||
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) | |||
175 | } | 177 | } |
176 | 178 | ||
177 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
178 | int parse_args(const char *name, | 180 | int parse_args(const char *doing, |
179 | char *args, | 181 | char *args, |
180 | const struct kernel_param *params, | 182 | const struct kernel_param *params, |
181 | unsigned num, | 183 | unsigned num, |
182 | s16 min_level, | 184 | s16 min_level, |
183 | s16 max_level, | 185 | s16 max_level, |
184 | int (*unknown)(char *param, char *val)) | 186 | int (*unknown)(char *param, char *val, const char *doing)) |
185 | { | 187 | { |
186 | char *param, *val; | 188 | char *param, *val; |
187 | 189 | ||
188 | pr_debug("Parsing ARGS: %s\n", args); | ||
189 | |||
190 | /* Chew leading spaces */ | 190 | /* Chew leading spaces */ |
191 | args = skip_spaces(args); | 191 | args = skip_spaces(args); |
192 | 192 | ||
193 | if (*args) | ||
194 | pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); | ||
195 | |||
193 | while (*args) { | 196 | while (*args) { |
194 | int ret; | 197 | int ret; |
195 | int irq_was_disabled; | 198 | int irq_was_disabled; |
196 | 199 | ||
197 | args = next_arg(args, ¶m, &val); | 200 | args = next_arg(args, ¶m, &val); |
198 | irq_was_disabled = irqs_disabled(); | 201 | irq_was_disabled = irqs_disabled(); |
199 | ret = parse_one(param, val, params, num, | 202 | ret = parse_one(param, val, doing, params, num, |
200 | min_level, max_level, unknown); | 203 | min_level, max_level, unknown); |
201 | if (irq_was_disabled && !irqs_disabled()) { | 204 | if (irq_was_disabled && !irqs_disabled()) |
202 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | 205 | pr_warn("%s: option '%s' enabled irq's!\n", |
203 | "irq's!\n", param); | 206 | doing, param); |
204 | } | 207 | |
205 | switch (ret) { | 208 | switch (ret) { |
206 | case -ENOENT: | 209 | case -ENOENT: |
207 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | 210 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
208 | name, param); | ||
209 | return ret; | 211 | return ret; |
210 | case -ENOSPC: | 212 | case -ENOSPC: |
211 | printk(KERN_ERR | 213 | pr_err("%s: `%s' too large for parameter `%s'\n", |
212 | "%s: `%s' too large for parameter `%s'\n", | 214 | doing, val ?: "", param); |
213 | name, val ?: "", param); | ||
214 | return ret; | 215 | return ret; |
215 | case 0: | 216 | case 0: |
216 | break; | 217 | break; |
217 | default: | 218 | default: |
218 | printk(KERN_ERR | 219 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
219 | "%s: `%s' invalid for parameter `%s'\n", | 220 | doing, val ?: "", param); |
220 | name, val ?: "", param); | ||
221 | return ret; | 221 | return ret; |
222 | } | 222 | } |
223 | } | 223 | } |
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | |||
263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 263 | int param_set_charp(const char *val, const struct kernel_param *kp) |
264 | { | 264 | { |
265 | if (strlen(val) > 1024) { | 265 | if (strlen(val) > 1024) { |
266 | printk(KERN_ERR "%s: string parameter too long\n", | 266 | pr_err("%s: string parameter too long\n", kp->name); |
267 | kp->name); | ||
268 | return -ENOSPC; | 267 | return -ENOSPC; |
269 | } | 268 | } |
270 | 269 | ||
@@ -400,8 +399,7 @@ static int param_array(const char *name, | |||
400 | int len; | 399 | int len; |
401 | 400 | ||
402 | if (*num == max) { | 401 | if (*num == max) { |
403 | printk(KERN_ERR "%s: can only take %i arguments\n", | 402 | pr_err("%s: can only take %i arguments\n", name, max); |
404 | name, max); | ||
405 | return -EINVAL; | 403 | return -EINVAL; |
406 | } | 404 | } |
407 | len = strcspn(val, ","); | 405 | len = strcspn(val, ","); |
@@ -420,8 +418,7 @@ static int param_array(const char *name, | |||
420 | } while (save == ','); | 418 | } while (save == ','); |
421 | 419 | ||
422 | if (*num < min) { | 420 | if (*num < min) { |
423 | printk(KERN_ERR "%s: needs at least %i arguments\n", | 421 | pr_err("%s: needs at least %i arguments\n", name, min); |
424 | name, min); | ||
425 | return -EINVAL; | 422 | return -EINVAL; |
426 | } | 423 | } |
427 | return 0; | 424 | return 0; |
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) | |||
480 | const struct kparam_string *kps = kp->str; | 477 | const struct kparam_string *kps = kp->str; |
481 | 478 | ||
482 | if (strlen(val)+1 > kps->maxlen) { | 479 | if (strlen(val)+1 > kps->maxlen) { |
483 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 480 | pr_err("%s: string doesn't fit in %u chars.\n", |
484 | kp->name, kps->maxlen-1); | 481 | kp->name, kps->maxlen-1); |
485 | return -ENOSPC; | 482 | return -ENOSPC; |
486 | } | 483 | } |
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
750 | #endif | 747 | #endif |
751 | if (err) { | 748 | if (err) { |
752 | kobject_put(&mk->kobj); | 749 | kobject_put(&mk->kobj); |
753 | printk(KERN_ERR | 750 | pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", |
754 | "Module '%s' failed add to sysfs, error number %d\n", | ||
755 | name, err); | 751 | name, err); |
756 | printk(KERN_ERR | ||
757 | "The system will be unstable now.\n"); | ||
758 | return NULL; | 752 | return NULL; |
759 | } | 753 | } |
760 | 754 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e321..8f9b4eb974e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP | |||
103 | select HOTPLUG | 103 | select HOTPLUG |
104 | select HOTPLUG_CPU | 104 | select HOTPLUG_CPU |
105 | 105 | ||
106 | config PM_AUTOSLEEP | ||
107 | bool "Opportunistic sleep" | ||
108 | depends on PM_SLEEP | ||
109 | default n | ||
110 | ---help--- | ||
111 | Allow the kernel to trigger a system transition into a global sleep | ||
112 | state automatically whenever there are no active wakeup sources. | ||
113 | |||
114 | config PM_WAKELOCKS | ||
115 | bool "User space wakeup sources interface" | ||
116 | depends on PM_SLEEP | ||
117 | default n | ||
118 | ---help--- | ||
119 | Allow user space to create, activate and deactivate wakeup source | ||
120 | objects with the help of a sysfs-based interface. | ||
121 | |||
122 | config PM_WAKELOCKS_LIMIT | ||
123 | int "Maximum number of user space wakeup sources (0 = no limit)" | ||
124 | range 0 100000 | ||
125 | default 100 | ||
126 | depends on PM_WAKELOCKS | ||
127 | |||
128 | config PM_WAKELOCKS_GC | ||
129 | bool "Garbage collector for user space wakeup sources" | ||
130 | depends on PM_WAKELOCKS | ||
131 | default y | ||
132 | |||
106 | config PM_RUNTIME | 133 | config PM_RUNTIME |
107 | bool "Run-time PM core functionality" | 134 | bool "Run-time PM core functionality" |
108 | depends on !IA64_HP_SIM | 135 | depends on !IA64_HP_SIM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec525..29472bff11e 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
11 | block_io.o | 11 | block_io.o |
12 | obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o | ||
13 | obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o | ||
12 | 14 | ||
13 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 00000000000..ca304046d9e --- /dev/null +++ b/kernel/power/autosleep.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * kernel/power/autosleep.c | ||
3 | * | ||
4 | * Opportunistic sleep support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | */ | ||
8 | |||
9 | #include <linux/device.h> | ||
10 | #include <linux/mutex.h> | ||
11 | #include <linux/pm_wakeup.h> | ||
12 | |||
13 | #include "power.h" | ||
14 | |||
15 | static suspend_state_t autosleep_state; | ||
16 | static struct workqueue_struct *autosleep_wq; | ||
17 | /* | ||
18 | * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source | ||
19 | * is active, otherwise a deadlock with try_to_suspend() is possible. | ||
20 | * Alternatively mutex_lock_interruptible() can be used. This will then fail | ||
21 | * if an auto_sleep cycle tries to freeze processes. | ||
22 | */ | ||
23 | static DEFINE_MUTEX(autosleep_lock); | ||
24 | static struct wakeup_source *autosleep_ws; | ||
25 | |||
26 | static void try_to_suspend(struct work_struct *work) | ||
27 | { | ||
28 | unsigned int initial_count, final_count; | ||
29 | |||
30 | if (!pm_get_wakeup_count(&initial_count, true)) | ||
31 | goto out; | ||
32 | |||
33 | mutex_lock(&autosleep_lock); | ||
34 | |||
35 | if (!pm_save_wakeup_count(initial_count)) { | ||
36 | mutex_unlock(&autosleep_lock); | ||
37 | goto out; | ||
38 | } | ||
39 | |||
40 | if (autosleep_state == PM_SUSPEND_ON) { | ||
41 | mutex_unlock(&autosleep_lock); | ||
42 | return; | ||
43 | } | ||
44 | if (autosleep_state >= PM_SUSPEND_MAX) | ||
45 | hibernate(); | ||
46 | else | ||
47 | pm_suspend(autosleep_state); | ||
48 | |||
49 | mutex_unlock(&autosleep_lock); | ||
50 | |||
51 | if (!pm_get_wakeup_count(&final_count, false)) | ||
52 | goto out; | ||
53 | |||
54 | /* | ||
55 | * If the wakeup occured for an unknown reason, wait to prevent the | ||
56 | * system from trying to suspend and waking up in a tight loop. | ||
57 | */ | ||
58 | if (final_count == initial_count) | ||
59 | schedule_timeout_uninterruptible(HZ / 2); | ||
60 | |||
61 | out: | ||
62 | queue_up_suspend_work(); | ||
63 | } | ||
64 | |||
65 | static DECLARE_WORK(suspend_work, try_to_suspend); | ||
66 | |||
67 | void queue_up_suspend_work(void) | ||
68 | { | ||
69 | if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) | ||
70 | queue_work(autosleep_wq, &suspend_work); | ||
71 | } | ||
72 | |||
73 | suspend_state_t pm_autosleep_state(void) | ||
74 | { | ||
75 | return autosleep_state; | ||
76 | } | ||
77 | |||
78 | int pm_autosleep_lock(void) | ||
79 | { | ||
80 | return mutex_lock_interruptible(&autosleep_lock); | ||
81 | } | ||
82 | |||
83 | void pm_autosleep_unlock(void) | ||
84 | { | ||
85 | mutex_unlock(&autosleep_lock); | ||
86 | } | ||
87 | |||
88 | int pm_autosleep_set_state(suspend_state_t state) | ||
89 | { | ||
90 | |||
91 | #ifndef CONFIG_HIBERNATION | ||
92 | if (state >= PM_SUSPEND_MAX) | ||
93 | return -EINVAL; | ||
94 | #endif | ||
95 | |||
96 | __pm_stay_awake(autosleep_ws); | ||
97 | |||
98 | mutex_lock(&autosleep_lock); | ||
99 | |||
100 | autosleep_state = state; | ||
101 | |||
102 | __pm_relax(autosleep_ws); | ||
103 | |||
104 | if (state > PM_SUSPEND_ON) { | ||
105 | pm_wakep_autosleep_enabled(true); | ||
106 | queue_up_suspend_work(); | ||
107 | } else { | ||
108 | pm_wakep_autosleep_enabled(false); | ||
109 | } | ||
110 | |||
111 | mutex_unlock(&autosleep_lock); | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | int __init pm_autosleep_init(void) | ||
116 | { | ||
117 | autosleep_ws = wakeup_source_register("autosleep"); | ||
118 | if (!autosleep_ws) | ||
119 | return -ENOMEM; | ||
120 | |||
121 | autosleep_wq = alloc_ordered_workqueue("autosleep", 0); | ||
122 | if (autosleep_wq) | ||
123 | return 0; | ||
124 | |||
125 | wakeup_source_unregister(autosleep_ws); | ||
126 | return -ENOMEM; | ||
127 | } | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e09dfbfeece..8b53db38a27 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/freezer.h> | 25 | #include <linux/freezer.h> |
26 | #include <linux/gfp.h> | 26 | #include <linux/gfp.h> |
27 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
28 | #include <linux/ctype.h> | ||
29 | #include <linux/genhd.h> | ||
28 | #include <scsi/scsi_scan.h> | 30 | #include <scsi/scsi_scan.h> |
29 | 31 | ||
30 | #include "power.h" | 32 | #include "power.h" |
@@ -722,6 +724,17 @@ static int software_resume(void) | |||
722 | 724 | ||
723 | /* Check if the device is there */ | 725 | /* Check if the device is there */ |
724 | swsusp_resume_device = name_to_dev_t(resume_file); | 726 | swsusp_resume_device = name_to_dev_t(resume_file); |
727 | |||
728 | /* | ||
729 | * name_to_dev_t is ineffective to verify parition if resume_file is in | ||
730 | * integer format. (e.g. major:minor) | ||
731 | */ | ||
732 | if (isdigit(resume_file[0]) && resume_wait) { | ||
733 | int partno; | ||
734 | while (!get_gendisk(swsusp_resume_device, &partno)) | ||
735 | msleep(10); | ||
736 | } | ||
737 | |||
725 | if (!swsusp_resume_device) { | 738 | if (!swsusp_resume_device) { |
726 | /* | 739 | /* |
727 | * Some device discovery might still be in progress; we need | 740 | * Some device discovery might still be in progress; we need |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c6..428f8a034e9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
269 | return (s - buf); | 269 | return (s - buf); |
270 | } | 270 | } |
271 | 271 | ||
272 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | 272 | static suspend_state_t decode_state(const char *buf, size_t n) |
273 | const char *buf, size_t n) | ||
274 | { | 273 | { |
275 | #ifdef CONFIG_SUSPEND | 274 | #ifdef CONFIG_SUSPEND |
276 | suspend_state_t state = PM_SUSPEND_STANDBY; | 275 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
278 | #endif | 277 | #endif |
279 | char *p; | 278 | char *p; |
280 | int len; | 279 | int len; |
281 | int error = -EINVAL; | ||
282 | 280 | ||
283 | p = memchr(buf, '\n', n); | 281 | p = memchr(buf, '\n', n); |
284 | len = p ? p - buf : n; | 282 | len = p ? p - buf : n; |
285 | 283 | ||
286 | /* First, check if we are requested to hibernate */ | 284 | /* Check hibernation first. */ |
287 | if (len == 4 && !strncmp(buf, "disk", len)) { | 285 | if (len == 4 && !strncmp(buf, "disk", len)) |
288 | error = hibernate(); | 286 | return PM_SUSPEND_MAX; |
289 | goto Exit; | ||
290 | } | ||
291 | 287 | ||
292 | #ifdef CONFIG_SUSPEND | 288 | #ifdef CONFIG_SUSPEND |
293 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | 289 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) |
294 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { | 290 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) |
295 | error = pm_suspend(state); | 291 | return state; |
296 | break; | ||
297 | } | ||
298 | } | ||
299 | #endif | 292 | #endif |
300 | 293 | ||
301 | Exit: | 294 | return PM_SUSPEND_ON; |
295 | } | ||
296 | |||
297 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
298 | const char *buf, size_t n) | ||
299 | { | ||
300 | suspend_state_t state; | ||
301 | int error; | ||
302 | |||
303 | error = pm_autosleep_lock(); | ||
304 | if (error) | ||
305 | return error; | ||
306 | |||
307 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
308 | error = -EBUSY; | ||
309 | goto out; | ||
310 | } | ||
311 | |||
312 | state = decode_state(buf, n); | ||
313 | if (state < PM_SUSPEND_MAX) | ||
314 | error = pm_suspend(state); | ||
315 | else if (state == PM_SUSPEND_MAX) | ||
316 | error = hibernate(); | ||
317 | else | ||
318 | error = -EINVAL; | ||
319 | |||
320 | out: | ||
321 | pm_autosleep_unlock(); | ||
302 | return error ? error : n; | 322 | return error ? error : n; |
303 | } | 323 | } |
304 | 324 | ||
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, | |||
339 | { | 359 | { |
340 | unsigned int val; | 360 | unsigned int val; |
341 | 361 | ||
342 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; | 362 | return pm_get_wakeup_count(&val, true) ? |
363 | sprintf(buf, "%u\n", val) : -EINTR; | ||
343 | } | 364 | } |
344 | 365 | ||
345 | static ssize_t wakeup_count_store(struct kobject *kobj, | 366 | static ssize_t wakeup_count_store(struct kobject *kobj, |
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, | |||
347 | const char *buf, size_t n) | 368 | const char *buf, size_t n) |
348 | { | 369 | { |
349 | unsigned int val; | 370 | unsigned int val; |
371 | int error; | ||
372 | |||
373 | error = pm_autosleep_lock(); | ||
374 | if (error) | ||
375 | return error; | ||
376 | |||
377 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
378 | error = -EBUSY; | ||
379 | goto out; | ||
380 | } | ||
350 | 381 | ||
382 | error = -EINVAL; | ||
351 | if (sscanf(buf, "%u", &val) == 1) { | 383 | if (sscanf(buf, "%u", &val) == 1) { |
352 | if (pm_save_wakeup_count(val)) | 384 | if (pm_save_wakeup_count(val)) |
353 | return n; | 385 | error = n; |
354 | } | 386 | } |
355 | return -EINVAL; | 387 | |
388 | out: | ||
389 | pm_autosleep_unlock(); | ||
390 | return error; | ||
356 | } | 391 | } |
357 | 392 | ||
358 | power_attr(wakeup_count); | 393 | power_attr(wakeup_count); |
394 | |||
395 | #ifdef CONFIG_PM_AUTOSLEEP | ||
396 | static ssize_t autosleep_show(struct kobject *kobj, | ||
397 | struct kobj_attribute *attr, | ||
398 | char *buf) | ||
399 | { | ||
400 | suspend_state_t state = pm_autosleep_state(); | ||
401 | |||
402 | if (state == PM_SUSPEND_ON) | ||
403 | return sprintf(buf, "off\n"); | ||
404 | |||
405 | #ifdef CONFIG_SUSPEND | ||
406 | if (state < PM_SUSPEND_MAX) | ||
407 | return sprintf(buf, "%s\n", valid_state(state) ? | ||
408 | pm_states[state] : "error"); | ||
409 | #endif | ||
410 | #ifdef CONFIG_HIBERNATION | ||
411 | return sprintf(buf, "disk\n"); | ||
412 | #else | ||
413 | return sprintf(buf, "error"); | ||
414 | #endif | ||
415 | } | ||
416 | |||
417 | static ssize_t autosleep_store(struct kobject *kobj, | ||
418 | struct kobj_attribute *attr, | ||
419 | const char *buf, size_t n) | ||
420 | { | ||
421 | suspend_state_t state = decode_state(buf, n); | ||
422 | int error; | ||
423 | |||
424 | if (state == PM_SUSPEND_ON | ||
425 | && strcmp(buf, "off") && strcmp(buf, "off\n")) | ||
426 | return -EINVAL; | ||
427 | |||
428 | error = pm_autosleep_set_state(state); | ||
429 | return error ? error : n; | ||
430 | } | ||
431 | |||
432 | power_attr(autosleep); | ||
433 | #endif /* CONFIG_PM_AUTOSLEEP */ | ||
434 | |||
435 | #ifdef CONFIG_PM_WAKELOCKS | ||
436 | static ssize_t wake_lock_show(struct kobject *kobj, | ||
437 | struct kobj_attribute *attr, | ||
438 | char *buf) | ||
439 | { | ||
440 | return pm_show_wakelocks(buf, true); | ||
441 | } | ||
442 | |||
443 | static ssize_t wake_lock_store(struct kobject *kobj, | ||
444 | struct kobj_attribute *attr, | ||
445 | const char *buf, size_t n) | ||
446 | { | ||
447 | int error = pm_wake_lock(buf); | ||
448 | return error ? error : n; | ||
449 | } | ||
450 | |||
451 | power_attr(wake_lock); | ||
452 | |||
453 | static ssize_t wake_unlock_show(struct kobject *kobj, | ||
454 | struct kobj_attribute *attr, | ||
455 | char *buf) | ||
456 | { | ||
457 | return pm_show_wakelocks(buf, false); | ||
458 | } | ||
459 | |||
460 | static ssize_t wake_unlock_store(struct kobject *kobj, | ||
461 | struct kobj_attribute *attr, | ||
462 | const char *buf, size_t n) | ||
463 | { | ||
464 | int error = pm_wake_unlock(buf); | ||
465 | return error ? error : n; | ||
466 | } | ||
467 | |||
468 | power_attr(wake_unlock); | ||
469 | |||
470 | #endif /* CONFIG_PM_WAKELOCKS */ | ||
359 | #endif /* CONFIG_PM_SLEEP */ | 471 | #endif /* CONFIG_PM_SLEEP */ |
360 | 472 | ||
361 | #ifdef CONFIG_PM_TRACE | 473 | #ifdef CONFIG_PM_TRACE |
@@ -409,6 +521,13 @@ static struct attribute * g[] = { | |||
409 | #ifdef CONFIG_PM_SLEEP | 521 | #ifdef CONFIG_PM_SLEEP |
410 | &pm_async_attr.attr, | 522 | &pm_async_attr.attr, |
411 | &wakeup_count_attr.attr, | 523 | &wakeup_count_attr.attr, |
524 | #ifdef CONFIG_PM_AUTOSLEEP | ||
525 | &autosleep_attr.attr, | ||
526 | #endif | ||
527 | #ifdef CONFIG_PM_WAKELOCKS | ||
528 | &wake_lock_attr.attr, | ||
529 | &wake_unlock_attr.attr, | ||
530 | #endif | ||
412 | #ifdef CONFIG_PM_DEBUG | 531 | #ifdef CONFIG_PM_DEBUG |
413 | &pm_test_attr.attr, | 532 | &pm_test_attr.attr, |
414 | #endif | 533 | #endif |
@@ -444,7 +563,10 @@ static int __init pm_init(void) | |||
444 | power_kobj = kobject_create_and_add("power", NULL); | 563 | power_kobj = kobject_create_and_add("power", NULL); |
445 | if (!power_kobj) | 564 | if (!power_kobj) |
446 | return -ENOMEM; | 565 | return -ENOMEM; |
447 | return sysfs_create_group(power_kobj, &attr_group); | 566 | error = sysfs_create_group(power_kobj, &attr_group); |
567 | if (error) | ||
568 | return error; | ||
569 | return pm_autosleep_init(); | ||
448 | } | 570 | } |
449 | 571 | ||
450 | core_initcall(pm_init); | 572 | core_initcall(pm_init); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d740..b0bd4beaebf 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) | |||
264 | { | 264 | { |
265 | } | 265 | } |
266 | #endif | 266 | #endif |
267 | |||
268 | #ifdef CONFIG_PM_AUTOSLEEP | ||
269 | |||
270 | /* kernel/power/autosleep.c */ | ||
271 | extern int pm_autosleep_init(void); | ||
272 | extern int pm_autosleep_lock(void); | ||
273 | extern void pm_autosleep_unlock(void); | ||
274 | extern suspend_state_t pm_autosleep_state(void); | ||
275 | extern int pm_autosleep_set_state(suspend_state_t state); | ||
276 | |||
277 | #else /* !CONFIG_PM_AUTOSLEEP */ | ||
278 | |||
279 | static inline int pm_autosleep_init(void) { return 0; } | ||
280 | static inline int pm_autosleep_lock(void) { return 0; } | ||
281 | static inline void pm_autosleep_unlock(void) {} | ||
282 | static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } | ||
283 | |||
284 | #endif /* !CONFIG_PM_AUTOSLEEP */ | ||
285 | |||
286 | #ifdef CONFIG_PM_WAKELOCKS | ||
287 | |||
288 | /* kernel/power/wakelock.c */ | ||
289 | extern ssize_t pm_show_wakelocks(char *buf, bool show_active); | ||
290 | extern int pm_wake_lock(const char *buf); | ||
291 | extern int pm_wake_unlock(const char *buf); | ||
292 | |||
293 | #endif /* !CONFIG_PM_WAKELOCKS */ | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8742fd013a9..11e22c068e8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | 9 | * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> |
10 | * | 10 | * |
11 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
12 | * | 12 | * |
@@ -51,6 +51,23 @@ | |||
51 | 51 | ||
52 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) | 52 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) |
53 | 53 | ||
54 | /* | ||
55 | * Number of free pages that are not high. | ||
56 | */ | ||
57 | static inline unsigned long low_free_pages(void) | ||
58 | { | ||
59 | return nr_free_pages() - nr_free_highpages(); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * Number of pages required to be kept free while writing the image. Always | ||
64 | * half of all available low pages before the writing starts. | ||
65 | */ | ||
66 | static inline unsigned long reqd_free_pages(void) | ||
67 | { | ||
68 | return low_free_pages() / 2; | ||
69 | } | ||
70 | |||
54 | struct swap_map_page { | 71 | struct swap_map_page { |
55 | sector_t entries[MAP_PAGE_ENTRIES]; | 72 | sector_t entries[MAP_PAGE_ENTRIES]; |
56 | sector_t next_swap; | 73 | sector_t next_swap; |
@@ -72,7 +89,7 @@ struct swap_map_handle { | |||
72 | sector_t cur_swap; | 89 | sector_t cur_swap; |
73 | sector_t first_sector; | 90 | sector_t first_sector; |
74 | unsigned int k; | 91 | unsigned int k; |
75 | unsigned long nr_free_pages, written; | 92 | unsigned long reqd_free_pages; |
76 | u32 crc32; | 93 | u32 crc32; |
77 | }; | 94 | }; |
78 | 95 | ||
@@ -265,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
265 | return -ENOSPC; | 282 | return -ENOSPC; |
266 | 283 | ||
267 | if (bio_chain) { | 284 | if (bio_chain) { |
268 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 285 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | |
286 | __GFP_NORETRY); | ||
269 | if (src) { | 287 | if (src) { |
270 | copy_page(src, buf); | 288 | copy_page(src, buf); |
271 | } else { | 289 | } else { |
272 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ | 290 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ |
273 | if (ret) | 291 | if (ret) |
274 | return ret; | 292 | return ret; |
275 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 293 | src = (void *)__get_free_page(__GFP_WAIT | |
294 | __GFP_NOWARN | | ||
295 | __GFP_NORETRY); | ||
276 | if (src) { | 296 | if (src) { |
277 | copy_page(src, buf); | 297 | copy_page(src, buf); |
278 | } else { | 298 | } else { |
@@ -316,8 +336,7 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
316 | goto err_rel; | 336 | goto err_rel; |
317 | } | 337 | } |
318 | handle->k = 0; | 338 | handle->k = 0; |
319 | handle->nr_free_pages = nr_free_pages() >> 1; | 339 | handle->reqd_free_pages = reqd_free_pages(); |
320 | handle->written = 0; | ||
321 | handle->first_sector = handle->cur_swap; | 340 | handle->first_sector = handle->cur_swap; |
322 | return 0; | 341 | return 0; |
323 | err_rel: | 342 | err_rel: |
@@ -351,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
351 | clear_page(handle->cur); | 370 | clear_page(handle->cur); |
352 | handle->cur_swap = offset; | 371 | handle->cur_swap = offset; |
353 | handle->k = 0; | 372 | handle->k = 0; |
354 | } | 373 | |
355 | if (bio_chain && ++handle->written > handle->nr_free_pages) { | 374 | if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { |
356 | error = hib_wait_on_bio_chain(bio_chain); | 375 | error = hib_wait_on_bio_chain(bio_chain); |
357 | if (error) | 376 | if (error) |
358 | goto out; | 377 | goto out; |
359 | handle->written = 0; | 378 | /* |
379 | * Recalculate the number of required free pages, to | ||
380 | * make sure we never take more than half. | ||
381 | */ | ||
382 | handle->reqd_free_pages = reqd_free_pages(); | ||
383 | } | ||
360 | } | 384 | } |
361 | out: | 385 | out: |
362 | return error; | 386 | return error; |
@@ -403,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
403 | /* Maximum number of threads for compression/decompression. */ | 427 | /* Maximum number of threads for compression/decompression. */ |
404 | #define LZO_THREADS 3 | 428 | #define LZO_THREADS 3 |
405 | 429 | ||
406 | /* Maximum number of pages for read buffering. */ | 430 | /* Minimum/maximum number of pages for read buffering. */ |
407 | #define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) | 431 | #define LZO_MIN_RD_PAGES 1024 |
432 | #define LZO_MAX_RD_PAGES 8192 | ||
408 | 433 | ||
409 | 434 | ||
410 | /** | 435 | /** |
@@ -615,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
615 | } | 640 | } |
616 | 641 | ||
617 | /* | 642 | /* |
618 | * Adjust number of free pages after all allocations have been done. | ||
619 | * We don't want to run out of pages when writing. | ||
620 | */ | ||
621 | handle->nr_free_pages = nr_free_pages() >> 1; | ||
622 | |||
623 | /* | ||
624 | * Start the CRC32 thread. | 643 | * Start the CRC32 thread. |
625 | */ | 644 | */ |
626 | init_waitqueue_head(&crc->go); | 645 | init_waitqueue_head(&crc->go); |
@@ -641,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
641 | goto out_clean; | 660 | goto out_clean; |
642 | } | 661 | } |
643 | 662 | ||
663 | /* | ||
664 | * Adjust the number of required free pages after all allocations have | ||
665 | * been done. We don't want to run out of pages when writing. | ||
666 | */ | ||
667 | handle->reqd_free_pages = reqd_free_pages(); | ||
668 | |||
644 | printk(KERN_INFO | 669 | printk(KERN_INFO |
645 | "PM: Using %u thread(s) for compression.\n" | 670 | "PM: Using %u thread(s) for compression.\n" |
646 | "PM: Compressing and saving image data (%u pages) ... ", | 671 | "PM: Compressing and saving image data (%u pages) ... ", |
@@ -1051,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1051 | unsigned i, thr, run_threads, nr_threads; | 1076 | unsigned i, thr, run_threads, nr_threads; |
1052 | unsigned ring = 0, pg = 0, ring_size = 0, | 1077 | unsigned ring = 0, pg = 0, ring_size = 0, |
1053 | have = 0, want, need, asked = 0; | 1078 | have = 0, want, need, asked = 0; |
1054 | unsigned long read_pages; | 1079 | unsigned long read_pages = 0; |
1055 | unsigned char **page = NULL; | 1080 | unsigned char **page = NULL; |
1056 | struct dec_data *data = NULL; | 1081 | struct dec_data *data = NULL; |
1057 | struct crc_data *crc = NULL; | 1082 | struct crc_data *crc = NULL; |
@@ -1063,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1063 | nr_threads = num_online_cpus() - 1; | 1088 | nr_threads = num_online_cpus() - 1; |
1064 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | 1089 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); |
1065 | 1090 | ||
1066 | page = vmalloc(sizeof(*page) * LZO_READ_PAGES); | 1091 | page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); |
1067 | if (!page) { | 1092 | if (!page) { |
1068 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 1093 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
1069 | ret = -ENOMEM; | 1094 | ret = -ENOMEM; |
@@ -1128,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1128 | } | 1153 | } |
1129 | 1154 | ||
1130 | /* | 1155 | /* |
1131 | * Adjust number of pages for read buffering, in case we are short. | 1156 | * Set the number of pages for read buffering. |
1157 | * This is complete guesswork, because we'll only know the real | ||
1158 | * picture once prepare_image() is called, which is much later on | ||
1159 | * during the image load phase. We'll assume the worst case and | ||
1160 | * say that none of the image pages are from high memory. | ||
1132 | */ | 1161 | */ |
1133 | read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; | 1162 | if (low_free_pages() > snapshot_get_image_size()) |
1134 | read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); | 1163 | read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; |
1164 | read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); | ||
1135 | 1165 | ||
1136 | for (i = 0; i < read_pages; i++) { | 1166 | for (i = 0; i < read_pages; i++) { |
1137 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | 1167 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? |
1138 | __GFP_WAIT | __GFP_HIGH : | 1168 | __GFP_WAIT | __GFP_HIGH : |
1139 | __GFP_WAIT); | 1169 | __GFP_WAIT | __GFP_NOWARN | |
1170 | __GFP_NORETRY); | ||
1171 | |||
1140 | if (!page[i]) { | 1172 | if (!page[i]) { |
1141 | if (i < LZO_CMP_PAGES) { | 1173 | if (i < LZO_CMP_PAGES) { |
1142 | ring_size = i; | 1174 | ring_size = i; |
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 00000000000..c8fba338007 --- /dev/null +++ b/kernel/power/wakelock.c | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * kernel/power/wakelock.c | ||
3 | * | ||
4 | * User space wakeup sources support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | * | ||
8 | * This code is based on the analogous interface allowing user space to | ||
9 | * manipulate wakelocks on Android. | ||
10 | */ | ||
11 | |||
12 | #include <linux/ctype.h> | ||
13 | #include <linux/device.h> | ||
14 | #include <linux/err.h> | ||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/list.h> | ||
17 | #include <linux/rbtree.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | static DEFINE_MUTEX(wakelocks_lock); | ||
21 | |||
22 | struct wakelock { | ||
23 | char *name; | ||
24 | struct rb_node node; | ||
25 | struct wakeup_source ws; | ||
26 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
27 | struct list_head lru; | ||
28 | #endif | ||
29 | }; | ||
30 | |||
31 | static struct rb_root wakelocks_tree = RB_ROOT; | ||
32 | |||
33 | ssize_t pm_show_wakelocks(char *buf, bool show_active) | ||
34 | { | ||
35 | struct rb_node *node; | ||
36 | struct wakelock *wl; | ||
37 | char *str = buf; | ||
38 | char *end = buf + PAGE_SIZE; | ||
39 | |||
40 | mutex_lock(&wakelocks_lock); | ||
41 | |||
42 | for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { | ||
43 | wl = rb_entry(node, struct wakelock, node); | ||
44 | if (wl->ws.active == show_active) | ||
45 | str += scnprintf(str, end - str, "%s ", wl->name); | ||
46 | } | ||
47 | if (str > buf) | ||
48 | str--; | ||
49 | |||
50 | str += scnprintf(str, end - str, "\n"); | ||
51 | |||
52 | mutex_unlock(&wakelocks_lock); | ||
53 | return (str - buf); | ||
54 | } | ||
55 | |||
56 | #if CONFIG_PM_WAKELOCKS_LIMIT > 0 | ||
57 | static unsigned int number_of_wakelocks; | ||
58 | |||
59 | static inline bool wakelocks_limit_exceeded(void) | ||
60 | { | ||
61 | return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; | ||
62 | } | ||
63 | |||
64 | static inline void increment_wakelocks_number(void) | ||
65 | { | ||
66 | number_of_wakelocks++; | ||
67 | } | ||
68 | |||
69 | static inline void decrement_wakelocks_number(void) | ||
70 | { | ||
71 | number_of_wakelocks--; | ||
72 | } | ||
73 | #else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ | ||
74 | static inline bool wakelocks_limit_exceeded(void) { return false; } | ||
75 | static inline void increment_wakelocks_number(void) {} | ||
76 | static inline void decrement_wakelocks_number(void) {} | ||
77 | #endif /* CONFIG_PM_WAKELOCKS_LIMIT */ | ||
78 | |||
79 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
80 | #define WL_GC_COUNT_MAX 100 | ||
81 | #define WL_GC_TIME_SEC 300 | ||
82 | |||
83 | static LIST_HEAD(wakelocks_lru_list); | ||
84 | static unsigned int wakelocks_gc_count; | ||
85 | |||
86 | static inline void wakelocks_lru_add(struct wakelock *wl) | ||
87 | { | ||
88 | list_add(&wl->lru, &wakelocks_lru_list); | ||
89 | } | ||
90 | |||
91 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) | ||
92 | { | ||
93 | list_move(&wl->lru, &wakelocks_lru_list); | ||
94 | } | ||
95 | |||
96 | static void wakelocks_gc(void) | ||
97 | { | ||
98 | struct wakelock *wl, *aux; | ||
99 | ktime_t now; | ||
100 | |||
101 | if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) | ||
102 | return; | ||
103 | |||
104 | now = ktime_get(); | ||
105 | list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { | ||
106 | u64 idle_time_ns; | ||
107 | bool active; | ||
108 | |||
109 | spin_lock_irq(&wl->ws.lock); | ||
110 | idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); | ||
111 | active = wl->ws.active; | ||
112 | spin_unlock_irq(&wl->ws.lock); | ||
113 | |||
114 | if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) | ||
115 | break; | ||
116 | |||
117 | if (!active) { | ||
118 | wakeup_source_remove(&wl->ws); | ||
119 | rb_erase(&wl->node, &wakelocks_tree); | ||
120 | list_del(&wl->lru); | ||
121 | kfree(wl->name); | ||
122 | kfree(wl); | ||
123 | decrement_wakelocks_number(); | ||
124 | } | ||
125 | } | ||
126 | wakelocks_gc_count = 0; | ||
127 | } | ||
128 | #else /* !CONFIG_PM_WAKELOCKS_GC */ | ||
129 | static inline void wakelocks_lru_add(struct wakelock *wl) {} | ||
130 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} | ||
131 | static inline void wakelocks_gc(void) {} | ||
132 | #endif /* !CONFIG_PM_WAKELOCKS_GC */ | ||
133 | |||
134 | static struct wakelock *wakelock_lookup_add(const char *name, size_t len, | ||
135 | bool add_if_not_found) | ||
136 | { | ||
137 | struct rb_node **node = &wakelocks_tree.rb_node; | ||
138 | struct rb_node *parent = *node; | ||
139 | struct wakelock *wl; | ||
140 | |||
141 | while (*node) { | ||
142 | int diff; | ||
143 | |||
144 | parent = *node; | ||
145 | wl = rb_entry(*node, struct wakelock, node); | ||
146 | diff = strncmp(name, wl->name, len); | ||
147 | if (diff == 0) { | ||
148 | if (wl->name[len]) | ||
149 | diff = -1; | ||
150 | else | ||
151 | return wl; | ||
152 | } | ||
153 | if (diff < 0) | ||
154 | node = &(*node)->rb_left; | ||
155 | else | ||
156 | node = &(*node)->rb_right; | ||
157 | } | ||
158 | if (!add_if_not_found) | ||
159 | return ERR_PTR(-EINVAL); | ||
160 | |||
161 | if (wakelocks_limit_exceeded()) | ||
162 | return ERR_PTR(-ENOSPC); | ||
163 | |||
164 | /* Not found, we have to add a new one. */ | ||
165 | wl = kzalloc(sizeof(*wl), GFP_KERNEL); | ||
166 | if (!wl) | ||
167 | return ERR_PTR(-ENOMEM); | ||
168 | |||
169 | wl->name = kstrndup(name, len, GFP_KERNEL); | ||
170 | if (!wl->name) { | ||
171 | kfree(wl); | ||
172 | return ERR_PTR(-ENOMEM); | ||
173 | } | ||
174 | wl->ws.name = wl->name; | ||
175 | wakeup_source_add(&wl->ws); | ||
176 | rb_link_node(&wl->node, parent, node); | ||
177 | rb_insert_color(&wl->node, &wakelocks_tree); | ||
178 | wakelocks_lru_add(wl); | ||
179 | increment_wakelocks_number(); | ||
180 | return wl; | ||
181 | } | ||
182 | |||
183 | int pm_wake_lock(const char *buf) | ||
184 | { | ||
185 | const char *str = buf; | ||
186 | struct wakelock *wl; | ||
187 | u64 timeout_ns = 0; | ||
188 | size_t len; | ||
189 | int ret = 0; | ||
190 | |||
191 | while (*str && !isspace(*str)) | ||
192 | str++; | ||
193 | |||
194 | len = str - buf; | ||
195 | if (!len) | ||
196 | return -EINVAL; | ||
197 | |||
198 | if (*str && *str != '\n') { | ||
199 | /* Find out if there's a valid timeout string appended. */ | ||
200 | ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); | ||
201 | if (ret) | ||
202 | return -EINVAL; | ||
203 | } | ||
204 | |||
205 | mutex_lock(&wakelocks_lock); | ||
206 | |||
207 | wl = wakelock_lookup_add(buf, len, true); | ||
208 | if (IS_ERR(wl)) { | ||
209 | ret = PTR_ERR(wl); | ||
210 | goto out; | ||
211 | } | ||
212 | if (timeout_ns) { | ||
213 | u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; | ||
214 | |||
215 | do_div(timeout_ms, NSEC_PER_MSEC); | ||
216 | __pm_wakeup_event(&wl->ws, timeout_ms); | ||
217 | } else { | ||
218 | __pm_stay_awake(&wl->ws); | ||
219 | } | ||
220 | |||
221 | wakelocks_lru_most_recent(wl); | ||
222 | |||
223 | out: | ||
224 | mutex_unlock(&wakelocks_lock); | ||
225 | return ret; | ||
226 | } | ||
227 | |||
228 | int pm_wake_unlock(const char *buf) | ||
229 | { | ||
230 | struct wakelock *wl; | ||
231 | size_t len; | ||
232 | int ret = 0; | ||
233 | |||
234 | len = strlen(buf); | ||
235 | if (!len) | ||
236 | return -EINVAL; | ||
237 | |||
238 | if (buf[len-1] == '\n') | ||
239 | len--; | ||
240 | |||
241 | if (!len) | ||
242 | return -EINVAL; | ||
243 | |||
244 | mutex_lock(&wakelocks_lock); | ||
245 | |||
246 | wl = wakelock_lookup_add(buf, len, false); | ||
247 | if (IS_ERR(wl)) { | ||
248 | ret = PTR_ERR(wl); | ||
249 | goto out; | ||
250 | } | ||
251 | __pm_relax(&wl->ws); | ||
252 | |||
253 | wakelocks_lru_most_recent(wl); | ||
254 | wakelocks_gc(); | ||
255 | |||
256 | out: | ||
257 | mutex_unlock(&wakelocks_lock); | ||
258 | return ret; | ||
259 | } | ||
diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d3..32462d2b364 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/rculist.h> | 43 | #include <linux/rculist.h> |
44 | #include <linux/poll.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | 47 | ||
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
54 | { | 55 | { |
55 | } | 56 | } |
56 | 57 | ||
57 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
58 | |||
59 | /* printk's without a loglevel use this.. */ | 58 | /* printk's without a loglevel use this.. */ |
60 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 59 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
61 | 60 | ||
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); | |||
99 | static int console_locked, console_suspended; | 98 | static int console_locked, console_suspended; |
100 | 99 | ||
101 | /* | 100 | /* |
102 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | ||
103 | * It is also used in interesting ways to provide interlocking in | ||
104 | * console_unlock();. | ||
105 | */ | ||
106 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
107 | |||
108 | #define LOG_BUF_MASK (log_buf_len-1) | ||
109 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | ||
110 | |||
111 | /* | ||
112 | * The indices into log_buf are not constrained to log_buf_len - they | ||
113 | * must be masked before subscripting | ||
114 | */ | ||
115 | static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ | ||
116 | static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ | ||
117 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | ||
118 | |||
119 | /* | ||
120 | * If exclusive_console is non-NULL then only this console is to be printed to. | 101 | * If exclusive_console is non-NULL then only this console is to be printed to. |
121 | */ | 102 | */ |
122 | static struct console *exclusive_console; | 103 | static struct console *exclusive_console; |
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline); | |||
145 | /* Flag: console code may call schedule() */ | 126 | /* Flag: console code may call schedule() */ |
146 | static int console_may_schedule; | 127 | static int console_may_schedule; |
147 | 128 | ||
129 | /* | ||
130 | * The printk log buffer consists of a chain of concatenated variable | ||
131 | * length records. Every record starts with a record header, containing | ||
132 | * the overall length of the record. | ||
133 | * | ||
134 | * The heads to the first and last entry in the buffer, as well as the | ||
135 | * sequence numbers of these both entries are maintained when messages | ||
136 | * are stored.. | ||
137 | * | ||
138 | * If the heads indicate available messages, the length in the header | ||
139 | * tells the start next message. A length == 0 for the next message | ||
140 | * indicates a wrap-around to the beginning of the buffer. | ||
141 | * | ||
142 | * Every record carries the monotonic timestamp in microseconds, as well as | ||
143 | * the standard userspace syslog level and syslog facility. The usual | ||
144 | * kernel messages use LOG_KERN; userspace-injected messages always carry | ||
145 | * a matching syslog facility, by default LOG_USER. The origin of every | ||
146 | * message can be reliably determined that way. | ||
147 | * | ||
148 | * The human readable log message directly follows the message header. The | ||
149 | * length of the message text is stored in the header, the stored message | ||
150 | * is not terminated. | ||
151 | * | ||
152 | * Optionally, a message can carry a dictionary of properties (key/value pairs), | ||
153 | * to provide userspace with a machine-readable message context. | ||
154 | * | ||
155 | * Examples for well-defined, commonly used property names are: | ||
156 | * DEVICE=b12:8 device identifier | ||
157 | * b12:8 block dev_t | ||
158 | * c127:3 char dev_t | ||
159 | * n8 netdev ifindex | ||
160 | * +sound:card0 subsystem:devname | ||
161 | * SUBSYSTEM=pci driver-core subsystem name | ||
162 | * | ||
163 | * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value | ||
164 | * follows directly after a '=' character. Every property is terminated by | ||
165 | * a '\0' character. The last property is not terminated. | ||
166 | * | ||
167 | * Example of a message structure: | ||
168 | * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec | ||
169 | * 0008 34 00 record is 52 bytes long | ||
170 | * 000a 0b 00 text is 11 bytes long | ||
171 | * 000c 1f 00 dictionary is 23 bytes long | ||
172 | * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) | ||
173 | * 0010 69 74 27 73 20 61 20 6c "it's a l" | ||
174 | * 69 6e 65 "ine" | ||
175 | * 001b 44 45 56 49 43 "DEVIC" | ||
176 | * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" | ||
177 | * 52 49 56 45 52 3d 62 75 "RIVER=bu" | ||
178 | * 67 "g" | ||
179 | * 0032 00 00 00 padding to next message header | ||
180 | * | ||
181 | * The 'struct log' buffer header must never be directly exported to | ||
182 | * userspace, it is a kernel-private implementation detail that might | ||
183 | * need to be changed in the future, when the requirements change. | ||
184 | * | ||
185 | * /dev/kmsg exports the structured data in the following line format: | ||
186 | * "level,sequnum,timestamp;<message text>\n" | ||
187 | * | ||
188 | * The optional key/value pairs are attached as continuation lines starting | ||
189 | * with a space character and terminated by a newline. All possible | ||
190 | * non-prinatable characters are escaped in the "\xff" notation. | ||
191 | * | ||
192 | * Users of the export format should ignore possible additional values | ||
193 | * separated by ',', and find the message after the ';' character. | ||
194 | */ | ||
195 | |||
196 | struct log { | ||
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | ||
198 | u16 len; /* length of entire record */ | ||
199 | u16 text_len; /* length of text buffer */ | ||
200 | u16 dict_len; /* length of dictionary buffer */ | ||
201 | u16 level; /* syslog level + facility */ | ||
202 | }; | ||
203 | |||
204 | /* | ||
205 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | ||
206 | * used in interesting ways to provide interlocking in console_unlock(); | ||
207 | */ | ||
208 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
209 | |||
210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | ||
211 | static u64 syslog_seq; | ||
212 | static u32 syslog_idx; | ||
213 | |||
214 | /* index and sequence number of the first record stored in the buffer */ | ||
215 | static u64 log_first_seq; | ||
216 | static u32 log_first_idx; | ||
217 | |||
218 | /* index and sequence number of the next record to store in the buffer */ | ||
219 | static u64 log_next_seq; | ||
148 | #ifdef CONFIG_PRINTK | 220 | #ifdef CONFIG_PRINTK |
221 | static u32 log_next_idx; | ||
222 | |||
223 | /* the next printk record to read after the last 'clear' command */ | ||
224 | static u64 clear_seq; | ||
225 | static u32 clear_idx; | ||
226 | |||
227 | #define LOG_LINE_MAX 1024 | ||
149 | 228 | ||
150 | static char __log_buf[__LOG_BUF_LEN]; | 229 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
231 | #define LOG_ALIGN 4 | ||
232 | #else | ||
233 | #define LOG_ALIGN 8 | ||
234 | #endif | ||
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | ||
151 | static char *log_buf = __log_buf; | 237 | static char *log_buf = __log_buf; |
152 | static int log_buf_len = __LOG_BUF_LEN; | 238 | static u32 log_buf_len = __LOG_BUF_LEN; |
153 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 239 | |
154 | static int saved_console_loglevel = -1; | 240 | /* cpu currently holding logbuf_lock */ |
241 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
242 | |||
243 | /* human readable text of the record */ | ||
244 | static char *log_text(const struct log *msg) | ||
245 | { | ||
246 | return (char *)msg + sizeof(struct log); | ||
247 | } | ||
248 | |||
249 | /* optional key/value pair dictionary attached to the record */ | ||
250 | static char *log_dict(const struct log *msg) | ||
251 | { | ||
252 | return (char *)msg + sizeof(struct log) + msg->text_len; | ||
253 | } | ||
254 | |||
255 | /* get record by index; idx must point to valid msg */ | ||
256 | static struct log *log_from_idx(u32 idx) | ||
257 | { | ||
258 | struct log *msg = (struct log *)(log_buf + idx); | ||
259 | |||
260 | /* | ||
261 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
262 | * read the message at the start of the buffer. | ||
263 | */ | ||
264 | if (!msg->len) | ||
265 | return (struct log *)log_buf; | ||
266 | return msg; | ||
267 | } | ||
268 | |||
269 | /* get next record; idx must point to valid msg */ | ||
270 | static u32 log_next(u32 idx) | ||
271 | { | ||
272 | struct log *msg = (struct log *)(log_buf + idx); | ||
273 | |||
274 | /* length == 0 indicates the end of the buffer; wrap */ | ||
275 | /* | ||
276 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
277 | * read the message at the start of the buffer as *this* one, and | ||
278 | * return the one after that. | ||
279 | */ | ||
280 | if (!msg->len) { | ||
281 | msg = (struct log *)log_buf; | ||
282 | return msg->len; | ||
283 | } | ||
284 | return idx + msg->len; | ||
285 | } | ||
286 | |||
287 | /* insert record into the buffer, discard old ones, update heads */ | ||
288 | static void log_store(int facility, int level, | ||
289 | const char *dict, u16 dict_len, | ||
290 | const char *text, u16 text_len) | ||
291 | { | ||
292 | struct log *msg; | ||
293 | u32 size, pad_len; | ||
294 | |||
295 | /* number of '\0' padding bytes to next message */ | ||
296 | size = sizeof(struct log) + text_len + dict_len; | ||
297 | pad_len = (-size) & (LOG_ALIGN - 1); | ||
298 | size += pad_len; | ||
299 | |||
300 | while (log_first_seq < log_next_seq) { | ||
301 | u32 free; | ||
302 | |||
303 | if (log_next_idx > log_first_idx) | ||
304 | free = max(log_buf_len - log_next_idx, log_first_idx); | ||
305 | else | ||
306 | free = log_first_idx - log_next_idx; | ||
307 | |||
308 | if (free > size + sizeof(struct log)) | ||
309 | break; | ||
310 | |||
311 | /* drop old messages until we have enough contiuous space */ | ||
312 | log_first_idx = log_next(log_first_idx); | ||
313 | log_first_seq++; | ||
314 | } | ||
315 | |||
316 | if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { | ||
317 | /* | ||
318 | * This message + an additional empty header does not fit | ||
319 | * at the end of the buffer. Add an empty header with len == 0 | ||
320 | * to signify a wrap around. | ||
321 | */ | ||
322 | memset(log_buf + log_next_idx, 0, sizeof(struct log)); | ||
323 | log_next_idx = 0; | ||
324 | } | ||
325 | |||
326 | /* fill message */ | ||
327 | msg = (struct log *)(log_buf + log_next_idx); | ||
328 | memcpy(log_text(msg), text, text_len); | ||
329 | msg->text_len = text_len; | ||
330 | memcpy(log_dict(msg), dict, dict_len); | ||
331 | msg->dict_len = dict_len; | ||
332 | msg->level = (facility << 3) | (level & 7); | ||
333 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | ||
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | ||
336 | |||
337 | /* insert message */ | ||
338 | log_next_idx += msg->len; | ||
339 | log_next_seq++; | ||
340 | } | ||
341 | |||
342 | /* /dev/kmsg - userspace message inject/listen interface */ | ||
343 | struct devkmsg_user { | ||
344 | u64 seq; | ||
345 | u32 idx; | ||
346 | struct mutex lock; | ||
347 | char buf[8192]; | ||
348 | }; | ||
349 | |||
350 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | ||
351 | unsigned long count, loff_t pos) | ||
352 | { | ||
353 | char *buf, *line; | ||
354 | int i; | ||
355 | int level = default_message_loglevel; | ||
356 | int facility = 1; /* LOG_USER */ | ||
357 | size_t len = iov_length(iv, count); | ||
358 | ssize_t ret = len; | ||
359 | |||
360 | if (len > LOG_LINE_MAX) | ||
361 | return -EINVAL; | ||
362 | buf = kmalloc(len+1, GFP_KERNEL); | ||
363 | if (buf == NULL) | ||
364 | return -ENOMEM; | ||
365 | |||
366 | line = buf; | ||
367 | for (i = 0; i < count; i++) { | ||
368 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) | ||
369 | goto out; | ||
370 | line += iv[i].iov_len; | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace | ||
375 | * the decimal value represents 32bit, the lower 3 bit are the log | ||
376 | * level, the rest are the log facility. | ||
377 | * | ||
378 | * If no prefix or no userspace facility is specified, we | ||
379 | * enforce LOG_USER, to be able to reliably distinguish | ||
380 | * kernel-generated messages from userspace-injected ones. | ||
381 | */ | ||
382 | line = buf; | ||
383 | if (line[0] == '<') { | ||
384 | char *endp = NULL; | ||
385 | |||
386 | i = simple_strtoul(line+1, &endp, 10); | ||
387 | if (endp && endp[0] == '>') { | ||
388 | level = i & 7; | ||
389 | if (i >> 3) | ||
390 | facility = i >> 3; | ||
391 | endp++; | ||
392 | len -= endp - line; | ||
393 | line = endp; | ||
394 | } | ||
395 | } | ||
396 | line[len] = '\0'; | ||
397 | |||
398 | printk_emit(facility, level, NULL, 0, "%s", line); | ||
399 | out: | ||
400 | kfree(buf); | ||
401 | return ret; | ||
402 | } | ||
403 | |||
404 | static ssize_t devkmsg_read(struct file *file, char __user *buf, | ||
405 | size_t count, loff_t *ppos) | ||
406 | { | ||
407 | struct devkmsg_user *user = file->private_data; | ||
408 | struct log *msg; | ||
409 | u64 ts_usec; | ||
410 | size_t i; | ||
411 | size_t len; | ||
412 | ssize_t ret; | ||
413 | |||
414 | if (!user) | ||
415 | return -EBADF; | ||
416 | |||
417 | mutex_lock(&user->lock); | ||
418 | raw_spin_lock(&logbuf_lock); | ||
419 | while (user->seq == log_next_seq) { | ||
420 | if (file->f_flags & O_NONBLOCK) { | ||
421 | ret = -EAGAIN; | ||
422 | raw_spin_unlock(&logbuf_lock); | ||
423 | goto out; | ||
424 | } | ||
425 | |||
426 | raw_spin_unlock(&logbuf_lock); | ||
427 | ret = wait_event_interruptible(log_wait, | ||
428 | user->seq != log_next_seq); | ||
429 | if (ret) | ||
430 | goto out; | ||
431 | raw_spin_lock(&logbuf_lock); | ||
432 | } | ||
433 | |||
434 | if (user->seq < log_first_seq) { | ||
435 | /* our last seen message is gone, return error and reset */ | ||
436 | user->idx = log_first_idx; | ||
437 | user->seq = log_first_seq; | ||
438 | ret = -EPIPE; | ||
439 | raw_spin_unlock(&logbuf_lock); | ||
440 | goto out; | ||
441 | } | ||
442 | |||
443 | msg = log_from_idx(user->idx); | ||
444 | ts_usec = msg->ts_nsec; | ||
445 | do_div(ts_usec, 1000); | ||
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | ||
447 | msg->level, user->seq, ts_usec); | ||
448 | |||
449 | /* escape non-printable characters */ | ||
450 | for (i = 0; i < msg->text_len; i++) { | ||
451 | unsigned char c = log_text(msg)[i]; | ||
452 | |||
453 | if (c < ' ' || c >= 128) | ||
454 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
455 | else | ||
456 | user->buf[len++] = c; | ||
457 | } | ||
458 | user->buf[len++] = '\n'; | ||
459 | |||
460 | if (msg->dict_len) { | ||
461 | bool line = true; | ||
462 | |||
463 | for (i = 0; i < msg->dict_len; i++) { | ||
464 | unsigned char c = log_dict(msg)[i]; | ||
465 | |||
466 | if (line) { | ||
467 | user->buf[len++] = ' '; | ||
468 | line = false; | ||
469 | } | ||
470 | |||
471 | if (c == '\0') { | ||
472 | user->buf[len++] = '\n'; | ||
473 | line = true; | ||
474 | continue; | ||
475 | } | ||
476 | |||
477 | if (c < ' ' || c >= 128) { | ||
478 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
479 | continue; | ||
480 | } | ||
481 | |||
482 | user->buf[len++] = c; | ||
483 | } | ||
484 | user->buf[len++] = '\n'; | ||
485 | } | ||
486 | |||
487 | user->idx = log_next(user->idx); | ||
488 | user->seq++; | ||
489 | raw_spin_unlock(&logbuf_lock); | ||
490 | |||
491 | if (len > count) { | ||
492 | ret = -EINVAL; | ||
493 | goto out; | ||
494 | } | ||
495 | |||
496 | if (copy_to_user(buf, user->buf, len)) { | ||
497 | ret = -EFAULT; | ||
498 | goto out; | ||
499 | } | ||
500 | ret = len; | ||
501 | out: | ||
502 | mutex_unlock(&user->lock); | ||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | ||
507 | { | ||
508 | struct devkmsg_user *user = file->private_data; | ||
509 | loff_t ret = 0; | ||
510 | |||
511 | if (!user) | ||
512 | return -EBADF; | ||
513 | if (offset) | ||
514 | return -ESPIPE; | ||
515 | |||
516 | raw_spin_lock(&logbuf_lock); | ||
517 | switch (whence) { | ||
518 | case SEEK_SET: | ||
519 | /* the first record */ | ||
520 | user->idx = log_first_idx; | ||
521 | user->seq = log_first_seq; | ||
522 | break; | ||
523 | case SEEK_DATA: | ||
524 | /* | ||
525 | * The first record after the last SYSLOG_ACTION_CLEAR, | ||
526 | * like issued by 'dmesg -c'. Reading /dev/kmsg itself | ||
527 | * changes no global state, and does not clear anything. | ||
528 | */ | ||
529 | user->idx = clear_idx; | ||
530 | user->seq = clear_seq; | ||
531 | break; | ||
532 | case SEEK_END: | ||
533 | /* after the last record */ | ||
534 | user->idx = log_next_idx; | ||
535 | user->seq = log_next_seq; | ||
536 | break; | ||
537 | default: | ||
538 | ret = -EINVAL; | ||
539 | } | ||
540 | raw_spin_unlock(&logbuf_lock); | ||
541 | return ret; | ||
542 | } | ||
543 | |||
544 | static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | ||
545 | { | ||
546 | struct devkmsg_user *user = file->private_data; | ||
547 | int ret = 0; | ||
548 | |||
549 | if (!user) | ||
550 | return POLLERR|POLLNVAL; | ||
551 | |||
552 | poll_wait(file, &log_wait, wait); | ||
553 | |||
554 | raw_spin_lock(&logbuf_lock); | ||
555 | if (user->seq < log_next_seq) { | ||
556 | /* return error when data has vanished underneath us */ | ||
557 | if (user->seq < log_first_seq) | ||
558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | ||
559 | ret = POLLIN|POLLRDNORM; | ||
560 | } | ||
561 | raw_spin_unlock(&logbuf_lock); | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | |||
566 | static int devkmsg_open(struct inode *inode, struct file *file) | ||
567 | { | ||
568 | struct devkmsg_user *user; | ||
569 | int err; | ||
570 | |||
571 | /* write-only does not need any file context */ | ||
572 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) | ||
573 | return 0; | ||
574 | |||
575 | err = security_syslog(SYSLOG_ACTION_READ_ALL); | ||
576 | if (err) | ||
577 | return err; | ||
578 | |||
579 | user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); | ||
580 | if (!user) | ||
581 | return -ENOMEM; | ||
582 | |||
583 | mutex_init(&user->lock); | ||
584 | |||
585 | raw_spin_lock(&logbuf_lock); | ||
586 | user->idx = log_first_idx; | ||
587 | user->seq = log_first_seq; | ||
588 | raw_spin_unlock(&logbuf_lock); | ||
589 | |||
590 | file->private_data = user; | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | static int devkmsg_release(struct inode *inode, struct file *file) | ||
595 | { | ||
596 | struct devkmsg_user *user = file->private_data; | ||
597 | |||
598 | if (!user) | ||
599 | return 0; | ||
600 | |||
601 | mutex_destroy(&user->lock); | ||
602 | kfree(user); | ||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | const struct file_operations kmsg_fops = { | ||
607 | .open = devkmsg_open, | ||
608 | .read = devkmsg_read, | ||
609 | .aio_write = devkmsg_writev, | ||
610 | .llseek = devkmsg_llseek, | ||
611 | .poll = devkmsg_poll, | ||
612 | .release = devkmsg_release, | ||
613 | }; | ||
155 | 614 | ||
156 | #ifdef CONFIG_KEXEC | 615 | #ifdef CONFIG_KEXEC |
157 | /* | 616 | /* |
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1; | |||
165 | void log_buf_kexec_setup(void) | 624 | void log_buf_kexec_setup(void) |
166 | { | 625 | { |
167 | VMCOREINFO_SYMBOL(log_buf); | 626 | VMCOREINFO_SYMBOL(log_buf); |
168 | VMCOREINFO_SYMBOL(log_end); | ||
169 | VMCOREINFO_SYMBOL(log_buf_len); | 627 | VMCOREINFO_SYMBOL(log_buf_len); |
170 | VMCOREINFO_SYMBOL(logged_chars); | 628 | VMCOREINFO_SYMBOL(log_first_idx); |
629 | VMCOREINFO_SYMBOL(log_next_idx); | ||
171 | } | 630 | } |
172 | #endif | 631 | #endif |
173 | 632 | ||
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup); | |||
191 | void __init setup_log_buf(int early) | 650 | void __init setup_log_buf(int early) |
192 | { | 651 | { |
193 | unsigned long flags; | 652 | unsigned long flags; |
194 | unsigned start, dest_idx, offset; | ||
195 | char *new_log_buf; | 653 | char *new_log_buf; |
196 | int free; | 654 | int free; |
197 | 655 | ||
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early) | |||
219 | log_buf_len = new_log_buf_len; | 677 | log_buf_len = new_log_buf_len; |
220 | log_buf = new_log_buf; | 678 | log_buf = new_log_buf; |
221 | new_log_buf_len = 0; | 679 | new_log_buf_len = 0; |
222 | free = __LOG_BUF_LEN - log_end; | 680 | free = __LOG_BUF_LEN - log_next_idx; |
223 | 681 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | |
224 | offset = start = min(con_start, log_start); | ||
225 | dest_idx = 0; | ||
226 | while (start != log_end) { | ||
227 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
228 | |||
229 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
230 | start++; | ||
231 | dest_idx++; | ||
232 | } | ||
233 | log_start -= offset; | ||
234 | con_start -= offset; | ||
235 | log_end -= offset; | ||
236 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 682 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
237 | 683 | ||
238 | pr_info("log_buf_len: %d\n", log_buf_len); | 684 | pr_info("log_buf_len: %d\n", log_buf_len); |
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file) | |||
332 | return 0; | 778 | return 0; |
333 | } | 779 | } |
334 | 780 | ||
781 | #if defined(CONFIG_PRINTK_TIME) | ||
782 | static bool printk_time = 1; | ||
783 | #else | ||
784 | static bool printk_time; | ||
785 | #endif | ||
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
787 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | ||
789 | { | ||
790 | size_t len = 0; | ||
791 | |||
792 | if (syslog) { | ||
793 | if (buf) { | ||
794 | len += sprintf(buf, "<%u>", msg->level); | ||
795 | } else { | ||
796 | len += 3; | ||
797 | if (msg->level > 9) | ||
798 | len++; | ||
799 | if (msg->level > 99) | ||
800 | len++; | ||
801 | } | ||
802 | } | ||
803 | |||
804 | if (printk_time) { | ||
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | ||
817 | } | ||
818 | |||
819 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
820 | char *buf, size_t size) | ||
821 | { | ||
822 | const char *text = log_text(msg); | ||
823 | size_t text_size = msg->text_len; | ||
824 | size_t len = 0; | ||
825 | |||
826 | do { | ||
827 | const char *next = memchr(text, '\n', text_size); | ||
828 | size_t text_len; | ||
829 | |||
830 | if (next) { | ||
831 | text_len = next - text; | ||
832 | next++; | ||
833 | text_size -= next - text; | ||
834 | } else { | ||
835 | text_len = text_size; | ||
836 | } | ||
837 | |||
838 | if (buf) { | ||
839 | if (print_prefix(msg, syslog, NULL) + | ||
840 | text_len + 1>= size - len) | ||
841 | break; | ||
842 | |||
843 | len += print_prefix(msg, syslog, buf + len); | ||
844 | memcpy(buf + len, text, text_len); | ||
845 | len += text_len; | ||
846 | buf[len++] = '\n'; | ||
847 | } else { | ||
848 | /* SYSLOG_ACTION_* buffer size only calculation */ | ||
849 | len += print_prefix(msg, syslog, NULL); | ||
850 | len += text_len + 1; | ||
851 | } | ||
852 | |||
853 | text = next; | ||
854 | } while (text); | ||
855 | |||
856 | return len; | ||
857 | } | ||
858 | |||
859 | static int syslog_print(char __user *buf, int size) | ||
860 | { | ||
861 | char *text; | ||
862 | struct log *msg; | ||
863 | int len; | ||
864 | |||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
866 | if (!text) | ||
867 | return -ENOMEM; | ||
868 | |||
869 | raw_spin_lock_irq(&logbuf_lock); | ||
870 | if (syslog_seq < log_first_seq) { | ||
871 | /* messages are gone, move to first one */ | ||
872 | syslog_seq = log_first_seq; | ||
873 | syslog_idx = log_first_idx; | ||
874 | } | ||
875 | msg = log_from_idx(syslog_idx); | ||
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
877 | syslog_idx = log_next(syslog_idx); | ||
878 | syslog_seq++; | ||
879 | raw_spin_unlock_irq(&logbuf_lock); | ||
880 | |||
881 | if (len > 0 && copy_to_user(buf, text, len)) | ||
882 | len = -EFAULT; | ||
883 | |||
884 | kfree(text); | ||
885 | return len; | ||
886 | } | ||
887 | |||
888 | static int syslog_print_all(char __user *buf, int size, bool clear) | ||
889 | { | ||
890 | char *text; | ||
891 | int len = 0; | ||
892 | |||
893 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
894 | if (!text) | ||
895 | return -ENOMEM; | ||
896 | |||
897 | raw_spin_lock_irq(&logbuf_lock); | ||
898 | if (buf) { | ||
899 | u64 next_seq; | ||
900 | u64 seq; | ||
901 | u32 idx; | ||
902 | |||
903 | if (clear_seq < log_first_seq) { | ||
904 | /* messages are gone, move to first available one */ | ||
905 | clear_seq = log_first_seq; | ||
906 | clear_idx = log_first_idx; | ||
907 | } | ||
908 | |||
909 | /* | ||
910 | * Find first record that fits, including all following records, | ||
911 | * into the user-provided buffer for this dump. | ||
912 | */ | ||
913 | seq = clear_seq; | ||
914 | idx = clear_idx; | ||
915 | while (seq < log_next_seq) { | ||
916 | struct log *msg = log_from_idx(idx); | ||
917 | |||
918 | len += msg_print_text(msg, true, NULL, 0); | ||
919 | idx = log_next(idx); | ||
920 | seq++; | ||
921 | } | ||
922 | seq = clear_seq; | ||
923 | idx = clear_idx; | ||
924 | while (len > size && seq < log_next_seq) { | ||
925 | struct log *msg = log_from_idx(idx); | ||
926 | |||
927 | len -= msg_print_text(msg, true, NULL, 0); | ||
928 | idx = log_next(idx); | ||
929 | seq++; | ||
930 | } | ||
931 | |||
932 | /* last message in this dump */ | ||
933 | next_seq = log_next_seq; | ||
934 | |||
935 | len = 0; | ||
936 | while (len >= 0 && seq < next_seq) { | ||
937 | struct log *msg = log_from_idx(idx); | ||
938 | int textlen; | ||
939 | |||
940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
941 | if (textlen < 0) { | ||
942 | len = textlen; | ||
943 | break; | ||
944 | } | ||
945 | idx = log_next(idx); | ||
946 | seq++; | ||
947 | |||
948 | raw_spin_unlock_irq(&logbuf_lock); | ||
949 | if (copy_to_user(buf + len, text, textlen)) | ||
950 | len = -EFAULT; | ||
951 | else | ||
952 | len += textlen; | ||
953 | raw_spin_lock_irq(&logbuf_lock); | ||
954 | |||
955 | if (seq < log_first_seq) { | ||
956 | /* messages are gone, move to next one */ | ||
957 | seq = log_first_seq; | ||
958 | idx = log_first_idx; | ||
959 | } | ||
960 | } | ||
961 | } | ||
962 | |||
963 | if (clear) { | ||
964 | clear_seq = log_next_seq; | ||
965 | clear_idx = log_next_idx; | ||
966 | } | ||
967 | raw_spin_unlock_irq(&logbuf_lock); | ||
968 | |||
969 | kfree(text); | ||
970 | return len; | ||
971 | } | ||
972 | |||
335 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 973 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
336 | { | 974 | { |
337 | unsigned i, j, limit, count; | 975 | bool clear = false; |
338 | int do_clear = 0; | 976 | static int saved_console_loglevel = -1; |
339 | char c; | ||
340 | int error; | 977 | int error; |
341 | 978 | ||
342 | error = check_syslog_permissions(type, from_file); | 979 | error = check_syslog_permissions(type, from_file); |
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
364 | goto out; | 1001 | goto out; |
365 | } | 1002 | } |
366 | error = wait_event_interruptible(log_wait, | 1003 | error = wait_event_interruptible(log_wait, |
367 | (log_start - log_end)); | 1004 | syslog_seq != log_next_seq); |
368 | if (error) | 1005 | if (error) |
369 | goto out; | 1006 | goto out; |
370 | i = 0; | 1007 | error = syslog_print(buf, len); |
371 | raw_spin_lock_irq(&logbuf_lock); | ||
372 | while (!error && (log_start != log_end) && i < len) { | ||
373 | c = LOG_BUF(log_start); | ||
374 | log_start++; | ||
375 | raw_spin_unlock_irq(&logbuf_lock); | ||
376 | error = __put_user(c,buf); | ||
377 | buf++; | ||
378 | i++; | ||
379 | cond_resched(); | ||
380 | raw_spin_lock_irq(&logbuf_lock); | ||
381 | } | ||
382 | raw_spin_unlock_irq(&logbuf_lock); | ||
383 | if (!error) | ||
384 | error = i; | ||
385 | break; | 1008 | break; |
386 | /* Read/clear last kernel messages */ | 1009 | /* Read/clear last kernel messages */ |
387 | case SYSLOG_ACTION_READ_CLEAR: | 1010 | case SYSLOG_ACTION_READ_CLEAR: |
388 | do_clear = 1; | 1011 | clear = true; |
389 | /* FALL THRU */ | 1012 | /* FALL THRU */ |
390 | /* Read last kernel messages */ | 1013 | /* Read last kernel messages */ |
391 | case SYSLOG_ACTION_READ_ALL: | 1014 | case SYSLOG_ACTION_READ_ALL: |
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | error = -EFAULT; | 1022 | error = -EFAULT; |
400 | goto out; | 1023 | goto out; |
401 | } | 1024 | } |
402 | count = len; | 1025 | error = syslog_print_all(buf, len, clear); |
403 | if (count > log_buf_len) | ||
404 | count = log_buf_len; | ||
405 | raw_spin_lock_irq(&logbuf_lock); | ||
406 | if (count > logged_chars) | ||
407 | count = logged_chars; | ||
408 | if (do_clear) | ||
409 | logged_chars = 0; | ||
410 | limit = log_end; | ||
411 | /* | ||
412 | * __put_user() could sleep, and while we sleep | ||
413 | * printk() could overwrite the messages | ||
414 | * we try to copy to user space. Therefore | ||
415 | * the messages are copied in reverse. <manfreds> | ||
416 | */ | ||
417 | for (i = 0; i < count && !error; i++) { | ||
418 | j = limit-1-i; | ||
419 | if (j + log_buf_len < log_end) | ||
420 | break; | ||
421 | c = LOG_BUF(j); | ||
422 | raw_spin_unlock_irq(&logbuf_lock); | ||
423 | error = __put_user(c,&buf[count-1-i]); | ||
424 | cond_resched(); | ||
425 | raw_spin_lock_irq(&logbuf_lock); | ||
426 | } | ||
427 | raw_spin_unlock_irq(&logbuf_lock); | ||
428 | if (error) | ||
429 | break; | ||
430 | error = i; | ||
431 | if (i != count) { | ||
432 | int offset = count-error; | ||
433 | /* buffer overflow during copy, correct user buffer. */ | ||
434 | for (i = 0; i < error; i++) { | ||
435 | if (__get_user(c,&buf[i+offset]) || | ||
436 | __put_user(c,&buf[i])) { | ||
437 | error = -EFAULT; | ||
438 | break; | ||
439 | } | ||
440 | cond_resched(); | ||
441 | } | ||
442 | } | ||
443 | break; | 1026 | break; |
444 | /* Clear ring buffer */ | 1027 | /* Clear ring buffer */ |
445 | case SYSLOG_ACTION_CLEAR: | 1028 | case SYSLOG_ACTION_CLEAR: |
446 | logged_chars = 0; | 1029 | syslog_print_all(NULL, 0, true); |
447 | break; | ||
448 | /* Disable logging to console */ | 1030 | /* Disable logging to console */ |
449 | case SYSLOG_ACTION_CONSOLE_OFF: | 1031 | case SYSLOG_ACTION_CONSOLE_OFF: |
450 | if (saved_console_loglevel == -1) | 1032 | if (saved_console_loglevel == -1) |
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
472 | break; | 1054 | break; |
473 | /* Number of chars in the log buffer */ | 1055 | /* Number of chars in the log buffer */ |
474 | case SYSLOG_ACTION_SIZE_UNREAD: | 1056 | case SYSLOG_ACTION_SIZE_UNREAD: |
475 | error = log_end - log_start; | 1057 | raw_spin_lock_irq(&logbuf_lock); |
1058 | if (syslog_seq < log_first_seq) { | ||
1059 | /* messages are gone, move to first one */ | ||
1060 | syslog_seq = log_first_seq; | ||
1061 | syslog_idx = log_first_idx; | ||
1062 | } | ||
1063 | if (from_file) { | ||
1064 | /* | ||
1065 | * Short-cut for poll(/"proc/kmsg") which simply checks | ||
1066 | * for pending data, not the size; return the count of | ||
1067 | * records, not the length. | ||
1068 | */ | ||
1069 | error = log_next_idx - syslog_idx; | ||
1070 | } else { | ||
1071 | u64 seq; | ||
1072 | u32 idx; | ||
1073 | |||
1074 | error = 0; | ||
1075 | seq = syslog_seq; | ||
1076 | idx = syslog_idx; | ||
1077 | while (seq < log_next_seq) { | ||
1078 | struct log *msg = log_from_idx(idx); | ||
1079 | |||
1080 | error += msg_print_text(msg, true, NULL, 0); | ||
1081 | idx = log_next(idx); | ||
1082 | seq++; | ||
1083 | } | ||
1084 | } | ||
1085 | raw_spin_unlock_irq(&logbuf_lock); | ||
476 | break; | 1086 | break; |
477 | /* Size of the log buffer */ | 1087 | /* Size of the log buffer */ |
478 | case SYSLOG_ACTION_SIZE_BUFFER: | 1088 | case SYSLOG_ACTION_SIZE_BUFFER: |
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4]) | |||
501 | { | 1111 | { |
502 | syslog_data[0] = log_buf; | 1112 | syslog_data[0] = log_buf; |
503 | syslog_data[1] = log_buf + log_buf_len; | 1113 | syslog_data[1] = log_buf + log_buf_len; |
504 | syslog_data[2] = log_buf + log_end - | 1114 | syslog_data[2] = log_buf + log_first_idx; |
505 | (logged_chars < log_buf_len ? logged_chars : log_buf_len); | 1115 | syslog_data[3] = log_buf + log_next_idx; |
506 | syslog_data[3] = log_buf + log_end; | ||
507 | } | 1116 | } |
508 | #endif /* CONFIG_KGDB_KDB */ | 1117 | #endif /* CONFIG_KGDB_KDB */ |
509 | 1118 | ||
510 | /* | ||
511 | * Call the console drivers on a range of log_buf | ||
512 | */ | ||
513 | static void __call_console_drivers(unsigned start, unsigned end) | ||
514 | { | ||
515 | struct console *con; | ||
516 | |||
517 | for_each_console(con) { | ||
518 | if (exclusive_console && con != exclusive_console) | ||
519 | continue; | ||
520 | if ((con->flags & CON_ENABLED) && con->write && | ||
521 | (cpu_online(smp_processor_id()) || | ||
522 | (con->flags & CON_ANYTIME))) | ||
523 | con->write(con, &LOG_BUF(start), end - start); | ||
524 | } | ||
525 | } | ||
526 | |||
527 | static bool __read_mostly ignore_loglevel; | 1119 | static bool __read_mostly ignore_loglevel; |
528 | 1120 | ||
529 | static int __init ignore_loglevel_setup(char *str) | 1121 | static int __init ignore_loglevel_setup(char *str) |
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | |||
540 | "print all kernel messages to the console."); | 1132 | "print all kernel messages to the console."); |
541 | 1133 | ||
542 | /* | 1134 | /* |
543 | * Write out chars from start to end - 1 inclusive | ||
544 | */ | ||
545 | static void _call_console_drivers(unsigned start, | ||
546 | unsigned end, int msg_log_level) | ||
547 | { | ||
548 | trace_console(&LOG_BUF(0), start, end, log_buf_len); | ||
549 | |||
550 | if ((msg_log_level < console_loglevel || ignore_loglevel) && | ||
551 | console_drivers && start != end) { | ||
552 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | ||
553 | /* wrapped write */ | ||
554 | __call_console_drivers(start & LOG_BUF_MASK, | ||
555 | log_buf_len); | ||
556 | __call_console_drivers(0, end & LOG_BUF_MASK); | ||
557 | } else { | ||
558 | __call_console_drivers(start, end); | ||
559 | } | ||
560 | } | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
565 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
566 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
567 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
568 | * to extract the correct log level for in-kernel processing, and not mangle | ||
569 | * the original value. | ||
570 | * | ||
571 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
572 | * passed, it will be filled in with the log level without a possible facility | ||
573 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
574 | * and returned. If no valid header is found, 0 is returned and the passed | ||
575 | * variables are not touched. | ||
576 | */ | ||
577 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
578 | { | ||
579 | unsigned int lev = 0; | ||
580 | char sp = '\0'; | ||
581 | size_t len; | ||
582 | |||
583 | if (p[0] != '<' || !p[1]) | ||
584 | return 0; | ||
585 | if (p[2] == '>') { | ||
586 | /* usual single digit level number or special char */ | ||
587 | switch (p[1]) { | ||
588 | case '0' ... '7': | ||
589 | lev = p[1] - '0'; | ||
590 | break; | ||
591 | case 'c': /* KERN_CONT */ | ||
592 | case 'd': /* KERN_DEFAULT */ | ||
593 | sp = p[1]; | ||
594 | break; | ||
595 | default: | ||
596 | return 0; | ||
597 | } | ||
598 | len = 3; | ||
599 | } else { | ||
600 | /* multi digit including the level and facility number */ | ||
601 | char *endp = NULL; | ||
602 | |||
603 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
604 | if (endp == NULL || endp[0] != '>') | ||
605 | return 0; | ||
606 | len = (endp + 1) - p; | ||
607 | } | ||
608 | |||
609 | /* do not accept special char if not asked for */ | ||
610 | if (sp && !special) | ||
611 | return 0; | ||
612 | |||
613 | if (special) { | ||
614 | *special = sp; | ||
615 | /* return special char, do not touch level */ | ||
616 | if (sp) | ||
617 | return len; | ||
618 | } | ||
619 | |||
620 | if (level) | ||
621 | *level = lev; | ||
622 | return len; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * Call the console drivers, asking them to write out | 1135 | * Call the console drivers, asking them to write out |
627 | * log_buf[start] to log_buf[end - 1]. | 1136 | * log_buf[start] to log_buf[end - 1]. |
628 | * The console_lock must be held. | 1137 | * The console_lock must be held. |
629 | */ | 1138 | */ |
630 | static void call_console_drivers(unsigned start, unsigned end) | 1139 | static void call_console_drivers(int level, const char *text, size_t len) |
631 | { | 1140 | { |
632 | unsigned cur_index, start_print; | 1141 | struct console *con; |
633 | static int msg_level = -1; | ||
634 | 1142 | ||
635 | BUG_ON(((int)(start - end)) > 0); | 1143 | trace_console(text, 0, len, len); |
636 | 1144 | ||
637 | cur_index = start; | 1145 | if (level >= console_loglevel && !ignore_loglevel) |
638 | start_print = start; | 1146 | return; |
639 | while (cur_index != end) { | 1147 | if (!console_drivers) |
640 | if (msg_level < 0 && ((end - cur_index) > 2)) { | 1148 | return; |
641 | /* strip log prefix */ | ||
642 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); | ||
643 | start_print = cur_index; | ||
644 | } | ||
645 | while (cur_index != end) { | ||
646 | char c = LOG_BUF(cur_index); | ||
647 | |||
648 | cur_index++; | ||
649 | if (c == '\n') { | ||
650 | if (msg_level < 0) { | ||
651 | /* | ||
652 | * printk() has already given us loglevel tags in | ||
653 | * the buffer. This code is here in case the | ||
654 | * log buffer has wrapped right round and scribbled | ||
655 | * on those tags | ||
656 | */ | ||
657 | msg_level = default_message_loglevel; | ||
658 | } | ||
659 | _call_console_drivers(start_print, cur_index, msg_level); | ||
660 | msg_level = -1; | ||
661 | start_print = cur_index; | ||
662 | break; | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | _call_console_drivers(start_print, end, msg_level); | ||
667 | } | ||
668 | 1149 | ||
669 | static void emit_log_char(char c) | 1150 | for_each_console(con) { |
670 | { | 1151 | if (exclusive_console && con != exclusive_console) |
671 | LOG_BUF(log_end) = c; | 1152 | continue; |
672 | log_end++; | 1153 | if (!(con->flags & CON_ENABLED)) |
673 | if (log_end - log_start > log_buf_len) | 1154 | continue; |
674 | log_start = log_end - log_buf_len; | 1155 | if (!con->write) |
675 | if (log_end - con_start > log_buf_len) | 1156 | continue; |
676 | con_start = log_end - log_buf_len; | 1157 | if (!cpu_online(smp_processor_id()) && |
677 | if (logged_chars < log_buf_len) | 1158 | !(con->flags & CON_ANYTIME)) |
678 | logged_chars++; | 1159 | continue; |
1160 | con->write(con, text, len); | ||
1161 | } | ||
679 | } | 1162 | } |
680 | 1163 | ||
681 | /* | 1164 | /* |
@@ -700,16 +1183,6 @@ static void zap_locks(void) | |||
700 | sema_init(&console_sem, 1); | 1183 | sema_init(&console_sem, 1); |
701 | } | 1184 | } |
702 | 1185 | ||
703 | #if defined(CONFIG_PRINTK_TIME) | ||
704 | static bool printk_time = 1; | ||
705 | #else | ||
706 | static bool printk_time = 0; | ||
707 | #endif | ||
708 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
709 | |||
710 | static bool always_kmsg_dump; | ||
711 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
712 | |||
713 | /* Check if we have any console registered that can be called early in boot. */ | 1186 | /* Check if we have any console registered that can be called early in boot. */ |
714 | static int have_callable_console(void) | 1187 | static int have_callable_console(void) |
715 | { | 1188 | { |
@@ -722,51 +1195,6 @@ static int have_callable_console(void) | |||
722 | return 0; | 1195 | return 0; |
723 | } | 1196 | } |
724 | 1197 | ||
725 | /** | ||
726 | * printk - print a kernel message | ||
727 | * @fmt: format string | ||
728 | * | ||
729 | * This is printk(). It can be called from any context. We want it to work. | ||
730 | * | ||
731 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and | ||
732 | * call the console drivers. If we fail to get the semaphore we place the output | ||
733 | * into the log buffer and return. The current holder of the console_sem will | ||
734 | * notice the new output in console_unlock(); and will send it to the | ||
735 | * consoles before releasing the lock. | ||
736 | * | ||
737 | * One effect of this deferred printing is that code which calls printk() and | ||
738 | * then changes console_loglevel may break. This is because console_loglevel | ||
739 | * is inspected when the actual printing occurs. | ||
740 | * | ||
741 | * See also: | ||
742 | * printf(3) | ||
743 | * | ||
744 | * See the vsnprintf() documentation for format string extensions over C99. | ||
745 | */ | ||
746 | |||
747 | asmlinkage int printk(const char *fmt, ...) | ||
748 | { | ||
749 | va_list args; | ||
750 | int r; | ||
751 | |||
752 | #ifdef CONFIG_KGDB_KDB | ||
753 | if (unlikely(kdb_trap_printk)) { | ||
754 | va_start(args, fmt); | ||
755 | r = vkdb_printf(fmt, args); | ||
756 | va_end(args); | ||
757 | return r; | ||
758 | } | ||
759 | #endif | ||
760 | va_start(args, fmt); | ||
761 | r = vprintk(fmt, args); | ||
762 | va_end(args); | ||
763 | |||
764 | return r; | ||
765 | } | ||
766 | |||
767 | /* cpu currently holding logbuf_lock */ | ||
768 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
769 | |||
770 | /* | 1198 | /* |
771 | * Can we actually use the console at this time on this cpu? | 1199 | * Can we actually use the console at this time on this cpu? |
772 | * | 1200 | * |
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
810 | retval = 0; | 1238 | retval = 0; |
811 | } | 1239 | } |
812 | } | 1240 | } |
813 | printk_cpu = UINT_MAX; | 1241 | logbuf_cpu = UINT_MAX; |
814 | if (wake) | 1242 | if (wake) |
815 | up(&console_sem); | 1243 | up(&console_sem); |
816 | raw_spin_unlock(&logbuf_lock); | 1244 | raw_spin_unlock(&logbuf_lock); |
817 | return retval; | 1245 | return retval; |
818 | } | 1246 | } |
819 | static const char recursion_bug_msg [] = | ||
820 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
821 | static int recursion_bug; | ||
822 | static int new_text_line = 1; | ||
823 | static char printk_buf[1024]; | ||
824 | 1247 | ||
825 | int printk_delay_msec __read_mostly; | 1248 | int printk_delay_msec __read_mostly; |
826 | 1249 | ||
@@ -836,15 +1259,23 @@ static inline void printk_delay(void) | |||
836 | } | 1259 | } |
837 | } | 1260 | } |
838 | 1261 | ||
839 | asmlinkage int vprintk(const char *fmt, va_list args) | 1262 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | ||
1264 | const char *fmt, va_list args) | ||
840 | { | 1265 | { |
841 | int printed_len = 0; | 1266 | static int recursion_bug; |
842 | int current_log_level = default_message_loglevel; | 1267 | static char cont_buf[LOG_LINE_MAX]; |
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | ||
1272 | char *text = textbuf; | ||
1273 | size_t text_len; | ||
843 | unsigned long flags; | 1274 | unsigned long flags; |
844 | int this_cpu; | 1275 | int this_cpu; |
845 | char *p; | 1276 | bool newline = false; |
846 | size_t plen; | 1277 | bool prefix = false; |
847 | char special; | 1278 | int printed_len = 0; |
848 | 1279 | ||
849 | boot_delay_msec(); | 1280 | boot_delay_msec(); |
850 | printk_delay(); | 1281 | printk_delay(); |
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | /* | 1287 | /* |
857 | * Ouch, printk recursed into itself! | 1288 | * Ouch, printk recursed into itself! |
858 | */ | 1289 | */ |
859 | if (unlikely(printk_cpu == this_cpu)) { | 1290 | if (unlikely(logbuf_cpu == this_cpu)) { |
860 | /* | 1291 | /* |
861 | * If a crash is occurring during printk() on this CPU, | 1292 | * If a crash is occurring during printk() on this CPU, |
862 | * then try to get the crash message out but make sure | 1293 | * then try to get the crash message out but make sure |
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
873 | 1304 | ||
874 | lockdep_off(); | 1305 | lockdep_off(); |
875 | raw_spin_lock(&logbuf_lock); | 1306 | raw_spin_lock(&logbuf_lock); |
876 | printk_cpu = this_cpu; | 1307 | logbuf_cpu = this_cpu; |
877 | 1308 | ||
878 | if (recursion_bug) { | 1309 | if (recursion_bug) { |
1310 | static const char recursion_msg[] = | ||
1311 | "BUG: recent printk recursion!"; | ||
1312 | |||
879 | recursion_bug = 0; | 1313 | recursion_bug = 0; |
880 | strcpy(printk_buf, recursion_bug_msg); | 1314 | printed_len += strlen(recursion_msg); |
881 | printed_len = strlen(recursion_bug_msg); | 1315 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | ||
882 | } | 1317 | } |
883 | /* Emit the output into the temporary buffer */ | ||
884 | printed_len += vscnprintf(printk_buf + printed_len, | ||
885 | sizeof(printk_buf) - printed_len, fmt, args); | ||
886 | 1318 | ||
887 | p = printk_buf; | 1319 | /* |
1320 | * The printf needs to come first; we need the syslog | ||
1321 | * prefix which might be passed-in as a parameter. | ||
1322 | */ | ||
1323 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | ||
888 | 1324 | ||
889 | /* Read log level and handle special printk prefix */ | 1325 | /* mark and strip a trailing newline */ |
890 | plen = log_prefix(p, ¤t_log_level, &special); | 1326 | if (text_len && text[text_len-1] == '\n') { |
891 | if (plen) { | 1327 | text_len--; |
892 | p += plen; | 1328 | newline = true; |
1329 | } | ||
893 | 1330 | ||
894 | switch (special) { | 1331 | /* strip syslog prefix and extract log level or control flags */ |
895 | case 'c': /* Strip <c> KERN_CONT, continue line */ | 1332 | if (text[0] == '<' && text[1] && text[2] == '>') { |
896 | plen = 0; | 1333 | switch (text[1]) { |
897 | break; | 1334 | case '0' ... '7': |
898 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ | 1335 | if (level == -1) |
899 | plen = 0; | 1336 | level = text[1] - '0'; |
900 | default: | 1337 | case 'd': /* KERN_DEFAULT */ |
901 | if (!new_text_line) { | 1338 | prefix = true; |
902 | emit_log_char('\n'); | 1339 | case 'c': /* KERN_CONT */ |
903 | new_text_line = 1; | 1340 | text += 3; |
904 | } | 1341 | text_len -= 3; |
905 | } | 1342 | } |
906 | } | 1343 | } |
907 | 1344 | ||
908 | /* | 1345 | if (level == -1) |
909 | * Copy the output into log_buf. If the caller didn't provide | 1346 | level = default_message_loglevel; |
910 | * the appropriate log prefix, we insert them here | ||
911 | */ | ||
912 | for (; *p; p++) { | ||
913 | if (new_text_line) { | ||
914 | new_text_line = 0; | ||
915 | |||
916 | if (plen) { | ||
917 | /* Copy original log prefix */ | ||
918 | int i; | ||
919 | |||
920 | for (i = 0; i < plen; i++) | ||
921 | emit_log_char(printk_buf[i]); | ||
922 | printed_len += plen; | ||
923 | } else { | ||
924 | /* Add log prefix */ | ||
925 | emit_log_char('<'); | ||
926 | emit_log_char(current_log_level + '0'); | ||
927 | emit_log_char('>'); | ||
928 | printed_len += 3; | ||
929 | } | ||
930 | 1347 | ||
931 | if (printk_time) { | 1348 | if (dict) { |
932 | /* Add the current time stamp */ | 1349 | prefix = true; |
933 | char tbuf[50], *tp; | 1350 | newline = true; |
934 | unsigned tlen; | 1351 | } |
935 | unsigned long long t; | ||
936 | unsigned long nanosec_rem; | ||
937 | |||
938 | t = cpu_clock(printk_cpu); | ||
939 | nanosec_rem = do_div(t, 1000000000); | ||
940 | tlen = sprintf(tbuf, "[%5lu.%06lu] ", | ||
941 | (unsigned long) t, | ||
942 | nanosec_rem / 1000); | ||
943 | |||
944 | for (tp = tbuf; tp < tbuf + tlen; tp++) | ||
945 | emit_log_char(*tp); | ||
946 | printed_len += tlen; | ||
947 | } | ||
948 | 1352 | ||
949 | if (!*p) | 1353 | if (!newline) { |
950 | break; | 1354 | if (cont_len && (prefix || cont_task != current)) { |
1355 | /* | ||
1356 | * Flush earlier buffer, which is either from a | ||
1357 | * different thread, or when we got a new prefix. | ||
1358 | */ | ||
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | ||
1360 | cont_len = 0; | ||
951 | } | 1361 | } |
952 | 1362 | ||
953 | emit_log_char(*p); | 1363 | if (!cont_len) { |
954 | if (*p == '\n') | 1364 | cont_level = level; |
955 | new_text_line = 1; | 1365 | cont_task = current; |
1366 | } | ||
1367 | |||
1368 | /* buffer or append to earlier buffer from the same thread */ | ||
1369 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1370 | text_len = sizeof(cont_buf) - cont_len; | ||
1371 | memcpy(cont_buf + cont_len, text, text_len); | ||
1372 | cont_len += text_len; | ||
1373 | } else { | ||
1374 | if (cont_len && cont_task == current) { | ||
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | |||
1386 | /* append to the earlier buffer and flush */ | ||
1387 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1388 | text_len = sizeof(cont_buf) - cont_len; | ||
1389 | memcpy(cont_buf + cont_len, text, text_len); | ||
1390 | cont_len += text_len; | ||
1391 | log_store(facility, cont_level, | ||
1392 | NULL, 0, cont_buf, cont_len); | ||
1393 | cont_len = 0; | ||
1394 | cont_task = NULL; | ||
1395 | printed_len = cont_len; | ||
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | ||
956 | } | 1402 | } |
957 | 1403 | ||
958 | /* | 1404 | /* |
959 | * Try to acquire and then immediately release the | 1405 | * Try to acquire and then immediately release the console semaphore. |
960 | * console semaphore. The release will do all the | 1406 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
961 | * actual magic (print out buffers, wake up klogd, | 1407 | * users. |
962 | * etc). | ||
963 | * | 1408 | * |
964 | * The console_trylock_for_printk() function | 1409 | * The console_trylock_for_printk() function will release 'logbuf_lock' |
965 | * will release 'logbuf_lock' regardless of whether it | 1410 | * regardless of whether it actually gets the console semaphore or not. |
966 | * actually gets the semaphore or not. | ||
967 | */ | 1411 | */ |
968 | if (console_trylock_for_printk(this_cpu)) | 1412 | if (console_trylock_for_printk(this_cpu)) |
969 | console_unlock(); | 1413 | console_unlock(); |
@@ -974,16 +1418,81 @@ out_restore_irqs: | |||
974 | 1418 | ||
975 | return printed_len; | 1419 | return printed_len; |
976 | } | 1420 | } |
977 | EXPORT_SYMBOL(printk); | 1421 | EXPORT_SYMBOL(vprintk_emit); |
978 | EXPORT_SYMBOL(vprintk); | ||
979 | 1422 | ||
980 | #else | 1423 | asmlinkage int vprintk(const char *fmt, va_list args) |
1424 | { | ||
1425 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1426 | } | ||
1427 | EXPORT_SYMBOL(vprintk); | ||
981 | 1428 | ||
982 | static void call_console_drivers(unsigned start, unsigned end) | 1429 | asmlinkage int printk_emit(int facility, int level, |
1430 | const char *dict, size_t dictlen, | ||
1431 | const char *fmt, ...) | ||
983 | { | 1432 | { |
1433 | va_list args; | ||
1434 | int r; | ||
1435 | |||
1436 | va_start(args, fmt); | ||
1437 | r = vprintk_emit(facility, level, dict, dictlen, fmt, args); | ||
1438 | va_end(args); | ||
1439 | |||
1440 | return r; | ||
984 | } | 1441 | } |
1442 | EXPORT_SYMBOL(printk_emit); | ||
985 | 1443 | ||
1444 | /** | ||
1445 | * printk - print a kernel message | ||
1446 | * @fmt: format string | ||
1447 | * | ||
1448 | * This is printk(). It can be called from any context. We want it to work. | ||
1449 | * | ||
1450 | * We try to grab the console_lock. If we succeed, it's easy - we log the | ||
1451 | * output and call the console drivers. If we fail to get the semaphore, we | ||
1452 | * place the output into the log buffer and return. The current holder of | ||
1453 | * the console_sem will notice the new output in console_unlock(); and will | ||
1454 | * send it to the consoles before releasing the lock. | ||
1455 | * | ||
1456 | * One effect of this deferred printing is that code which calls printk() and | ||
1457 | * then changes console_loglevel may break. This is because console_loglevel | ||
1458 | * is inspected when the actual printing occurs. | ||
1459 | * | ||
1460 | * See also: | ||
1461 | * printf(3) | ||
1462 | * | ||
1463 | * See the vsnprintf() documentation for format string extensions over C99. | ||
1464 | */ | ||
1465 | asmlinkage int printk(const char *fmt, ...) | ||
1466 | { | ||
1467 | va_list args; | ||
1468 | int r; | ||
1469 | |||
1470 | #ifdef CONFIG_KGDB_KDB | ||
1471 | if (unlikely(kdb_trap_printk)) { | ||
1472 | va_start(args, fmt); | ||
1473 | r = vkdb_printf(fmt, args); | ||
1474 | va_end(args); | ||
1475 | return r; | ||
1476 | } | ||
986 | #endif | 1477 | #endif |
1478 | va_start(args, fmt); | ||
1479 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1480 | va_end(args); | ||
1481 | |||
1482 | return r; | ||
1483 | } | ||
1484 | EXPORT_SYMBOL(printk); | ||
1485 | |||
1486 | #else | ||
1487 | |||
1488 | #define LOG_LINE_MAX 0 | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | ||
1490 | static u32 log_next(u32 idx) { return 0; } | ||
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | ||
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
1493 | char *buf, size_t size) { return 0; } | ||
1494 | |||
1495 | #endif /* CONFIG_PRINTK */ | ||
987 | 1496 | ||
988 | static int __add_preferred_console(char *name, int idx, char *options, | 1497 | static int __add_preferred_console(char *name, int idx, char *options, |
989 | char *brl_options) | 1498 | char *brl_options) |
@@ -1217,7 +1726,7 @@ int is_console_locked(void) | |||
1217 | } | 1726 | } |
1218 | 1727 | ||
1219 | /* | 1728 | /* |
1220 | * Delayed printk facility, for scheduler-internal messages: | 1729 | * Delayed printk version, for scheduler-internal messages: |
1221 | */ | 1730 | */ |
1222 | #define PRINTK_BUF_SIZE 512 | 1731 | #define PRINTK_BUF_SIZE 512 |
1223 | 1732 | ||
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void) | |||
1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1762 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1254 | } | 1763 | } |
1255 | 1764 | ||
1765 | /* the next printk record to write to the console */ | ||
1766 | static u64 console_seq; | ||
1767 | static u32 console_idx; | ||
1768 | |||
1256 | /** | 1769 | /** |
1257 | * console_unlock - unlock the console system | 1770 | * console_unlock - unlock the console system |
1258 | * | 1771 | * |
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void) | |||
1263 | * by printk(). If this is the case, console_unlock(); emits | 1776 | * by printk(). If this is the case, console_unlock(); emits |
1264 | * the output prior to releasing the lock. | 1777 | * the output prior to releasing the lock. |
1265 | * | 1778 | * |
1266 | * If there is output waiting for klogd, we wake it up. | 1779 | * If there is output waiting, we wake /dev/kmsg and syslog() users. |
1267 | * | 1780 | * |
1268 | * console_unlock(); may be called from any context. | 1781 | * console_unlock(); may be called from any context. |
1269 | */ | 1782 | */ |
1270 | void console_unlock(void) | 1783 | void console_unlock(void) |
1271 | { | 1784 | { |
1785 | static u64 seen_seq; | ||
1272 | unsigned long flags; | 1786 | unsigned long flags; |
1273 | unsigned _con_start, _log_end; | 1787 | bool wake_klogd = false; |
1274 | unsigned wake_klogd = 0, retry = 0; | 1788 | bool retry; |
1275 | 1789 | ||
1276 | if (console_suspended) { | 1790 | if (console_suspended) { |
1277 | up(&console_sem); | 1791 | up(&console_sem); |
@@ -1281,17 +1795,38 @@ void console_unlock(void) | |||
1281 | console_may_schedule = 0; | 1795 | console_may_schedule = 0; |
1282 | 1796 | ||
1283 | again: | 1797 | again: |
1284 | for ( ; ; ) { | 1798 | for (;;) { |
1799 | struct log *msg; | ||
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | ||
1802 | int level; | ||
1803 | |||
1285 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 1804 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1286 | wake_klogd |= log_start - log_end; | 1805 | if (seen_seq != log_next_seq) { |
1287 | if (con_start == log_end) | 1806 | wake_klogd = true; |
1288 | break; /* Nothing to print */ | 1807 | seen_seq = log_next_seq; |
1289 | _con_start = con_start; | 1808 | } |
1290 | _log_end = log_end; | 1809 | |
1291 | con_start = log_end; /* Flush */ | 1810 | if (console_seq < log_first_seq) { |
1811 | /* messages are gone, move to first one */ | ||
1812 | console_seq = log_first_seq; | ||
1813 | console_idx = log_first_idx; | ||
1814 | } | ||
1815 | |||
1816 | if (console_seq == log_next_seq) | ||
1817 | break; | ||
1818 | |||
1819 | msg = log_from_idx(console_idx); | ||
1820 | level = msg->level & 7; | ||
1821 | |||
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | ||
1823 | |||
1824 | console_idx = log_next(console_idx); | ||
1825 | console_seq++; | ||
1292 | raw_spin_unlock(&logbuf_lock); | 1826 | raw_spin_unlock(&logbuf_lock); |
1827 | |||
1293 | stop_critical_timings(); /* don't trace print latency */ | 1828 | stop_critical_timings(); /* don't trace print latency */ |
1294 | call_console_drivers(_con_start, _log_end); | 1829 | call_console_drivers(level, text, len); |
1295 | start_critical_timings(); | 1830 | start_critical_timings(); |
1296 | local_irq_restore(flags); | 1831 | local_irq_restore(flags); |
1297 | } | 1832 | } |
@@ -1312,8 +1847,7 @@ again: | |||
1312 | * flush, no worries. | 1847 | * flush, no worries. |
1313 | */ | 1848 | */ |
1314 | raw_spin_lock(&logbuf_lock); | 1849 | raw_spin_lock(&logbuf_lock); |
1315 | if (con_start != log_end) | 1850 | retry = console_seq != log_next_seq; |
1316 | retry = 1; | ||
1317 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 1851 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1318 | 1852 | ||
1319 | if (retry && console_trylock()) | 1853 | if (retry && console_trylock()) |
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon) | |||
1549 | * for us. | 2083 | * for us. |
1550 | */ | 2084 | */ |
1551 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1552 | con_start = log_start; | 2086 | console_seq = syslog_seq; |
2087 | console_idx = syslog_idx; | ||
1553 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1554 | /* | 2089 | /* |
1555 | * We're about to replay the log buffer. Only do this to the | 2090 | * We're about to replay the log buffer. Only do this to the |
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1758 | } | 2293 | } |
1759 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 2294 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1760 | 2295 | ||
2296 | static bool always_kmsg_dump; | ||
2297 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
2298 | |||
1761 | /** | 2299 | /** |
1762 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2300 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1763 | * @reason: the reason (oops, panic etc) for dumping | 2301 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | |||
1767 | */ | 2305 | */ |
1768 | void kmsg_dump(enum kmsg_dump_reason reason) | 2306 | void kmsg_dump(enum kmsg_dump_reason reason) |
1769 | { | 2307 | { |
1770 | unsigned long end; | 2308 | u64 idx; |
1771 | unsigned chars; | ||
1772 | struct kmsg_dumper *dumper; | 2309 | struct kmsg_dumper *dumper; |
1773 | const char *s1, *s2; | 2310 | const char *s1, *s2; |
1774 | unsigned long l1, l2; | 2311 | unsigned long l1, l2; |
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1780 | /* Theoretically, the log could move on after we do this, but | 2317 | /* Theoretically, the log could move on after we do this, but |
1781 | there's not a lot we can do about that. The new messages | 2318 | there's not a lot we can do about that. The new messages |
1782 | will overwrite the start of what we dump. */ | 2319 | will overwrite the start of what we dump. */ |
2320 | |||
1783 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1784 | end = log_end & LOG_BUF_MASK; | 2322 | if (syslog_seq < log_first_seq) |
1785 | chars = logged_chars; | 2323 | idx = syslog_idx; |
1786 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2324 | else |
2325 | idx = log_first_idx; | ||
1787 | 2326 | ||
1788 | if (chars > end) { | 2327 | if (idx > log_next_idx) { |
1789 | s1 = log_buf + log_buf_len - chars + end; | 2328 | s1 = log_buf; |
1790 | l1 = chars - end; | 2329 | l1 = log_next_idx; |
1791 | 2330 | ||
1792 | s2 = log_buf; | 2331 | s2 = log_buf + idx; |
1793 | l2 = end; | 2332 | l2 = log_buf_len - idx; |
1794 | } else { | 2333 | } else { |
1795 | s1 = ""; | 2334 | s1 = ""; |
1796 | l1 = 0; | 2335 | l1 = 0; |
1797 | 2336 | ||
1798 | s2 = log_buf + end - chars; | 2337 | s2 = log_buf + idx; |
1799 | l2 = chars; | 2338 | l2 = log_next_idx - idx; |
1800 | } | 2339 | } |
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1801 | 2341 | ||
1802 | rcu_read_lock(); | 2342 | rcu_read_lock(); |
1803 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2343 | list_for_each_entry_rcu(dumper, &dump_list, list) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ee8d49b9c30..a232bb59d93 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
198 | return 0; | 198 | return 0; |
199 | rcu_read_lock(); | 199 | rcu_read_lock(); |
200 | tcred = __task_cred(task); | 200 | tcred = __task_cred(task); |
201 | if (cred->user->user_ns == tcred->user->user_ns && | 201 | if (uid_eq(cred->uid, tcred->euid) && |
202 | (cred->uid == tcred->euid && | 202 | uid_eq(cred->uid, tcred->suid) && |
203 | cred->uid == tcred->suid && | 203 | uid_eq(cred->uid, tcred->uid) && |
204 | cred->uid == tcred->uid && | 204 | gid_eq(cred->gid, tcred->egid) && |
205 | cred->gid == tcred->egid && | 205 | gid_eq(cred->gid, tcred->sgid) && |
206 | cred->gid == tcred->sgid && | 206 | gid_eq(cred->gid, tcred->gid)) |
207 | cred->gid == tcred->gid)) | ||
208 | goto ok; | 207 | goto ok; |
209 | if (ptrace_has_cap(tcred->user->user_ns, mode)) | 208 | if (ptrace_has_cap(tcred->user_ns, mode)) |
210 | goto ok; | 209 | goto ok; |
211 | rcu_read_unlock(); | 210 | rcu_read_unlock(); |
212 | return -EPERM; | 211 | return -EPERM; |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc2..95cba41ce1e 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -51,6 +51,34 @@ | |||
51 | 51 | ||
52 | #include "rcu.h" | 52 | #include "rcu.h" |
53 | 53 | ||
54 | #ifdef CONFIG_PREEMPT_RCU | ||
55 | |||
56 | /* | ||
57 | * Check for a task exiting while in a preemptible-RCU read-side | ||
58 | * critical section, clean up if so. No need to issue warnings, | ||
59 | * as debug_check_no_locks_held() already does this if lockdep | ||
60 | * is enabled. | ||
61 | */ | ||
62 | void exit_rcu(void) | ||
63 | { | ||
64 | struct task_struct *t = current; | ||
65 | |||
66 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
67 | return; | ||
68 | t->rcu_read_lock_nesting = 1; | ||
69 | barrier(); | ||
70 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
71 | __rcu_read_unlock(); | ||
72 | } | ||
73 | |||
74 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
75 | |||
76 | void exit_rcu(void) | ||
77 | { | ||
78 | } | ||
79 | |||
80 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
81 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 82 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | static struct lock_class_key rcu_lock_key; | 83 | static struct lock_class_key rcu_lock_key; |
56 | struct lockdep_map rcu_lock_map = | 84 | struct lockdep_map rcu_lock_map = |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb6..fc31a2d6510 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) | |||
851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | ||
855 | * Check for a task exiting while in a preemptible -RCU read-side | ||
856 | * critical section, clean up if so. No need to issue warnings, | ||
857 | * as debug_check_no_locks_held() already does this if lockdep | ||
858 | * is enabled. | ||
859 | */ | ||
860 | void exit_rcu(void) | ||
861 | { | ||
862 | struct task_struct *t = current; | ||
863 | |||
864 | if (t->rcu_read_lock_nesting == 0) | ||
865 | return; | ||
866 | t->rcu_read_lock_nesting = 1; | ||
867 | __rcu_read_unlock(); | ||
868 | } | ||
869 | |||
870 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 854 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
871 | 855 | ||
872 | #ifdef CONFIG_RCU_TRACE | 856 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6..e66b34ab755 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | 68 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ |
68 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | 69 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ |
69 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | 70 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ |
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); | |||
96 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 97 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
97 | module_param(fqs_stutter, int, 0444); | 98 | module_param(fqs_stutter, int, 0444); |
98 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 99 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
100 | module_param(n_barrier_cbs, int, 0444); | ||
101 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | ||
99 | module_param(onoff_interval, int, 0444); | 102 | module_param(onoff_interval, int, 0444); |
100 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 103 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); |
101 | module_param(onoff_holdoff, int, 0444); | 104 | module_param(onoff_holdoff, int, 0444); |
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; | |||
139 | static struct task_struct *onoff_task; | 142 | static struct task_struct *onoff_task; |
140 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 143 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
141 | static struct task_struct *stall_task; | 144 | static struct task_struct *stall_task; |
145 | static struct task_struct **barrier_cbs_tasks; | ||
146 | static struct task_struct *barrier_task; | ||
142 | 147 | ||
143 | #define RCU_TORTURE_PIPE_LEN 10 | 148 | #define RCU_TORTURE_PIPE_LEN 10 |
144 | 149 | ||
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
164 | static atomic_t n_rcu_torture_free; | 169 | static atomic_t n_rcu_torture_free; |
165 | static atomic_t n_rcu_torture_mberror; | 170 | static atomic_t n_rcu_torture_mberror; |
166 | static atomic_t n_rcu_torture_error; | 171 | static atomic_t n_rcu_torture_error; |
172 | static long n_rcu_torture_barrier_error; | ||
167 | static long n_rcu_torture_boost_ktrerror; | 173 | static long n_rcu_torture_boost_ktrerror; |
168 | static long n_rcu_torture_boost_rterror; | 174 | static long n_rcu_torture_boost_rterror; |
169 | static long n_rcu_torture_boost_failure; | 175 | static long n_rcu_torture_boost_failure; |
@@ -173,6 +179,8 @@ static long n_offline_attempts; | |||
173 | static long n_offline_successes; | 179 | static long n_offline_successes; |
174 | static long n_online_attempts; | 180 | static long n_online_attempts; |
175 | static long n_online_successes; | 181 | static long n_online_successes; |
182 | static long n_barrier_attempts; | ||
183 | static long n_barrier_successes; | ||
176 | static struct list_head rcu_torture_removed; | 184 | static struct list_head rcu_torture_removed; |
177 | static cpumask_var_t shuffle_tmp_mask; | 185 | static cpumask_var_t shuffle_tmp_mask; |
178 | 186 | ||
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ | |||
197 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 205 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
198 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
199 | /* and boost task create/destroy. */ | 207 | /* and boost task create/destroy. */ |
208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | ||
209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | ||
210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | ||
211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | ||
200 | 212 | ||
201 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 213 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
202 | 214 | ||
@@ -327,6 +339,7 @@ struct rcu_torture_ops { | |||
327 | int (*completed)(void); | 339 | int (*completed)(void); |
328 | void (*deferred_free)(struct rcu_torture *p); | 340 | void (*deferred_free)(struct rcu_torture *p); |
329 | void (*sync)(void); | 341 | void (*sync)(void); |
342 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
330 | void (*cb_barrier)(void); | 343 | void (*cb_barrier)(void); |
331 | void (*fqs)(void); | 344 | void (*fqs)(void); |
332 | int (*stats)(char *page); | 345 | int (*stats)(char *page); |
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
417 | .completed = rcu_torture_completed, | 430 | .completed = rcu_torture_completed, |
418 | .deferred_free = rcu_torture_deferred_free, | 431 | .deferred_free = rcu_torture_deferred_free, |
419 | .sync = synchronize_rcu, | 432 | .sync = synchronize_rcu, |
433 | .call = call_rcu, | ||
420 | .cb_barrier = rcu_barrier, | 434 | .cb_barrier = rcu_barrier, |
421 | .fqs = rcu_force_quiescent_state, | 435 | .fqs = rcu_force_quiescent_state, |
422 | .stats = NULL, | 436 | .stats = NULL, |
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
460 | .completed = rcu_torture_completed, | 474 | .completed = rcu_torture_completed, |
461 | .deferred_free = rcu_sync_torture_deferred_free, | 475 | .deferred_free = rcu_sync_torture_deferred_free, |
462 | .sync = synchronize_rcu, | 476 | .sync = synchronize_rcu, |
477 | .call = NULL, | ||
463 | .cb_barrier = NULL, | 478 | .cb_barrier = NULL, |
464 | .fqs = rcu_force_quiescent_state, | 479 | .fqs = rcu_force_quiescent_state, |
465 | .stats = NULL, | 480 | .stats = NULL, |
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
477 | .completed = rcu_no_completed, | 492 | .completed = rcu_no_completed, |
478 | .deferred_free = rcu_sync_torture_deferred_free, | 493 | .deferred_free = rcu_sync_torture_deferred_free, |
479 | .sync = synchronize_rcu_expedited, | 494 | .sync = synchronize_rcu_expedited, |
495 | .call = NULL, | ||
480 | .cb_barrier = NULL, | 496 | .cb_barrier = NULL, |
481 | .fqs = rcu_force_quiescent_state, | 497 | .fqs = rcu_force_quiescent_state, |
482 | .stats = NULL, | 498 | .stats = NULL, |
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
519 | .completed = rcu_bh_torture_completed, | 535 | .completed = rcu_bh_torture_completed, |
520 | .deferred_free = rcu_bh_torture_deferred_free, | 536 | .deferred_free = rcu_bh_torture_deferred_free, |
521 | .sync = synchronize_rcu_bh, | 537 | .sync = synchronize_rcu_bh, |
538 | .call = call_rcu_bh, | ||
522 | .cb_barrier = rcu_barrier_bh, | 539 | .cb_barrier = rcu_barrier_bh, |
523 | .fqs = rcu_bh_force_quiescent_state, | 540 | .fqs = rcu_bh_force_quiescent_state, |
524 | .stats = NULL, | 541 | .stats = NULL, |
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
535 | .completed = rcu_bh_torture_completed, | 552 | .completed = rcu_bh_torture_completed, |
536 | .deferred_free = rcu_sync_torture_deferred_free, | 553 | .deferred_free = rcu_sync_torture_deferred_free, |
537 | .sync = synchronize_rcu_bh, | 554 | .sync = synchronize_rcu_bh, |
555 | .call = NULL, | ||
538 | .cb_barrier = NULL, | 556 | .cb_barrier = NULL, |
539 | .fqs = rcu_bh_force_quiescent_state, | 557 | .fqs = rcu_bh_force_quiescent_state, |
540 | .stats = NULL, | 558 | .stats = NULL, |
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
551 | .completed = rcu_bh_torture_completed, | 569 | .completed = rcu_bh_torture_completed, |
552 | .deferred_free = rcu_sync_torture_deferred_free, | 570 | .deferred_free = rcu_sync_torture_deferred_free, |
553 | .sync = synchronize_rcu_bh_expedited, | 571 | .sync = synchronize_rcu_bh_expedited, |
572 | .call = NULL, | ||
554 | .cb_barrier = NULL, | 573 | .cb_barrier = NULL, |
555 | .fqs = rcu_bh_force_quiescent_state, | 574 | .fqs = rcu_bh_force_quiescent_state, |
556 | .stats = NULL, | 575 | .stats = NULL, |
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void) | |||
606 | return srcu_batches_completed(&srcu_ctl); | 625 | return srcu_batches_completed(&srcu_ctl); |
607 | } | 626 | } |
608 | 627 | ||
628 | static void srcu_torture_deferred_free(struct rcu_torture *rp) | ||
629 | { | ||
630 | call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); | ||
631 | } | ||
632 | |||
609 | static void srcu_torture_synchronize(void) | 633 | static void srcu_torture_synchronize(void) |
610 | { | 634 | { |
611 | synchronize_srcu(&srcu_ctl); | 635 | synchronize_srcu(&srcu_ctl); |
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page) | |||
620 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 644 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", |
621 | torture_type, TORTURE_FLAG, idx); | 645 | torture_type, TORTURE_FLAG, idx); |
622 | for_each_possible_cpu(cpu) { | 646 | for_each_possible_cpu(cpu) { |
623 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, | 647 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, |
624 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 648 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
625 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 649 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
626 | } | 650 | } |
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = { | |||
635 | .read_delay = srcu_read_delay, | 659 | .read_delay = srcu_read_delay, |
636 | .readunlock = srcu_torture_read_unlock, | 660 | .readunlock = srcu_torture_read_unlock, |
637 | .completed = srcu_torture_completed, | 661 | .completed = srcu_torture_completed, |
638 | .deferred_free = rcu_sync_torture_deferred_free, | 662 | .deferred_free = srcu_torture_deferred_free, |
639 | .sync = srcu_torture_synchronize, | 663 | .sync = srcu_torture_synchronize, |
664 | .call = NULL, | ||
640 | .cb_barrier = NULL, | 665 | .cb_barrier = NULL, |
641 | .stats = srcu_torture_stats, | 666 | .stats = srcu_torture_stats, |
642 | .name = "srcu" | 667 | .name = "srcu" |
643 | }; | 668 | }; |
644 | 669 | ||
670 | static struct rcu_torture_ops srcu_sync_ops = { | ||
671 | .init = srcu_torture_init, | ||
672 | .cleanup = srcu_torture_cleanup, | ||
673 | .readlock = srcu_torture_read_lock, | ||
674 | .read_delay = srcu_read_delay, | ||
675 | .readunlock = srcu_torture_read_unlock, | ||
676 | .completed = srcu_torture_completed, | ||
677 | .deferred_free = rcu_sync_torture_deferred_free, | ||
678 | .sync = srcu_torture_synchronize, | ||
679 | .call = NULL, | ||
680 | .cb_barrier = NULL, | ||
681 | .stats = srcu_torture_stats, | ||
682 | .name = "srcu_sync" | ||
683 | }; | ||
684 | |||
645 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | 685 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) |
646 | { | 686 | { |
647 | return srcu_read_lock_raw(&srcu_ctl); | 687 | return srcu_read_lock_raw(&srcu_ctl); |
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
659 | .read_delay = srcu_read_delay, | 699 | .read_delay = srcu_read_delay, |
660 | .readunlock = srcu_torture_read_unlock_raw, | 700 | .readunlock = srcu_torture_read_unlock_raw, |
661 | .completed = srcu_torture_completed, | 701 | .completed = srcu_torture_completed, |
662 | .deferred_free = rcu_sync_torture_deferred_free, | 702 | .deferred_free = srcu_torture_deferred_free, |
663 | .sync = srcu_torture_synchronize, | 703 | .sync = srcu_torture_synchronize, |
704 | .call = NULL, | ||
664 | .cb_barrier = NULL, | 705 | .cb_barrier = NULL, |
665 | .stats = srcu_torture_stats, | 706 | .stats = srcu_torture_stats, |
666 | .name = "srcu_raw" | 707 | .name = "srcu_raw" |
667 | }; | 708 | }; |
668 | 709 | ||
710 | static struct rcu_torture_ops srcu_raw_sync_ops = { | ||
711 | .init = srcu_torture_init, | ||
712 | .cleanup = srcu_torture_cleanup, | ||
713 | .readlock = srcu_torture_read_lock_raw, | ||
714 | .read_delay = srcu_read_delay, | ||
715 | .readunlock = srcu_torture_read_unlock_raw, | ||
716 | .completed = srcu_torture_completed, | ||
717 | .deferred_free = rcu_sync_torture_deferred_free, | ||
718 | .sync = srcu_torture_synchronize, | ||
719 | .call = NULL, | ||
720 | .cb_barrier = NULL, | ||
721 | .stats = srcu_torture_stats, | ||
722 | .name = "srcu_raw_sync" | ||
723 | }; | ||
724 | |||
669 | static void srcu_torture_synchronize_expedited(void) | 725 | static void srcu_torture_synchronize_expedited(void) |
670 | { | 726 | { |
671 | synchronize_srcu_expedited(&srcu_ctl); | 727 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { | |||
680 | .completed = srcu_torture_completed, | 736 | .completed = srcu_torture_completed, |
681 | .deferred_free = rcu_sync_torture_deferred_free, | 737 | .deferred_free = rcu_sync_torture_deferred_free, |
682 | .sync = srcu_torture_synchronize_expedited, | 738 | .sync = srcu_torture_synchronize_expedited, |
739 | .call = NULL, | ||
683 | .cb_barrier = NULL, | 740 | .cb_barrier = NULL, |
684 | .stats = srcu_torture_stats, | 741 | .stats = srcu_torture_stats, |
685 | .name = "srcu_expedited" | 742 | .name = "srcu_expedited" |
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page) | |||
1129 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1130 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1131 | "rtbf: %ld rtb: %ld nt: %ld " | 1188 | "rtbf: %ld rtb: %ld nt: %ld " |
1132 | "onoff: %ld/%ld:%ld/%ld", | 1189 | "onoff: %ld/%ld:%ld/%ld " |
1190 | "barrier: %ld/%ld:%ld", | ||
1133 | rcu_torture_current, | 1191 | rcu_torture_current, |
1134 | rcu_torture_current_version, | 1192 | rcu_torture_current_version, |
1135 | list_empty(&rcu_torture_freelist), | 1193 | list_empty(&rcu_torture_freelist), |
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page) | |||
1145 | n_online_successes, | 1203 | n_online_successes, |
1146 | n_online_attempts, | 1204 | n_online_attempts, |
1147 | n_offline_successes, | 1205 | n_offline_successes, |
1148 | n_offline_attempts); | 1206 | n_offline_attempts, |
1207 | n_barrier_successes, | ||
1208 | n_barrier_attempts, | ||
1209 | n_rcu_torture_barrier_error); | ||
1210 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1149 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1211 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1212 | n_rcu_torture_barrier_error != 0 || | ||
1150 | n_rcu_torture_boost_ktrerror != 0 || | 1213 | n_rcu_torture_boost_ktrerror != 0 || |
1151 | n_rcu_torture_boost_rterror != 0 || | 1214 | n_rcu_torture_boost_rterror != 0 || |
1152 | n_rcu_torture_boost_failure != 0) | 1215 | n_rcu_torture_boost_failure != 0 || |
1153 | cnt += sprintf(&page[cnt], " !!!"); | 1216 | i > 1) { |
1154 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1155 | if (i > 1) { | ||
1156 | cnt += sprintf(&page[cnt], "!!! "); | 1217 | cnt += sprintf(&page[cnt], "!!! "); |
1157 | atomic_inc(&n_rcu_torture_error); | 1218 | atomic_inc(&n_rcu_torture_error); |
1158 | WARN_ON_ONCE(1); | 1219 | WARN_ON_ONCE(1); |
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu) | |||
1337 | 1398 | ||
1338 | /* This must be outside of the mutex, otherwise deadlock! */ | 1399 | /* This must be outside of the mutex, otherwise deadlock! */ |
1339 | kthread_stop(t); | 1400 | kthread_stop(t); |
1401 | boost_tasks[cpu] = NULL; | ||
1340 | } | 1402 | } |
1341 | 1403 | ||
1342 | static int rcutorture_booster_init(int cpu) | 1404 | static int rcutorture_booster_init(int cpu) |
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void) | |||
1484 | return; | 1546 | return; |
1485 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | 1547 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); |
1486 | kthread_stop(onoff_task); | 1548 | kthread_stop(onoff_task); |
1549 | onoff_task = NULL; | ||
1487 | } | 1550 | } |
1488 | 1551 | ||
1489 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1552 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1490 | 1553 | ||
1491 | static void | 1554 | static int |
1492 | rcu_torture_onoff_init(void) | 1555 | rcu_torture_onoff_init(void) |
1493 | { | 1556 | { |
1557 | return 0; | ||
1494 | } | 1558 | } |
1495 | 1559 | ||
1496 | static void rcu_torture_onoff_cleanup(void) | 1560 | static void rcu_torture_onoff_cleanup(void) |
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void) | |||
1554 | return; | 1618 | return; |
1555 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | 1619 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); |
1556 | kthread_stop(stall_task); | 1620 | kthread_stop(stall_task); |
1621 | stall_task = NULL; | ||
1622 | } | ||
1623 | |||
1624 | /* Callback function for RCU barrier testing. */ | ||
1625 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | ||
1626 | { | ||
1627 | atomic_inc(&barrier_cbs_invoked); | ||
1628 | } | ||
1629 | |||
1630 | /* kthread function to register callbacks used to test RCU barriers. */ | ||
1631 | static int rcu_torture_barrier_cbs(void *arg) | ||
1632 | { | ||
1633 | long myid = (long)arg; | ||
1634 | struct rcu_head rcu; | ||
1635 | |||
1636 | init_rcu_head_on_stack(&rcu); | ||
1637 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); | ||
1638 | set_user_nice(current, 19); | ||
1639 | do { | ||
1640 | wait_event(barrier_cbs_wq[myid], | ||
1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | ||
1642 | kthread_should_stop() || | ||
1643 | fullstop != FULLSTOP_DONTSTOP); | ||
1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1645 | break; | ||
1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | ||
1647 | if (atomic_dec_and_test(&barrier_cbs_count)) | ||
1648 | wake_up(&barrier_wq); | ||
1649 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1650 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); | ||
1651 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1652 | while (!kthread_should_stop()) | ||
1653 | schedule_timeout_interruptible(1); | ||
1654 | cur_ops->cb_barrier(); | ||
1655 | destroy_rcu_head_on_stack(&rcu); | ||
1656 | return 0; | ||
1657 | } | ||
1658 | |||
1659 | /* kthread function to drive and coordinate RCU barrier testing. */ | ||
1660 | static int rcu_torture_barrier(void *arg) | ||
1661 | { | ||
1662 | int i; | ||
1663 | |||
1664 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); | ||
1665 | do { | ||
1666 | atomic_set(&barrier_cbs_invoked, 0); | ||
1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | ||
1668 | /* wake_up() path contains the required barriers. */ | ||
1669 | for (i = 0; i < n_barrier_cbs; i++) | ||
1670 | wake_up(&barrier_cbs_wq[i]); | ||
1671 | wait_event(barrier_wq, | ||
1672 | atomic_read(&barrier_cbs_count) == 0 || | ||
1673 | kthread_should_stop() || | ||
1674 | fullstop != FULLSTOP_DONTSTOP); | ||
1675 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1676 | break; | ||
1677 | n_barrier_attempts++; | ||
1678 | cur_ops->cb_barrier(); | ||
1679 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | ||
1680 | n_rcu_torture_barrier_error++; | ||
1681 | WARN_ON_ONCE(1); | ||
1682 | } | ||
1683 | n_barrier_successes++; | ||
1684 | schedule_timeout_interruptible(HZ / 10); | ||
1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | ||
1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1688 | while (!kthread_should_stop()) | ||
1689 | schedule_timeout_interruptible(1); | ||
1690 | return 0; | ||
1691 | } | ||
1692 | |||
1693 | /* Initialize RCU barrier testing. */ | ||
1694 | static int rcu_torture_barrier_init(void) | ||
1695 | { | ||
1696 | int i; | ||
1697 | int ret; | ||
1698 | |||
1699 | if (n_barrier_cbs == 0) | ||
1700 | return 0; | ||
1701 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | ||
1702 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1703 | " Call or barrier ops missing for %s,\n", | ||
1704 | torture_type, cur_ops->name); | ||
1705 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1706 | " RCU barrier testing omitted from run.\n", | ||
1707 | torture_type); | ||
1708 | return 0; | ||
1709 | } | ||
1710 | atomic_set(&barrier_cbs_count, 0); | ||
1711 | atomic_set(&barrier_cbs_invoked, 0); | ||
1712 | barrier_cbs_tasks = | ||
1713 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | ||
1714 | GFP_KERNEL); | ||
1715 | barrier_cbs_wq = | ||
1716 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | ||
1717 | GFP_KERNEL); | ||
1718 | if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) | ||
1719 | return -ENOMEM; | ||
1720 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1721 | init_waitqueue_head(&barrier_cbs_wq[i]); | ||
1722 | barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, | ||
1723 | (void *)(long)i, | ||
1724 | "rcu_torture_barrier_cbs"); | ||
1725 | if (IS_ERR(barrier_cbs_tasks[i])) { | ||
1726 | ret = PTR_ERR(barrier_cbs_tasks[i]); | ||
1727 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); | ||
1728 | barrier_cbs_tasks[i] = NULL; | ||
1729 | return ret; | ||
1730 | } | ||
1731 | } | ||
1732 | barrier_task = kthread_run(rcu_torture_barrier, NULL, | ||
1733 | "rcu_torture_barrier"); | ||
1734 | if (IS_ERR(barrier_task)) { | ||
1735 | ret = PTR_ERR(barrier_task); | ||
1736 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); | ||
1737 | barrier_task = NULL; | ||
1738 | } | ||
1739 | return 0; | ||
1740 | } | ||
1741 | |||
1742 | /* Clean up after RCU barrier testing. */ | ||
1743 | static void rcu_torture_barrier_cleanup(void) | ||
1744 | { | ||
1745 | int i; | ||
1746 | |||
1747 | if (barrier_task != NULL) { | ||
1748 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); | ||
1749 | kthread_stop(barrier_task); | ||
1750 | barrier_task = NULL; | ||
1751 | } | ||
1752 | if (barrier_cbs_tasks != NULL) { | ||
1753 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1754 | if (barrier_cbs_tasks[i] != NULL) { | ||
1755 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); | ||
1756 | kthread_stop(barrier_cbs_tasks[i]); | ||
1757 | barrier_cbs_tasks[i] = NULL; | ||
1758 | } | ||
1759 | } | ||
1760 | kfree(barrier_cbs_tasks); | ||
1761 | barrier_cbs_tasks = NULL; | ||
1762 | } | ||
1763 | if (barrier_cbs_wq != NULL) { | ||
1764 | kfree(barrier_cbs_wq); | ||
1765 | barrier_cbs_wq = NULL; | ||
1766 | } | ||
1557 | } | 1767 | } |
1558 | 1768 | ||
1559 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1769 | static int rcutorture_cpu_notify(struct notifier_block *self, |
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void) | |||
1598 | fullstop = FULLSTOP_RMMOD; | 1808 | fullstop = FULLSTOP_RMMOD; |
1599 | mutex_unlock(&fullstop_mutex); | 1809 | mutex_unlock(&fullstop_mutex); |
1600 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1810 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1811 | rcu_torture_barrier_cleanup(); | ||
1601 | rcu_torture_stall_cleanup(); | 1812 | rcu_torture_stall_cleanup(); |
1602 | if (stutter_task) { | 1813 | if (stutter_task) { |
1603 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1814 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void) | |||
1665 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | 1876 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); |
1666 | kthread_stop(shutdown_task); | 1877 | kthread_stop(shutdown_task); |
1667 | } | 1878 | } |
1879 | shutdown_task = NULL; | ||
1668 | rcu_torture_onoff_cleanup(); | 1880 | rcu_torture_onoff_cleanup(); |
1669 | 1881 | ||
1670 | /* Wait for all RCU callbacks to fire. */ | 1882 | /* Wait for all RCU callbacks to fire. */ |
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void) | |||
1676 | 1888 | ||
1677 | if (cur_ops->cleanup) | 1889 | if (cur_ops->cleanup) |
1678 | cur_ops->cleanup(); | 1890 | cur_ops->cleanup(); |
1679 | if (atomic_read(&n_rcu_torture_error)) | 1891 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1680 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1892 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1681 | else if (n_online_successes != n_online_attempts || | 1893 | else if (n_online_successes != n_online_attempts || |
1682 | n_offline_successes != n_offline_attempts) | 1894 | n_offline_successes != n_offline_attempts) |
@@ -1692,10 +1904,12 @@ rcu_torture_init(void) | |||
1692 | int i; | 1904 | int i; |
1693 | int cpu; | 1905 | int cpu; |
1694 | int firsterr = 0; | 1906 | int firsterr = 0; |
1907 | int retval; | ||
1695 | static struct rcu_torture_ops *torture_ops[] = | 1908 | static struct rcu_torture_ops *torture_ops[] = |
1696 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1697 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1698 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, | 1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, |
1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | ||
1699 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1700 | 1914 | ||
1701 | mutex_lock(&fullstop_mutex); | 1915 | mutex_lock(&fullstop_mutex); |
@@ -1749,6 +1963,7 @@ rcu_torture_init(void) | |||
1749 | atomic_set(&n_rcu_torture_free, 0); | 1963 | atomic_set(&n_rcu_torture_free, 0); |
1750 | atomic_set(&n_rcu_torture_mberror, 0); | 1964 | atomic_set(&n_rcu_torture_mberror, 0); |
1751 | atomic_set(&n_rcu_torture_error, 0); | 1965 | atomic_set(&n_rcu_torture_error, 0); |
1966 | n_rcu_torture_barrier_error = 0; | ||
1752 | n_rcu_torture_boost_ktrerror = 0; | 1967 | n_rcu_torture_boost_ktrerror = 0; |
1753 | n_rcu_torture_boost_rterror = 0; | 1968 | n_rcu_torture_boost_rterror = 0; |
1754 | n_rcu_torture_boost_failure = 0; | 1969 | n_rcu_torture_boost_failure = 0; |
@@ -1872,7 +2087,6 @@ rcu_torture_init(void) | |||
1872 | test_boost_duration = 2; | 2087 | test_boost_duration = 2; |
1873 | if ((test_boost == 1 && cur_ops->can_boost) || | 2088 | if ((test_boost == 1 && cur_ops->can_boost) || |
1874 | test_boost == 2) { | 2089 | test_boost == 2) { |
1875 | int retval; | ||
1876 | 2090 | ||
1877 | boost_starttime = jiffies + test_boost_interval * HZ; | 2091 | boost_starttime = jiffies + test_boost_interval * HZ; |
1878 | register_cpu_notifier(&rcutorture_cpu_nb); | 2092 | register_cpu_notifier(&rcutorture_cpu_nb); |
@@ -1897,9 +2111,22 @@ rcu_torture_init(void) | |||
1897 | goto unwind; | 2111 | goto unwind; |
1898 | } | 2112 | } |
1899 | } | 2113 | } |
1900 | rcu_torture_onoff_init(); | 2114 | i = rcu_torture_onoff_init(); |
2115 | if (i != 0) { | ||
2116 | firsterr = i; | ||
2117 | goto unwind; | ||
2118 | } | ||
1901 | register_reboot_notifier(&rcutorture_shutdown_nb); | 2119 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1902 | rcu_torture_stall_init(); | 2120 | i = rcu_torture_stall_init(); |
2121 | if (i != 0) { | ||
2122 | firsterr = i; | ||
2123 | goto unwind; | ||
2124 | } | ||
2125 | retval = rcu_torture_barrier_init(); | ||
2126 | if (retval != 0) { | ||
2127 | firsterr = retval; | ||
2128 | goto unwind; | ||
2129 | } | ||
1903 | rcutorture_record_test_transition(); | 2130 | rcutorture_record_test_transition(); |
1904 | mutex_unlock(&fullstop_mutex); | 2131 | mutex_unlock(&fullstop_mutex); |
1905 | return 0; | 2132 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922..0da7b88d92d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
75 | .gpnum = -300, \ | 75 | .gpnum = -300, \ |
76 | .completed = -300, \ | 76 | .completed = -300, \ |
77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | ||
79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | ||
78 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
79 | .n_force_qs = 0, \ | 81 | .n_force_qs = 0, \ |
80 | .n_force_qs_ngp = 0, \ | 82 | .n_force_qs_ngp = 0, \ |
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
145 | unsigned long rcutorture_testseq; | 147 | unsigned long rcutorture_testseq; |
146 | unsigned long rcutorture_vernum; | 148 | unsigned long rcutorture_vernum; |
147 | 149 | ||
150 | /* State information for rcu_barrier() and friends. */ | ||
151 | |||
152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
153 | static atomic_t rcu_barrier_cpu_count; | ||
154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
155 | static struct completion rcu_barrier_completion; | ||
156 | |||
148 | /* | 157 | /* |
149 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
150 | * permit this function to be invoked without holding the root rcu_node | 159 | * permit this function to be invoked without holding the root rcu_node |
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu) | |||
192 | { | 201 | { |
193 | trace_rcu_utilization("Start context switch"); | 202 | trace_rcu_utilization("Start context switch"); |
194 | rcu_sched_qs(cpu); | 203 | rcu_sched_qs(cpu); |
195 | rcu_preempt_note_context_switch(cpu); | ||
196 | trace_rcu_utilization("End context switch"); | 204 | trace_rcu_utilization("End context switch"); |
197 | } | 205 | } |
198 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1311 | #ifdef CONFIG_HOTPLUG_CPU | 1319 | #ifdef CONFIG_HOTPLUG_CPU |
1312 | 1320 | ||
1313 | /* | 1321 | /* |
1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1322 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1315 | * Also record a quiescent state for this CPU for the current grace period. | 1323 | * specified CPU must be offline, and the caller must hold the |
1316 | * Synchronization and interrupt disabling are not required because | 1324 | * ->onofflock. |
1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
1318 | * operations that might block must be done later from the CPU_DEAD | ||
1319 | * notifier. | ||
1320 | * | ||
1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
1323 | * destination from the bits set in that mask. | ||
1324 | */ | 1325 | */ |
1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1326 | static void |
1327 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
1328 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
1326 | { | 1329 | { |
1327 | int i; | 1330 | int i; |
1328 | unsigned long mask; | ||
1329 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
1333 | 1331 | ||
1334 | /* First, adjust the counts. */ | 1332 | /* |
1333 | * Orphan the callbacks. First adjust the counts. This is safe | ||
1334 | * because ->onofflock excludes _rcu_barrier()'s adoption of | ||
1335 | * the callbacks, thus no memory barrier is required. | ||
1336 | */ | ||
1335 | if (rdp->nxtlist != NULL) { | 1337 | if (rdp->nxtlist != NULL) { |
1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | 1338 | rsp->qlen_lazy += rdp->qlen_lazy; |
1337 | receive_rdp->qlen += rdp->qlen; | 1339 | rsp->qlen += rdp->qlen; |
1340 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1338 | rdp->qlen_lazy = 0; | 1341 | rdp->qlen_lazy = 0; |
1339 | rdp->qlen = 0; | 1342 | rdp->qlen = 0; |
1340 | } | 1343 | } |
1341 | 1344 | ||
1342 | /* | 1345 | /* |
1343 | * Next, move ready-to-invoke callbacks to be invoked on some | 1346 | * Next, move those callbacks still needing a grace period to |
1344 | * other CPU. These will not be required to pass through another | 1347 | * the orphanage, where some other CPU will pick them up. |
1345 | * grace period: They are done, regardless of CPU. | 1348 | * Some of the callbacks might have gone partway through a grace |
1349 | * period, but that is too bad. They get to start over because we | ||
1350 | * cannot assume that grace periods are synchronized across CPUs. | ||
1351 | * We don't bother updating the ->nxttail[] array yet, instead | ||
1352 | * we just reset the whole thing later on. | ||
1346 | */ | 1353 | */ |
1347 | if (rdp->nxtlist != NULL && | 1354 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { |
1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | 1355 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; |
1349 | struct rcu_head *oldhead; | 1356 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; |
1350 | struct rcu_head **oldtail; | 1357 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1351 | struct rcu_head **newtail; | ||
1352 | |||
1353 | oldhead = rdp->nxtlist; | ||
1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
1361 | receive_rdp->nxttail[i] = newtail; | ||
1362 | if (rdp->nxttail[i] == newtail) | ||
1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1364 | } | ||
1365 | } | 1358 | } |
1366 | 1359 | ||
1367 | /* | 1360 | /* |
1368 | * Finally, put the rest of the callbacks at the end of the list. | 1361 | * Then move the ready-to-invoke callbacks to the orphanage, |
1369 | * The ones that made it partway through get to start over: We | 1362 | * where some other CPU will pick them up. These will not be |
1370 | * cannot assume that grace periods are synchronized across CPUs. | 1363 | * required to pass though another grace period: They are done. |
1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but | ||
1372 | * this does not seem compelling. Not yet, anyway.) | ||
1373 | */ | 1364 | */ |
1374 | if (rdp->nxtlist != NULL) { | 1365 | if (rdp->nxtlist != NULL) { |
1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1366 | *rsp->orphan_donetail = rdp->nxtlist; |
1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | 1367 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1380 | |||
1381 | rdp->nxtlist = NULL; | ||
1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1384 | } | 1368 | } |
1385 | 1369 | ||
1370 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
1371 | rdp->nxtlist = NULL; | ||
1372 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1373 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
1378 | * orphanage. The caller must hold the ->onofflock. | ||
1379 | */ | ||
1380 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1381 | { | ||
1382 | int i; | ||
1383 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
1384 | |||
1386 | /* | 1385 | /* |
1387 | * Record a quiescent state for the dying CPU. This is safe | 1386 | * If there is an rcu_barrier() operation in progress, then |
1388 | * only because we have already cleared out the callbacks. | 1387 | * only the task doing that operation is permitted to adopt |
1389 | * (Otherwise, the RCU core might try to schedule the invocation | 1388 | * callbacks. To do otherwise breaks rcu_barrier() and friends |
1390 | * of callbacks on this now-offline CPU, which would be bad.) | 1389 | * by causing them to fail to wait for the callbacks in the |
1390 | * orphanage. | ||
1391 | */ | 1391 | */ |
1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1392 | if (rsp->rcu_barrier_in_progress && |
1393 | rsp->rcu_barrier_in_progress != current) | ||
1394 | return; | ||
1395 | |||
1396 | /* Do the accounting first. */ | ||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
1398 | rdp->qlen += rsp->qlen; | ||
1399 | rdp->n_cbs_adopted += rsp->qlen; | ||
1400 | rsp->qlen_lazy = 0; | ||
1401 | rsp->qlen = 0; | ||
1402 | |||
1403 | /* | ||
1404 | * We do not need a memory barrier here because the only way we | ||
1405 | * can get here if there is an rcu_barrier() in flight is if | ||
1406 | * we are the task doing the rcu_barrier(). | ||
1407 | */ | ||
1408 | |||
1409 | /* First adopt the ready-to-invoke callbacks. */ | ||
1410 | if (rsp->orphan_donelist != NULL) { | ||
1411 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1412 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
1413 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
1414 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
1415 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
1416 | rsp->orphan_donelist = NULL; | ||
1417 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1418 | } | ||
1419 | |||
1420 | /* And then adopt the callbacks that still need a grace period. */ | ||
1421 | if (rsp->orphan_nxtlist != NULL) { | ||
1422 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
1423 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
1424 | rsp->orphan_nxtlist = NULL; | ||
1425 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * Trace the fact that this CPU is going offline. | ||
1431 | */ | ||
1432 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
1433 | { | ||
1434 | RCU_TRACE(unsigned long mask); | ||
1435 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
1436 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
1437 | |||
1438 | RCU_TRACE(mask = rdp->grpmask); | ||
1393 | trace_rcu_grace_period(rsp->name, | 1439 | trace_rcu_grace_period(rsp->name, |
1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1440 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1395 | "cpuofl"); | 1441 | "cpuofl"); |
1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
1398 | } | 1442 | } |
1399 | 1443 | ||
1400 | /* | 1444 | /* |
1401 | * The CPU has been completely removed, and some other CPU is reporting | 1445 | * The CPU has been completely removed, and some other CPU is reporting |
1402 | * this fact from process context. Do the remainder of the cleanup. | 1446 | * this fact from process context. Do the remainder of the cleanup, |
1447 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
1448 | * adopting them, if there is no _rcu_barrier() instance running. | ||
1403 | * There can only be one CPU hotplug operation at a time, so no other | 1449 | * There can only be one CPU hotplug operation at a time, so no other |
1404 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1450 | * CPU can be attempting to update rcu_cpu_kthread_task. |
1405 | */ | 1451 | */ |
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1409 | unsigned long mask; | 1455 | unsigned long mask; |
1410 | int need_report = 0; | 1456 | int need_report = 0; |
1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1457 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ | 1458 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
1413 | 1459 | ||
1414 | /* Adjust any no-longer-needed kthreads. */ | 1460 | /* Adjust any no-longer-needed kthreads. */ |
1415 | rcu_stop_cpu_kthread(cpu); | 1461 | rcu_stop_cpu_kthread(cpu); |
1416 | rcu_node_kthread_setaffinity(rnp, -1); | 1462 | rcu_node_kthread_setaffinity(rnp, -1); |
1417 | 1463 | ||
1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | 1464 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1419 | 1465 | ||
1420 | /* Exclude any attempts to start a new grace period. */ | 1466 | /* Exclude any attempts to start a new grace period. */ |
1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1467 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1422 | 1468 | ||
1469 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
1470 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
1471 | rcu_adopt_orphan_cbs(rsp); | ||
1472 | |||
1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1473 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1474 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1425 | do { | 1475 | do { |
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1456 | 1506 | ||
1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1507 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1458 | 1508 | ||
1509 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1510 | { | ||
1511 | } | ||
1512 | |||
1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1513 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1460 | { | 1514 | { |
1461 | } | 1515 | } |
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1524 | rcu_is_callbacks_kthread()); | 1578 | rcu_is_callbacks_kthread()); |
1525 | 1579 | ||
1526 | /* Update count, and requeue any remaining callbacks. */ | 1580 | /* Update count, and requeue any remaining callbacks. */ |
1527 | rdp->qlen_lazy -= count_lazy; | ||
1528 | rdp->qlen -= count; | ||
1529 | rdp->n_cbs_invoked += count; | ||
1530 | if (list != NULL) { | 1581 | if (list != NULL) { |
1531 | *tail = rdp->nxtlist; | 1582 | *tail = rdp->nxtlist; |
1532 | rdp->nxtlist = list; | 1583 | rdp->nxtlist = list; |
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1536 | else | 1587 | else |
1537 | break; | 1588 | break; |
1538 | } | 1589 | } |
1590 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
1591 | rdp->qlen_lazy -= count_lazy; | ||
1592 | rdp->qlen -= count; | ||
1593 | rdp->n_cbs_invoked += count; | ||
1539 | 1594 | ||
1540 | /* Reinstate batch limit if we have worked down the excess. */ | 1595 | /* Reinstate batch limit if we have worked down the excess. */ |
1541 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1596 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
@@ -1820,15 +1875,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1820 | * a quiescent state betweentimes. | 1875 | * a quiescent state betweentimes. |
1821 | */ | 1876 | */ |
1822 | local_irq_save(flags); | 1877 | local_irq_save(flags); |
1823 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | ||
1824 | rdp = this_cpu_ptr(rsp->rda); | 1878 | rdp = this_cpu_ptr(rsp->rda); |
1825 | 1879 | ||
1826 | /* Add the callback to our list. */ | 1880 | /* Add the callback to our list. */ |
1827 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1828 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1829 | rdp->qlen++; | 1881 | rdp->qlen++; |
1830 | if (lazy) | 1882 | if (lazy) |
1831 | rdp->qlen_lazy++; | 1883 | rdp->qlen_lazy++; |
1884 | else | ||
1885 | rcu_idle_count_callbacks_posted(); | ||
1886 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
1887 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1888 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1832 | 1889 | ||
1833 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1890 | if (__is_kfree_rcu_offset((unsigned long)func)) |
1834 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1891 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
@@ -1894,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
1894 | } | 1951 | } |
1895 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 1952 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1896 | 1953 | ||
1954 | /* | ||
1955 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | ||
1956 | * any blocking grace-period wait automatically implies a grace period | ||
1957 | * if there is only one CPU online at any point time during execution | ||
1958 | * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to | ||
1959 | * occasionally incorrectly indicate that there are multiple CPUs online | ||
1960 | * when there was in fact only one the whole time, as this just adds | ||
1961 | * some overhead: RCU still operates correctly. | ||
1962 | * | ||
1963 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
1964 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
1965 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
1966 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
1967 | * all times, but num_online_cpus() might well return one (or even zero). | ||
1968 | * | ||
1969 | * However, all such demonic sequences require at least one CPU-offline | ||
1970 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
1971 | * is only a problem if there is an RCU read-side critical section executing | ||
1972 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
1973 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
1974 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
1975 | * that there is only one CPU when in fact there was more than one throughout | ||
1976 | * is when there were no RCU readers in the system. If there are no | ||
1977 | * RCU readers, the grace period by definition can be of zero length, | ||
1978 | * regardless of the number of online CPUs. | ||
1979 | */ | ||
1980 | static inline int rcu_blocking_is_gp(void) | ||
1981 | { | ||
1982 | might_sleep(); /* Check for RCU read-side critical section. */ | ||
1983 | return num_online_cpus() <= 1; | ||
1984 | } | ||
1985 | |||
1897 | /** | 1986 | /** |
1898 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. | 1987 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. |
1899 | * | 1988 | * |
@@ -2167,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
2167 | rcu_preempt_cpu_has_callbacks(cpu); | 2256 | rcu_preempt_cpu_has_callbacks(cpu); |
2168 | } | 2257 | } |
2169 | 2258 | ||
2170 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2259 | /* |
2171 | static atomic_t rcu_barrier_cpu_count; | 2260 | * RCU callback function for _rcu_barrier(). If we are last, wake |
2172 | static DEFINE_MUTEX(rcu_barrier_mutex); | 2261 | * up the task executing _rcu_barrier(). |
2173 | static struct completion rcu_barrier_completion; | 2262 | */ |
2174 | |||
2175 | static void rcu_barrier_callback(struct rcu_head *notused) | 2263 | static void rcu_barrier_callback(struct rcu_head *notused) |
2176 | { | 2264 | { |
2177 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2265 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -2201,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2201 | void (*call_rcu_func)(struct rcu_head *head, | 2289 | void (*call_rcu_func)(struct rcu_head *head, |
2202 | void (*func)(struct rcu_head *head))) | 2290 | void (*func)(struct rcu_head *head))) |
2203 | { | 2291 | { |
2204 | BUG_ON(in_interrupt()); | 2292 | int cpu; |
2293 | unsigned long flags; | ||
2294 | struct rcu_data *rdp; | ||
2295 | struct rcu_head rh; | ||
2296 | |||
2297 | init_rcu_head_on_stack(&rh); | ||
2298 | |||
2205 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2299 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
2206 | mutex_lock(&rcu_barrier_mutex); | 2300 | mutex_lock(&rcu_barrier_mutex); |
2207 | init_completion(&rcu_barrier_completion); | 2301 | |
2302 | smp_mb(); /* Prevent any prior operations from leaking in. */ | ||
2303 | |||
2208 | /* | 2304 | /* |
2209 | * Initialize rcu_barrier_cpu_count to 1, then invoke | 2305 | * Initialize the count to one rather than to zero in order to |
2210 | * rcu_barrier_func() on each CPU, so that each CPU also has | 2306 | * avoid a too-soon return to zero in case of a short grace period |
2211 | * incremented rcu_barrier_cpu_count. Only then is it safe to | 2307 | * (or preemption of this task). Also flag this task as doing |
2212 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 2308 | * an rcu_barrier(). This will prevent anyone else from adopting |
2213 | * might complete its grace period before all of the other CPUs | 2309 | * orphaned callbacks, which could cause otherwise failure if a |
2214 | * did their increment, causing this function to return too | 2310 | * CPU went offline and quickly came back online. To see this, |
2215 | * early. Note that on_each_cpu() disables irqs, which prevents | 2311 | * consider the following sequence of events: |
2216 | * any CPUs from coming online or going offline until each online | 2312 | * |
2217 | * CPU has queued its RCU-barrier callback. | 2313 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. |
2314 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
2315 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
2316 | * 4. CPU 1 comes back online. | ||
2317 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
2318 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
2319 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
2218 | */ | 2320 | */ |
2321 | init_completion(&rcu_barrier_completion); | ||
2219 | atomic_set(&rcu_barrier_cpu_count, 1); | 2322 | atomic_set(&rcu_barrier_cpu_count, 1); |
2220 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 2323 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
2324 | rsp->rcu_barrier_in_progress = current; | ||
2325 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2326 | |||
2327 | /* | ||
2328 | * Force every CPU with callbacks to register a new callback | ||
2329 | * that will tell us when all the preceding callbacks have | ||
2330 | * been invoked. If an offline CPU has callbacks, wait for | ||
2331 | * it to either come back online or to finish orphaning those | ||
2332 | * callbacks. | ||
2333 | */ | ||
2334 | for_each_possible_cpu(cpu) { | ||
2335 | preempt_disable(); | ||
2336 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2337 | if (cpu_is_offline(cpu)) { | ||
2338 | preempt_enable(); | ||
2339 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
2340 | schedule_timeout_interruptible(1); | ||
2341 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2342 | smp_call_function_single(cpu, rcu_barrier_func, | ||
2343 | (void *)call_rcu_func, 1); | ||
2344 | preempt_enable(); | ||
2345 | } else { | ||
2346 | preempt_enable(); | ||
2347 | } | ||
2348 | } | ||
2349 | |||
2350 | /* | ||
2351 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
2352 | * posted, we can adopt all of the orphaned callbacks and place | ||
2353 | * an rcu_barrier_callback() callback after them. When that is done, | ||
2354 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
2355 | * following every callback that could possibly have been | ||
2356 | * registered before _rcu_barrier() was called. | ||
2357 | */ | ||
2358 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
2359 | rcu_adopt_orphan_cbs(rsp); | ||
2360 | rsp->rcu_barrier_in_progress = NULL; | ||
2361 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2362 | atomic_inc(&rcu_barrier_cpu_count); | ||
2363 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
2364 | call_rcu_func(&rh, rcu_barrier_callback); | ||
2365 | |||
2366 | /* | ||
2367 | * Now that we have an rcu_barrier_callback() callback on each | ||
2368 | * CPU, and thus each counted, remove the initial count. | ||
2369 | */ | ||
2221 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2370 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
2222 | complete(&rcu_barrier_completion); | 2371 | complete(&rcu_barrier_completion); |
2372 | |||
2373 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
2223 | wait_for_completion(&rcu_barrier_completion); | 2374 | wait_for_completion(&rcu_barrier_completion); |
2375 | |||
2376 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
2224 | mutex_unlock(&rcu_barrier_mutex); | 2377 | mutex_unlock(&rcu_barrier_mutex); |
2378 | |||
2379 | destroy_rcu_head_on_stack(&rh); | ||
2225 | } | 2380 | } |
2226 | 2381 | ||
2227 | /** | 2382 | /** |
@@ -2418,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2418 | 2573 | ||
2419 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2574 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
2420 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2575 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
2421 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | 2576 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; |
2422 | } | 2577 | } |
2423 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2578 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
2424 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2579 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a407..7f5d138dedf 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -29,18 +29,14 @@ | |||
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
33 | * CONFIG_RCU_FANOUT_LEAF. | ||
33 | * In theory, it should be possible to add more levels straightforwardly. | 34 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this did work well going from three levels to four. | 35 | * In practice, this did work well going from three levels to four. |
35 | * Of course, your mileage may vary. | 36 | * Of course, your mileage may vary. |
36 | */ | 37 | */ |
37 | #define MAX_RCU_LVLS 4 | 38 | #define MAX_RCU_LVLS 4 |
38 | #if CONFIG_RCU_FANOUT > 16 | 39 | #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) |
39 | #define RCU_FANOUT_LEAF 16 | ||
40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ | ||
41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) | ||
42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | ||
43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | 40 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) |
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | 41 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) |
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
@@ -371,6 +367,17 @@ struct rcu_state { | |||
371 | 367 | ||
372 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 368 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
373 | /* starting new GP. */ | 369 | /* starting new GP. */ |
370 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
371 | /* need a grace period. */ | ||
372 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
373 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
374 | /* are ready to invoke. */ | ||
375 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
376 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
377 | long qlen; /* Total number of callbacks. */ | ||
378 | struct task_struct *rcu_barrier_in_progress; | ||
379 | /* Task doing rcu_barrier(), */ | ||
380 | /* or NULL if no barrier. */ | ||
374 | raw_spinlock_t fqslock; /* Only one task forcing */ | 381 | raw_spinlock_t fqslock; /* Only one task forcing */ |
375 | /* quiescent states. */ | 382 | /* quiescent states. */ |
376 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 383 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
423 | /* Forward declarations for rcutree_plugin.h */ | 430 | /* Forward declarations for rcutree_plugin.h */ |
424 | static void rcu_bootup_announce(void); | 431 | static void rcu_bootup_announce(void); |
425 | long rcu_batches_completed(void); | 432 | long rcu_batches_completed(void); |
426 | static void rcu_preempt_note_context_switch(int cpu); | ||
427 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
428 | #ifdef CONFIG_HOTPLUG_CPU | 434 | #ifdef CONFIG_HOTPLUG_CPU |
429 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); | |||
471 | static void rcu_prepare_for_idle_init(int cpu); | 477 | static void rcu_prepare_for_idle_init(int cpu); |
472 | static void rcu_cleanup_after_idle(int cpu); | 478 | static void rcu_cleanup_after_idle(int cpu); |
473 | static void rcu_prepare_for_idle(int cpu); | 479 | static void rcu_prepare_for_idle(int cpu); |
480 | static void rcu_idle_count_callbacks_posted(void); | ||
474 | static void print_cpu_stall_info_begin(void); | 481 | static void print_cpu_stall_info_begin(void); |
475 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 482 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
476 | static void print_cpu_stall_info_end(void); | 483 | static void print_cpu_stall_info_end(void); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816b..2411000d986 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) | |||
153 | * | 153 | * |
154 | * Caller must disable preemption. | 154 | * Caller must disable preemption. |
155 | */ | 155 | */ |
156 | static void rcu_preempt_note_context_switch(int cpu) | 156 | void rcu_preempt_note_context_switch(void) |
157 | { | 157 | { |
158 | struct task_struct *t = current; | 158 | struct task_struct *t = current; |
159 | unsigned long flags; | 159 | unsigned long flags; |
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
165 | 165 | ||
166 | /* Possibly blocking in an RCU read-side critical section. */ | 166 | /* Possibly blocking in an RCU read-side critical section. */ |
167 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); |
168 | rnp = rdp->mynode; | 168 | rnp = rdp->mynode; |
169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 169 | raw_spin_lock_irqsave(&rnp->lock, flags); |
170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
228 | * means that we continue to block the current grace period. | 228 | * means that we continue to block the current grace period. |
229 | */ | 229 | */ |
230 | local_irq_save(flags); | 230 | local_irq_save(flags); |
231 | rcu_preempt_qs(cpu); | 231 | rcu_preempt_qs(smp_processor_id()); |
232 | local_irq_restore(flags); | 232 | local_irq_restore(flags); |
233 | } | 233 | } |
234 | 234 | ||
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) | |||
969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | 969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
970 | } | 970 | } |
971 | 971 | ||
972 | /* | ||
973 | * Check for a task exiting while in a preemptible-RCU read-side | ||
974 | * critical section, clean up if so. No need to issue warnings, | ||
975 | * as debug_check_no_locks_held() already does this if lockdep | ||
976 | * is enabled. | ||
977 | */ | ||
978 | void exit_rcu(void) | ||
979 | { | ||
980 | struct task_struct *t = current; | ||
981 | |||
982 | if (t->rcu_read_lock_nesting == 0) | ||
983 | return; | ||
984 | t->rcu_read_lock_nesting = 1; | ||
985 | __rcu_read_unlock(); | ||
986 | } | ||
987 | |||
988 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 972 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
989 | 973 | ||
990 | static struct rcu_state *rcu_state = &rcu_sched_state; | 974 | static struct rcu_state *rcu_state = &rcu_sched_state; |
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void) | |||
1018 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
1019 | 1003 | ||
1020 | /* | 1004 | /* |
1021 | * Because preemptible RCU does not exist, we never have to check for | ||
1022 | * CPUs being in quiescent states. | ||
1023 | */ | ||
1024 | static void rcu_preempt_note_context_switch(int cpu) | ||
1025 | { | ||
1026 | } | ||
1027 | |||
1028 | /* | ||
1029 | * Because preemptible RCU does not exist, there are never any preempted | 1005 | * Because preemptible RCU does not exist, there are never any preempted |
1030 | * RCU readers. | 1006 | * RCU readers. |
1031 | */ | 1007 | */ |
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu) | |||
1938 | { | 1914 | { |
1939 | } | 1915 | } |
1940 | 1916 | ||
1917 | /* | ||
1918 | * Don't bother keeping a running count of the number of RCU callbacks | ||
1919 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
1920 | */ | ||
1921 | static void rcu_idle_count_callbacks_posted(void) | ||
1922 | { | ||
1923 | } | ||
1924 | |||
1941 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1925 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1942 | 1926 | ||
1943 | /* | 1927 | /* |
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu) | |||
1978 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1979 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1980 | 1964 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
1981 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1982 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
1983 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | 1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ |
1984 | static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ | 1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); |
1985 | static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | 1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ |
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1986 | 1979 | ||
1987 | /* | 1980 | /* |
1988 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | |||
1995 | */ | 1988 | */ |
1996 | int rcu_needs_cpu(int cpu) | 1989 | int rcu_needs_cpu(int cpu) |
1997 | { | 1990 | { |
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1998 | /* If no callbacks, RCU doesn't need the CPU. */ | 1993 | /* If no callbacks, RCU doesn't need the CPU. */ |
1999 | if (!rcu_cpu_has_callbacks(cpu)) | 1994 | if (!rcu_cpu_has_callbacks(cpu)) |
2000 | return 0; | 1995 | return 0; |
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2045 | } | 2040 | } |
2046 | 2041 | ||
2047 | /* | 2042 | /* |
2043 | * Handler for smp_call_function_single(). The only point of this | ||
2044 | * handler is to wake the CPU up, so the handler does only tracing. | ||
2045 | */ | ||
2046 | void rcu_idle_demigrate(void *unused) | ||
2047 | { | ||
2048 | trace_rcu_prep_idle("Demigrate"); | ||
2049 | } | ||
2050 | |||
2051 | /* | ||
2048 | * Timer handler used to force CPU to start pushing its remaining RCU | 2052 | * Timer handler used to force CPU to start pushing its remaining RCU |
2049 | * callbacks in the case where it entered dyntick-idle mode with callbacks | 2053 | * callbacks in the case where it entered dyntick-idle mode with callbacks |
2050 | * pending. The hander doesn't really need to do anything because the | 2054 | * pending. The hander doesn't really need to do anything because the |
2051 | * real work is done upon re-entry to idle, or by the next scheduling-clock | 2055 | * real work is done upon re-entry to idle, or by the next scheduling-clock |
2052 | * interrupt should idle not be re-entered. | 2056 | * interrupt should idle not be re-entered. |
2057 | * | ||
2058 | * One special case: the timer gets migrated without awakening the CPU | ||
2059 | * on which the timer was scheduled on. In this case, we must wake up | ||
2060 | * that CPU. We do so with smp_call_function_single(). | ||
2053 | */ | 2061 | */ |
2054 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | 2062 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) |
2055 | { | 2063 | { |
2064 | int cpu = (int)cpu_in; | ||
2065 | |||
2056 | trace_rcu_prep_idle("Timer"); | 2066 | trace_rcu_prep_idle("Timer"); |
2057 | return HRTIMER_NORESTART; | 2067 | if (cpu != smp_processor_id()) |
2068 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
2069 | else | ||
2070 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
2058 | } | 2071 | } |
2059 | 2072 | ||
2060 | /* | 2073 | /* |
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | |||
2062 | */ | 2075 | */ |
2063 | static void rcu_prepare_for_idle_init(int cpu) | 2076 | static void rcu_prepare_for_idle_init(int cpu) |
2064 | { | 2077 | { |
2065 | static int firsttime = 1; | 2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
2066 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), |
2067 | 2080 | rcu_idle_gp_timer_func, cpu); | |
2068 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; |
2069 | hrtp->function = rcu_idle_gp_timer_func; | 2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; |
2070 | if (firsttime) { | ||
2071 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2072 | |||
2073 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2074 | upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); | ||
2075 | rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2076 | firsttime = 0; | ||
2077 | } | ||
2078 | } | 2083 | } |
2079 | 2084 | ||
2080 | /* | 2085 | /* |
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu) | |||
2084 | */ | 2089 | */ |
2085 | static void rcu_cleanup_after_idle(int cpu) | 2090 | static void rcu_cleanup_after_idle(int cpu) |
2086 | { | 2091 | { |
2087 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | 2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); |
2093 | trace_rcu_prep_idle("Cleanup after idle"); | ||
2088 | } | 2094 | } |
2089 | 2095 | ||
2090 | /* | 2096 | /* |
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | */ | 2114 | */ |
2109 | static void rcu_prepare_for_idle(int cpu) | 2115 | static void rcu_prepare_for_idle(int cpu) |
2110 | { | 2116 | { |
2117 | struct timer_list *tp; | ||
2118 | |||
2119 | /* | ||
2120 | * If this is an idle re-entry, for example, due to use of | ||
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | ||
2122 | * loop, then don't take any state-machine actions, unless the | ||
2123 | * momentary exit from idle queued additional non-lazy callbacks. | ||
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | ||
2125 | * pending. | ||
2126 | */ | ||
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | ||
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | ||
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2133 | } | ||
2134 | return; | ||
2135 | } | ||
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | ||
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | |||
2111 | /* | 2140 | /* |
2112 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2113 | * Also reset state to avoid prejudicing later attempts. | 2142 | * Also reset state to avoid prejudicing later attempts. |
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu) | |||
2140 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
2141 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2142 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) |
2143 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2144 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | 2173 | jiffies + RCU_IDLE_GP_DELAY; |
2145 | else | 2174 | else |
2146 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2147 | rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); | 2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2180 | per_cpu(rcu_nonlazy_posted, cpu); | ||
2148 | return; /* Nothing more to do immediately. */ | 2181 | return; /* Nothing more to do immediately. */ |
2149 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2150 | /* We have hit the limit, so time to give up. */ | 2183 | /* We have hit the limit, so time to give up. */ |
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu) | |||
2184 | trace_rcu_prep_idle("Callbacks drained"); | 2217 | trace_rcu_prep_idle("Callbacks drained"); |
2185 | } | 2218 | } |
2186 | 2219 | ||
2220 | /* | ||
2221 | * Keep a running count of the number of non-lazy callbacks posted | ||
2222 | * on this CPU. This running counter (which is never decremented) allows | ||
2223 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
2224 | * posts a callback, even if an equal number of callbacks are invoked. | ||
2225 | * Of course, callbacks should only be posted from within a trace event | ||
2226 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
2227 | */ | ||
2228 | static void rcu_idle_count_callbacks_posted(void) | ||
2229 | { | ||
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | ||
2231 | } | ||
2232 | |||
2187 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2188 | 2234 | ||
2189 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 2235 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu) | |||
2192 | 2238 | ||
2193 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2194 | { | 2240 | { |
2195 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); |
2196 | 2242 | ||
2197 | sprintf(cp, "drain=%d %c timer=%lld", | 2243 | sprintf(cp, "drain=%d %c timer=%lu", |
2198 | per_cpu(rcu_dyntick_drain, cpu), | 2244 | per_cpu(rcu_dyntick_drain, cpu), |
2199 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', |
2200 | hrtimer_active(hrtp) | 2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2201 | ? ktime_to_us(hrtimer_get_remaining(hrtp)) | ||
2202 | : -1); | ||
2203 | } | 2247 | } |
2204 | 2248 | ||
2205 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2249 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff4..d4bc16ddd1d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
271 | 271 | ||
272 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
275 | rsp->completed, gpnum, rsp->fqs_state, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
276 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
277 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs_lh); | 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
282 | if (rnp->level != level) { | 282 | if (rnp->level != level) { |
283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b..bebe2b170d4 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) | |||
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val, |
26 | bool force) | ||
26 | { | 27 | { |
28 | int ret = 0; | ||
29 | |||
27 | if (counter->usage + val > counter->limit) { | 30 | if (counter->usage + val > counter->limit) { |
28 | counter->failcnt++; | 31 | counter->failcnt++; |
29 | return -ENOMEM; | 32 | ret = -ENOMEM; |
33 | if (!force) | ||
34 | return ret; | ||
30 | } | 35 | } |
31 | 36 | ||
32 | counter->usage += val; | 37 | counter->usage += val; |
33 | if (counter->usage > counter->max_usage) | 38 | if (counter->usage > counter->max_usage) |
34 | counter->max_usage = counter->usage; | 39 | counter->max_usage = counter->usage; |
35 | return 0; | 40 | return ret; |
36 | } | 41 | } |
37 | 42 | ||
38 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 43 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, |
39 | struct res_counter **limit_fail_at) | 44 | struct res_counter **limit_fail_at, bool force) |
40 | { | 45 | { |
41 | int ret; | 46 | int ret, r; |
42 | unsigned long flags; | 47 | unsigned long flags; |
43 | struct res_counter *c, *u; | 48 | struct res_counter *c, *u; |
44 | 49 | ||
50 | r = ret = 0; | ||
45 | *limit_fail_at = NULL; | 51 | *limit_fail_at = NULL; |
46 | local_irq_save(flags); | 52 | local_irq_save(flags); |
47 | for (c = counter; c != NULL; c = c->parent) { | 53 | for (c = counter; c != NULL; c = c->parent) { |
48 | spin_lock(&c->lock); | 54 | spin_lock(&c->lock); |
49 | ret = res_counter_charge_locked(c, val); | 55 | r = res_counter_charge_locked(c, val, force); |
50 | spin_unlock(&c->lock); | 56 | spin_unlock(&c->lock); |
51 | if (ret < 0) { | 57 | if (r < 0 && !ret) { |
58 | ret = r; | ||
52 | *limit_fail_at = c; | 59 | *limit_fail_at = c; |
53 | goto undo; | 60 | if (!force) |
61 | break; | ||
54 | } | 62 | } |
55 | } | 63 | } |
56 | ret = 0; | 64 | |
57 | goto done; | 65 | if (ret < 0 && !force) { |
58 | undo: | 66 | for (u = counter; u != c; u = u->parent) { |
59 | for (u = counter; u != c; u = u->parent) { | 67 | spin_lock(&u->lock); |
60 | spin_lock(&u->lock); | 68 | res_counter_uncharge_locked(u, val); |
61 | res_counter_uncharge_locked(u, val); | 69 | spin_unlock(&u->lock); |
62 | spin_unlock(&u->lock); | 70 | } |
63 | } | 71 | } |
64 | done: | ||
65 | local_irq_restore(flags); | 72 | local_irq_restore(flags); |
73 | |||
66 | return ret; | 74 | return ret; |
67 | } | 75 | } |
68 | 76 | ||
77 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
78 | struct res_counter **limit_fail_at) | ||
79 | { | ||
80 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
81 | } | ||
82 | |||
69 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | 83 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, |
70 | struct res_counter **limit_fail_at) | 84 | struct res_counter **limit_fail_at) |
71 | { | 85 | { |
72 | int ret, r; | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
73 | unsigned long flags; | ||
74 | struct res_counter *c; | ||
75 | |||
76 | r = ret = 0; | ||
77 | *limit_fail_at = NULL; | ||
78 | local_irq_save(flags); | ||
79 | for (c = counter; c != NULL; c = c->parent) { | ||
80 | spin_lock(&c->lock); | ||
81 | r = res_counter_charge_locked(c, val); | ||
82 | if (r) | ||
83 | c->usage += val; | ||
84 | spin_unlock(&c->lock); | ||
85 | if (r < 0 && ret == 0) { | ||
86 | *limit_fail_at = c; | ||
87 | ret = r; | ||
88 | } | ||
89 | } | ||
90 | local_irq_restore(flags); | ||
91 | |||
92 | return ret; | ||
93 | } | 87 | } |
88 | |||
94 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
95 | { | 90 | { |
96 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a..173ea52f3af 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | |||
20 | |||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c692a0a555..39eb6011bc3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 693 | } |
693 | #endif | 694 | #endif |
694 | 695 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 696 | static void set_load_weight(struct task_struct *p) |
698 | { | 697 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 698 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -1913,7 +1912,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1913 | struct task_struct *next) | 1912 | struct task_struct *next) |
1914 | { | 1913 | { |
1915 | sched_info_switch(prev, next); | 1914 | sched_info_switch(prev, next); |
1916 | perf_event_task_sched(prev, next); | 1915 | perf_event_task_sched_out(prev, next); |
1917 | fire_sched_out_preempt_notifiers(prev, next); | 1916 | fire_sched_out_preempt_notifiers(prev, next); |
1918 | prepare_lock_switch(rq, next); | 1917 | prepare_lock_switch(rq, next); |
1919 | prepare_arch_switch(next); | 1918 | prepare_arch_switch(next); |
@@ -1956,6 +1955,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1956 | */ | 1955 | */ |
1957 | prev_state = prev->state; | 1956 | prev_state = prev->state; |
1958 | finish_arch_switch(prev); | 1957 | finish_arch_switch(prev); |
1958 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1959 | local_irq_disable(); | ||
1960 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1961 | perf_event_task_sched_in(prev, current); | ||
1962 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1963 | local_irq_enable(); | ||
1964 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1959 | finish_lock_switch(rq, prev); | 1965 | finish_lock_switch(rq, prev); |
1960 | finish_arch_post_lock_switch(); | 1966 | finish_arch_post_lock_switch(); |
1961 | 1967 | ||
@@ -2076,6 +2082,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2076 | #endif | 2082 | #endif |
2077 | 2083 | ||
2078 | /* Here we just switch the register state and the stack. */ | 2084 | /* Here we just switch the register state and the stack. */ |
2085 | rcu_switch_from(prev); | ||
2079 | switch_to(prev, next, prev); | 2086 | switch_to(prev, next, prev); |
2080 | 2087 | ||
2081 | barrier(); | 2088 | barrier(); |
@@ -2479,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2479 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2480 | * every tick. We fix it up based on jiffies. | 2487 | * every tick. We fix it up based on jiffies. |
2481 | */ | 2488 | */ |
2482 | void update_cpu_load(struct rq *this_rq) | 2489 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2490 | unsigned long pending_updates) | ||
2483 | { | 2491 | { |
2484 | unsigned long this_load = this_rq->load.weight; | ||
2485 | unsigned long curr_jiffies = jiffies; | ||
2486 | unsigned long pending_updates; | ||
2487 | int i, scale; | 2492 | int i, scale; |
2488 | 2493 | ||
2489 | this_rq->nr_load_updates++; | 2494 | this_rq->nr_load_updates++; |
2490 | 2495 | ||
2491 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2492 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2493 | return; | ||
2494 | |||
2495 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2496 | this_rq->last_load_update_tick = curr_jiffies; | ||
2497 | |||
2498 | /* Update our load: */ | 2496 | /* Update our load: */ |
2499 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2497 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2500 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2498 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2519,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq) | |||
2519 | sched_avg_update(this_rq); | 2517 | sched_avg_update(this_rq); |
2520 | } | 2518 | } |
2521 | 2519 | ||
2520 | /* | ||
2521 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2522 | * idle balance. | ||
2523 | */ | ||
2524 | void update_idle_cpu_load(struct rq *this_rq) | ||
2525 | { | ||
2526 | unsigned long curr_jiffies = jiffies; | ||
2527 | unsigned long load = this_rq->load.weight; | ||
2528 | unsigned long pending_updates; | ||
2529 | |||
2530 | /* | ||
2531 | * Bloody broken means of dealing with nohz, but better than nothing.. | ||
2532 | * jiffies is updated by one cpu, another cpu can drift wrt the jiffy | ||
2533 | * update and see 0 difference the one time and 2 the next, even though | ||
2534 | * we ticked at roughtly the same rate. | ||
2535 | * | ||
2536 | * Hence we only use this from nohz_idle_balance() and skip this | ||
2537 | * nonsense when called from the scheduler_tick() since that's | ||
2538 | * guaranteed a stable rate. | ||
2539 | */ | ||
2540 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2541 | return; | ||
2542 | |||
2543 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2544 | this_rq->last_load_update_tick = curr_jiffies; | ||
2545 | |||
2546 | __update_cpu_load(this_rq, load, pending_updates); | ||
2547 | } | ||
2548 | |||
2549 | /* | ||
2550 | * Called from scheduler_tick() | ||
2551 | */ | ||
2522 | static void update_cpu_load_active(struct rq *this_rq) | 2552 | static void update_cpu_load_active(struct rq *this_rq) |
2523 | { | 2553 | { |
2524 | update_cpu_load(this_rq); | 2554 | /* |
2555 | * See the mess in update_idle_cpu_load(). | ||
2556 | */ | ||
2557 | this_rq->last_load_update_tick = jiffies; | ||
2558 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2525 | 2559 | ||
2526 | calc_load_account_active(this_rq); | 2560 | calc_load_account_active(this_rq); |
2527 | } | 2561 | } |
@@ -3106,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3106 | if (irqs_disabled()) | 3140 | if (irqs_disabled()) |
3107 | print_irqtrace_events(prev); | 3141 | print_irqtrace_events(prev); |
3108 | dump_stack(); | 3142 | dump_stack(); |
3143 | add_taint(TAINT_WARN); | ||
3109 | } | 3144 | } |
3110 | 3145 | ||
3111 | /* | 3146 | /* |
@@ -4035,11 +4070,8 @@ static bool check_same_owner(struct task_struct *p) | |||
4035 | 4070 | ||
4036 | rcu_read_lock(); | 4071 | rcu_read_lock(); |
4037 | pcred = __task_cred(p); | 4072 | pcred = __task_cred(p); |
4038 | if (cred->user->user_ns == pcred->user->user_ns) | 4073 | match = (uid_eq(cred->euid, pcred->euid) || |
4039 | match = (cred->euid == pcred->euid || | 4074 | uid_eq(cred->euid, pcred->uid)); |
4040 | cred->euid == pcred->uid); | ||
4041 | else | ||
4042 | match = false; | ||
4043 | rcu_read_unlock(); | 4075 | rcu_read_unlock(); |
4044 | return match; | 4076 | return match; |
4045 | } | 4077 | } |
@@ -5553,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5553 | break; | 5585 | break; |
5554 | } | 5586 | } |
5555 | 5587 | ||
5556 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5588 | if (!(sd->flags & SD_OVERLAP) && |
5589 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5557 | printk(KERN_CONT "\n"); | 5590 | printk(KERN_CONT "\n"); |
5558 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5591 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5559 | break; | 5592 | break; |
@@ -5891,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str) | |||
5891 | 5924 | ||
5892 | __setup("isolcpus=", isolated_cpu_setup); | 5925 | __setup("isolcpus=", isolated_cpu_setup); |
5893 | 5926 | ||
5894 | #ifdef CONFIG_NUMA | ||
5895 | |||
5896 | /** | ||
5897 | * find_next_best_node - find the next node to include in a sched_domain | ||
5898 | * @node: node whose sched_domain we're building | ||
5899 | * @used_nodes: nodes already in the sched_domain | ||
5900 | * | ||
5901 | * Find the next node to include in a given scheduling domain. Simply | ||
5902 | * finds the closest node not already in the @used_nodes map. | ||
5903 | * | ||
5904 | * Should use nodemask_t. | ||
5905 | */ | ||
5906 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5907 | { | ||
5908 | int i, n, val, min_val, best_node = -1; | ||
5909 | |||
5910 | min_val = INT_MAX; | ||
5911 | |||
5912 | for (i = 0; i < nr_node_ids; i++) { | ||
5913 | /* Start at @node */ | ||
5914 | n = (node + i) % nr_node_ids; | ||
5915 | |||
5916 | if (!nr_cpus_node(n)) | ||
5917 | continue; | ||
5918 | |||
5919 | /* Skip already used nodes */ | ||
5920 | if (node_isset(n, *used_nodes)) | ||
5921 | continue; | ||
5922 | |||
5923 | /* Simple min distance search */ | ||
5924 | val = node_distance(node, n); | ||
5925 | |||
5926 | if (val < min_val) { | ||
5927 | min_val = val; | ||
5928 | best_node = n; | ||
5929 | } | ||
5930 | } | ||
5931 | |||
5932 | if (best_node != -1) | ||
5933 | node_set(best_node, *used_nodes); | ||
5934 | return best_node; | ||
5935 | } | ||
5936 | |||
5937 | /** | ||
5938 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5939 | * @node: node whose cpumask we're constructing | ||
5940 | * @span: resulting cpumask | ||
5941 | * | ||
5942 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5943 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5944 | * out optimally. | ||
5945 | */ | ||
5946 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5947 | { | ||
5948 | nodemask_t used_nodes; | ||
5949 | int i; | ||
5950 | |||
5951 | cpumask_clear(span); | ||
5952 | nodes_clear(used_nodes); | ||
5953 | |||
5954 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5955 | node_set(node, used_nodes); | ||
5956 | |||
5957 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5958 | int next_node = find_next_best_node(node, &used_nodes); | ||
5959 | if (next_node < 0) | ||
5960 | break; | ||
5961 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5962 | } | ||
5963 | } | ||
5964 | |||
5965 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5966 | { | ||
5967 | lockdep_assert_held(&sched_domains_mutex); | ||
5968 | |||
5969 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5970 | |||
5971 | return sched_domains_tmpmask; | ||
5972 | } | ||
5973 | |||
5974 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5975 | { | ||
5976 | return cpu_possible_mask; | ||
5977 | } | ||
5978 | #endif /* CONFIG_NUMA */ | ||
5979 | |||
5980 | static const struct cpumask *cpu_cpu_mask(int cpu) | 5927 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5981 | { | 5928 | { |
5982 | return cpumask_of_node(cpu_to_node(cpu)); | 5929 | return cpumask_of_node(cpu_to_node(cpu)); |
5983 | } | 5930 | } |
5984 | 5931 | ||
5985 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5986 | |||
5987 | struct sd_data { | 5932 | struct sd_data { |
5988 | struct sched_domain **__percpu sd; | 5933 | struct sched_domain **__percpu sd; |
5989 | struct sched_group **__percpu sg; | 5934 | struct sched_group **__percpu sg; |
@@ -6013,6 +5958,7 @@ struct sched_domain_topology_level { | |||
6013 | sched_domain_init_f init; | 5958 | sched_domain_init_f init; |
6014 | sched_domain_mask_f mask; | 5959 | sched_domain_mask_f mask; |
6015 | int flags; | 5960 | int flags; |
5961 | int numa_level; | ||
6016 | struct sd_data data; | 5962 | struct sd_data data; |
6017 | }; | 5963 | }; |
6018 | 5964 | ||
@@ -6204,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6204 | } | 6150 | } |
6205 | 6151 | ||
6206 | SD_INIT_FUNC(CPU) | 6152 | SD_INIT_FUNC(CPU) |
6207 | #ifdef CONFIG_NUMA | ||
6208 | SD_INIT_FUNC(ALLNODES) | ||
6209 | SD_INIT_FUNC(NODE) | ||
6210 | #endif | ||
6211 | #ifdef CONFIG_SCHED_SMT | 6153 | #ifdef CONFIG_SCHED_SMT |
6212 | SD_INIT_FUNC(SIBLING) | 6154 | SD_INIT_FUNC(SIBLING) |
6213 | #endif | 6155 | #endif |
@@ -6329,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6329 | { sd_init_BOOK, cpu_book_mask, }, | 6271 | { sd_init_BOOK, cpu_book_mask, }, |
6330 | #endif | 6272 | #endif |
6331 | { sd_init_CPU, cpu_cpu_mask, }, | 6273 | { sd_init_CPU, cpu_cpu_mask, }, |
6332 | #ifdef CONFIG_NUMA | ||
6333 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6334 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6335 | #endif | ||
6336 | { NULL, }, | 6274 | { NULL, }, |
6337 | }; | 6275 | }; |
6338 | 6276 | ||
6339 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6277 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6340 | 6278 | ||
6279 | #ifdef CONFIG_NUMA | ||
6280 | |||
6281 | static int sched_domains_numa_levels; | ||
6282 | static int sched_domains_numa_scale; | ||
6283 | static int *sched_domains_numa_distance; | ||
6284 | static struct cpumask ***sched_domains_numa_masks; | ||
6285 | static int sched_domains_curr_level; | ||
6286 | |||
6287 | static inline int sd_local_flags(int level) | ||
6288 | { | ||
6289 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | ||
6290 | return 0; | ||
6291 | |||
6292 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6293 | } | ||
6294 | |||
6295 | static struct sched_domain * | ||
6296 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6297 | { | ||
6298 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6299 | int level = tl->numa_level; | ||
6300 | int sd_weight = cpumask_weight( | ||
6301 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6302 | |||
6303 | *sd = (struct sched_domain){ | ||
6304 | .min_interval = sd_weight, | ||
6305 | .max_interval = 2*sd_weight, | ||
6306 | .busy_factor = 32, | ||
6307 | .imbalance_pct = 125, | ||
6308 | .cache_nice_tries = 2, | ||
6309 | .busy_idx = 3, | ||
6310 | .idle_idx = 2, | ||
6311 | .newidle_idx = 0, | ||
6312 | .wake_idx = 0, | ||
6313 | .forkexec_idx = 0, | ||
6314 | |||
6315 | .flags = 1*SD_LOAD_BALANCE | ||
6316 | | 1*SD_BALANCE_NEWIDLE | ||
6317 | | 0*SD_BALANCE_EXEC | ||
6318 | | 0*SD_BALANCE_FORK | ||
6319 | | 0*SD_BALANCE_WAKE | ||
6320 | | 0*SD_WAKE_AFFINE | ||
6321 | | 0*SD_PREFER_LOCAL | ||
6322 | | 0*SD_SHARE_CPUPOWER | ||
6323 | | 0*SD_SHARE_PKG_RESOURCES | ||
6324 | | 1*SD_SERIALIZE | ||
6325 | | 0*SD_PREFER_SIBLING | ||
6326 | | sd_local_flags(level) | ||
6327 | , | ||
6328 | .last_balance = jiffies, | ||
6329 | .balance_interval = sd_weight, | ||
6330 | }; | ||
6331 | SD_INIT_NAME(sd, NUMA); | ||
6332 | sd->private = &tl->data; | ||
6333 | |||
6334 | /* | ||
6335 | * Ugly hack to pass state to sd_numa_mask()... | ||
6336 | */ | ||
6337 | sched_domains_curr_level = tl->numa_level; | ||
6338 | |||
6339 | return sd; | ||
6340 | } | ||
6341 | |||
6342 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6343 | { | ||
6344 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6345 | } | ||
6346 | |||
6347 | static void sched_init_numa(void) | ||
6348 | { | ||
6349 | int next_distance, curr_distance = node_distance(0, 0); | ||
6350 | struct sched_domain_topology_level *tl; | ||
6351 | int level = 0; | ||
6352 | int i, j, k; | ||
6353 | |||
6354 | sched_domains_numa_scale = curr_distance; | ||
6355 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6356 | if (!sched_domains_numa_distance) | ||
6357 | return; | ||
6358 | |||
6359 | /* | ||
6360 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6361 | * unique distances in the node_distance() table. | ||
6362 | * | ||
6363 | * Assumes node_distance(0,j) includes all distances in | ||
6364 | * node_distance(i,j) in order to avoid cubic time. | ||
6365 | * | ||
6366 | * XXX: could be optimized to O(n log n) by using sort() | ||
6367 | */ | ||
6368 | next_distance = curr_distance; | ||
6369 | for (i = 0; i < nr_node_ids; i++) { | ||
6370 | for (j = 0; j < nr_node_ids; j++) { | ||
6371 | int distance = node_distance(0, j); | ||
6372 | if (distance > curr_distance && | ||
6373 | (distance < next_distance || | ||
6374 | next_distance == curr_distance)) | ||
6375 | next_distance = distance; | ||
6376 | } | ||
6377 | if (next_distance != curr_distance) { | ||
6378 | sched_domains_numa_distance[level++] = next_distance; | ||
6379 | sched_domains_numa_levels = level; | ||
6380 | curr_distance = next_distance; | ||
6381 | } else break; | ||
6382 | } | ||
6383 | /* | ||
6384 | * 'level' contains the number of unique distances, excluding the | ||
6385 | * identity distance node_distance(i,i). | ||
6386 | * | ||
6387 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6388 | * numbers. | ||
6389 | */ | ||
6390 | |||
6391 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6392 | if (!sched_domains_numa_masks) | ||
6393 | return; | ||
6394 | |||
6395 | /* | ||
6396 | * Now for each level, construct a mask per node which contains all | ||
6397 | * cpus of nodes that are that many hops away from us. | ||
6398 | */ | ||
6399 | for (i = 0; i < level; i++) { | ||
6400 | sched_domains_numa_masks[i] = | ||
6401 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6402 | if (!sched_domains_numa_masks[i]) | ||
6403 | return; | ||
6404 | |||
6405 | for (j = 0; j < nr_node_ids; j++) { | ||
6406 | struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); | ||
6407 | if (!mask) | ||
6408 | return; | ||
6409 | |||
6410 | sched_domains_numa_masks[i][j] = mask; | ||
6411 | |||
6412 | for (k = 0; k < nr_node_ids; k++) { | ||
6413 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6414 | continue; | ||
6415 | |||
6416 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6417 | } | ||
6418 | } | ||
6419 | } | ||
6420 | |||
6421 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6422 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6423 | if (!tl) | ||
6424 | return; | ||
6425 | |||
6426 | /* | ||
6427 | * Copy the default topology bits.. | ||
6428 | */ | ||
6429 | for (i = 0; default_topology[i].init; i++) | ||
6430 | tl[i] = default_topology[i]; | ||
6431 | |||
6432 | /* | ||
6433 | * .. and append 'j' levels of NUMA goodness. | ||
6434 | */ | ||
6435 | for (j = 0; j < level; i++, j++) { | ||
6436 | tl[i] = (struct sched_domain_topology_level){ | ||
6437 | .init = sd_numa_init, | ||
6438 | .mask = sd_numa_mask, | ||
6439 | .flags = SDTL_OVERLAP, | ||
6440 | .numa_level = j, | ||
6441 | }; | ||
6442 | } | ||
6443 | |||
6444 | sched_domain_topology = tl; | ||
6445 | } | ||
6446 | #else | ||
6447 | static inline void sched_init_numa(void) | ||
6448 | { | ||
6449 | } | ||
6450 | #endif /* CONFIG_NUMA */ | ||
6451 | |||
6341 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6452 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6342 | { | 6453 | { |
6343 | struct sched_domain_topology_level *tl; | 6454 | struct sched_domain_topology_level *tl; |
@@ -6375,6 +6486,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6375 | if (!sg) | 6486 | if (!sg) |
6376 | return -ENOMEM; | 6487 | return -ENOMEM; |
6377 | 6488 | ||
6489 | sg->next = sg; | ||
6490 | |||
6378 | *per_cpu_ptr(sdd->sg, j) = sg; | 6491 | *per_cpu_ptr(sdd->sg, j) = sg; |
6379 | 6492 | ||
6380 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6493 | sgp = kzalloc_node(sizeof(struct sched_group_power), |
@@ -6398,16 +6511,26 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6398 | struct sd_data *sdd = &tl->data; | 6511 | struct sd_data *sdd = &tl->data; |
6399 | 6512 | ||
6400 | for_each_cpu(j, cpu_map) { | 6513 | for_each_cpu(j, cpu_map) { |
6401 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | 6514 | struct sched_domain *sd; |
6402 | if (sd && (sd->flags & SD_OVERLAP)) | 6515 | |
6403 | free_sched_groups(sd->groups, 0); | 6516 | if (sdd->sd) { |
6404 | kfree(*per_cpu_ptr(sdd->sd, j)); | 6517 | sd = *per_cpu_ptr(sdd->sd, j); |
6405 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6518 | if (sd && (sd->flags & SD_OVERLAP)) |
6406 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 6519 | free_sched_groups(sd->groups, 0); |
6520 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
6521 | } | ||
6522 | |||
6523 | if (sdd->sg) | ||
6524 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
6525 | if (sdd->sgp) | ||
6526 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
6407 | } | 6527 | } |
6408 | free_percpu(sdd->sd); | 6528 | free_percpu(sdd->sd); |
6529 | sdd->sd = NULL; | ||
6409 | free_percpu(sdd->sg); | 6530 | free_percpu(sdd->sg); |
6531 | sdd->sg = NULL; | ||
6410 | free_percpu(sdd->sgp); | 6532 | free_percpu(sdd->sgp); |
6533 | sdd->sgp = NULL; | ||
6411 | } | 6534 | } |
6412 | } | 6535 | } |
6413 | 6536 | ||
@@ -6693,97 +6816,6 @@ match2: | |||
6693 | mutex_unlock(&sched_domains_mutex); | 6816 | mutex_unlock(&sched_domains_mutex); |
6694 | } | 6817 | } |
6695 | 6818 | ||
6696 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6697 | static void reinit_sched_domains(void) | ||
6698 | { | ||
6699 | get_online_cpus(); | ||
6700 | |||
6701 | /* Destroy domains first to force the rebuild */ | ||
6702 | partition_sched_domains(0, NULL, NULL); | ||
6703 | |||
6704 | rebuild_sched_domains(); | ||
6705 | put_online_cpus(); | ||
6706 | } | ||
6707 | |||
6708 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6709 | { | ||
6710 | unsigned int level = 0; | ||
6711 | |||
6712 | if (sscanf(buf, "%u", &level) != 1) | ||
6713 | return -EINVAL; | ||
6714 | |||
6715 | /* | ||
6716 | * level is always be positive so don't check for | ||
6717 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | ||
6718 | * What happens on 0 or 1 byte write, | ||
6719 | * need to check for count as well? | ||
6720 | */ | ||
6721 | |||
6722 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | ||
6723 | return -EINVAL; | ||
6724 | |||
6725 | if (smt) | ||
6726 | sched_smt_power_savings = level; | ||
6727 | else | ||
6728 | sched_mc_power_savings = level; | ||
6729 | |||
6730 | reinit_sched_domains(); | ||
6731 | |||
6732 | return count; | ||
6733 | } | ||
6734 | |||
6735 | #ifdef CONFIG_SCHED_MC | ||
6736 | static ssize_t sched_mc_power_savings_show(struct device *dev, | ||
6737 | struct device_attribute *attr, | ||
6738 | char *buf) | ||
6739 | { | ||
6740 | return sprintf(buf, "%u\n", sched_mc_power_savings); | ||
6741 | } | ||
6742 | static ssize_t sched_mc_power_savings_store(struct device *dev, | ||
6743 | struct device_attribute *attr, | ||
6744 | const char *buf, size_t count) | ||
6745 | { | ||
6746 | return sched_power_savings_store(buf, count, 0); | ||
6747 | } | ||
6748 | static DEVICE_ATTR(sched_mc_power_savings, 0644, | ||
6749 | sched_mc_power_savings_show, | ||
6750 | sched_mc_power_savings_store); | ||
6751 | #endif | ||
6752 | |||
6753 | #ifdef CONFIG_SCHED_SMT | ||
6754 | static ssize_t sched_smt_power_savings_show(struct device *dev, | ||
6755 | struct device_attribute *attr, | ||
6756 | char *buf) | ||
6757 | { | ||
6758 | return sprintf(buf, "%u\n", sched_smt_power_savings); | ||
6759 | } | ||
6760 | static ssize_t sched_smt_power_savings_store(struct device *dev, | ||
6761 | struct device_attribute *attr, | ||
6762 | const char *buf, size_t count) | ||
6763 | { | ||
6764 | return sched_power_savings_store(buf, count, 1); | ||
6765 | } | ||
6766 | static DEVICE_ATTR(sched_smt_power_savings, 0644, | ||
6767 | sched_smt_power_savings_show, | ||
6768 | sched_smt_power_savings_store); | ||
6769 | #endif | ||
6770 | |||
6771 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) | ||
6772 | { | ||
6773 | int err = 0; | ||
6774 | |||
6775 | #ifdef CONFIG_SCHED_SMT | ||
6776 | if (smt_capable()) | ||
6777 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); | ||
6778 | #endif | ||
6779 | #ifdef CONFIG_SCHED_MC | ||
6780 | if (!err && mc_capable()) | ||
6781 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); | ||
6782 | #endif | ||
6783 | return err; | ||
6784 | } | ||
6785 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
6786 | |||
6787 | /* | 6819 | /* |
6788 | * Update cpusets according to cpu_active mask. If cpusets are | 6820 | * Update cpusets according to cpu_active mask. If cpusets are |
6789 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 6821 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
@@ -6821,6 +6853,8 @@ void __init sched_init_smp(void) | |||
6821 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 6853 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6822 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 6854 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6823 | 6855 | ||
6856 | sched_init_numa(); | ||
6857 | |||
6824 | get_online_cpus(); | 6858 | get_online_cpus(); |
6825 | mutex_lock(&sched_domains_mutex); | 6859 | mutex_lock(&sched_domains_mutex); |
6826 | init_sched_domains(cpu_active_mask); | 6860 | init_sched_domains(cpu_active_mask); |
@@ -7042,6 +7076,7 @@ void __init sched_init(void) | |||
7042 | /* May be allocated at isolcpus cmdline parse time */ | 7076 | /* May be allocated at isolcpus cmdline parse time */ |
7043 | if (cpu_isolated_map == NULL) | 7077 | if (cpu_isolated_map == NULL) |
7044 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7078 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7079 | idle_thread_set_boot_cpu(); | ||
7045 | #endif | 7080 | #endif |
7046 | init_sched_fair_class(); | 7081 | init_sched_fair_class(); |
7047 | 7082 | ||
@@ -7963,13 +7998,9 @@ static struct cftype cpu_files[] = { | |||
7963 | .write_u64 = cpu_rt_period_write_uint, | 7998 | .write_u64 = cpu_rt_period_write_uint, |
7964 | }, | 7999 | }, |
7965 | #endif | 8000 | #endif |
8001 | { } /* terminate */ | ||
7966 | }; | 8002 | }; |
7967 | 8003 | ||
7968 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7969 | { | ||
7970 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7971 | } | ||
7972 | |||
7973 | struct cgroup_subsys cpu_cgroup_subsys = { | 8004 | struct cgroup_subsys cpu_cgroup_subsys = { |
7974 | .name = "cpu", | 8005 | .name = "cpu", |
7975 | .create = cpu_cgroup_create, | 8006 | .create = cpu_cgroup_create, |
@@ -7977,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7977 | .can_attach = cpu_cgroup_can_attach, | 8008 | .can_attach = cpu_cgroup_can_attach, |
7978 | .attach = cpu_cgroup_attach, | 8009 | .attach = cpu_cgroup_attach, |
7979 | .exit = cpu_cgroup_exit, | 8010 | .exit = cpu_cgroup_exit, |
7980 | .populate = cpu_cgroup_populate, | ||
7981 | .subsys_id = cpu_cgroup_subsys_id, | 8011 | .subsys_id = cpu_cgroup_subsys_id, |
8012 | .base_cftypes = cpu_files, | ||
7982 | .early_init = 1, | 8013 | .early_init = 1, |
7983 | }; | 8014 | }; |
7984 | 8015 | ||
@@ -8163,13 +8194,9 @@ static struct cftype files[] = { | |||
8163 | .name = "stat", | 8194 | .name = "stat", |
8164 | .read_map = cpuacct_stats_show, | 8195 | .read_map = cpuacct_stats_show, |
8165 | }, | 8196 | }, |
8197 | { } /* terminate */ | ||
8166 | }; | 8198 | }; |
8167 | 8199 | ||
8168 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8169 | { | ||
8170 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8171 | } | ||
8172 | |||
8173 | /* | 8200 | /* |
8174 | * charge this task's execution time to its accounting group. | 8201 | * charge this task's execution time to its accounting group. |
8175 | * | 8202 | * |
@@ -8201,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8201 | .name = "cpuacct", | 8228 | .name = "cpuacct", |
8202 | .create = cpuacct_create, | 8229 | .create = cpuacct_create, |
8203 | .destroy = cpuacct_destroy, | 8230 | .destroy = cpuacct_destroy, |
8204 | .populate = cpuacct_populate, | ||
8205 | .subsys_id = cpuacct_subsys_id, | 8231 | .subsys_id = cpuacct_subsys_id, |
8232 | .base_cftypes = files, | ||
8206 | }; | 8233 | }; |
8207 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8234 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161..6f79596e0ea 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | SPLIT_NS(spread0)); | 202 | SPLIT_NS(spread0)); |
203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
204 | cfs_rq->nr_spread_over); | 204 | cfs_rq->nr_spread_over); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 207 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
260 | SEQ_printf(m, "\ncpu#%d\n", cpu); | 260 | SEQ_printf(m, "\ncpu#%d\n", cpu); |
261 | #endif | 261 | #endif |
262 | 262 | ||
263 | #define P(x) \ | 263 | #define P(x) \ |
264 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 264 | do { \ |
265 | if (sizeof(rq->x) == 4) \ | ||
266 | SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ | ||
267 | else \ | ||
268 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ | ||
269 | } while (0) | ||
270 | |||
265 | #define PN(x) \ | 271 | #define PN(x) \ |
266 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 272 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
267 | 273 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f..940e6d17cf9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
785 | #ifdef CONFIG_SMP | 785 | #ifdef CONFIG_SMP |
786 | if (entity_is_task(se)) | 786 | if (entity_is_task(se)) |
787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 787 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
788 | #endif | 788 | #endif |
789 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
790 | } | 790 | } |
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2721 | * If power savings logic is enabled for a domain, see if we | 2721 | * If power savings logic is enabled for a domain, see if we |
2722 | * are not overloaded, if so, don't balance wider. | 2722 | * are not overloaded, if so, don't balance wider. |
2723 | */ | 2723 | */ |
2724 | if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { | 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { |
2725 | unsigned long power = 0; | 2725 | unsigned long power = 0; |
2726 | unsigned long nr_running = 0; | 2726 | unsigned long nr_running = 0; |
2727 | unsigned long capacity; | 2727 | unsigned long capacity; |
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2734 | 2734 | ||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
2736 | 2736 | ||
2737 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2738 | nr_running /= 2; | ||
2739 | |||
2740 | if (nr_running < capacity) | 2737 | if (nr_running < capacity) |
2741 | want_sd = 0; | 2738 | want_sd = 0; |
2742 | } | 2739 | } |
@@ -3082,7 +3079,7 @@ struct lb_env { | |||
3082 | struct rq *dst_rq; | 3079 | struct rq *dst_rq; |
3083 | 3080 | ||
3084 | enum cpu_idle_type idle; | 3081 | enum cpu_idle_type idle; |
3085 | long load_move; | 3082 | long imbalance; |
3086 | unsigned int flags; | 3083 | unsigned int flags; |
3087 | 3084 | ||
3088 | unsigned int loop; | 3085 | unsigned int loop; |
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env) | |||
3215 | 3212 | ||
3216 | static unsigned long task_h_load(struct task_struct *p); | 3213 | static unsigned long task_h_load(struct task_struct *p); |
3217 | 3214 | ||
3215 | static const unsigned int sched_nr_migrate_break = 32; | ||
3216 | |||
3218 | /* | 3217 | /* |
3219 | * move_tasks tries to move up to load_move weighted load from busiest to | 3218 | * move_tasks tries to move up to imbalance weighted load from busiest to |
3220 | * this_rq, as part of a balancing operation within domain "sd". | 3219 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | 3220 | * Returns 1 if successful and 0 otherwise. |
3222 | * | 3221 | * |
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env) | |||
3229 | unsigned long load; | 3228 | unsigned long load; |
3230 | int pulled = 0; | 3229 | int pulled = 0; |
3231 | 3230 | ||
3232 | if (env->load_move <= 0) | 3231 | if (env->imbalance <= 0) |
3233 | return 0; | 3232 | return 0; |
3234 | 3233 | ||
3235 | while (!list_empty(tasks)) { | 3234 | while (!list_empty(tasks)) { |
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env) | |||
3242 | 3241 | ||
3243 | /* take a breather every nr_migrate tasks */ | 3242 | /* take a breather every nr_migrate tasks */ |
3244 | if (env->loop > env->loop_break) { | 3243 | if (env->loop > env->loop_break) { |
3245 | env->loop_break += sysctl_sched_nr_migrate; | 3244 | env->loop_break += sched_nr_migrate_break; |
3246 | env->flags |= LBF_NEED_BREAK; | 3245 | env->flags |= LBF_NEED_BREAK; |
3247 | break; | 3246 | break; |
3248 | } | 3247 | } |
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env) | |||
3252 | 3251 | ||
3253 | load = task_h_load(p); | 3252 | load = task_h_load(p); |
3254 | 3253 | ||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | 3254 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) |
3256 | goto next; | 3255 | goto next; |
3257 | 3256 | ||
3258 | if ((load / 2) > env->load_move) | 3257 | if ((load / 2) > env->imbalance) |
3259 | goto next; | 3258 | goto next; |
3260 | 3259 | ||
3261 | if (!can_migrate_task(p, env)) | 3260 | if (!can_migrate_task(p, env)) |
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env) | |||
3263 | 3262 | ||
3264 | move_task(p, env); | 3263 | move_task(p, env); |
3265 | pulled++; | 3264 | pulled++; |
3266 | env->load_move -= load; | 3265 | env->imbalance -= load; |
3267 | 3266 | ||
3268 | #ifdef CONFIG_PREEMPT | 3267 | #ifdef CONFIG_PREEMPT |
3269 | /* | 3268 | /* |
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env) | |||
3279 | * We only want to steal up to the prescribed amount of | 3278 | * We only want to steal up to the prescribed amount of |
3280 | * weighted load. | 3279 | * weighted load. |
3281 | */ | 3280 | */ |
3282 | if (env->load_move <= 0) | 3281 | if (env->imbalance <= 0) |
3283 | break; | 3282 | break; |
3284 | 3283 | ||
3285 | continue; | 3284 | continue; |
@@ -3433,14 +3432,6 @@ struct sd_lb_stats { | |||
3433 | unsigned int busiest_group_weight; | 3432 | unsigned int busiest_group_weight; |
3434 | 3433 | ||
3435 | int group_imb; /* Is there imbalance in this sd */ | 3434 | int group_imb; /* Is there imbalance in this sd */ |
3436 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3437 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3438 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3439 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3440 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3441 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3442 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3443 | #endif | ||
3444 | }; | 3435 | }; |
3445 | 3436 | ||
3446 | /* | 3437 | /* |
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
3484 | return load_idx; | 3475 | return load_idx; |
3485 | } | 3476 | } |
3486 | 3477 | ||
3487 | |||
3488 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3489 | /** | ||
3490 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3491 | * the given sched_domain, during load balancing. | ||
3492 | * | ||
3493 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3494 | * @sds: Variable containing the statistics for sd. | ||
3495 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3496 | */ | ||
3497 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3498 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3499 | { | ||
3500 | /* | ||
3501 | * Busy processors will not participate in power savings | ||
3502 | * balance. | ||
3503 | */ | ||
3504 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3505 | sds->power_savings_balance = 0; | ||
3506 | else { | ||
3507 | sds->power_savings_balance = 1; | ||
3508 | sds->min_nr_running = ULONG_MAX; | ||
3509 | sds->leader_nr_running = 0; | ||
3510 | } | ||
3511 | } | ||
3512 | |||
3513 | /** | ||
3514 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3515 | * sched_domain while performing load balancing. | ||
3516 | * | ||
3517 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3518 | * @sds: Variable containing the statistics of the sched_domain | ||
3519 | * @local_group: Does group contain the CPU for which we're performing | ||
3520 | * load balancing ? | ||
3521 | * @sgs: Variable containing the statistics of the group. | ||
3522 | */ | ||
3523 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3524 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3525 | { | ||
3526 | |||
3527 | if (!sds->power_savings_balance) | ||
3528 | return; | ||
3529 | |||
3530 | /* | ||
3531 | * If the local group is idle or completely loaded | ||
3532 | * no need to do power savings balance at this domain | ||
3533 | */ | ||
3534 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3535 | !sds->this_nr_running)) | ||
3536 | sds->power_savings_balance = 0; | ||
3537 | |||
3538 | /* | ||
3539 | * If a group is already running at full capacity or idle, | ||
3540 | * don't include that group in power savings calculations | ||
3541 | */ | ||
3542 | if (!sds->power_savings_balance || | ||
3543 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3544 | !sgs->sum_nr_running) | ||
3545 | return; | ||
3546 | |||
3547 | /* | ||
3548 | * Calculate the group which has the least non-idle load. | ||
3549 | * This is the group from where we need to pick up the load | ||
3550 | * for saving power | ||
3551 | */ | ||
3552 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3553 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3554 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3555 | sds->group_min = group; | ||
3556 | sds->min_nr_running = sgs->sum_nr_running; | ||
3557 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3558 | sgs->sum_nr_running; | ||
3559 | } | ||
3560 | |||
3561 | /* | ||
3562 | * Calculate the group which is almost near its | ||
3563 | * capacity but still has some space to pick up some load | ||
3564 | * from other group and save more power | ||
3565 | */ | ||
3566 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3567 | return; | ||
3568 | |||
3569 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3570 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3571 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3572 | sds->group_leader = group; | ||
3573 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3574 | } | ||
3575 | } | ||
3576 | |||
3577 | /** | ||
3578 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3579 | * @sds: Variable containing the statistics of the sched_domain | ||
3580 | * under consideration. | ||
3581 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3582 | * @imbalance: Variable to store the imbalance. | ||
3583 | * | ||
3584 | * Description: | ||
3585 | * Check if we have potential to perform some power-savings balance. | ||
3586 | * If yes, set the busiest group to be the least loaded group in the | ||
3587 | * sched_domain, so that it's CPUs can be put to idle. | ||
3588 | * | ||
3589 | * Returns 1 if there is potential to perform power-savings balance. | ||
3590 | * Else returns 0. | ||
3591 | */ | ||
3592 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3593 | int this_cpu, unsigned long *imbalance) | ||
3594 | { | ||
3595 | if (!sds->power_savings_balance) | ||
3596 | return 0; | ||
3597 | |||
3598 | if (sds->this != sds->group_leader || | ||
3599 | sds->group_leader == sds->group_min) | ||
3600 | return 0; | ||
3601 | |||
3602 | *imbalance = sds->min_load_per_task; | ||
3603 | sds->busiest = sds->group_min; | ||
3604 | |||
3605 | return 1; | ||
3606 | |||
3607 | } | ||
3608 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3609 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3610 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3611 | { | ||
3612 | return; | ||
3613 | } | ||
3614 | |||
3615 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3616 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3617 | { | ||
3618 | return; | ||
3619 | } | ||
3620 | |||
3621 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3622 | int this_cpu, unsigned long *imbalance) | ||
3623 | { | ||
3624 | return 0; | ||
3625 | } | ||
3626 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3627 | |||
3628 | |||
3629 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 3478 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
3630 | { | 3479 | { |
3631 | return SCHED_POWER_SCALE; | 3480 | return SCHED_POWER_SCALE; |
@@ -3763,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3763 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3612 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3764 | * @sd: The sched_domain whose statistics are to be updated. | 3613 | * @sd: The sched_domain whose statistics are to be updated. |
3765 | * @group: sched_group whose statistics are to be updated. | 3614 | * @group: sched_group whose statistics are to be updated. |
3766 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3767 | * @idle: Idle status of this_cpu | ||
3768 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3615 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3769 | * @local_group: Does group contain this_cpu. | 3616 | * @local_group: Does group contain this_cpu. |
3770 | * @cpus: Set of cpus considered for load balancing. | 3617 | * @cpus: Set of cpus considered for load balancing. |
3771 | * @balance: Should we balance. | 3618 | * @balance: Should we balance. |
3772 | * @sgs: variable to hold the statistics for this group. | 3619 | * @sgs: variable to hold the statistics for this group. |
3773 | */ | 3620 | */ |
3774 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 3621 | static inline void update_sg_lb_stats(struct lb_env *env, |
3775 | struct sched_group *group, int this_cpu, | 3622 | struct sched_group *group, int load_idx, |
3776 | enum cpu_idle_type idle, int load_idx, | ||
3777 | int local_group, const struct cpumask *cpus, | 3623 | int local_group, const struct cpumask *cpus, |
3778 | int *balance, struct sg_lb_stats *sgs) | 3624 | int *balance, struct sg_lb_stats *sgs) |
3779 | { | 3625 | { |
3780 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; | 3626 | unsigned long nr_running, max_nr_running, min_nr_running; |
3781 | int i; | 3627 | unsigned long load, max_cpu_load, min_cpu_load; |
3782 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3628 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3783 | unsigned long avg_load_per_task = 0; | 3629 | unsigned long avg_load_per_task = 0; |
3630 | int i; | ||
3784 | 3631 | ||
3785 | if (local_group) | 3632 | if (local_group) |
3786 | balance_cpu = group_first_cpu(group); | 3633 | balance_cpu = group_first_cpu(group); |
@@ -3789,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3789 | max_cpu_load = 0; | 3636 | max_cpu_load = 0; |
3790 | min_cpu_load = ~0UL; | 3637 | min_cpu_load = ~0UL; |
3791 | max_nr_running = 0; | 3638 | max_nr_running = 0; |
3639 | min_nr_running = ~0UL; | ||
3792 | 3640 | ||
3793 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3641 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
3794 | struct rq *rq = cpu_rq(i); | 3642 | struct rq *rq = cpu_rq(i); |
3795 | 3643 | ||
3644 | nr_running = rq->nr_running; | ||
3645 | |||
3796 | /* Bias balancing toward cpus of our domain */ | 3646 | /* Bias balancing toward cpus of our domain */ |
3797 | if (local_group) { | 3647 | if (local_group) { |
3798 | if (idle_cpu(i) && !first_idle_cpu) { | 3648 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -3803,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3803 | load = target_load(i, load_idx); | 3653 | load = target_load(i, load_idx); |
3804 | } else { | 3654 | } else { |
3805 | load = source_load(i, load_idx); | 3655 | load = source_load(i, load_idx); |
3806 | if (load > max_cpu_load) { | 3656 | if (load > max_cpu_load) |
3807 | max_cpu_load = load; | 3657 | max_cpu_load = load; |
3808 | max_nr_running = rq->nr_running; | ||
3809 | } | ||
3810 | if (min_cpu_load > load) | 3658 | if (min_cpu_load > load) |
3811 | min_cpu_load = load; | 3659 | min_cpu_load = load; |
3660 | |||
3661 | if (nr_running > max_nr_running) | ||
3662 | max_nr_running = nr_running; | ||
3663 | if (min_nr_running > nr_running) | ||
3664 | min_nr_running = nr_running; | ||
3812 | } | 3665 | } |
3813 | 3666 | ||
3814 | sgs->group_load += load; | 3667 | sgs->group_load += load; |
3815 | sgs->sum_nr_running += rq->nr_running; | 3668 | sgs->sum_nr_running += nr_running; |
3816 | sgs->sum_weighted_load += weighted_cpuload(i); | 3669 | sgs->sum_weighted_load += weighted_cpuload(i); |
3817 | if (idle_cpu(i)) | 3670 | if (idle_cpu(i)) |
3818 | sgs->idle_cpus++; | 3671 | sgs->idle_cpus++; |
@@ -3825,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3825 | * to do the newly idle load balance. | 3678 | * to do the newly idle load balance. |
3826 | */ | 3679 | */ |
3827 | if (local_group) { | 3680 | if (local_group) { |
3828 | if (idle != CPU_NEWLY_IDLE) { | 3681 | if (env->idle != CPU_NEWLY_IDLE) { |
3829 | if (balance_cpu != this_cpu) { | 3682 | if (balance_cpu != env->dst_cpu) { |
3830 | *balance = 0; | 3683 | *balance = 0; |
3831 | return; | 3684 | return; |
3832 | } | 3685 | } |
3833 | update_group_power(sd, this_cpu); | 3686 | update_group_power(env->sd, env->dst_cpu); |
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | 3687 | } else if (time_after_eq(jiffies, group->sgp->next_update)) |
3835 | update_group_power(sd, this_cpu); | 3688 | update_group_power(env->sd, env->dst_cpu); |
3836 | } | 3689 | } |
3837 | 3690 | ||
3838 | /* Adjust by relative CPU power of the group */ | 3691 | /* Adjust by relative CPU power of the group */ |
@@ -3850,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3850 | if (sgs->sum_nr_running) | 3703 | if (sgs->sum_nr_running) |
3851 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 3704 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
3852 | 3705 | ||
3853 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 3706 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && |
3707 | (max_nr_running - min_nr_running) > 1) | ||
3854 | sgs->group_imb = 1; | 3708 | sgs->group_imb = 1; |
3855 | 3709 | ||
3856 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | 3710 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
3857 | SCHED_POWER_SCALE); | 3711 | SCHED_POWER_SCALE); |
3858 | if (!sgs->group_capacity) | 3712 | if (!sgs->group_capacity) |
3859 | sgs->group_capacity = fix_small_capacity(sd, group); | 3713 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
3860 | sgs->group_weight = group->group_weight; | 3714 | sgs->group_weight = group->group_weight; |
3861 | 3715 | ||
3862 | if (sgs->group_capacity > sgs->sum_nr_running) | 3716 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -3874,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3874 | * Determine if @sg is a busier group than the previously selected | 3728 | * Determine if @sg is a busier group than the previously selected |
3875 | * busiest group. | 3729 | * busiest group. |
3876 | */ | 3730 | */ |
3877 | static bool update_sd_pick_busiest(struct sched_domain *sd, | 3731 | static bool update_sd_pick_busiest(struct lb_env *env, |
3878 | struct sd_lb_stats *sds, | 3732 | struct sd_lb_stats *sds, |
3879 | struct sched_group *sg, | 3733 | struct sched_group *sg, |
3880 | struct sg_lb_stats *sgs, | 3734 | struct sg_lb_stats *sgs) |
3881 | int this_cpu) | ||
3882 | { | 3735 | { |
3883 | if (sgs->avg_load <= sds->max_load) | 3736 | if (sgs->avg_load <= sds->max_load) |
3884 | return false; | 3737 | return false; |
@@ -3894,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3894 | * numbered CPUs in the group, therefore mark all groups | 3747 | * numbered CPUs in the group, therefore mark all groups |
3895 | * higher than ourself as busy. | 3748 | * higher than ourself as busy. |
3896 | */ | 3749 | */ |
3897 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 3750 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && |
3898 | this_cpu < group_first_cpu(sg)) { | 3751 | env->dst_cpu < group_first_cpu(sg)) { |
3899 | if (!sds->busiest) | 3752 | if (!sds->busiest) |
3900 | return true; | 3753 | return true; |
3901 | 3754 | ||
@@ -3915,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3915 | * @balance: Should we balance. | 3768 | * @balance: Should we balance. |
3916 | * @sds: variable to hold the statistics for this sched_domain. | 3769 | * @sds: variable to hold the statistics for this sched_domain. |
3917 | */ | 3770 | */ |
3918 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 3771 | static inline void update_sd_lb_stats(struct lb_env *env, |
3919 | enum cpu_idle_type idle, const struct cpumask *cpus, | 3772 | const struct cpumask *cpus, |
3920 | int *balance, struct sd_lb_stats *sds) | 3773 | int *balance, struct sd_lb_stats *sds) |
3921 | { | 3774 | { |
3922 | struct sched_domain *child = sd->child; | 3775 | struct sched_domain *child = env->sd->child; |
3923 | struct sched_group *sg = sd->groups; | 3776 | struct sched_group *sg = env->sd->groups; |
3924 | struct sg_lb_stats sgs; | 3777 | struct sg_lb_stats sgs; |
3925 | int load_idx, prefer_sibling = 0; | 3778 | int load_idx, prefer_sibling = 0; |
3926 | 3779 | ||
3927 | if (child && child->flags & SD_PREFER_SIBLING) | 3780 | if (child && child->flags & SD_PREFER_SIBLING) |
3928 | prefer_sibling = 1; | 3781 | prefer_sibling = 1; |
3929 | 3782 | ||
3930 | init_sd_power_savings_stats(sd, sds, idle); | 3783 | load_idx = get_sd_load_idx(env->sd, env->idle); |
3931 | load_idx = get_sd_load_idx(sd, idle); | ||
3932 | 3784 | ||
3933 | do { | 3785 | do { |
3934 | int local_group; | 3786 | int local_group; |
3935 | 3787 | ||
3936 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 3788 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
3937 | memset(&sgs, 0, sizeof(sgs)); | 3789 | memset(&sgs, 0, sizeof(sgs)); |
3938 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, | 3790 | update_sg_lb_stats(env, sg, load_idx, local_group, |
3939 | local_group, cpus, balance, &sgs); | 3791 | cpus, balance, &sgs); |
3940 | 3792 | ||
3941 | if (local_group && !(*balance)) | 3793 | if (local_group && !(*balance)) |
3942 | return; | 3794 | return; |
@@ -3964,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3964 | sds->this_load_per_task = sgs.sum_weighted_load; | 3816 | sds->this_load_per_task = sgs.sum_weighted_load; |
3965 | sds->this_has_capacity = sgs.group_has_capacity; | 3817 | sds->this_has_capacity = sgs.group_has_capacity; |
3966 | sds->this_idle_cpus = sgs.idle_cpus; | 3818 | sds->this_idle_cpus = sgs.idle_cpus; |
3967 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 3819 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { |
3968 | sds->max_load = sgs.avg_load; | 3820 | sds->max_load = sgs.avg_load; |
3969 | sds->busiest = sg; | 3821 | sds->busiest = sg; |
3970 | sds->busiest_nr_running = sgs.sum_nr_running; | 3822 | sds->busiest_nr_running = sgs.sum_nr_running; |
@@ -3976,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3976 | sds->group_imb = sgs.group_imb; | 3828 | sds->group_imb = sgs.group_imb; |
3977 | } | 3829 | } |
3978 | 3830 | ||
3979 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); | ||
3980 | sg = sg->next; | 3831 | sg = sg->next; |
3981 | } while (sg != sd->groups); | 3832 | } while (sg != env->sd->groups); |
3982 | } | 3833 | } |
3983 | 3834 | ||
3984 | /** | 3835 | /** |
@@ -4006,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
4006 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | 3857 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. |
4007 | * @imbalance: returns amount of imbalanced due to packing. | 3858 | * @imbalance: returns amount of imbalanced due to packing. |
4008 | */ | 3859 | */ |
4009 | static int check_asym_packing(struct sched_domain *sd, | 3860 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
4010 | struct sd_lb_stats *sds, | ||
4011 | int this_cpu, unsigned long *imbalance) | ||
4012 | { | 3861 | { |
4013 | int busiest_cpu; | 3862 | int busiest_cpu; |
4014 | 3863 | ||
4015 | if (!(sd->flags & SD_ASYM_PACKING)) | 3864 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
4016 | return 0; | 3865 | return 0; |
4017 | 3866 | ||
4018 | if (!sds->busiest) | 3867 | if (!sds->busiest) |
4019 | return 0; | 3868 | return 0; |
4020 | 3869 | ||
4021 | busiest_cpu = group_first_cpu(sds->busiest); | 3870 | busiest_cpu = group_first_cpu(sds->busiest); |
4022 | if (this_cpu > busiest_cpu) | 3871 | if (env->dst_cpu > busiest_cpu) |
4023 | return 0; | 3872 | return 0; |
4024 | 3873 | ||
4025 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, | 3874 | env->imbalance = DIV_ROUND_CLOSEST( |
4026 | SCHED_POWER_SCALE); | 3875 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); |
3876 | |||
4027 | return 1; | 3877 | return 1; |
4028 | } | 3878 | } |
4029 | 3879 | ||
@@ -4035,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd, | |||
4035 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | 3885 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. |
4036 | * @imbalance: Variable to store the imbalance. | 3886 | * @imbalance: Variable to store the imbalance. |
4037 | */ | 3887 | */ |
4038 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | 3888 | static inline |
4039 | int this_cpu, unsigned long *imbalance) | 3889 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4040 | { | 3890 | { |
4041 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 3891 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4042 | unsigned int imbn = 2; | 3892 | unsigned int imbn = 2; |
@@ -4047,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4047 | if (sds->busiest_load_per_task > | 3897 | if (sds->busiest_load_per_task > |
4048 | sds->this_load_per_task) | 3898 | sds->this_load_per_task) |
4049 | imbn = 1; | 3899 | imbn = 1; |
4050 | } else | 3900 | } else { |
4051 | sds->this_load_per_task = | 3901 | sds->this_load_per_task = |
4052 | cpu_avg_load_per_task(this_cpu); | 3902 | cpu_avg_load_per_task(env->dst_cpu); |
3903 | } | ||
4053 | 3904 | ||
4054 | scaled_busy_load_per_task = sds->busiest_load_per_task | 3905 | scaled_busy_load_per_task = sds->busiest_load_per_task |
4055 | * SCHED_POWER_SCALE; | 3906 | * SCHED_POWER_SCALE; |
@@ -4057,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4057 | 3908 | ||
4058 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3909 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
4059 | (scaled_busy_load_per_task * imbn)) { | 3910 | (scaled_busy_load_per_task * imbn)) { |
4060 | *imbalance = sds->busiest_load_per_task; | 3911 | env->imbalance = sds->busiest_load_per_task; |
4061 | return; | 3912 | return; |
4062 | } | 3913 | } |
4063 | 3914 | ||
@@ -4094,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4094 | 3945 | ||
4095 | /* Move if we gain throughput */ | 3946 | /* Move if we gain throughput */ |
4096 | if (pwr_move > pwr_now) | 3947 | if (pwr_move > pwr_now) |
4097 | *imbalance = sds->busiest_load_per_task; | 3948 | env->imbalance = sds->busiest_load_per_task; |
4098 | } | 3949 | } |
4099 | 3950 | ||
4100 | /** | 3951 | /** |
4101 | * calculate_imbalance - Calculate the amount of imbalance present within the | 3952 | * calculate_imbalance - Calculate the amount of imbalance present within the |
4102 | * groups of a given sched_domain during load balance. | 3953 | * groups of a given sched_domain during load balance. |
3954 | * @env: load balance environment | ||
4103 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | 3955 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. |
4104 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
4105 | * @imbalance: The variable to store the imbalance. | ||
4106 | */ | 3956 | */ |
4107 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 3957 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4108 | unsigned long *imbalance) | ||
4109 | { | 3958 | { |
4110 | unsigned long max_pull, load_above_capacity = ~0UL; | 3959 | unsigned long max_pull, load_above_capacity = ~0UL; |
4111 | 3960 | ||
@@ -4121,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4121 | * its cpu_power, while calculating max_load..) | 3970 | * its cpu_power, while calculating max_load..) |
4122 | */ | 3971 | */ |
4123 | if (sds->max_load < sds->avg_load) { | 3972 | if (sds->max_load < sds->avg_load) { |
4124 | *imbalance = 0; | 3973 | env->imbalance = 0; |
4125 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3974 | return fix_small_imbalance(env, sds); |
4126 | } | 3975 | } |
4127 | 3976 | ||
4128 | if (!sds->group_imb) { | 3977 | if (!sds->group_imb) { |
@@ -4150,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4150 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 3999 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
4151 | 4000 | ||
4152 | /* How much load to actually move to equalise the imbalance */ | 4001 | /* How much load to actually move to equalise the imbalance */ |
4153 | *imbalance = min(max_pull * sds->busiest->sgp->power, | 4002 | env->imbalance = min(max_pull * sds->busiest->sgp->power, |
4154 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4003 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
4155 | / SCHED_POWER_SCALE; | 4004 | / SCHED_POWER_SCALE; |
4156 | 4005 | ||
@@ -4160,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4160 | * a think about bumping its value to force at least one task to be | 4009 | * a think about bumping its value to force at least one task to be |
4161 | * moved | 4010 | * moved |
4162 | */ | 4011 | */ |
4163 | if (*imbalance < sds->busiest_load_per_task) | 4012 | if (env->imbalance < sds->busiest_load_per_task) |
4164 | return fix_small_imbalance(sds, this_cpu, imbalance); | 4013 | return fix_small_imbalance(env, sds); |
4165 | 4014 | ||
4166 | } | 4015 | } |
4167 | 4016 | ||
@@ -4192,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4192 | * put to idle by rebalancing its tasks onto our group. | 4041 | * put to idle by rebalancing its tasks onto our group. |
4193 | */ | 4042 | */ |
4194 | static struct sched_group * | 4043 | static struct sched_group * |
4195 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 4044 | find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) |
4196 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4197 | const struct cpumask *cpus, int *balance) | ||
4198 | { | 4045 | { |
4199 | struct sd_lb_stats sds; | 4046 | struct sd_lb_stats sds; |
4200 | 4047 | ||
@@ -4204,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4204 | * Compute the various statistics relavent for load balancing at | 4051 | * Compute the various statistics relavent for load balancing at |
4205 | * this level. | 4052 | * this level. |
4206 | */ | 4053 | */ |
4207 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); | 4054 | update_sd_lb_stats(env, cpus, balance, &sds); |
4208 | 4055 | ||
4209 | /* | 4056 | /* |
4210 | * this_cpu is not the appropriate cpu to perform load balancing at | 4057 | * this_cpu is not the appropriate cpu to perform load balancing at |
@@ -4213,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4213 | if (!(*balance)) | 4060 | if (!(*balance)) |
4214 | goto ret; | 4061 | goto ret; |
4215 | 4062 | ||
4216 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | 4063 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4217 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 4064 | check_asym_packing(env, &sds)) |
4218 | return sds.busiest; | 4065 | return sds.busiest; |
4219 | 4066 | ||
4220 | /* There is no busy sibling group to pull tasks from */ | 4067 | /* There is no busy sibling group to pull tasks from */ |
@@ -4232,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4232 | goto force_balance; | 4079 | goto force_balance; |
4233 | 4080 | ||
4234 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4081 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4235 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4082 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
4236 | !sds.busiest_has_capacity) | 4083 | !sds.busiest_has_capacity) |
4237 | goto force_balance; | 4084 | goto force_balance; |
4238 | 4085 | ||
@@ -4250,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4250 | if (sds.this_load >= sds.avg_load) | 4097 | if (sds.this_load >= sds.avg_load) |
4251 | goto out_balanced; | 4098 | goto out_balanced; |
4252 | 4099 | ||
4253 | if (idle == CPU_IDLE) { | 4100 | if (env->idle == CPU_IDLE) { |
4254 | /* | 4101 | /* |
4255 | * This cpu is idle. If the busiest group load doesn't | 4102 | * This cpu is idle. If the busiest group load doesn't |
4256 | * have more tasks than the number of available cpu's and | 4103 | * have more tasks than the number of available cpu's and |
@@ -4265,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4265 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 4112 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4266 | * imbalance_pct to be conservative. | 4113 | * imbalance_pct to be conservative. |
4267 | */ | 4114 | */ |
4268 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 4115 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) |
4269 | goto out_balanced; | 4116 | goto out_balanced; |
4270 | } | 4117 | } |
4271 | 4118 | ||
4272 | force_balance: | 4119 | force_balance: |
4273 | /* Looks like there is an imbalance. Compute it */ | 4120 | /* Looks like there is an imbalance. Compute it */ |
4274 | calculate_imbalance(&sds, this_cpu, imbalance); | 4121 | calculate_imbalance(env, &sds); |
4275 | return sds.busiest; | 4122 | return sds.busiest; |
4276 | 4123 | ||
4277 | out_balanced: | 4124 | out_balanced: |
4278 | /* | ||
4279 | * There is no obvious imbalance. But check if we can do some balancing | ||
4280 | * to save power. | ||
4281 | */ | ||
4282 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4283 | return sds.busiest; | ||
4284 | ret: | 4125 | ret: |
4285 | *imbalance = 0; | 4126 | env->imbalance = 0; |
4286 | return NULL; | 4127 | return NULL; |
4287 | } | 4128 | } |
4288 | 4129 | ||
4289 | /* | 4130 | /* |
4290 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4131 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
4291 | */ | 4132 | */ |
4292 | static struct rq * | 4133 | static struct rq *find_busiest_queue(struct lb_env *env, |
4293 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | 4134 | struct sched_group *group, |
4294 | enum cpu_idle_type idle, unsigned long imbalance, | 4135 | const struct cpumask *cpus) |
4295 | const struct cpumask *cpus) | ||
4296 | { | 4136 | { |
4297 | struct rq *busiest = NULL, *rq; | 4137 | struct rq *busiest = NULL, *rq; |
4298 | unsigned long max_load = 0; | 4138 | unsigned long max_load = 0; |
@@ -4305,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4305 | unsigned long wl; | 4145 | unsigned long wl; |
4306 | 4146 | ||
4307 | if (!capacity) | 4147 | if (!capacity) |
4308 | capacity = fix_small_capacity(sd, group); | 4148 | capacity = fix_small_capacity(env->sd, group); |
4309 | 4149 | ||
4310 | if (!cpumask_test_cpu(i, cpus)) | 4150 | if (!cpumask_test_cpu(i, cpus)) |
4311 | continue; | 4151 | continue; |
@@ -4317,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4317 | * When comparing with imbalance, use weighted_cpuload() | 4157 | * When comparing with imbalance, use weighted_cpuload() |
4318 | * which is not scaled with the cpu power. | 4158 | * which is not scaled with the cpu power. |
4319 | */ | 4159 | */ |
4320 | if (capacity && rq->nr_running == 1 && wl > imbalance) | 4160 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) |
4321 | continue; | 4161 | continue; |
4322 | 4162 | ||
4323 | /* | 4163 | /* |
@@ -4346,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4346 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4186 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4347 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4187 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
4348 | 4188 | ||
4349 | static int need_active_balance(struct sched_domain *sd, int idle, | 4189 | static int need_active_balance(struct lb_env *env) |
4350 | int busiest_cpu, int this_cpu) | ||
4351 | { | 4190 | { |
4352 | if (idle == CPU_NEWLY_IDLE) { | 4191 | struct sched_domain *sd = env->sd; |
4192 | |||
4193 | if (env->idle == CPU_NEWLY_IDLE) { | ||
4353 | 4194 | ||
4354 | /* | 4195 | /* |
4355 | * ASYM_PACKING needs to force migrate tasks from busy but | 4196 | * ASYM_PACKING needs to force migrate tasks from busy but |
4356 | * higher numbered CPUs in order to pack all tasks in the | 4197 | * higher numbered CPUs in order to pack all tasks in the |
4357 | * lowest numbered CPUs. | 4198 | * lowest numbered CPUs. |
4358 | */ | 4199 | */ |
4359 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | 4200 | if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) |
4360 | return 1; | 4201 | return 1; |
4361 | |||
4362 | /* | ||
4363 | * The only task running in a non-idle cpu can be moved to this | ||
4364 | * cpu in an attempt to completely freeup the other CPU | ||
4365 | * package. | ||
4366 | * | ||
4367 | * The package power saving logic comes from | ||
4368 | * find_busiest_group(). If there are no imbalance, then | ||
4369 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4370 | * f_b_g() will select a group from which a running task may be | ||
4371 | * pulled to this cpu in order to make the other package idle. | ||
4372 | * If there is no opportunity to make a package idle and if | ||
4373 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4374 | * action will be taken in load_balance_newidle(). | ||
4375 | * | ||
4376 | * Under normal task pull operation due to imbalance, there | ||
4377 | * will be more than one task in the source run queue and | ||
4378 | * move_tasks() will succeed. ld_moved will be true and this | ||
4379 | * active balance code will not be triggered. | ||
4380 | */ | ||
4381 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4382 | return 0; | ||
4383 | } | 4202 | } |
4384 | 4203 | ||
4385 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 4204 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
@@ -4397,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4397 | { | 4216 | { |
4398 | int ld_moved, active_balance = 0; | 4217 | int ld_moved, active_balance = 0; |
4399 | struct sched_group *group; | 4218 | struct sched_group *group; |
4400 | unsigned long imbalance; | ||
4401 | struct rq *busiest; | 4219 | struct rq *busiest; |
4402 | unsigned long flags; | 4220 | unsigned long flags; |
4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4221 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
@@ -4407,7 +4225,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4407 | .dst_cpu = this_cpu, | 4225 | .dst_cpu = this_cpu, |
4408 | .dst_rq = this_rq, | 4226 | .dst_rq = this_rq, |
4409 | .idle = idle, | 4227 | .idle = idle, |
4410 | .loop_break = sysctl_sched_nr_migrate, | 4228 | .loop_break = sched_nr_migrate_break, |
4411 | }; | 4229 | }; |
4412 | 4230 | ||
4413 | cpumask_copy(cpus, cpu_active_mask); | 4231 | cpumask_copy(cpus, cpu_active_mask); |
@@ -4415,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4415 | schedstat_inc(sd, lb_count[idle]); | 4233 | schedstat_inc(sd, lb_count[idle]); |
4416 | 4234 | ||
4417 | redo: | 4235 | redo: |
4418 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, | 4236 | group = find_busiest_group(&env, cpus, balance); |
4419 | cpus, balance); | ||
4420 | 4237 | ||
4421 | if (*balance == 0) | 4238 | if (*balance == 0) |
4422 | goto out_balanced; | 4239 | goto out_balanced; |
@@ -4426,7 +4243,7 @@ redo: | |||
4426 | goto out_balanced; | 4243 | goto out_balanced; |
4427 | } | 4244 | } |
4428 | 4245 | ||
4429 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); | 4246 | busiest = find_busiest_queue(&env, group, cpus); |
4430 | if (!busiest) { | 4247 | if (!busiest) { |
4431 | schedstat_inc(sd, lb_nobusyq[idle]); | 4248 | schedstat_inc(sd, lb_nobusyq[idle]); |
4432 | goto out_balanced; | 4249 | goto out_balanced; |
@@ -4434,7 +4251,7 @@ redo: | |||
4434 | 4251 | ||
4435 | BUG_ON(busiest == this_rq); | 4252 | BUG_ON(busiest == this_rq); |
4436 | 4253 | ||
4437 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 4254 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4438 | 4255 | ||
4439 | ld_moved = 0; | 4256 | ld_moved = 0; |
4440 | if (busiest->nr_running > 1) { | 4257 | if (busiest->nr_running > 1) { |
@@ -4445,10 +4262,9 @@ redo: | |||
4445 | * correctly treated as an imbalance. | 4262 | * correctly treated as an imbalance. |
4446 | */ | 4263 | */ |
4447 | env.flags |= LBF_ALL_PINNED; | 4264 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | 4265 | env.src_cpu = busiest->cpu; |
4449 | env.src_cpu = busiest->cpu; | 4266 | env.src_rq = busiest; |
4450 | env.src_rq = busiest; | 4267 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
4451 | env.loop_max = busiest->nr_running; | ||
4452 | 4268 | ||
4453 | more_balance: | 4269 | more_balance: |
4454 | local_irq_save(flags); | 4270 | local_irq_save(flags); |
@@ -4490,7 +4306,7 @@ more_balance: | |||
4490 | if (idle != CPU_NEWLY_IDLE) | 4306 | if (idle != CPU_NEWLY_IDLE) |
4491 | sd->nr_balance_failed++; | 4307 | sd->nr_balance_failed++; |
4492 | 4308 | ||
4493 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { | 4309 | if (need_active_balance(&env)) { |
4494 | raw_spin_lock_irqsave(&busiest->lock, flags); | 4310 | raw_spin_lock_irqsave(&busiest->lock, flags); |
4495 | 4311 | ||
4496 | /* don't kick the active_load_balance_cpu_stop, | 4312 | /* don't kick the active_load_balance_cpu_stop, |
@@ -4517,10 +4333,11 @@ more_balance: | |||
4517 | } | 4333 | } |
4518 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 4334 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
4519 | 4335 | ||
4520 | if (active_balance) | 4336 | if (active_balance) { |
4521 | stop_one_cpu_nowait(cpu_of(busiest), | 4337 | stop_one_cpu_nowait(cpu_of(busiest), |
4522 | active_load_balance_cpu_stop, busiest, | 4338 | active_load_balance_cpu_stop, busiest, |
4523 | &busiest->active_balance_work); | 4339 | &busiest->active_balance_work); |
4340 | } | ||
4524 | 4341 | ||
4525 | /* | 4342 | /* |
4526 | * We've kicked active balancing, reset the failure | 4343 | * We've kicked active balancing, reset the failure |
@@ -4701,104 +4518,15 @@ static struct { | |||
4701 | unsigned long next_balance; /* in jiffy units */ | 4518 | unsigned long next_balance; /* in jiffy units */ |
4702 | } nohz ____cacheline_aligned; | 4519 | } nohz ____cacheline_aligned; |
4703 | 4520 | ||
4704 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4521 | static inline int find_new_ilb(int call_cpu) |
4705 | /** | ||
4706 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4707 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4708 | * be returned. | ||
4709 | * @flag: The flag to check for the lowest sched_domain | ||
4710 | * for the given cpu. | ||
4711 | * | ||
4712 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4713 | */ | ||
4714 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4715 | { | ||
4716 | struct sched_domain *sd; | ||
4717 | |||
4718 | for_each_domain(cpu, sd) | ||
4719 | if (sd->flags & flag) | ||
4720 | break; | ||
4721 | |||
4722 | return sd; | ||
4723 | } | ||
4724 | |||
4725 | /** | ||
4726 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4727 | * @cpu: The cpu whose domains we're iterating over. | ||
4728 | * @sd: variable holding the value of the power_savings_sd | ||
4729 | * for cpu. | ||
4730 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4731 | * | ||
4732 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4733 | * set, starting from the lowest sched_domain to the highest. | ||
4734 | */ | ||
4735 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4736 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4737 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4738 | |||
4739 | /** | ||
4740 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4741 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4742 | * | ||
4743 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4744 | * Else, returns >= nr_cpu_ids. | ||
4745 | * | ||
4746 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4747 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4748 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4749 | * when there are other idle cpu's which are better suited for that job. | ||
4750 | */ | ||
4751 | static int find_new_ilb(int cpu) | ||
4752 | { | 4522 | { |
4753 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 4523 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
4754 | struct sched_group *ilbg; | ||
4755 | struct sched_domain *sd; | ||
4756 | |||
4757 | /* | ||
4758 | * Have idle load balancer selection from semi-idle packages only | ||
4759 | * when power-aware load balancing is enabled | ||
4760 | */ | ||
4761 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4762 | goto out_done; | ||
4763 | |||
4764 | /* | ||
4765 | * Optimize for the case when we have no idle CPUs or only one | ||
4766 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4767 | */ | ||
4768 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | ||
4769 | goto out_done; | ||
4770 | |||
4771 | rcu_read_lock(); | ||
4772 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4773 | ilbg = sd->groups; | ||
4774 | |||
4775 | do { | ||
4776 | if (ilbg->group_weight != | ||
4777 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { | ||
4778 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4779 | sched_group_cpus(ilbg)); | ||
4780 | goto unlock; | ||
4781 | } | ||
4782 | |||
4783 | ilbg = ilbg->next; | ||
4784 | |||
4785 | } while (ilbg != sd->groups); | ||
4786 | } | ||
4787 | unlock: | ||
4788 | rcu_read_unlock(); | ||
4789 | 4524 | ||
4790 | out_done: | ||
4791 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | 4525 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4792 | return ilb; | 4526 | return ilb; |
4793 | 4527 | ||
4794 | return nr_cpu_ids; | 4528 | return nr_cpu_ids; |
4795 | } | 4529 | } |
4796 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4797 | static inline int find_new_ilb(int call_cpu) | ||
4798 | { | ||
4799 | return nr_cpu_ids; | ||
4800 | } | ||
4801 | #endif | ||
4802 | 4530 | ||
4803 | /* | 4531 | /* |
4804 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | 4532 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the |
@@ -5021,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
5021 | 4749 | ||
5022 | raw_spin_lock_irq(&this_rq->lock); | 4750 | raw_spin_lock_irq(&this_rq->lock); |
5023 | update_rq_clock(this_rq); | 4751 | update_rq_clock(this_rq); |
5024 | update_cpu_load(this_rq); | 4752 | update_idle_cpu_load(this_rq); |
5025 | raw_spin_unlock_irq(&this_rq->lock); | 4753 | raw_spin_unlock_irq(&this_rq->lock); |
5026 | 4754 | ||
5027 | rebalance_domains(balance_cpu, CPU_IDLE); | 4755 | rebalance_domains(balance_cpu, CPU_IDLE); |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d..de00a486c5c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
68 | 68 | ||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
71 | SCHED_FEAT(LB_MIN, false) | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f28..b44d604b35d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
5 | * | 5 | * |
6 | * (NOTE: these are not related to SCHED_IDLE tasks which are | 6 | * (NOTE: these are not related to SCHED_IDLE tasks which are |
7 | * handled in sched_fair.c) | 7 | * handled in sched/fair.c) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d..c5565c3c515 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1803 | static void set_cpus_allowed_rt(struct task_struct *p, | 1803 | static void set_cpus_allowed_rt(struct task_struct *p, |
1804 | const struct cpumask *new_mask) | 1804 | const struct cpumask *new_mask) |
1805 | { | 1805 | { |
1806 | int weight = cpumask_weight(new_mask); | 1806 | struct rq *rq; |
1807 | int weight; | ||
1807 | 1808 | ||
1808 | BUG_ON(!rt_task(p)); | 1809 | BUG_ON(!rt_task(p)); |
1809 | 1810 | ||
1810 | /* | 1811 | if (!p->on_rq) |
1811 | * Update the migration status of the RQ if we have an RT task | 1812 | return; |
1812 | * which is running AND changing its weight value. | ||
1813 | */ | ||
1814 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
1815 | struct rq *rq = task_rq(p); | ||
1816 | |||
1817 | if (!task_current(rq, p)) { | ||
1818 | /* | ||
1819 | * Make sure we dequeue this task from the pushable list | ||
1820 | * before going further. It will either remain off of | ||
1821 | * the list because we are no longer pushable, or it | ||
1822 | * will be requeued. | ||
1823 | */ | ||
1824 | if (p->rt.nr_cpus_allowed > 1) | ||
1825 | dequeue_pushable_task(rq, p); | ||
1826 | 1813 | ||
1827 | /* | 1814 | weight = cpumask_weight(new_mask); |
1828 | * Requeue if our weight is changing and still > 1 | ||
1829 | */ | ||
1830 | if (weight > 1) | ||
1831 | enqueue_pushable_task(rq, p); | ||
1832 | 1815 | ||
1833 | } | 1816 | /* |
1817 | * Only update if the process changes its state from whether it | ||
1818 | * can migrate or not. | ||
1819 | */ | ||
1820 | if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) | ||
1821 | return; | ||
1834 | 1822 | ||
1835 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | 1823 | rq = task_rq(p); |
1836 | rq->rt.rt_nr_migratory++; | ||
1837 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
1838 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1839 | rq->rt.rt_nr_migratory--; | ||
1840 | } | ||
1841 | 1824 | ||
1842 | update_rt_migration(&rq->rt); | 1825 | /* |
1826 | * The process used to be able to migrate OR it can now migrate | ||
1827 | */ | ||
1828 | if (weight <= 1) { | ||
1829 | if (!task_current(rq, p)) | ||
1830 | dequeue_pushable_task(rq, p); | ||
1831 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1832 | rq->rt.rt_nr_migratory--; | ||
1833 | } else { | ||
1834 | if (!task_current(rq, p)) | ||
1835 | enqueue_pushable_task(rq, p); | ||
1836 | rq->rt.rt_nr_migratory++; | ||
1843 | } | 1837 | } |
1838 | |||
1839 | update_rt_migration(&rq->rt); | ||
1844 | } | 1840 | } |
1845 | 1841 | ||
1846 | /* Assumes rq->lock is held */ | 1842 | /* Assumes rq->lock is held */ |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52..ba9dccfd24c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -201,7 +201,7 @@ struct cfs_bandwidth { }; | |||
201 | /* CFS-related fields in a runqueue */ | 201 | /* CFS-related fields in a runqueue */ |
202 | struct cfs_rq { | 202 | struct cfs_rq { |
203 | struct load_weight load; | 203 | struct load_weight load; |
204 | unsigned long nr_running, h_nr_running; | 204 | unsigned int nr_running, h_nr_running; |
205 | 205 | ||
206 | u64 exec_clock; | 206 | u64 exec_clock; |
207 | u64 min_vruntime; | 207 | u64 min_vruntime; |
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void) | |||
279 | /* Real-Time classes' related field in a runqueue: */ | 279 | /* Real-Time classes' related field in a runqueue: */ |
280 | struct rt_rq { | 280 | struct rt_rq { |
281 | struct rt_prio_array active; | 281 | struct rt_prio_array active; |
282 | unsigned long rt_nr_running; | 282 | unsigned int rt_nr_running; |
283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
284 | struct { | 284 | struct { |
285 | int curr; /* highest queued rt task prio */ | 285 | int curr; /* highest queued rt task prio */ |
@@ -353,7 +353,7 @@ struct rq { | |||
353 | * nr_running and cpu_load should be in the same cacheline because | 353 | * nr_running and cpu_load should be in the same cacheline because |
354 | * remote CPUs use both these fields when doing load calculation. | 354 | * remote CPUs use both these fields when doing load calculation. |
355 | */ | 355 | */ |
356 | unsigned long nr_running; | 356 | unsigned int nr_running; |
357 | #define CPU_LOAD_IDX_MAX 5 | 357 | #define CPU_LOAD_IDX_MAX 5 |
358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
359 | unsigned long last_load_update_tick; | 359 | unsigned long last_load_update_tick; |
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu); | |||
876 | extern struct rt_bandwidth def_rt_bandwidth; | 876 | extern struct rt_bandwidth def_rt_bandwidth; |
877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
878 | 878 | ||
879 | extern void update_cpu_load(struct rq *this_rq); | 879 | extern void update_idle_cpu_load(struct rq *this_rq); |
880 | 880 | ||
881 | #ifdef CONFIG_CGROUP_CPUACCT | 881 | #ifdef CONFIG_CGROUP_CPUACCT |
882 | #include <linux/cgroup.h> | 882 | #include <linux/cgroup.h> |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895e..ee376beedaf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -3,16 +3,357 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | 4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * | 5 | * |
6 | * This defines a simple but solid secure-computing mode. | 6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> | ||
8 | * | ||
9 | * This defines a simple but solid secure-computing facility. | ||
10 | * | ||
11 | * Mode 1 uses a fixed list of allowed system calls. | ||
12 | * Mode 2 allows user-defined system call filters in the form | ||
13 | * of Berkeley Packet Filters/Linux Socket Filters. | ||
7 | */ | 14 | */ |
8 | 15 | ||
16 | #include <linux/atomic.h> | ||
9 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
10 | #include <linux/seccomp.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> | ||
20 | #include <linux/seccomp.h> | ||
13 | 21 | ||
14 | /* #define SECCOMP_DEBUG 1 */ | 22 | /* #define SECCOMP_DEBUG 1 */ |
15 | #define NR_SECCOMP_MODES 1 | 23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER | ||
25 | #include <asm/syscall.h> | ||
26 | #include <linux/filter.h> | ||
27 | #include <linux/ptrace.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/tracehook.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | |||
33 | /** | ||
34 | * struct seccomp_filter - container for seccomp BPF programs | ||
35 | * | ||
36 | * @usage: reference count to manage the object lifetime. | ||
37 | * get/put helpers should be used when accessing an instance | ||
38 | * outside of a lifetime-guarded section. In general, this | ||
39 | * is only needed for handling filters shared across tasks. | ||
40 | * @prev: points to a previously installed, or inherited, filter | ||
41 | * @len: the number of instructions in the program | ||
42 | * @insns: the BPF program instructions to evaluate | ||
43 | * | ||
44 | * seccomp_filter objects are organized in a tree linked via the @prev | ||
45 | * pointer. For any task, it appears to be a singly-linked list starting | ||
46 | * with current->seccomp.filter, the most recently attached or inherited filter. | ||
47 | * However, multiple filters may share a @prev node, by way of fork(), which | ||
48 | * results in a unidirectional tree existing in memory. This is similar to | ||
49 | * how namespaces work. | ||
50 | * | ||
51 | * seccomp_filter objects should never be modified after being attached | ||
52 | * to a task_struct (other than @usage). | ||
53 | */ | ||
54 | struct seccomp_filter { | ||
55 | atomic_t usage; | ||
56 | struct seccomp_filter *prev; | ||
57 | unsigned short len; /* Instruction count */ | ||
58 | struct sock_filter insns[]; | ||
59 | }; | ||
60 | |||
61 | /* Limit any path through the tree to 256KB worth of instructions. */ | ||
62 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) | ||
63 | |||
64 | /** | ||
65 | * get_u32 - returns a u32 offset into data | ||
66 | * @data: a unsigned 64 bit value | ||
67 | * @index: 0 or 1 to return the first or second 32-bits | ||
68 | * | ||
69 | * This inline exists to hide the length of unsigned long. If a 32-bit | ||
70 | * unsigned long is passed in, it will be extended and the top 32-bits will be | ||
71 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be | ||
72 | * properly returned. | ||
73 | * | ||
74 | * Endianness is explicitly ignored and left for BPF program authors to manage | ||
75 | * as per the specific architecture. | ||
76 | */ | ||
77 | static inline u32 get_u32(u64 data, int index) | ||
78 | { | ||
79 | return ((u32 *)&data)[index]; | ||
80 | } | ||
81 | |||
82 | /* Helper for bpf_load below. */ | ||
83 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) | ||
84 | /** | ||
85 | * bpf_load: checks and returns a pointer to the requested offset | ||
86 | * @off: offset into struct seccomp_data to load from | ||
87 | * | ||
88 | * Returns the requested 32-bits of data. | ||
89 | * seccomp_check_filter() should assure that @off is 32-bit aligned | ||
90 | * and not out of bounds. Failure to do so is a BUG. | ||
91 | */ | ||
92 | u32 seccomp_bpf_load(int off) | ||
93 | { | ||
94 | struct pt_regs *regs = task_pt_regs(current); | ||
95 | if (off == BPF_DATA(nr)) | ||
96 | return syscall_get_nr(current, regs); | ||
97 | if (off == BPF_DATA(arch)) | ||
98 | return syscall_get_arch(current, regs); | ||
99 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { | ||
100 | unsigned long value; | ||
101 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); | ||
102 | int index = !!(off % sizeof(u64)); | ||
103 | syscall_get_arguments(current, regs, arg, 1, &value); | ||
104 | return get_u32(value, index); | ||
105 | } | ||
106 | if (off == BPF_DATA(instruction_pointer)) | ||
107 | return get_u32(KSTK_EIP(current), 0); | ||
108 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) | ||
109 | return get_u32(KSTK_EIP(current), 1); | ||
110 | /* seccomp_check_filter should make this impossible. */ | ||
111 | BUG(); | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * seccomp_check_filter - verify seccomp filter code | ||
116 | * @filter: filter to verify | ||
117 | * @flen: length of filter | ||
118 | * | ||
119 | * Takes a previously checked filter (by sk_chk_filter) and | ||
120 | * redirects all filter code that loads struct sk_buff data | ||
121 | * and related data through seccomp_bpf_load. It also | ||
122 | * enforces length and alignment checking of those loads. | ||
123 | * | ||
124 | * Returns 0 if the rule set is legal or -EINVAL if not. | ||
125 | */ | ||
126 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | ||
127 | { | ||
128 | int pc; | ||
129 | for (pc = 0; pc < flen; pc++) { | ||
130 | struct sock_filter *ftest = &filter[pc]; | ||
131 | u16 code = ftest->code; | ||
132 | u32 k = ftest->k; | ||
133 | |||
134 | switch (code) { | ||
135 | case BPF_S_LD_W_ABS: | ||
136 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; | ||
137 | /* 32-bit aligned and not out of bounds. */ | ||
138 | if (k >= sizeof(struct seccomp_data) || k & 3) | ||
139 | return -EINVAL; | ||
140 | continue; | ||
141 | case BPF_S_LD_W_LEN: | ||
142 | ftest->code = BPF_S_LD_IMM; | ||
143 | ftest->k = sizeof(struct seccomp_data); | ||
144 | continue; | ||
145 | case BPF_S_LDX_W_LEN: | ||
146 | ftest->code = BPF_S_LDX_IMM; | ||
147 | ftest->k = sizeof(struct seccomp_data); | ||
148 | continue; | ||
149 | /* Explicitly include allowed calls. */ | ||
150 | case BPF_S_RET_K: | ||
151 | case BPF_S_RET_A: | ||
152 | case BPF_S_ALU_ADD_K: | ||
153 | case BPF_S_ALU_ADD_X: | ||
154 | case BPF_S_ALU_SUB_K: | ||
155 | case BPF_S_ALU_SUB_X: | ||
156 | case BPF_S_ALU_MUL_K: | ||
157 | case BPF_S_ALU_MUL_X: | ||
158 | case BPF_S_ALU_DIV_X: | ||
159 | case BPF_S_ALU_AND_K: | ||
160 | case BPF_S_ALU_AND_X: | ||
161 | case BPF_S_ALU_OR_K: | ||
162 | case BPF_S_ALU_OR_X: | ||
163 | case BPF_S_ALU_LSH_K: | ||
164 | case BPF_S_ALU_LSH_X: | ||
165 | case BPF_S_ALU_RSH_K: | ||
166 | case BPF_S_ALU_RSH_X: | ||
167 | case BPF_S_ALU_NEG: | ||
168 | case BPF_S_LD_IMM: | ||
169 | case BPF_S_LDX_IMM: | ||
170 | case BPF_S_MISC_TAX: | ||
171 | case BPF_S_MISC_TXA: | ||
172 | case BPF_S_ALU_DIV_K: | ||
173 | case BPF_S_LD_MEM: | ||
174 | case BPF_S_LDX_MEM: | ||
175 | case BPF_S_ST: | ||
176 | case BPF_S_STX: | ||
177 | case BPF_S_JMP_JA: | ||
178 | case BPF_S_JMP_JEQ_K: | ||
179 | case BPF_S_JMP_JEQ_X: | ||
180 | case BPF_S_JMP_JGE_K: | ||
181 | case BPF_S_JMP_JGE_X: | ||
182 | case BPF_S_JMP_JGT_K: | ||
183 | case BPF_S_JMP_JGT_X: | ||
184 | case BPF_S_JMP_JSET_K: | ||
185 | case BPF_S_JMP_JSET_X: | ||
186 | continue; | ||
187 | default: | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * seccomp_run_filters - evaluates all seccomp filters against @syscall | ||
196 | * @syscall: number of the current system call | ||
197 | * | ||
198 | * Returns valid seccomp BPF response codes. | ||
199 | */ | ||
200 | static u32 seccomp_run_filters(int syscall) | ||
201 | { | ||
202 | struct seccomp_filter *f; | ||
203 | u32 ret = SECCOMP_RET_ALLOW; | ||
204 | |||
205 | /* Ensure unexpected behavior doesn't result in failing open. */ | ||
206 | if (WARN_ON(current->seccomp.filter == NULL)) | ||
207 | return SECCOMP_RET_KILL; | ||
208 | |||
209 | /* | ||
210 | * All filters in the list are evaluated and the lowest BPF return | ||
211 | * value always takes priority (ignoring the DATA). | ||
212 | */ | ||
213 | for (f = current->seccomp.filter; f; f = f->prev) { | ||
214 | u32 cur_ret = sk_run_filter(NULL, f->insns); | ||
215 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | ||
216 | ret = cur_ret; | ||
217 | } | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | /** | ||
222 | * seccomp_attach_filter: Attaches a seccomp filter to current. | ||
223 | * @fprog: BPF program to install | ||
224 | * | ||
225 | * Returns 0 on success or an errno on failure. | ||
226 | */ | ||
227 | static long seccomp_attach_filter(struct sock_fprog *fprog) | ||
228 | { | ||
229 | struct seccomp_filter *filter; | ||
230 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | ||
231 | unsigned long total_insns = fprog->len; | ||
232 | long ret; | ||
233 | |||
234 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | ||
235 | return -EINVAL; | ||
236 | |||
237 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | ||
238 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | ||
239 | if (total_insns > MAX_INSNS_PER_PATH) | ||
240 | return -ENOMEM; | ||
241 | |||
242 | /* | ||
243 | * Installing a seccomp filter requires that the task have | ||
244 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | ||
245 | * This avoids scenarios where unprivileged tasks can affect the | ||
246 | * behavior of privileged children. | ||
247 | */ | ||
248 | if (!current->no_new_privs && | ||
249 | security_capable_noaudit(current_cred(), current_user_ns(), | ||
250 | CAP_SYS_ADMIN) != 0) | ||
251 | return -EACCES; | ||
252 | |||
253 | /* Allocate a new seccomp_filter */ | ||
254 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, | ||
255 | GFP_KERNEL|__GFP_NOWARN); | ||
256 | if (!filter) | ||
257 | return -ENOMEM; | ||
258 | atomic_set(&filter->usage, 1); | ||
259 | filter->len = fprog->len; | ||
260 | |||
261 | /* Copy the instructions from fprog. */ | ||
262 | ret = -EFAULT; | ||
263 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) | ||
264 | goto fail; | ||
265 | |||
266 | /* Check and rewrite the fprog via the skb checker */ | ||
267 | ret = sk_chk_filter(filter->insns, filter->len); | ||
268 | if (ret) | ||
269 | goto fail; | ||
270 | |||
271 | /* Check and rewrite the fprog for seccomp use */ | ||
272 | ret = seccomp_check_filter(filter->insns, filter->len); | ||
273 | if (ret) | ||
274 | goto fail; | ||
275 | |||
276 | /* | ||
277 | * If there is an existing filter, make it the prev and don't drop its | ||
278 | * task reference. | ||
279 | */ | ||
280 | filter->prev = current->seccomp.filter; | ||
281 | current->seccomp.filter = filter; | ||
282 | return 0; | ||
283 | fail: | ||
284 | kfree(filter); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | ||
290 | * @user_filter: pointer to the user data containing a sock_fprog. | ||
291 | * | ||
292 | * Returns 0 on success and non-zero otherwise. | ||
293 | */ | ||
294 | long seccomp_attach_user_filter(char __user *user_filter) | ||
295 | { | ||
296 | struct sock_fprog fprog; | ||
297 | long ret = -EFAULT; | ||
298 | |||
299 | #ifdef CONFIG_COMPAT | ||
300 | if (is_compat_task()) { | ||
301 | struct compat_sock_fprog fprog32; | ||
302 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) | ||
303 | goto out; | ||
304 | fprog.len = fprog32.len; | ||
305 | fprog.filter = compat_ptr(fprog32.filter); | ||
306 | } else /* falls through to the if below. */ | ||
307 | #endif | ||
308 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | ||
309 | goto out; | ||
310 | ret = seccomp_attach_filter(&fprog); | ||
311 | out: | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | ||
316 | void get_seccomp_filter(struct task_struct *tsk) | ||
317 | { | ||
318 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
319 | if (!orig) | ||
320 | return; | ||
321 | /* Reference count is bounded by the number of total processes. */ | ||
322 | atomic_inc(&orig->usage); | ||
323 | } | ||
324 | |||
325 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | ||
326 | void put_seccomp_filter(struct task_struct *tsk) | ||
327 | { | ||
328 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
329 | /* Clean up single-reference branches iteratively. */ | ||
330 | while (orig && atomic_dec_and_test(&orig->usage)) { | ||
331 | struct seccomp_filter *freeme = orig; | ||
332 | orig = orig->prev; | ||
333 | kfree(freeme); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation | ||
339 | * @syscall: syscall number to send to userland | ||
340 | * @reason: filter-supplied reason code to send to userland (via si_errno) | ||
341 | * | ||
342 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. | ||
343 | */ | ||
344 | static void seccomp_send_sigsys(int syscall, int reason) | ||
345 | { | ||
346 | struct siginfo info; | ||
347 | memset(&info, 0, sizeof(info)); | ||
348 | info.si_signo = SIGSYS; | ||
349 | info.si_code = SYS_SECCOMP; | ||
350 | info.si_call_addr = (void __user *)KSTK_EIP(current); | ||
351 | info.si_errno = reason; | ||
352 | info.si_arch = syscall_get_arch(current, task_pt_regs(current)); | ||
353 | info.si_syscall = syscall; | ||
354 | force_sig_info(SIGSYS, &info, current); | ||
355 | } | ||
356 | #endif /* CONFIG_SECCOMP_FILTER */ | ||
16 | 357 | ||
17 | /* | 358 | /* |
18 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 359 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { | |||
31 | }; | 372 | }; |
32 | #endif | 373 | #endif |
33 | 374 | ||
34 | void __secure_computing(int this_syscall) | 375 | int __secure_computing(int this_syscall) |
35 | { | 376 | { |
36 | int mode = current->seccomp.mode; | 377 | int mode = current->seccomp.mode; |
37 | int * syscall; | 378 | int exit_sig = 0; |
379 | int *syscall; | ||
380 | u32 ret; | ||
38 | 381 | ||
39 | switch (mode) { | 382 | switch (mode) { |
40 | case 1: | 383 | case SECCOMP_MODE_STRICT: |
41 | syscall = mode1_syscalls; | 384 | syscall = mode1_syscalls; |
42 | #ifdef CONFIG_COMPAT | 385 | #ifdef CONFIG_COMPAT |
43 | if (is_compat_task()) | 386 | if (is_compat_task()) |
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) | |||
45 | #endif | 388 | #endif |
46 | do { | 389 | do { |
47 | if (*syscall == this_syscall) | 390 | if (*syscall == this_syscall) |
48 | return; | 391 | return 0; |
49 | } while (*++syscall); | 392 | } while (*++syscall); |
393 | exit_sig = SIGKILL; | ||
394 | ret = SECCOMP_RET_KILL; | ||
395 | break; | ||
396 | #ifdef CONFIG_SECCOMP_FILTER | ||
397 | case SECCOMP_MODE_FILTER: { | ||
398 | int data; | ||
399 | ret = seccomp_run_filters(this_syscall); | ||
400 | data = ret & SECCOMP_RET_DATA; | ||
401 | ret &= SECCOMP_RET_ACTION; | ||
402 | switch (ret) { | ||
403 | case SECCOMP_RET_ERRNO: | ||
404 | /* Set the low-order 16-bits as a errno. */ | ||
405 | syscall_set_return_value(current, task_pt_regs(current), | ||
406 | -data, 0); | ||
407 | goto skip; | ||
408 | case SECCOMP_RET_TRAP: | ||
409 | /* Show the handler the original registers. */ | ||
410 | syscall_rollback(current, task_pt_regs(current)); | ||
411 | /* Let the filter pass back 16 bits of data. */ | ||
412 | seccomp_send_sigsys(this_syscall, data); | ||
413 | goto skip; | ||
414 | case SECCOMP_RET_TRACE: | ||
415 | /* Skip these calls if there is no tracer. */ | ||
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | ||
417 | goto skip; | ||
418 | /* Allow the BPF to provide the event message */ | ||
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
420 | /* | ||
421 | * The delivery of a fatal signal during event | ||
422 | * notification may silently skip tracer notification. | ||
423 | * Terminating the task now avoids executing a system | ||
424 | * call that may not be intended. | ||
425 | */ | ||
426 | if (fatal_signal_pending(current)) | ||
427 | break; | ||
428 | return 0; | ||
429 | case SECCOMP_RET_ALLOW: | ||
430 | return 0; | ||
431 | case SECCOMP_RET_KILL: | ||
432 | default: | ||
433 | break; | ||
434 | } | ||
435 | exit_sig = SIGSYS; | ||
50 | break; | 436 | break; |
437 | } | ||
438 | #endif | ||
51 | default: | 439 | default: |
52 | BUG(); | 440 | BUG(); |
53 | } | 441 | } |
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) | |||
55 | #ifdef SECCOMP_DEBUG | 443 | #ifdef SECCOMP_DEBUG |
56 | dump_stack(); | 444 | dump_stack(); |
57 | #endif | 445 | #endif |
58 | audit_seccomp(this_syscall); | 446 | audit_seccomp(this_syscall, exit_sig, ret); |
59 | do_exit(SIGKILL); | 447 | do_exit(exit_sig); |
448 | #ifdef CONFIG_SECCOMP_FILTER | ||
449 | skip: | ||
450 | audit_seccomp(this_syscall, exit_sig, ret); | ||
451 | #endif | ||
452 | return -1; | ||
60 | } | 453 | } |
61 | 454 | ||
62 | long prctl_get_seccomp(void) | 455 | long prctl_get_seccomp(void) |
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void) | |||
64 | return current->seccomp.mode; | 457 | return current->seccomp.mode; |
65 | } | 458 | } |
66 | 459 | ||
67 | long prctl_set_seccomp(unsigned long seccomp_mode) | 460 | /** |
461 | * prctl_set_seccomp: configures current->seccomp.mode | ||
462 | * @seccomp_mode: requested mode to use | ||
463 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
464 | * | ||
465 | * This function may be called repeatedly with a @seccomp_mode of | ||
466 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | ||
467 | * successfully installed will be evaluated (in reverse order) for each system | ||
468 | * call the task makes. | ||
469 | * | ||
470 | * Once current->seccomp.mode is non-zero, it may not be changed. | ||
471 | * | ||
472 | * Returns 0 on success or -EINVAL on failure. | ||
473 | */ | ||
474 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | ||
68 | { | 475 | { |
69 | long ret; | 476 | long ret = -EINVAL; |
70 | 477 | ||
71 | /* can set it only once to be even more secure */ | 478 | if (current->seccomp.mode && |
72 | ret = -EPERM; | 479 | current->seccomp.mode != seccomp_mode) |
73 | if (unlikely(current->seccomp.mode)) | ||
74 | goto out; | 480 | goto out; |
75 | 481 | ||
76 | ret = -EINVAL; | 482 | switch (seccomp_mode) { |
77 | if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { | 483 | case SECCOMP_MODE_STRICT: |
78 | current->seccomp.mode = seccomp_mode; | 484 | ret = 0; |
79 | set_thread_flag(TIF_SECCOMP); | ||
80 | #ifdef TIF_NOTSC | 485 | #ifdef TIF_NOTSC |
81 | disable_TSC(); | 486 | disable_TSC(); |
82 | #endif | 487 | #endif |
83 | ret = 0; | 488 | break; |
489 | #ifdef CONFIG_SECCOMP_FILTER | ||
490 | case SECCOMP_MODE_FILTER: | ||
491 | ret = seccomp_attach_user_filter(filter); | ||
492 | if (ret) | ||
493 | goto out; | ||
494 | break; | ||
495 | #endif | ||
496 | default: | ||
497 | goto out; | ||
84 | } | 498 | } |
85 | 499 | ||
86 | out: | 500 | current->seccomp.mode = seccomp_mode; |
501 | set_thread_flag(TIF_SECCOMP); | ||
502 | out: | ||
87 | return ret; | 503 | return ret; |
88 | } | 504 | } |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c..4567fc020fe 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable); | |||
118 | * down_trylock - try to acquire the semaphore, without waiting | 118 | * down_trylock - try to acquire the semaphore, without waiting |
119 | * @sem: the semaphore to be acquired | 119 | * @sem: the semaphore to be acquired |
120 | * | 120 | * |
121 | * Try to acquire the semaphore atomically. Returns 0 if the mutex has | 121 | * Try to acquire the semaphore atomically. Returns 0 if the semaphore has |
122 | * been acquired successfully or 1 if it it cannot be acquired. | 122 | * been acquired successfully or 1 if it it cannot be acquired. |
123 | * | 123 | * |
124 | * NOTE: This return value is inverted from both spin_trylock and | 124 | * NOTE: This return value is inverted from both spin_trylock and |
diff --git a/kernel/signal.c b/kernel/signal.c index 60d80ab2601..f7b41821763 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -161,7 +161,7 @@ void recalc_sigpending(void) | |||
161 | 161 | ||
162 | #define SYNCHRONOUS_MASK \ | 162 | #define SYNCHRONOUS_MASK \ |
163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | 163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ |
164 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | 164 | sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) |
165 | 165 | ||
166 | int next_signal(struct sigpending *pending, sigset_t *mask) | 166 | int next_signal(struct sigpending *pending, sigset_t *mask) |
167 | { | 167 | { |
@@ -768,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t) | |||
768 | const struct cred *cred = current_cred(); | 768 | const struct cred *cred = current_cred(); |
769 | const struct cred *tcred = __task_cred(t); | 769 | const struct cred *tcred = __task_cred(t); |
770 | 770 | ||
771 | if (cred->user->user_ns == tcred->user->user_ns && | 771 | if (uid_eq(cred->euid, tcred->suid) || |
772 | (cred->euid == tcred->suid || | 772 | uid_eq(cred->euid, tcred->uid) || |
773 | cred->euid == tcred->uid || | 773 | uid_eq(cred->uid, tcred->suid) || |
774 | cred->uid == tcred->suid || | 774 | uid_eq(cred->uid, tcred->uid)) |
775 | cred->uid == tcred->uid)) | ||
776 | return 1; | 775 | return 1; |
777 | 776 | ||
778 | if (ns_capable(tcred->user->user_ns, CAP_KILL)) | 777 | if (ns_capable(tcred->user_ns, CAP_KILL)) |
779 | return 1; | 778 | return 1; |
780 | 779 | ||
781 | return 0; | 780 | return 0; |
@@ -1021,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig) | |||
1021 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | 1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); |
1022 | } | 1021 | } |
1023 | 1022 | ||
1024 | /* | ||
1025 | * map the uid in struct cred into user namespace *ns | ||
1026 | */ | ||
1027 | static inline uid_t map_cred_ns(const struct cred *cred, | ||
1028 | struct user_namespace *ns) | ||
1029 | { | ||
1030 | return user_ns_map_uid(ns, cred, cred->uid); | ||
1031 | } | ||
1032 | |||
1033 | #ifdef CONFIG_USER_NS | 1023 | #ifdef CONFIG_USER_NS |
1034 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | 1024 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) |
1035 | { | 1025 | { |
@@ -1039,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str | |||
1039 | if (SI_FROMKERNEL(info)) | 1029 | if (SI_FROMKERNEL(info)) |
1040 | return; | 1030 | return; |
1041 | 1031 | ||
1042 | info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), | 1032 | rcu_read_lock(); |
1043 | current_cred(), info->si_uid); | 1033 | info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), |
1034 | make_kuid(current_user_ns(), info->si_uid)); | ||
1035 | rcu_read_unlock(); | ||
1044 | } | 1036 | } |
1045 | #else | 1037 | #else |
1046 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | 1038 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) |
@@ -1107,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1107 | q->info.si_code = SI_USER; | 1099 | q->info.si_code = SI_USER; |
1108 | q->info.si_pid = task_tgid_nr_ns(current, | 1100 | q->info.si_pid = task_tgid_nr_ns(current, |
1109 | task_active_pid_ns(t)); | 1101 | task_active_pid_ns(t)); |
1110 | q->info.si_uid = current_uid(); | 1102 | q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
1111 | break; | 1103 | break; |
1112 | case (unsigned long) SEND_SIG_PRIV: | 1104 | case (unsigned long) SEND_SIG_PRIV: |
1113 | q->info.si_signo = sig; | 1105 | q->info.si_signo = sig; |
@@ -1388,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred, | |||
1388 | struct task_struct *target) | 1380 | struct task_struct *target) |
1389 | { | 1381 | { |
1390 | const struct cred *pcred = __task_cred(target); | 1382 | const struct cred *pcred = __task_cred(target); |
1391 | if (cred->user_ns != pcred->user_ns) | 1383 | if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && |
1392 | return 0; | 1384 | !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) |
1393 | if (cred->euid != pcred->suid && cred->euid != pcred->uid && | ||
1394 | cred->uid != pcred->suid && cred->uid != pcred->uid) | ||
1395 | return 0; | 1385 | return 0; |
1396 | return 1; | 1386 | return 1; |
1397 | } | 1387 | } |
@@ -1679,8 +1669,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1679 | */ | 1669 | */ |
1680 | rcu_read_lock(); | 1670 | rcu_read_lock(); |
1681 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); | 1671 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); |
1682 | info.si_uid = map_cred_ns(__task_cred(tsk), | 1672 | info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), |
1683 | task_cred_xxx(tsk->parent, user_ns)); | 1673 | task_uid(tsk)); |
1684 | rcu_read_unlock(); | 1674 | rcu_read_unlock(); |
1685 | 1675 | ||
1686 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); | 1676 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); |
@@ -1763,8 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1763 | */ | 1753 | */ |
1764 | rcu_read_lock(); | 1754 | rcu_read_lock(); |
1765 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1755 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
1766 | info.si_uid = map_cred_ns(__task_cred(tsk), | 1756 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); |
1767 | task_cred_xxx(parent, user_ns)); | ||
1768 | rcu_read_unlock(); | 1757 | rcu_read_unlock(); |
1769 | 1758 | ||
1770 | info.si_utime = cputime_to_clock_t(tsk->utime); | 1759 | info.si_utime = cputime_to_clock_t(tsk->utime); |
@@ -1974,7 +1963,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
1974 | info.si_signo = signr; | 1963 | info.si_signo = signr; |
1975 | info.si_code = exit_code; | 1964 | info.si_code = exit_code; |
1976 | info.si_pid = task_pid_vnr(current); | 1965 | info.si_pid = task_pid_vnr(current); |
1977 | info.si_uid = current_uid(); | 1966 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
1978 | 1967 | ||
1979 | /* Let the debugger run. */ | 1968 | /* Let the debugger run. */ |
1980 | ptrace_stop(exit_code, why, 1, &info); | 1969 | ptrace_stop(exit_code, why, 1, &info); |
@@ -2182,8 +2171,8 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
2182 | info->si_code = SI_USER; | 2171 | info->si_code = SI_USER; |
2183 | rcu_read_lock(); | 2172 | rcu_read_lock(); |
2184 | info->si_pid = task_pid_vnr(current->parent); | 2173 | info->si_pid = task_pid_vnr(current->parent); |
2185 | info->si_uid = map_cred_ns(__task_cred(current->parent), | 2174 | info->si_uid = from_kuid_munged(current_user_ns(), |
2186 | current_user_ns()); | 2175 | task_uid(current->parent)); |
2187 | rcu_read_unlock(); | 2176 | rcu_read_unlock(); |
2188 | } | 2177 | } |
2189 | 2178 | ||
@@ -2710,6 +2699,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2710 | err |= __put_user(from->si_uid, &to->si_uid); | 2699 | err |= __put_user(from->si_uid, &to->si_uid); |
2711 | err |= __put_user(from->si_ptr, &to->si_ptr); | 2700 | err |= __put_user(from->si_ptr, &to->si_ptr); |
2712 | break; | 2701 | break; |
2702 | #ifdef __ARCH_SIGSYS | ||
2703 | case __SI_SYS: | ||
2704 | err |= __put_user(from->si_call_addr, &to->si_call_addr); | ||
2705 | err |= __put_user(from->si_syscall, &to->si_syscall); | ||
2706 | err |= __put_user(from->si_arch, &to->si_arch); | ||
2707 | break; | ||
2708 | #endif | ||
2713 | default: /* this is just in case for now ... */ | 2709 | default: /* this is just in case for now ... */ |
2714 | err |= __put_user(from->si_pid, &to->si_pid); | 2710 | err |= __put_user(from->si_pid, &to->si_pid); |
2715 | err |= __put_user(from->si_uid, &to->si_uid); | 2711 | err |= __put_user(from->si_uid, &to->si_uid); |
@@ -2832,7 +2828,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | |||
2832 | info.si_errno = 0; | 2828 | info.si_errno = 0; |
2833 | info.si_code = SI_USER; | 2829 | info.si_code = SI_USER; |
2834 | info.si_pid = task_tgid_vnr(current); | 2830 | info.si_pid = task_tgid_vnr(current); |
2835 | info.si_uid = current_uid(); | 2831 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
2836 | 2832 | ||
2837 | return kill_something_info(sig, &info, pid); | 2833 | return kill_something_info(sig, &info, pid); |
2838 | } | 2834 | } |
@@ -2875,7 +2871,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) | |||
2875 | info.si_errno = 0; | 2871 | info.si_errno = 0; |
2876 | info.si_code = SI_TKILL; | 2872 | info.si_code = SI_TKILL; |
2877 | info.si_pid = task_tgid_vnr(current); | 2873 | info.si_pid = task_tgid_vnr(current); |
2878 | info.si_uid = current_uid(); | 2874 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
2879 | 2875 | ||
2880 | return do_send_specific(tgid, pid, sig, &info); | 2876 | return do_send_specific(tgid, pid, sig, &info); |
2881 | } | 2877 | } |
@@ -3240,6 +3236,21 @@ SYSCALL_DEFINE0(pause) | |||
3240 | 3236 | ||
3241 | #endif | 3237 | #endif |
3242 | 3238 | ||
3239 | #ifdef HAVE_SET_RESTORE_SIGMASK | ||
3240 | int sigsuspend(sigset_t *set) | ||
3241 | { | ||
3242 | sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
3243 | |||
3244 | current->saved_sigmask = current->blocked; | ||
3245 | set_current_blocked(set); | ||
3246 | |||
3247 | current->state = TASK_INTERRUPTIBLE; | ||
3248 | schedule(); | ||
3249 | set_restore_sigmask(); | ||
3250 | return -ERESTARTNOHAND; | ||
3251 | } | ||
3252 | #endif | ||
3253 | |||
3243 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | 3254 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND |
3244 | /** | 3255 | /** |
3245 | * sys_rt_sigsuspend - replace the signal mask for a value with the | 3256 | * sys_rt_sigsuspend - replace the signal mask for a value with the |
@@ -3257,15 +3268,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | |||
3257 | 3268 | ||
3258 | if (copy_from_user(&newset, unewset, sizeof(newset))) | 3269 | if (copy_from_user(&newset, unewset, sizeof(newset))) |
3259 | return -EFAULT; | 3270 | return -EFAULT; |
3260 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 3271 | return sigsuspend(&newset); |
3261 | |||
3262 | current->saved_sigmask = current->blocked; | ||
3263 | set_current_blocked(&newset); | ||
3264 | |||
3265 | current->state = TASK_INTERRUPTIBLE; | ||
3266 | schedule(); | ||
3267 | set_restore_sigmask(); | ||
3268 | return -ERESTARTNOHAND; | ||
3269 | } | 3272 | } |
3270 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | 3273 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ |
3271 | 3274 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf75..d0ae5b24875 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,8 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #include "smpboot.h" | ||
17 | |||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
17 | static struct { | 19 | static struct { |
18 | struct list_head queue; | 20 | struct list_head queue; |
@@ -669,6 +671,8 @@ void __init smp_init(void) | |||
669 | { | 671 | { |
670 | unsigned int cpu; | 672 | unsigned int cpu; |
671 | 673 | ||
674 | idle_threads_init(); | ||
675 | |||
672 | /* FIXME: This should be done in userspace --RR */ | 676 | /* FIXME: This should be done in userspace --RR */ |
673 | for_each_present_cpu(cpu) { | 677 | for_each_present_cpu(cpu) { |
674 | if (num_online_cpus() >= setup_max_cpus) | 678 | if (num_online_cpus() >= setup_max_cpus) |
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
791 | } | 795 | } |
792 | } | 796 | } |
793 | EXPORT_SYMBOL(on_each_cpu_cond); | 797 | EXPORT_SYMBOL(on_each_cpu_cond); |
798 | |||
799 | static void do_nothing(void *unused) | ||
800 | { | ||
801 | } | ||
802 | |||
803 | /** | ||
804 | * kick_all_cpus_sync - Force all cpus out of idle | ||
805 | * | ||
806 | * Used to synchronize the update of pm_idle function pointer. It's | ||
807 | * called after the pointer is updated and returns after the dummy | ||
808 | * callback function has been executed on all cpus. The execution of | ||
809 | * the function can only happen on the remote cpus after they have | ||
810 | * left the idle function which had been called via pm_idle function | ||
811 | * pointer. So it's guaranteed that nothing uses the previous pointer | ||
812 | * anymore. | ||
813 | */ | ||
814 | void kick_all_cpus_sync(void) | ||
815 | { | ||
816 | /* Make sure the change is visible before we kick the cpus */ | ||
817 | smp_mb(); | ||
818 | smp_call_function(do_nothing, NULL, 1); | ||
819 | } | ||
820 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 00000000000..e1a797e028a --- /dev/null +++ b/kernel/smpboot.c | |||
@@ -0,0 +1,62 @@ | |||
1 | /* | ||
2 | * Common SMP CPU bringup/teardown functions | ||
3 | */ | ||
4 | #include <linux/err.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/percpu.h> | ||
9 | |||
10 | #include "smpboot.h" | ||
11 | |||
12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
13 | /* | ||
14 | * For the hotplug case we keep the task structs around and reuse | ||
15 | * them. | ||
16 | */ | ||
17 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); | ||
18 | |||
19 | struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) | ||
20 | { | ||
21 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
22 | |||
23 | if (!tsk) | ||
24 | return ERR_PTR(-ENOMEM); | ||
25 | init_idle(tsk, cpu); | ||
26 | return tsk; | ||
27 | } | ||
28 | |||
29 | void __init idle_thread_set_boot_cpu(void) | ||
30 | { | ||
31 | per_cpu(idle_threads, smp_processor_id()) = current; | ||
32 | } | ||
33 | |||
34 | static inline void idle_init(unsigned int cpu) | ||
35 | { | ||
36 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
37 | |||
38 | if (!tsk) { | ||
39 | tsk = fork_idle(cpu); | ||
40 | if (IS_ERR(tsk)) | ||
41 | pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); | ||
42 | else | ||
43 | per_cpu(idle_threads, cpu) = tsk; | ||
44 | } | ||
45 | } | ||
46 | |||
47 | /** | ||
48 | * idle_thread_init - Initialize the idle thread for a cpu | ||
49 | * @cpu: The cpu for which the idle thread should be initialized | ||
50 | * | ||
51 | * Creates the thread if it does not exist. | ||
52 | */ | ||
53 | void __init idle_threads_init(void) | ||
54 | { | ||
55 | unsigned int cpu; | ||
56 | |||
57 | for_each_possible_cpu(cpu) { | ||
58 | if (cpu != smp_processor_id()) | ||
59 | idle_init(cpu); | ||
60 | } | ||
61 | } | ||
62 | #endif | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 00000000000..80c0acfb847 --- /dev/null +++ b/kernel/smpboot.h | |||
@@ -0,0 +1,18 @@ | |||
1 | #ifndef SMPBOOT_H | ||
2 | #define SMPBOOT_H | ||
3 | |||
4 | struct task_struct; | ||
5 | |||
6 | int smpboot_prepare(unsigned int cpu); | ||
7 | |||
8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
9 | struct task_struct *idle_thread_get(unsigned int cpu); | ||
10 | void idle_thread_set_boot_cpu(void); | ||
11 | void idle_threads_init(void); | ||
12 | #else | ||
13 | static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } | ||
14 | static inline void idle_thread_set_boot_cpu(void) { } | ||
15 | static inline void idle_threads_init(void) { } | ||
16 | #endif | ||
17 | |||
18 | #endif | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f..2095be3318d 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,10 +34,77 @@ | |||
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | /* | ||
38 | * Initialize an rcu_batch structure to empty. | ||
39 | */ | ||
40 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
41 | { | ||
42 | b->head = NULL; | ||
43 | b->tail = &b->head; | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
48 | */ | ||
49 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
50 | { | ||
51 | *b->tail = head; | ||
52 | b->tail = &head->next; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Is the specified rcu_batch structure empty? | ||
57 | */ | ||
58 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
59 | { | ||
60 | return b->tail == &b->head; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Remove the callback at the head of the specified rcu_batch structure | ||
65 | * and return a pointer to it, or return NULL if the structure is empty. | ||
66 | */ | ||
67 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
68 | { | ||
69 | struct rcu_head *head; | ||
70 | |||
71 | if (rcu_batch_empty(b)) | ||
72 | return NULL; | ||
73 | |||
74 | head = b->head; | ||
75 | b->head = head->next; | ||
76 | if (b->tail == &head->next) | ||
77 | rcu_batch_init(b); | ||
78 | |||
79 | return head; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
84 | * the structure specified by "to". | ||
85 | */ | ||
86 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
87 | { | ||
88 | if (!rcu_batch_empty(from)) { | ||
89 | *to->tail = from->head; | ||
90 | to->tail = from->tail; | ||
91 | rcu_batch_init(from); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 98 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | 99 | { |
39 | sp->completed = 0; | 100 | sp->completed = 0; |
40 | mutex_init(&sp->mutex); | 101 | spin_lock_init(&sp->queue_lock); |
102 | sp->running = false; | ||
103 | rcu_batch_init(&sp->batch_queue); | ||
104 | rcu_batch_init(&sp->batch_check0); | ||
105 | rcu_batch_init(&sp->batch_check1); | ||
106 | rcu_batch_init(&sp->batch_done); | ||
107 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | 108 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | 109 | return sp->per_cpu_ref ? 0 : -ENOMEM; |
43 | } | 110 | } |
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 140 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 141 | ||
75 | /* | 142 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 143 | * Returns approximate total of the readers' ->seq[] values for the |
77 | * active on the specified rank of per-CPU counters. | 144 | * rank of per-CPU counters specified by idx. |
78 | */ | 145 | */ |
146 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
147 | { | ||
148 | int cpu; | ||
149 | unsigned long sum = 0; | ||
150 | unsigned long t; | ||
79 | 151 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 152 | for_each_possible_cpu(cpu) { |
153 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
154 | sum += t; | ||
155 | } | ||
156 | return sum; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Returns approximate number of readers active on the specified rank | ||
161 | * of the per-CPU ->c[] counters. | ||
162 | */ | ||
163 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
81 | { | 164 | { |
82 | int cpu; | 165 | int cpu; |
83 | int sum; | 166 | unsigned long sum = 0; |
167 | unsigned long t; | ||
84 | 168 | ||
85 | sum = 0; | 169 | for_each_possible_cpu(cpu) { |
86 | for_each_possible_cpu(cpu) | 170 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 171 | sum += t; |
172 | } | ||
88 | return sum; | 173 | return sum; |
89 | } | 174 | } |
90 | 175 | ||
176 | /* | ||
177 | * Return true if the number of pre-existing readers is determined to | ||
178 | * be stably zero. An example unstable zero can occur if the call | ||
179 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
180 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
181 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
182 | * time to sum the array, and might in fact be interrupted or preempted | ||
183 | * partway through the summation. | ||
184 | */ | ||
185 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
186 | { | ||
187 | unsigned long seq; | ||
188 | |||
189 | seq = srcu_readers_seq_idx(sp, idx); | ||
190 | |||
191 | /* | ||
192 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
193 | * __srcu_read_lock(). This pairing ensures that if an | ||
194 | * __srcu_read_lock() increments its counter after the summation | ||
195 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
196 | * critical section will see any changes made prior to the start | ||
197 | * of the current SRCU grace period. | ||
198 | * | ||
199 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
200 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
201 | * must see the increment of ->c[]. | ||
202 | */ | ||
203 | smp_mb(); /* A */ | ||
204 | |||
205 | /* | ||
206 | * Note that srcu_readers_active_idx() can incorrectly return | ||
207 | * zero even though there is a pre-existing reader throughout. | ||
208 | * To see this, suppose that task A is in a very long SRCU | ||
209 | * read-side critical section that started on CPU 0, and that | ||
210 | * no other reader exists, so that the sum of the counters | ||
211 | * is equal to one. Then suppose that task B starts executing | ||
212 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
213 | * task C starts reading on CPU 0, so that its increment is not | ||
214 | * summed, but finishes reading on CPU 2, so that its decrement | ||
215 | * -is- summed. Then when task B completes its sum, it will | ||
216 | * incorrectly get zero, despite the fact that task A has been | ||
217 | * in its SRCU read-side critical section the whole time. | ||
218 | * | ||
219 | * We therefore do a validation step should srcu_readers_active_idx() | ||
220 | * return zero. | ||
221 | */ | ||
222 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * The remainder of this function is the validation step. | ||
227 | * The following smp_mb() D pairs with the smp_mb() C in | ||
228 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
229 | * by srcu_readers_active_idx() above, then any destructive | ||
230 | * operation performed after the grace period will happen after | ||
231 | * the corresponding SRCU read-side critical section. | ||
232 | * | ||
233 | * Note that there can be at most NR_CPUS worth of readers using | ||
234 | * the old index, which is not enough to overflow even a 32-bit | ||
235 | * integer. (Yes, this does mean that systems having more than | ||
236 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
237 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
238 | * Therefore, the only way that the return values of the two | ||
239 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
240 | * no increments of the corresponding rank of ->seq[] counts | ||
241 | * in the interim. But the missed-increment scenario laid out | ||
242 | * above includes an increment of the ->seq[] counter by | ||
243 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
244 | * scenario occurs, the return values from the two calls to | ||
245 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
246 | * step below suffices. | ||
247 | */ | ||
248 | smp_mb(); /* D */ | ||
249 | |||
250 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
251 | } | ||
252 | |||
91 | /** | 253 | /** |
92 | * srcu_readers_active - returns approximate number of readers. | 254 | * srcu_readers_active - returns approximate number of readers. |
93 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 255 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
98 | */ | 260 | */ |
99 | static int srcu_readers_active(struct srcu_struct *sp) | 261 | static int srcu_readers_active(struct srcu_struct *sp) |
100 | { | 262 | { |
101 | return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); | 263 | int cpu; |
264 | unsigned long sum = 0; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
268 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
269 | } | ||
270 | return sum; | ||
102 | } | 271 | } |
103 | 272 | ||
104 | /** | 273 | /** |
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 300 | int idx; |
132 | 301 | ||
133 | preempt_disable(); | 302 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 303 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 304 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 306 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
307 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
138 | preempt_enable(); | 308 | preempt_enable(); |
139 | return idx; | 309 | return idx; |
140 | } | 310 | } |
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 319 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 320 | { |
151 | preempt_disable(); | 321 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 322 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 323 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
154 | preempt_enable(); | 324 | preempt_enable(); |
155 | } | 325 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 326 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 333 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 334 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 335 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 336 | #define SRCU_RETRY_CHECK_DELAY 5 |
337 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
338 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
167 | 339 | ||
168 | /* | 340 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 341 | * @@@ Wait until all pre-existing readers complete. Such readers |
342 | * will have used the index specified by "idx". | ||
343 | * the caller should ensures the ->completed is not changed while checking | ||
344 | * and idx = (->completed & 1) ^ 1 | ||
170 | */ | 345 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 346 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) |
172 | { | 347 | { |
173 | int idx; | 348 | for (;;) { |
174 | 349 | if (srcu_readers_active_idx_check(sp, idx)) | |
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 350 | return true; |
176 | !lock_is_held(&rcu_bh_lock_map) && | 351 | if (--trycount <= 0) |
177 | !lock_is_held(&rcu_lock_map) && | 352 | return false; |
178 | !lock_is_held(&rcu_sched_lock_map), | 353 | udelay(SRCU_RETRY_CHECK_DELAY); |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 354 | } |
180 | 355 | } | |
181 | idx = sp->completed; | ||
182 | mutex_lock(&sp->mutex); | ||
183 | 356 | ||
184 | /* | 357 | /* |
185 | * Check to see if someone else did the work for us while we were | 358 | * Increment the ->completed counter so that future SRCU readers will |
186 | * waiting to acquire the lock. We need -two- advances of | 359 | * use the other rank of the ->c[] and ->seq[] arrays. This allows |
187 | * the counter, not just one. If there was but one, we might have | 360 | * us to wait for pre-existing readers in a starvation-free manner. |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 361 | */ |
189 | * having failed to prevent CPU-reordering races with concurrent | 362 | static void srcu_flip(struct srcu_struct *sp) |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 363 | { |
191 | * either (1) wait for two or (2) supply the second ourselves. | 364 | sp->completed++; |
192 | */ | 365 | } |
193 | 366 | ||
194 | if ((sp->completed - idx) >= 2) { | 367 | /* |
195 | mutex_unlock(&sp->mutex); | 368 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
196 | return; | 369 | * initiating grace-period processing if it is not already running. |
370 | */ | ||
371 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
372 | void (*func)(struct rcu_head *head)) | ||
373 | { | ||
374 | unsigned long flags; | ||
375 | |||
376 | head->next = NULL; | ||
377 | head->func = func; | ||
378 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
379 | rcu_batch_queue(&sp->batch_queue, head); | ||
380 | if (!sp->running) { | ||
381 | sp->running = true; | ||
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | ||
197 | } | 383 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
385 | } | ||
386 | EXPORT_SYMBOL_GPL(call_srcu); | ||
198 | 387 | ||
199 | sync_func(); /* Force memory barrier on all CPUs. */ | 388 | struct rcu_synchronize { |
389 | struct rcu_head head; | ||
390 | struct completion completion; | ||
391 | }; | ||
200 | 392 | ||
201 | /* | 393 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 394 | * Awaken the corresponding synchronize_srcu() instance now that a |
203 | * sees the new value of sp->completed will also see any preceding | 395 | * grace period has elapsed. |
204 | * changes to data structures made by this CPU. This prevents | 396 | */ |
205 | * some other CPU from reordering the accesses in its SRCU | 397 | static void wakeme_after_rcu(struct rcu_head *head) |
206 | * read-side critical section to precede the corresponding | 398 | { |
207 | * srcu_read_lock() -- ensuring that such references will in | 399 | struct rcu_synchronize *rcu; |
208 | * fact be protected. | ||
209 | * | ||
210 | * So it is now safe to do the flip. | ||
211 | */ | ||
212 | 400 | ||
213 | idx = sp->completed & 0x1; | 401 | rcu = container_of(head, struct rcu_synchronize, head); |
214 | sp->completed++; | 402 | complete(&rcu->completion); |
403 | } | ||
215 | 404 | ||
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
406 | static void srcu_reschedule(struct srcu_struct *sp); | ||
217 | 407 | ||
218 | /* | 408 | /* |
219 | * At this point, because of the preceding synchronize_sched(), | 409 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
220 | * all srcu_read_lock() calls using the old counters have completed. | 410 | */ |
221 | * Their corresponding critical sections might well be still | 411 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
222 | * executing, but the srcu_read_lock() primitives themselves | 412 | { |
223 | * will have finished executing. We initially give readers | 413 | struct rcu_synchronize rcu; |
224 | * an arbitrarily chosen 10 microseconds to get out of their | 414 | struct rcu_head *head = &rcu.head; |
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | 415 | bool done = false; |
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | 416 | ||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | 417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
418 | !lock_is_held(&rcu_bh_lock_map) && | ||
419 | !lock_is_held(&rcu_lock_map) && | ||
420 | !lock_is_held(&rcu_sched_lock_map), | ||
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
236 | 422 | ||
237 | /* | 423 | init_completion(&rcu.completion); |
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | 424 | |
239 | * primitives that were executing concurrently with the preceding | 425 | head->next = NULL; |
240 | * for_each_possible_cpu() loop to have completed by this point. | 426 | head->func = wakeme_after_rcu; |
241 | * More importantly, it also forces the corresponding SRCU read-side | 427 | spin_lock_irq(&sp->queue_lock); |
242 | * critical sections to have also completed, and the corresponding | 428 | if (!sp->running) { |
243 | * references to SRCU-protected data items to be dropped. | 429 | /* steal the processing owner */ |
244 | * | 430 | sp->running = true; |
245 | * Note: | 431 | rcu_batch_queue(&sp->batch_check0, head); |
246 | * | 432 | spin_unlock_irq(&sp->queue_lock); |
247 | * Despite what you might think at first glance, the | 433 | |
248 | * preceding synchronize_sched() -must- be within the | 434 | srcu_advance_batches(sp, trycount); |
249 | * critical section ended by the following mutex_unlock(). | 435 | if (!rcu_batch_empty(&sp->batch_done)) { |
250 | * Otherwise, a task taking the early exit can race | 436 | BUG_ON(sp->batch_done.head != head); |
251 | * with a srcu_read_unlock(), which might have executed | 437 | rcu_batch_dequeue(&sp->batch_done); |
252 | * just before the preceding srcu_readers_active() check, | 438 | done = true; |
253 | * and whose CPU might have reordered the srcu_read_unlock() | 439 | } |
254 | * with the preceding critical section. In this case, there | 440 | /* give the processing owner to work_struct */ |
255 | * is nothing preventing the synchronize_sched() task that is | 441 | srcu_reschedule(sp); |
256 | * taking the early exit from freeing a data structure that | 442 | } else { |
257 | * is still being referenced (out of order) by the task | 443 | rcu_batch_queue(&sp->batch_queue, head); |
258 | * doing the srcu_read_unlock(). | 444 | spin_unlock_irq(&sp->queue_lock); |
259 | * | 445 | } |
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | ||
264 | 446 | ||
265 | mutex_unlock(&sp->mutex); | 447 | if (!done) |
448 | wait_for_completion(&rcu.completion); | ||
266 | } | 449 | } |
267 | 450 | ||
268 | /** | 451 | /** |
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 464 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 465 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 466 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
285 | } | 468 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 469 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 470 | ||
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 472 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 473 | * @sp: srcu_struct with which to synchronize. |
291 | * | 474 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 475 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 476 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 477 | * |
300 | * Note that it is illegal to call this function while holding any lock | 478 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 479 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 480 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 481 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 482 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 485 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 487 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
313 | } | 489 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 491 | ||
316 | /** | 492 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
494 | */ | ||
495 | void srcu_barrier(struct srcu_struct *sp) | ||
496 | { | ||
497 | synchronize_srcu(sp); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
500 | |||
501 | /** | ||
317 | * srcu_batches_completed - return batches completed. | 502 | * srcu_batches_completed - return batches completed. |
318 | * @sp: srcu_struct on which to report batch completion. | 503 | * @sp: srcu_struct on which to report batch completion. |
319 | * | 504 | * |
320 | * Report the number of batches, correlated with, but not necessarily | 505 | * Report the number of batches, correlated with, but not necessarily |
321 | * precisely the same as, the number of grace periods that have elapsed. | 506 | * precisely the same as, the number of grace periods that have elapsed. |
322 | */ | 507 | */ |
323 | |||
324 | long srcu_batches_completed(struct srcu_struct *sp) | 508 | long srcu_batches_completed(struct srcu_struct *sp) |
325 | { | 509 | { |
326 | return sp->completed; | 510 | return sp->completed; |
327 | } | 511 | } |
328 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |
513 | |||
514 | #define SRCU_CALLBACK_BATCH 10 | ||
515 | #define SRCU_INTERVAL 1 | ||
516 | |||
517 | /* | ||
518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
519 | * period pipeline. | ||
520 | */ | ||
521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
522 | { | ||
523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
524 | spin_lock_irq(&sp->queue_lock); | ||
525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
526 | spin_unlock_irq(&sp->queue_lock); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
533 | */ | ||
534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
535 | { | ||
536 | int idx = 1 ^ (sp->completed & 1); | ||
537 | |||
538 | /* | ||
539 | * Because readers might be delayed for an extended period after | ||
540 | * fetching ->completed for their index, at any point in time there | ||
541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
542 | * need to wait for readers to clear from both index values before | ||
543 | * invoking a callback. | ||
544 | */ | ||
545 | |||
546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
547 | rcu_batch_empty(&sp->batch_check1)) | ||
548 | return; /* no callbacks need to be advanced */ | ||
549 | |||
550 | if (!try_check_zero(sp, idx, trycount)) | ||
551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
552 | |||
553 | /* | ||
554 | * The callbacks in ->batch_check1 have already done with their | ||
555 | * first zero check and flip back when they were enqueued on | ||
556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
557 | * (Presumably try_check_zero() returned false during that | ||
558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
560 | */ | ||
561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
562 | |||
563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
564 | return; /* no callbacks need to be advanced */ | ||
565 | srcu_flip(sp); | ||
566 | |||
567 | /* | ||
568 | * The callbacks in ->batch_check0 just finished their | ||
569 | * first check zero and flip, so move them to ->batch_check1 | ||
570 | * for future checking on the other idx. | ||
571 | */ | ||
572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
573 | |||
574 | /* | ||
575 | * SRCU read-side critical sections are normally short, so check | ||
576 | * at least twice in quick succession after a flip. | ||
577 | */ | ||
578 | trycount = trycount < 2 ? 2 : trycount; | ||
579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
581 | |||
582 | /* | ||
583 | * The callbacks in ->batch_check1 have now waited for all | ||
584 | * pre-existing readers using both idx values. They are therefore | ||
585 | * ready to invoke, so move them to ->batch_done. | ||
586 | */ | ||
587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
592 | * their grace period. If there are more to do, SRCU will reschedule | ||
593 | * the workqueue. | ||
594 | */ | ||
595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
596 | { | ||
597 | int i; | ||
598 | struct rcu_head *head; | ||
599 | |||
600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
602 | if (!head) | ||
603 | break; | ||
604 | local_bh_disable(); | ||
605 | head->func(head); | ||
606 | local_bh_enable(); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Finished one round of SRCU grace period. Start another if there are | ||
612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
613 | */ | ||
614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
615 | { | ||
616 | bool pending = true; | ||
617 | |||
618 | if (rcu_batch_empty(&sp->batch_done) && | ||
619 | rcu_batch_empty(&sp->batch_check1) && | ||
620 | rcu_batch_empty(&sp->batch_check0) && | ||
621 | rcu_batch_empty(&sp->batch_queue)) { | ||
622 | spin_lock_irq(&sp->queue_lock); | ||
623 | if (rcu_batch_empty(&sp->batch_done) && | ||
624 | rcu_batch_empty(&sp->batch_check1) && | ||
625 | rcu_batch_empty(&sp->batch_check0) && | ||
626 | rcu_batch_empty(&sp->batch_queue)) { | ||
627 | sp->running = false; | ||
628 | pending = false; | ||
629 | } | ||
630 | spin_unlock_irq(&sp->queue_lock); | ||
631 | } | ||
632 | |||
633 | if (pending) | ||
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * This is the work-queue function that handles SRCU grace periods. | ||
639 | */ | ||
640 | static void process_srcu(struct work_struct *work) | ||
641 | { | ||
642 | struct srcu_struct *sp; | ||
643 | |||
644 | sp = container_of(work, struct srcu_struct, work.work); | ||
645 | |||
646 | srcu_collect_new(sp); | ||
647 | srcu_advance_batches(sp, 1); | ||
648 | srcu_invoke_callbacks(sp); | ||
649 | srcu_reschedule(sp); | ||
650 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e..6df42624e45 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -93,10 +93,8 @@ | |||
93 | int overflowuid = DEFAULT_OVERFLOWUID; | 93 | int overflowuid = DEFAULT_OVERFLOWUID; |
94 | int overflowgid = DEFAULT_OVERFLOWGID; | 94 | int overflowgid = DEFAULT_OVERFLOWGID; |
95 | 95 | ||
96 | #ifdef CONFIG_UID16 | ||
97 | EXPORT_SYMBOL(overflowuid); | 96 | EXPORT_SYMBOL(overflowuid); |
98 | EXPORT_SYMBOL(overflowgid); | 97 | EXPORT_SYMBOL(overflowgid); |
99 | #endif | ||
100 | 98 | ||
101 | /* | 99 | /* |
102 | * the same as above, but for filesystems which can only store a 16-bit | 100 | * the same as above, but for filesystems which can only store a 16-bit |
@@ -133,11 +131,10 @@ static bool set_one_prio_perm(struct task_struct *p) | |||
133 | { | 131 | { |
134 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | 132 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); |
135 | 133 | ||
136 | if (pcred->user->user_ns == cred->user->user_ns && | 134 | if (uid_eq(pcred->uid, cred->euid) || |
137 | (pcred->uid == cred->euid || | 135 | uid_eq(pcred->euid, cred->euid)) |
138 | pcred->euid == cred->euid)) | ||
139 | return true; | 136 | return true; |
140 | if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) | 137 | if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) |
141 | return true; | 138 | return true; |
142 | return false; | 139 | return false; |
143 | } | 140 | } |
@@ -177,6 +174,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
177 | const struct cred *cred = current_cred(); | 174 | const struct cred *cred = current_cred(); |
178 | int error = -EINVAL; | 175 | int error = -EINVAL; |
179 | struct pid *pgrp; | 176 | struct pid *pgrp; |
177 | kuid_t uid; | ||
180 | 178 | ||
181 | if (which > PRIO_USER || which < PRIO_PROCESS) | 179 | if (which > PRIO_USER || which < PRIO_PROCESS) |
182 | goto out; | 180 | goto out; |
@@ -209,18 +207,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
209 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 207 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
210 | break; | 208 | break; |
211 | case PRIO_USER: | 209 | case PRIO_USER: |
212 | user = (struct user_struct *) cred->user; | 210 | uid = make_kuid(cred->user_ns, who); |
211 | user = cred->user; | ||
213 | if (!who) | 212 | if (!who) |
214 | who = cred->uid; | 213 | uid = cred->uid; |
215 | else if ((who != cred->uid) && | 214 | else if (!uid_eq(uid, cred->uid) && |
216 | !(user = find_user(who))) | 215 | !(user = find_user(uid))) |
217 | goto out_unlock; /* No processes for this user */ | 216 | goto out_unlock; /* No processes for this user */ |
218 | 217 | ||
219 | do_each_thread(g, p) { | 218 | do_each_thread(g, p) { |
220 | if (__task_cred(p)->uid == who) | 219 | if (uid_eq(task_uid(p), uid)) |
221 | error = set_one_prio(p, niceval, error); | 220 | error = set_one_prio(p, niceval, error); |
222 | } while_each_thread(g, p); | 221 | } while_each_thread(g, p); |
223 | if (who != cred->uid) | 222 | if (!uid_eq(uid, cred->uid)) |
224 | free_uid(user); /* For find_user() */ | 223 | free_uid(user); /* For find_user() */ |
225 | break; | 224 | break; |
226 | } | 225 | } |
@@ -244,6 +243,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
244 | const struct cred *cred = current_cred(); | 243 | const struct cred *cred = current_cred(); |
245 | long niceval, retval = -ESRCH; | 244 | long niceval, retval = -ESRCH; |
246 | struct pid *pgrp; | 245 | struct pid *pgrp; |
246 | kuid_t uid; | ||
247 | 247 | ||
248 | if (which > PRIO_USER || which < PRIO_PROCESS) | 248 | if (which > PRIO_USER || which < PRIO_PROCESS) |
249 | return -EINVAL; | 249 | return -EINVAL; |
@@ -274,21 +274,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
274 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 274 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
275 | break; | 275 | break; |
276 | case PRIO_USER: | 276 | case PRIO_USER: |
277 | user = (struct user_struct *) cred->user; | 277 | uid = make_kuid(cred->user_ns, who); |
278 | user = cred->user; | ||
278 | if (!who) | 279 | if (!who) |
279 | who = cred->uid; | 280 | uid = cred->uid; |
280 | else if ((who != cred->uid) && | 281 | else if (!uid_eq(uid, cred->uid) && |
281 | !(user = find_user(who))) | 282 | !(user = find_user(uid))) |
282 | goto out_unlock; /* No processes for this user */ | 283 | goto out_unlock; /* No processes for this user */ |
283 | 284 | ||
284 | do_each_thread(g, p) { | 285 | do_each_thread(g, p) { |
285 | if (__task_cred(p)->uid == who) { | 286 | if (uid_eq(task_uid(p), uid)) { |
286 | niceval = 20 - task_nice(p); | 287 | niceval = 20 - task_nice(p); |
287 | if (niceval > retval) | 288 | if (niceval > retval) |
288 | retval = niceval; | 289 | retval = niceval; |
289 | } | 290 | } |
290 | } while_each_thread(g, p); | 291 | } while_each_thread(g, p); |
291 | if (who != cred->uid) | 292 | if (!uid_eq(uid, cred->uid)) |
292 | free_uid(user); /* for find_user() */ | 293 | free_uid(user); /* for find_user() */ |
293 | break; | 294 | break; |
294 | } | 295 | } |
@@ -553,9 +554,19 @@ void ctrl_alt_del(void) | |||
553 | */ | 554 | */ |
554 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 555 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) |
555 | { | 556 | { |
557 | struct user_namespace *ns = current_user_ns(); | ||
556 | const struct cred *old; | 558 | const struct cred *old; |
557 | struct cred *new; | 559 | struct cred *new; |
558 | int retval; | 560 | int retval; |
561 | kgid_t krgid, kegid; | ||
562 | |||
563 | krgid = make_kgid(ns, rgid); | ||
564 | kegid = make_kgid(ns, egid); | ||
565 | |||
566 | if ((rgid != (gid_t) -1) && !gid_valid(krgid)) | ||
567 | return -EINVAL; | ||
568 | if ((egid != (gid_t) -1) && !gid_valid(kegid)) | ||
569 | return -EINVAL; | ||
559 | 570 | ||
560 | new = prepare_creds(); | 571 | new = prepare_creds(); |
561 | if (!new) | 572 | if (!new) |
@@ -564,25 +575,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
564 | 575 | ||
565 | retval = -EPERM; | 576 | retval = -EPERM; |
566 | if (rgid != (gid_t) -1) { | 577 | if (rgid != (gid_t) -1) { |
567 | if (old->gid == rgid || | 578 | if (gid_eq(old->gid, krgid) || |
568 | old->egid == rgid || | 579 | gid_eq(old->egid, krgid) || |
569 | nsown_capable(CAP_SETGID)) | 580 | nsown_capable(CAP_SETGID)) |
570 | new->gid = rgid; | 581 | new->gid = krgid; |
571 | else | 582 | else |
572 | goto error; | 583 | goto error; |
573 | } | 584 | } |
574 | if (egid != (gid_t) -1) { | 585 | if (egid != (gid_t) -1) { |
575 | if (old->gid == egid || | 586 | if (gid_eq(old->gid, kegid) || |
576 | old->egid == egid || | 587 | gid_eq(old->egid, kegid) || |
577 | old->sgid == egid || | 588 | gid_eq(old->sgid, kegid) || |
578 | nsown_capable(CAP_SETGID)) | 589 | nsown_capable(CAP_SETGID)) |
579 | new->egid = egid; | 590 | new->egid = kegid; |
580 | else | 591 | else |
581 | goto error; | 592 | goto error; |
582 | } | 593 | } |
583 | 594 | ||
584 | if (rgid != (gid_t) -1 || | 595 | if (rgid != (gid_t) -1 || |
585 | (egid != (gid_t) -1 && egid != old->gid)) | 596 | (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) |
586 | new->sgid = new->egid; | 597 | new->sgid = new->egid; |
587 | new->fsgid = new->egid; | 598 | new->fsgid = new->egid; |
588 | 599 | ||
@@ -600,9 +611,15 @@ error: | |||
600 | */ | 611 | */ |
601 | SYSCALL_DEFINE1(setgid, gid_t, gid) | 612 | SYSCALL_DEFINE1(setgid, gid_t, gid) |
602 | { | 613 | { |
614 | struct user_namespace *ns = current_user_ns(); | ||
603 | const struct cred *old; | 615 | const struct cred *old; |
604 | struct cred *new; | 616 | struct cred *new; |
605 | int retval; | 617 | int retval; |
618 | kgid_t kgid; | ||
619 | |||
620 | kgid = make_kgid(ns, gid); | ||
621 | if (!gid_valid(kgid)) | ||
622 | return -EINVAL; | ||
606 | 623 | ||
607 | new = prepare_creds(); | 624 | new = prepare_creds(); |
608 | if (!new) | 625 | if (!new) |
@@ -611,9 +628,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
611 | 628 | ||
612 | retval = -EPERM; | 629 | retval = -EPERM; |
613 | if (nsown_capable(CAP_SETGID)) | 630 | if (nsown_capable(CAP_SETGID)) |
614 | new->gid = new->egid = new->sgid = new->fsgid = gid; | 631 | new->gid = new->egid = new->sgid = new->fsgid = kgid; |
615 | else if (gid == old->gid || gid == old->sgid) | 632 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) |
616 | new->egid = new->fsgid = gid; | 633 | new->egid = new->fsgid = kgid; |
617 | else | 634 | else |
618 | goto error; | 635 | goto error; |
619 | 636 | ||
@@ -631,7 +648,7 @@ static int set_user(struct cred *new) | |||
631 | { | 648 | { |
632 | struct user_struct *new_user; | 649 | struct user_struct *new_user; |
633 | 650 | ||
634 | new_user = alloc_uid(current_user_ns(), new->uid); | 651 | new_user = alloc_uid(new->uid); |
635 | if (!new_user) | 652 | if (!new_user) |
636 | return -EAGAIN; | 653 | return -EAGAIN; |
637 | 654 | ||
@@ -670,9 +687,19 @@ static int set_user(struct cred *new) | |||
670 | */ | 687 | */ |
671 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 688 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) |
672 | { | 689 | { |
690 | struct user_namespace *ns = current_user_ns(); | ||
673 | const struct cred *old; | 691 | const struct cred *old; |
674 | struct cred *new; | 692 | struct cred *new; |
675 | int retval; | 693 | int retval; |
694 | kuid_t kruid, keuid; | ||
695 | |||
696 | kruid = make_kuid(ns, ruid); | ||
697 | keuid = make_kuid(ns, euid); | ||
698 | |||
699 | if ((ruid != (uid_t) -1) && !uid_valid(kruid)) | ||
700 | return -EINVAL; | ||
701 | if ((euid != (uid_t) -1) && !uid_valid(keuid)) | ||
702 | return -EINVAL; | ||
676 | 703 | ||
677 | new = prepare_creds(); | 704 | new = prepare_creds(); |
678 | if (!new) | 705 | if (!new) |
@@ -681,29 +708,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
681 | 708 | ||
682 | retval = -EPERM; | 709 | retval = -EPERM; |
683 | if (ruid != (uid_t) -1) { | 710 | if (ruid != (uid_t) -1) { |
684 | new->uid = ruid; | 711 | new->uid = kruid; |
685 | if (old->uid != ruid && | 712 | if (!uid_eq(old->uid, kruid) && |
686 | old->euid != ruid && | 713 | !uid_eq(old->euid, kruid) && |
687 | !nsown_capable(CAP_SETUID)) | 714 | !nsown_capable(CAP_SETUID)) |
688 | goto error; | 715 | goto error; |
689 | } | 716 | } |
690 | 717 | ||
691 | if (euid != (uid_t) -1) { | 718 | if (euid != (uid_t) -1) { |
692 | new->euid = euid; | 719 | new->euid = keuid; |
693 | if (old->uid != euid && | 720 | if (!uid_eq(old->uid, keuid) && |
694 | old->euid != euid && | 721 | !uid_eq(old->euid, keuid) && |
695 | old->suid != euid && | 722 | !uid_eq(old->suid, keuid) && |
696 | !nsown_capable(CAP_SETUID)) | 723 | !nsown_capable(CAP_SETUID)) |
697 | goto error; | 724 | goto error; |
698 | } | 725 | } |
699 | 726 | ||
700 | if (new->uid != old->uid) { | 727 | if (!uid_eq(new->uid, old->uid)) { |
701 | retval = set_user(new); | 728 | retval = set_user(new); |
702 | if (retval < 0) | 729 | if (retval < 0) |
703 | goto error; | 730 | goto error; |
704 | } | 731 | } |
705 | if (ruid != (uid_t) -1 || | 732 | if (ruid != (uid_t) -1 || |
706 | (euid != (uid_t) -1 && euid != old->uid)) | 733 | (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) |
707 | new->suid = new->euid; | 734 | new->suid = new->euid; |
708 | new->fsuid = new->euid; | 735 | new->fsuid = new->euid; |
709 | 736 | ||
@@ -731,9 +758,15 @@ error: | |||
731 | */ | 758 | */ |
732 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 759 | SYSCALL_DEFINE1(setuid, uid_t, uid) |
733 | { | 760 | { |
761 | struct user_namespace *ns = current_user_ns(); | ||
734 | const struct cred *old; | 762 | const struct cred *old; |
735 | struct cred *new; | 763 | struct cred *new; |
736 | int retval; | 764 | int retval; |
765 | kuid_t kuid; | ||
766 | |||
767 | kuid = make_kuid(ns, uid); | ||
768 | if (!uid_valid(kuid)) | ||
769 | return -EINVAL; | ||
737 | 770 | ||
738 | new = prepare_creds(); | 771 | new = prepare_creds(); |
739 | if (!new) | 772 | if (!new) |
@@ -742,17 +775,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
742 | 775 | ||
743 | retval = -EPERM; | 776 | retval = -EPERM; |
744 | if (nsown_capable(CAP_SETUID)) { | 777 | if (nsown_capable(CAP_SETUID)) { |
745 | new->suid = new->uid = uid; | 778 | new->suid = new->uid = kuid; |
746 | if (uid != old->uid) { | 779 | if (!uid_eq(kuid, old->uid)) { |
747 | retval = set_user(new); | 780 | retval = set_user(new); |
748 | if (retval < 0) | 781 | if (retval < 0) |
749 | goto error; | 782 | goto error; |
750 | } | 783 | } |
751 | } else if (uid != old->uid && uid != new->suid) { | 784 | } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { |
752 | goto error; | 785 | goto error; |
753 | } | 786 | } |
754 | 787 | ||
755 | new->fsuid = new->euid = uid; | 788 | new->fsuid = new->euid = kuid; |
756 | 789 | ||
757 | retval = security_task_fix_setuid(new, old, LSM_SETID_ID); | 790 | retval = security_task_fix_setuid(new, old, LSM_SETID_ID); |
758 | if (retval < 0) | 791 | if (retval < 0) |
@@ -772,9 +805,24 @@ error: | |||
772 | */ | 805 | */ |
773 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | 806 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) |
774 | { | 807 | { |
808 | struct user_namespace *ns = current_user_ns(); | ||
775 | const struct cred *old; | 809 | const struct cred *old; |
776 | struct cred *new; | 810 | struct cred *new; |
777 | int retval; | 811 | int retval; |
812 | kuid_t kruid, keuid, ksuid; | ||
813 | |||
814 | kruid = make_kuid(ns, ruid); | ||
815 | keuid = make_kuid(ns, euid); | ||
816 | ksuid = make_kuid(ns, suid); | ||
817 | |||
818 | if ((ruid != (uid_t) -1) && !uid_valid(kruid)) | ||
819 | return -EINVAL; | ||
820 | |||
821 | if ((euid != (uid_t) -1) && !uid_valid(keuid)) | ||
822 | return -EINVAL; | ||
823 | |||
824 | if ((suid != (uid_t) -1) && !uid_valid(ksuid)) | ||
825 | return -EINVAL; | ||
778 | 826 | ||
779 | new = prepare_creds(); | 827 | new = prepare_creds(); |
780 | if (!new) | 828 | if (!new) |
@@ -784,29 +832,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
784 | 832 | ||
785 | retval = -EPERM; | 833 | retval = -EPERM; |
786 | if (!nsown_capable(CAP_SETUID)) { | 834 | if (!nsown_capable(CAP_SETUID)) { |
787 | if (ruid != (uid_t) -1 && ruid != old->uid && | 835 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && |
788 | ruid != old->euid && ruid != old->suid) | 836 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) |
789 | goto error; | 837 | goto error; |
790 | if (euid != (uid_t) -1 && euid != old->uid && | 838 | if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && |
791 | euid != old->euid && euid != old->suid) | 839 | !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) |
792 | goto error; | 840 | goto error; |
793 | if (suid != (uid_t) -1 && suid != old->uid && | 841 | if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && |
794 | suid != old->euid && suid != old->suid) | 842 | !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) |
795 | goto error; | 843 | goto error; |
796 | } | 844 | } |
797 | 845 | ||
798 | if (ruid != (uid_t) -1) { | 846 | if (ruid != (uid_t) -1) { |
799 | new->uid = ruid; | 847 | new->uid = kruid; |
800 | if (ruid != old->uid) { | 848 | if (!uid_eq(kruid, old->uid)) { |
801 | retval = set_user(new); | 849 | retval = set_user(new); |
802 | if (retval < 0) | 850 | if (retval < 0) |
803 | goto error; | 851 | goto error; |
804 | } | 852 | } |
805 | } | 853 | } |
806 | if (euid != (uid_t) -1) | 854 | if (euid != (uid_t) -1) |
807 | new->euid = euid; | 855 | new->euid = keuid; |
808 | if (suid != (uid_t) -1) | 856 | if (suid != (uid_t) -1) |
809 | new->suid = suid; | 857 | new->suid = ksuid; |
810 | new->fsuid = new->euid; | 858 | new->fsuid = new->euid; |
811 | 859 | ||
812 | retval = security_task_fix_setuid(new, old, LSM_SETID_RES); | 860 | retval = security_task_fix_setuid(new, old, LSM_SETID_RES); |
@@ -820,14 +868,19 @@ error: | |||
820 | return retval; | 868 | return retval; |
821 | } | 869 | } |
822 | 870 | ||
823 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) | 871 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) |
824 | { | 872 | { |
825 | const struct cred *cred = current_cred(); | 873 | const struct cred *cred = current_cred(); |
826 | int retval; | 874 | int retval; |
875 | uid_t ruid, euid, suid; | ||
827 | 876 | ||
828 | if (!(retval = put_user(cred->uid, ruid)) && | 877 | ruid = from_kuid_munged(cred->user_ns, cred->uid); |
829 | !(retval = put_user(cred->euid, euid))) | 878 | euid = from_kuid_munged(cred->user_ns, cred->euid); |
830 | retval = put_user(cred->suid, suid); | 879 | suid = from_kuid_munged(cred->user_ns, cred->suid); |
880 | |||
881 | if (!(retval = put_user(ruid, ruidp)) && | ||
882 | !(retval = put_user(euid, euidp))) | ||
883 | retval = put_user(suid, suidp); | ||
831 | 884 | ||
832 | return retval; | 885 | return retval; |
833 | } | 886 | } |
@@ -837,9 +890,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u | |||
837 | */ | 890 | */ |
838 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | 891 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) |
839 | { | 892 | { |
893 | struct user_namespace *ns = current_user_ns(); | ||
840 | const struct cred *old; | 894 | const struct cred *old; |
841 | struct cred *new; | 895 | struct cred *new; |
842 | int retval; | 896 | int retval; |
897 | kgid_t krgid, kegid, ksgid; | ||
898 | |||
899 | krgid = make_kgid(ns, rgid); | ||
900 | kegid = make_kgid(ns, egid); | ||
901 | ksgid = make_kgid(ns, sgid); | ||
902 | |||
903 | if ((rgid != (gid_t) -1) && !gid_valid(krgid)) | ||
904 | return -EINVAL; | ||
905 | if ((egid != (gid_t) -1) && !gid_valid(kegid)) | ||
906 | return -EINVAL; | ||
907 | if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) | ||
908 | return -EINVAL; | ||
843 | 909 | ||
844 | new = prepare_creds(); | 910 | new = prepare_creds(); |
845 | if (!new) | 911 | if (!new) |
@@ -848,23 +914,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
848 | 914 | ||
849 | retval = -EPERM; | 915 | retval = -EPERM; |
850 | if (!nsown_capable(CAP_SETGID)) { | 916 | if (!nsown_capable(CAP_SETGID)) { |
851 | if (rgid != (gid_t) -1 && rgid != old->gid && | 917 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && |
852 | rgid != old->egid && rgid != old->sgid) | 918 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) |
853 | goto error; | 919 | goto error; |
854 | if (egid != (gid_t) -1 && egid != old->gid && | 920 | if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && |
855 | egid != old->egid && egid != old->sgid) | 921 | !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) |
856 | goto error; | 922 | goto error; |
857 | if (sgid != (gid_t) -1 && sgid != old->gid && | 923 | if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && |
858 | sgid != old->egid && sgid != old->sgid) | 924 | !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) |
859 | goto error; | 925 | goto error; |
860 | } | 926 | } |
861 | 927 | ||
862 | if (rgid != (gid_t) -1) | 928 | if (rgid != (gid_t) -1) |
863 | new->gid = rgid; | 929 | new->gid = krgid; |
864 | if (egid != (gid_t) -1) | 930 | if (egid != (gid_t) -1) |
865 | new->egid = egid; | 931 | new->egid = kegid; |
866 | if (sgid != (gid_t) -1) | 932 | if (sgid != (gid_t) -1) |
867 | new->sgid = sgid; | 933 | new->sgid = ksgid; |
868 | new->fsgid = new->egid; | 934 | new->fsgid = new->egid; |
869 | 935 | ||
870 | return commit_creds(new); | 936 | return commit_creds(new); |
@@ -874,14 +940,19 @@ error: | |||
874 | return retval; | 940 | return retval; |
875 | } | 941 | } |
876 | 942 | ||
877 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) | 943 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) |
878 | { | 944 | { |
879 | const struct cred *cred = current_cred(); | 945 | const struct cred *cred = current_cred(); |
880 | int retval; | 946 | int retval; |
947 | gid_t rgid, egid, sgid; | ||
948 | |||
949 | rgid = from_kgid_munged(cred->user_ns, cred->gid); | ||
950 | egid = from_kgid_munged(cred->user_ns, cred->egid); | ||
951 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); | ||
881 | 952 | ||
882 | if (!(retval = put_user(cred->gid, rgid)) && | 953 | if (!(retval = put_user(rgid, rgidp)) && |
883 | !(retval = put_user(cred->egid, egid))) | 954 | !(retval = put_user(egid, egidp))) |
884 | retval = put_user(cred->sgid, sgid); | 955 | retval = put_user(sgid, sgidp); |
885 | 956 | ||
886 | return retval; | 957 | return retval; |
887 | } | 958 | } |
@@ -898,18 +969,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
898 | const struct cred *old; | 969 | const struct cred *old; |
899 | struct cred *new; | 970 | struct cred *new; |
900 | uid_t old_fsuid; | 971 | uid_t old_fsuid; |
972 | kuid_t kuid; | ||
973 | |||
974 | old = current_cred(); | ||
975 | old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); | ||
976 | |||
977 | kuid = make_kuid(old->user_ns, uid); | ||
978 | if (!uid_valid(kuid)) | ||
979 | return old_fsuid; | ||
901 | 980 | ||
902 | new = prepare_creds(); | 981 | new = prepare_creds(); |
903 | if (!new) | 982 | if (!new) |
904 | return current_fsuid(); | 983 | return old_fsuid; |
905 | old = current_cred(); | ||
906 | old_fsuid = old->fsuid; | ||
907 | 984 | ||
908 | if (uid == old->uid || uid == old->euid || | 985 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || |
909 | uid == old->suid || uid == old->fsuid || | 986 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || |
910 | nsown_capable(CAP_SETUID)) { | 987 | nsown_capable(CAP_SETUID)) { |
911 | if (uid != old_fsuid) { | 988 | if (!uid_eq(kuid, old->fsuid)) { |
912 | new->fsuid = uid; | 989 | new->fsuid = kuid; |
913 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 990 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
914 | goto change_okay; | 991 | goto change_okay; |
915 | } | 992 | } |
@@ -931,18 +1008,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
931 | const struct cred *old; | 1008 | const struct cred *old; |
932 | struct cred *new; | 1009 | struct cred *new; |
933 | gid_t old_fsgid; | 1010 | gid_t old_fsgid; |
1011 | kgid_t kgid; | ||
1012 | |||
1013 | old = current_cred(); | ||
1014 | old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); | ||
1015 | |||
1016 | kgid = make_kgid(old->user_ns, gid); | ||
1017 | if (!gid_valid(kgid)) | ||
1018 | return old_fsgid; | ||
934 | 1019 | ||
935 | new = prepare_creds(); | 1020 | new = prepare_creds(); |
936 | if (!new) | 1021 | if (!new) |
937 | return current_fsgid(); | 1022 | return old_fsgid; |
938 | old = current_cred(); | ||
939 | old_fsgid = old->fsgid; | ||
940 | 1023 | ||
941 | if (gid == old->gid || gid == old->egid || | 1024 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || |
942 | gid == old->sgid || gid == old->fsgid || | 1025 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || |
943 | nsown_capable(CAP_SETGID)) { | 1026 | nsown_capable(CAP_SETGID)) { |
944 | if (gid != old_fsgid) { | 1027 | if (!gid_eq(kgid, old->fsgid)) { |
945 | new->fsgid = gid; | 1028 | new->fsgid = kgid; |
946 | goto change_okay; | 1029 | goto change_okay; |
947 | } | 1030 | } |
948 | } | 1031 | } |
@@ -1498,15 +1581,14 @@ static int check_prlimit_permission(struct task_struct *task) | |||
1498 | return 0; | 1581 | return 0; |
1499 | 1582 | ||
1500 | tcred = __task_cred(task); | 1583 | tcred = __task_cred(task); |
1501 | if (cred->user->user_ns == tcred->user->user_ns && | 1584 | if (uid_eq(cred->uid, tcred->euid) && |
1502 | (cred->uid == tcred->euid && | 1585 | uid_eq(cred->uid, tcred->suid) && |
1503 | cred->uid == tcred->suid && | 1586 | uid_eq(cred->uid, tcred->uid) && |
1504 | cred->uid == tcred->uid && | 1587 | gid_eq(cred->gid, tcred->egid) && |
1505 | cred->gid == tcred->egid && | 1588 | gid_eq(cred->gid, tcred->sgid) && |
1506 | cred->gid == tcred->sgid && | 1589 | gid_eq(cred->gid, tcred->gid)) |
1507 | cred->gid == tcred->gid)) | ||
1508 | return 0; | 1590 | return 0; |
1509 | if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) | 1591 | if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) |
1510 | return 0; | 1592 | return 0; |
1511 | 1593 | ||
1512 | return -EPERM; | 1594 | return -EPERM; |
@@ -1908,7 +1990,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1908 | error = prctl_get_seccomp(); | 1990 | error = prctl_get_seccomp(); |
1909 | break; | 1991 | break; |
1910 | case PR_SET_SECCOMP: | 1992 | case PR_SET_SECCOMP: |
1911 | error = prctl_set_seccomp(arg2); | 1993 | error = prctl_set_seccomp(arg2, (char __user *)arg3); |
1912 | break; | 1994 | break; |
1913 | case PR_GET_TSC: | 1995 | case PR_GET_TSC: |
1914 | error = GET_TSC_CTL(arg2); | 1996 | error = GET_TSC_CTL(arg2); |
@@ -1979,6 +2061,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1979 | error = put_user(me->signal->is_child_subreaper, | 2061 | error = put_user(me->signal->is_child_subreaper, |
1980 | (int __user *) arg2); | 2062 | (int __user *) arg2); |
1981 | break; | 2063 | break; |
2064 | case PR_SET_NO_NEW_PRIVS: | ||
2065 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
2066 | return -EINVAL; | ||
2067 | |||
2068 | current->no_new_privs = 1; | ||
2069 | break; | ||
2070 | case PR_GET_NO_NEW_PRIVS: | ||
2071 | if (arg2 || arg3 || arg4 || arg5) | ||
2072 | return -EINVAL; | ||
2073 | return current->no_new_privs ? 1 : 0; | ||
1982 | default: | 2074 | default: |
1983 | error = -EINVAL; | 2075 | error = -EINVAL; |
1984 | break; | 2076 | break; |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7..aa27d391bfc 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); | |||
59 | * If one has not already been chosen, it checks to see if a | 59 | * If one has not already been chosen, it checks to see if a |
60 | * functional rtc device is available. | 60 | * functional rtc device is available. |
61 | */ | 61 | */ |
62 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | struct rtc_device *alarmtimer_get_rtcdev(void) |
63 | { | 63 | { |
64 | unsigned long flags; | 64 | unsigned long flags; |
65 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) | |||
115 | class_interface_unregister(&alarmtimer_rtc_interface); | 115 | class_interface_unregister(&alarmtimer_rtc_interface); |
116 | } | 116 | } |
117 | #else | 117 | #else |
118 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) | 118 | struct rtc_device *alarmtimer_get_rtcdev(void) |
119 | { | 119 | { |
120 | return NULL; | 120 | return NULL; |
121 | } | 121 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index bf57abdc7bd..f113755695e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -346,7 +346,8 @@ int tick_resume_broadcast(void) | |||
346 | tick_get_broadcast_mask()); | 346 | tick_get_broadcast_mask()); |
347 | break; | 347 | break; |
348 | case TICKDEV_MODE_ONESHOT: | 348 | case TICKDEV_MODE_ONESHOT: |
349 | broadcast = tick_resume_broadcast_oneshot(bc); | 349 | if (!cpumask_empty(tick_get_broadcast_mask())) |
350 | broadcast = tick_resume_broadcast_oneshot(bc); | ||
350 | break; | 351 | break; |
351 | } | 352 | } |
352 | } | 353 | } |
@@ -373,6 +374,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force) | |||
373 | { | 374 | { |
374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 375 | struct clock_event_device *bc = tick_broadcast_device.evtdev; |
375 | 376 | ||
377 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) | ||
378 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
379 | |||
376 | return clockevents_program_event(bc, expires, force); | 380 | return clockevents_program_event(bc, expires, force); |
377 | } | 381 | } |
378 | 382 | ||
@@ -531,7 +535,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
531 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 535 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; |
532 | 536 | ||
533 | bc->event_handler = tick_handle_oneshot_broadcast; | 537 | bc->event_handler = tick_handle_oneshot_broadcast; |
534 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
535 | 538 | ||
536 | /* Take the do_timer update */ | 539 | /* Take the do_timer update */ |
537 | tick_do_timer_cpu = cpu; | 540 | tick_do_timer_cpu = cpu; |
@@ -549,6 +552,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
549 | to_cpumask(tmpmask)); | 552 | to_cpumask(tmpmask)); |
550 | 553 | ||
551 | if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { | 554 | if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { |
555 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
552 | tick_broadcast_init_next_event(to_cpumask(tmpmask), | 556 | tick_broadcast_init_next_event(to_cpumask(tmpmask), |
553 | tick_next_period); | 557 | tick_next_period); |
554 | tick_broadcast_set_event(tick_next_period, 1); | 558 | tick_broadcast_set_event(tick_next_period, 1); |
@@ -577,15 +581,10 @@ void tick_broadcast_switch_to_oneshot(void) | |||
577 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 581 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
578 | 582 | ||
579 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; | 583 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; |
580 | |||
581 | if (cpumask_empty(tick_get_broadcast_mask())) | ||
582 | goto end; | ||
583 | |||
584 | bc = tick_broadcast_device.evtdev; | 584 | bc = tick_broadcast_device.evtdev; |
585 | if (bc) | 585 | if (bc) |
586 | tick_broadcast_setup_oneshot(bc); | 586 | tick_broadcast_setup_oneshot(bc); |
587 | 587 | ||
588 | end: | ||
589 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 588 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
590 | } | 589 | } |
591 | 590 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888..6ec7e7e0db4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); | |||
861 | * | 861 | * |
862 | * mod_timer_pinned() is a way to update the expire field of an | 862 | * mod_timer_pinned() is a way to update the expire field of an |
863 | * active timer (if the timer is inactive it will be activated) | 863 | * active timer (if the timer is inactive it will be activated) |
864 | * and not allow the timer to be migrated to a different CPU. | 864 | * and to ensure that the timer is scheduled on the current CPU. |
865 | * | ||
866 | * Note that this does not prevent the timer from being migrated | ||
867 | * when the current CPU goes offline. If this is a problem for | ||
868 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
869 | * example, cancelling the timer when the corresponding CPU goes | ||
870 | * offline. | ||
865 | * | 871 | * |
866 | * mod_timer_pinned(timer, expires) is equivalent to: | 872 | * mod_timer_pinned(timer, expires) is equivalent to: |
867 | * | 873 | * |
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1102 | * warnings as well as problems when looking into | 1108 | * warnings as well as problems when looking into |
1103 | * timer->lockdep_map, make a copy and use that here. | 1109 | * timer->lockdep_map, make a copy and use that here. |
1104 | */ | 1110 | */ |
1105 | struct lockdep_map lockdep_map = timer->lockdep_map; | 1111 | struct lockdep_map lockdep_map; |
1112 | |||
1113 | lockdep_copy_map(&lockdep_map, &timer->lockdep_map); | ||
1106 | #endif | 1114 | #endif |
1107 | /* | 1115 | /* |
1108 | * Couple the lock chain with the lock chain at | 1116 | * Couple the lock chain with the lock chain at |
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid) | |||
1427 | SYSCALL_DEFINE0(getuid) | 1435 | SYSCALL_DEFINE0(getuid) |
1428 | { | 1436 | { |
1429 | /* Only we change this so SMP safe */ | 1437 | /* Only we change this so SMP safe */ |
1430 | return current_uid(); | 1438 | return from_kuid_munged(current_user_ns(), current_uid()); |
1431 | } | 1439 | } |
1432 | 1440 | ||
1433 | SYSCALL_DEFINE0(geteuid) | 1441 | SYSCALL_DEFINE0(geteuid) |
1434 | { | 1442 | { |
1435 | /* Only we change this so SMP safe */ | 1443 | /* Only we change this so SMP safe */ |
1436 | return current_euid(); | 1444 | return from_kuid_munged(current_user_ns(), current_euid()); |
1437 | } | 1445 | } |
1438 | 1446 | ||
1439 | SYSCALL_DEFINE0(getgid) | 1447 | SYSCALL_DEFINE0(getgid) |
1440 | { | 1448 | { |
1441 | /* Only we change this so SMP safe */ | 1449 | /* Only we change this so SMP safe */ |
1442 | return current_gid(); | 1450 | return from_kgid_munged(current_user_ns(), current_gid()); |
1443 | } | 1451 | } |
1444 | 1452 | ||
1445 | SYSCALL_DEFINE0(getegid) | 1453 | SYSCALL_DEFINE0(getegid) |
1446 | { | 1454 | { |
1447 | /* Only we change this so SMP safe */ | 1455 | /* Only we change this so SMP safe */ |
1448 | return current_egid(); | 1456 | return from_kgid_munged(current_user_ns(), current_egid()); |
1449 | } | 1457 | } |
1450 | 1458 | ||
1451 | #endif | 1459 | #endif |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ea4bff6295f..8c4c07071cc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,6 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE | ||
145 | select KALLSYMS | 144 | select KALLSYMS |
146 | select GENERIC_TRACER | 145 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 146 | select CONTEXT_SWITCH_TRACER |
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
272 | bool "Trace likely/unlikely profiler" | 271 | bool "Trace likely/unlikely profiler" |
273 | select TRACE_BRANCH_PROFILING | 272 | select TRACE_BRANCH_PROFILING |
274 | help | 273 | help |
275 | This tracer profiles all the the likely and unlikely macros | 274 | This tracer profiles all likely and unlikely macros |
276 | in the kernel. It will display the results in: | 275 | in the kernel. It will display the results in: |
277 | 276 | ||
278 | /sys/kernel/debug/tracing/trace_stat/branch_annotated | 277 | /sys/kernel/debug/tracing/trace_stat/branch_annotated |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1734c03e048..b831087c820 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o | |||
41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | ||
45 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 44 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
46 | ifeq ($(CONFIG_BLOCK),y) | 45 | ifeq ($(CONFIG_BLOCK),y) |
47 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o | 46 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cf81f27ce6c..a008663d86c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1383 | 1383 | ||
1384 | static int ftrace_cmp_recs(const void *a, const void *b) | 1384 | static int ftrace_cmp_recs(const void *a, const void *b) |
1385 | { | 1385 | { |
1386 | const struct dyn_ftrace *reca = a; | 1386 | const struct dyn_ftrace *key = a; |
1387 | const struct dyn_ftrace *recb = b; | 1387 | const struct dyn_ftrace *rec = b; |
1388 | 1388 | ||
1389 | if (reca->ip > recb->ip) | 1389 | if (key->flags < rec->ip) |
1390 | return 1; | ||
1391 | if (reca->ip < recb->ip) | ||
1392 | return -1; | 1390 | return -1; |
1391 | if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) | ||
1392 | return 1; | ||
1393 | return 0; | 1393 | return 0; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | /** | 1396 | static unsigned long ftrace_location_range(unsigned long start, unsigned long end) |
1397 | * ftrace_location - return true if the ip giving is a traced location | ||
1398 | * @ip: the instruction pointer to check | ||
1399 | * | ||
1400 | * Returns 1 if @ip given is a pointer to a ftrace location. | ||
1401 | * That is, the instruction that is either a NOP or call to | ||
1402 | * the function tracer. It checks the ftrace internal tables to | ||
1403 | * determine if the address belongs or not. | ||
1404 | */ | ||
1405 | int ftrace_location(unsigned long ip) | ||
1406 | { | 1397 | { |
1407 | struct ftrace_page *pg; | 1398 | struct ftrace_page *pg; |
1408 | struct dyn_ftrace *rec; | 1399 | struct dyn_ftrace *rec; |
1409 | struct dyn_ftrace key; | 1400 | struct dyn_ftrace key; |
1410 | 1401 | ||
1411 | key.ip = ip; | 1402 | key.ip = start; |
1403 | key.flags = end; /* overload flags, as it is unsigned long */ | ||
1412 | 1404 | ||
1413 | for (pg = ftrace_pages_start; pg; pg = pg->next) { | 1405 | for (pg = ftrace_pages_start; pg; pg = pg->next) { |
1406 | if (end < pg->records[0].ip || | ||
1407 | start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) | ||
1408 | continue; | ||
1414 | rec = bsearch(&key, pg->records, pg->index, | 1409 | rec = bsearch(&key, pg->records, pg->index, |
1415 | sizeof(struct dyn_ftrace), | 1410 | sizeof(struct dyn_ftrace), |
1416 | ftrace_cmp_recs); | 1411 | ftrace_cmp_recs); |
1417 | if (rec) | 1412 | if (rec) |
1418 | return 1; | 1413 | return rec->ip; |
1419 | } | 1414 | } |
1420 | 1415 | ||
1421 | return 0; | 1416 | return 0; |
1422 | } | 1417 | } |
1423 | 1418 | ||
1419 | /** | ||
1420 | * ftrace_location - return true if the ip giving is a traced location | ||
1421 | * @ip: the instruction pointer to check | ||
1422 | * | ||
1423 | * Returns rec->ip if @ip given is a pointer to a ftrace location. | ||
1424 | * That is, the instruction that is either a NOP or call to | ||
1425 | * the function tracer. It checks the ftrace internal tables to | ||
1426 | * determine if the address belongs or not. | ||
1427 | */ | ||
1428 | unsigned long ftrace_location(unsigned long ip) | ||
1429 | { | ||
1430 | return ftrace_location_range(ip, ip); | ||
1431 | } | ||
1432 | |||
1433 | /** | ||
1434 | * ftrace_text_reserved - return true if range contains an ftrace location | ||
1435 | * @start: start of range to search | ||
1436 | * @end: end of range to search (inclusive). @end points to the last byte to check. | ||
1437 | * | ||
1438 | * Returns 1 if @start and @end contains a ftrace location. | ||
1439 | * That is, the instruction that is either a NOP or call to | ||
1440 | * the function tracer. It checks the ftrace internal tables to | ||
1441 | * determine if the address belongs or not. | ||
1442 | */ | ||
1443 | int ftrace_text_reserved(void *start, void *end) | ||
1444 | { | ||
1445 | unsigned long ret; | ||
1446 | |||
1447 | ret = ftrace_location_range((unsigned long)start, | ||
1448 | (unsigned long)end); | ||
1449 | |||
1450 | return (int)!!ret; | ||
1451 | } | ||
1452 | |||
1424 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1453 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1425 | int filter_hash, | 1454 | int filter_hash, |
1426 | bool inc) | 1455 | bool inc) |
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
1520 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1549 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
1521 | } | 1550 | } |
1522 | 1551 | ||
1523 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) | ||
1524 | { | ||
1525 | if (ftrace_pages->index == ftrace_pages->size) { | ||
1526 | /* We should have allocated enough */ | ||
1527 | if (WARN_ON(!ftrace_pages->next)) | ||
1528 | return NULL; | ||
1529 | ftrace_pages = ftrace_pages->next; | ||
1530 | } | ||
1531 | |||
1532 | return &ftrace_pages->records[ftrace_pages->index++]; | ||
1533 | } | ||
1534 | |||
1535 | static struct dyn_ftrace * | ||
1536 | ftrace_record_ip(unsigned long ip) | ||
1537 | { | ||
1538 | struct dyn_ftrace *rec; | ||
1539 | |||
1540 | if (ftrace_disabled) | ||
1541 | return NULL; | ||
1542 | |||
1543 | rec = ftrace_alloc_dyn_node(ip); | ||
1544 | if (!rec) | ||
1545 | return NULL; | ||
1546 | |||
1547 | rec->ip = ip; | ||
1548 | |||
1549 | return rec; | ||
1550 | } | ||
1551 | |||
1552 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1552 | static void print_ip_ins(const char *fmt, unsigned char *p) |
1553 | { | 1553 | { |
1554 | int i; | 1554 | int i; |
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) | |||
1598 | } | 1598 | } |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | |||
1602 | /* Return 1 if the address range is reserved for ftrace */ | ||
1603 | int ftrace_text_reserved(void *start, void *end) | ||
1604 | { | ||
1605 | struct dyn_ftrace *rec; | ||
1606 | struct ftrace_page *pg; | ||
1607 | |||
1608 | do_for_each_ftrace_rec(pg, rec) { | ||
1609 | if (rec->ip <= (unsigned long)end && | ||
1610 | rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) | ||
1611 | return 1; | ||
1612 | } while_for_each_ftrace_rec(); | ||
1613 | return 0; | ||
1614 | } | ||
1615 | |||
1616 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1601 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
1617 | { | 1602 | { |
1618 | unsigned long flag = 0UL; | 1603 | unsigned long flag = 0UL; |
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1698 | return -1; /* unknow ftrace bug */ | 1683 | return -1; /* unknow ftrace bug */ |
1699 | } | 1684 | } |
1700 | 1685 | ||
1701 | static void ftrace_replace_code(int update) | 1686 | void __weak ftrace_replace_code(int enable) |
1702 | { | 1687 | { |
1703 | struct dyn_ftrace *rec; | 1688 | struct dyn_ftrace *rec; |
1704 | struct ftrace_page *pg; | 1689 | struct ftrace_page *pg; |
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) | |||
1708 | return; | 1693 | return; |
1709 | 1694 | ||
1710 | do_for_each_ftrace_rec(pg, rec) { | 1695 | do_for_each_ftrace_rec(pg, rec) { |
1711 | failed = __ftrace_replace_code(rec, update); | 1696 | failed = __ftrace_replace_code(rec, enable); |
1712 | if (failed) { | 1697 | if (failed) { |
1713 | ftrace_bug(failed, rec->ip); | 1698 | ftrace_bug(failed, rec->ip); |
1714 | /* Stop processing */ | 1699 | /* Stop processing */ |
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
1826 | return 0; | 1811 | return 0; |
1827 | } | 1812 | } |
1828 | 1813 | ||
1829 | static int __ftrace_modify_code(void *data) | 1814 | void ftrace_modify_all_code(int command) |
1830 | { | 1815 | { |
1831 | int *command = data; | 1816 | if (command & FTRACE_UPDATE_CALLS) |
1832 | |||
1833 | if (*command & FTRACE_UPDATE_CALLS) | ||
1834 | ftrace_replace_code(1); | 1817 | ftrace_replace_code(1); |
1835 | else if (*command & FTRACE_DISABLE_CALLS) | 1818 | else if (command & FTRACE_DISABLE_CALLS) |
1836 | ftrace_replace_code(0); | 1819 | ftrace_replace_code(0); |
1837 | 1820 | ||
1838 | if (*command & FTRACE_UPDATE_TRACE_FUNC) | 1821 | if (command & FTRACE_UPDATE_TRACE_FUNC) |
1839 | ftrace_update_ftrace_func(ftrace_trace_function); | 1822 | ftrace_update_ftrace_func(ftrace_trace_function); |
1840 | 1823 | ||
1841 | if (*command & FTRACE_START_FUNC_RET) | 1824 | if (command & FTRACE_START_FUNC_RET) |
1842 | ftrace_enable_ftrace_graph_caller(); | 1825 | ftrace_enable_ftrace_graph_caller(); |
1843 | else if (*command & FTRACE_STOP_FUNC_RET) | 1826 | else if (command & FTRACE_STOP_FUNC_RET) |
1844 | ftrace_disable_ftrace_graph_caller(); | 1827 | ftrace_disable_ftrace_graph_caller(); |
1828 | } | ||
1829 | |||
1830 | static int __ftrace_modify_code(void *data) | ||
1831 | { | ||
1832 | int *command = data; | ||
1833 | |||
1834 | ftrace_modify_all_code(*command); | ||
1845 | 1835 | ||
1846 | return 0; | 1836 | return 0; |
1847 | } | 1837 | } |
@@ -3666,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3666 | return 0; | 3656 | return 0; |
3667 | } | 3657 | } |
3668 | 3658 | ||
3669 | static void ftrace_swap_recs(void *a, void *b, int size) | 3659 | static int ftrace_cmp_ips(const void *a, const void *b) |
3660 | { | ||
3661 | const unsigned long *ipa = a; | ||
3662 | const unsigned long *ipb = b; | ||
3663 | |||
3664 | if (*ipa > *ipb) | ||
3665 | return 1; | ||
3666 | if (*ipa < *ipb) | ||
3667 | return -1; | ||
3668 | return 0; | ||
3669 | } | ||
3670 | |||
3671 | static void ftrace_swap_ips(void *a, void *b, int size) | ||
3670 | { | 3672 | { |
3671 | struct dyn_ftrace *reca = a; | 3673 | unsigned long *ipa = a; |
3672 | struct dyn_ftrace *recb = b; | 3674 | unsigned long *ipb = b; |
3673 | struct dyn_ftrace t; | 3675 | unsigned long t; |
3674 | 3676 | ||
3675 | t = *reca; | 3677 | t = *ipa; |
3676 | *reca = *recb; | 3678 | *ipa = *ipb; |
3677 | *recb = t; | 3679 | *ipb = t; |
3678 | } | 3680 | } |
3679 | 3681 | ||
3680 | static int ftrace_process_locs(struct module *mod, | 3682 | static int ftrace_process_locs(struct module *mod, |
3681 | unsigned long *start, | 3683 | unsigned long *start, |
3682 | unsigned long *end) | 3684 | unsigned long *end) |
3683 | { | 3685 | { |
3686 | struct ftrace_page *start_pg; | ||
3684 | struct ftrace_page *pg; | 3687 | struct ftrace_page *pg; |
3688 | struct dyn_ftrace *rec; | ||
3685 | unsigned long count; | 3689 | unsigned long count; |
3686 | unsigned long *p; | 3690 | unsigned long *p; |
3687 | unsigned long addr; | 3691 | unsigned long addr; |
@@ -3693,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3693 | if (!count) | 3697 | if (!count) |
3694 | return 0; | 3698 | return 0; |
3695 | 3699 | ||
3696 | pg = ftrace_allocate_pages(count); | 3700 | sort(start, count, sizeof(*start), |
3697 | if (!pg) | 3701 | ftrace_cmp_ips, ftrace_swap_ips); |
3702 | |||
3703 | start_pg = ftrace_allocate_pages(count); | ||
3704 | if (!start_pg) | ||
3698 | return -ENOMEM; | 3705 | return -ENOMEM; |
3699 | 3706 | ||
3700 | mutex_lock(&ftrace_lock); | 3707 | mutex_lock(&ftrace_lock); |
@@ -3707,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, | |||
3707 | if (!mod) { | 3714 | if (!mod) { |
3708 | WARN_ON(ftrace_pages || ftrace_pages_start); | 3715 | WARN_ON(ftrace_pages || ftrace_pages_start); |
3709 | /* First initialization */ | 3716 | /* First initialization */ |
3710 | ftrace_pages = ftrace_pages_start = pg; | 3717 | ftrace_pages = ftrace_pages_start = start_pg; |
3711 | } else { | 3718 | } else { |
3712 | if (!ftrace_pages) | 3719 | if (!ftrace_pages) |
3713 | goto out; | 3720 | goto out; |
@@ -3718,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3718 | ftrace_pages = ftrace_pages->next; | 3725 | ftrace_pages = ftrace_pages->next; |
3719 | } | 3726 | } |
3720 | 3727 | ||
3721 | ftrace_pages->next = pg; | 3728 | ftrace_pages->next = start_pg; |
3722 | ftrace_pages = pg; | ||
3723 | } | 3729 | } |
3724 | 3730 | ||
3725 | p = start; | 3731 | p = start; |
3732 | pg = start_pg; | ||
3726 | while (p < end) { | 3733 | while (p < end) { |
3727 | addr = ftrace_call_adjust(*p++); | 3734 | addr = ftrace_call_adjust(*p++); |
3728 | /* | 3735 | /* |
@@ -3733,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, | |||
3733 | */ | 3740 | */ |
3734 | if (!addr) | 3741 | if (!addr) |
3735 | continue; | 3742 | continue; |
3736 | if (!ftrace_record_ip(addr)) | 3743 | |
3737 | break; | 3744 | if (pg->index == pg->size) { |
3745 | /* We should have allocated enough */ | ||
3746 | if (WARN_ON(!pg->next)) | ||
3747 | break; | ||
3748 | pg = pg->next; | ||
3749 | } | ||
3750 | |||
3751 | rec = &pg->records[pg->index++]; | ||
3752 | rec->ip = addr; | ||
3738 | } | 3753 | } |
3739 | 3754 | ||
3740 | /* These new locations need to be initialized */ | 3755 | /* We should have used all pages */ |
3741 | ftrace_new_pgs = pg; | 3756 | WARN_ON(pg->next); |
3742 | 3757 | ||
3743 | /* Make each individual set of pages sorted by ips */ | 3758 | /* Assign the last page to ftrace_pages */ |
3744 | for (; pg; pg = pg->next) | 3759 | ftrace_pages = pg; |
3745 | sort(pg->records, pg->index, sizeof(struct dyn_ftrace), | 3760 | |
3746 | ftrace_cmp_recs, ftrace_swap_recs); | 3761 | /* These new locations need to be initialized */ |
3762 | ftrace_new_pgs = start_pg; | ||
3747 | 3763 | ||
3748 | /* | 3764 | /* |
3749 | * We only need to disable interrupts on start up | 3765 | * We only need to disable interrupts on start up |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d5eb332082..6420cda6233 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -23,6 +23,8 @@ | |||
23 | #include <asm/local.h> | 23 | #include <asm/local.h> |
24 | #include "trace.h" | 24 | #include "trace.h" |
25 | 25 | ||
26 | static void update_pages_handler(struct work_struct *work); | ||
27 | |||
26 | /* | 28 | /* |
27 | * The ring buffer header is special. We must manually up keep it. | 29 | * The ring buffer header is special. We must manually up keep it. |
28 | */ | 30 | */ |
@@ -470,12 +472,15 @@ struct ring_buffer_per_cpu { | |||
470 | /* ring buffer pages to update, > 0 to add, < 0 to remove */ | 472 | /* ring buffer pages to update, > 0 to add, < 0 to remove */ |
471 | int nr_pages_to_update; | 473 | int nr_pages_to_update; |
472 | struct list_head new_pages; /* new pages to add */ | 474 | struct list_head new_pages; /* new pages to add */ |
475 | struct work_struct update_pages_work; | ||
476 | struct completion update_done; | ||
473 | }; | 477 | }; |
474 | 478 | ||
475 | struct ring_buffer { | 479 | struct ring_buffer { |
476 | unsigned flags; | 480 | unsigned flags; |
477 | int cpus; | 481 | int cpus; |
478 | atomic_t record_disabled; | 482 | atomic_t record_disabled; |
483 | atomic_t resize_disabled; | ||
479 | cpumask_var_t cpumask; | 484 | cpumask_var_t cpumask; |
480 | 485 | ||
481 | struct lock_class_key *reader_lock_key; | 486 | struct lock_class_key *reader_lock_key; |
@@ -940,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
940 | struct list_head *head = cpu_buffer->pages; | 945 | struct list_head *head = cpu_buffer->pages; |
941 | struct buffer_page *bpage, *tmp; | 946 | struct buffer_page *bpage, *tmp; |
942 | 947 | ||
948 | /* Reset the head page if it exists */ | ||
949 | if (cpu_buffer->head_page) | ||
950 | rb_set_head_page(cpu_buffer); | ||
951 | |||
943 | rb_head_page_deactivate(cpu_buffer); | 952 | rb_head_page_deactivate(cpu_buffer); |
944 | 953 | ||
945 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) | 954 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) |
@@ -1048,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) | |||
1048 | raw_spin_lock_init(&cpu_buffer->reader_lock); | 1057 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1049 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1058 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1050 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1059 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1060 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); | ||
1061 | init_completion(&cpu_buffer->update_done); | ||
1051 | 1062 | ||
1052 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1063 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1053 | GFP_KERNEL, cpu_to_node(cpu)); | 1064 | GFP_KERNEL, cpu_to_node(cpu)); |
@@ -1235,70 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, | |||
1235 | 1246 | ||
1236 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); | 1247 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); |
1237 | 1248 | ||
1238 | static void | 1249 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) |
1239 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | ||
1240 | { | 1250 | { |
1241 | struct buffer_page *bpage; | 1251 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1242 | struct list_head *p; | 1252 | } |
1243 | unsigned i; | 1253 | |
1254 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1255 | { | ||
1256 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1257 | } | ||
1258 | |||
1259 | static int | ||
1260 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | ||
1261 | { | ||
1262 | struct list_head *tail_page, *to_remove, *next_page; | ||
1263 | struct buffer_page *to_remove_page, *tmp_iter_page; | ||
1264 | struct buffer_page *last_page, *first_page; | ||
1265 | unsigned int nr_removed; | ||
1266 | unsigned long head_bit; | ||
1267 | int page_entries; | ||
1268 | |||
1269 | head_bit = 0; | ||
1244 | 1270 | ||
1245 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1271 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1246 | rb_head_page_deactivate(cpu_buffer); | 1272 | atomic_inc(&cpu_buffer->record_disabled); |
1273 | /* | ||
1274 | * We don't race with the readers since we have acquired the reader | ||
1275 | * lock. We also don't race with writers after disabling recording. | ||
1276 | * This makes it easy to figure out the first and the last page to be | ||
1277 | * removed from the list. We unlink all the pages in between including | ||
1278 | * the first and last pages. This is done in a busy loop so that we | ||
1279 | * lose the least number of traces. | ||
1280 | * The pages are freed after we restart recording and unlock readers. | ||
1281 | */ | ||
1282 | tail_page = &cpu_buffer->tail_page->list; | ||
1247 | 1283 | ||
1248 | for (i = 0; i < nr_pages; i++) { | 1284 | /* |
1249 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1285 | * tail page might be on reader page, we remove the next page |
1250 | goto out; | 1286 | * from the ring buffer |
1251 | p = cpu_buffer->pages->next; | 1287 | */ |
1252 | bpage = list_entry(p, struct buffer_page, list); | 1288 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) |
1253 | list_del_init(&bpage->list); | 1289 | tail_page = rb_list_head(tail_page->next); |
1254 | free_buffer_page(bpage); | 1290 | to_remove = tail_page; |
1291 | |||
1292 | /* start of pages to remove */ | ||
1293 | first_page = list_entry(rb_list_head(to_remove->next), | ||
1294 | struct buffer_page, list); | ||
1295 | |||
1296 | for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { | ||
1297 | to_remove = rb_list_head(to_remove)->next; | ||
1298 | head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; | ||
1255 | } | 1299 | } |
1256 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | ||
1257 | goto out; | ||
1258 | 1300 | ||
1259 | rb_reset_cpu(cpu_buffer); | 1301 | next_page = rb_list_head(to_remove)->next; |
1260 | rb_check_pages(cpu_buffer); | ||
1261 | 1302 | ||
1262 | out: | 1303 | /* |
1304 | * Now we remove all pages between tail_page and next_page. | ||
1305 | * Make sure that we have head_bit value preserved for the | ||
1306 | * next page | ||
1307 | */ | ||
1308 | tail_page->next = (struct list_head *)((unsigned long)next_page | | ||
1309 | head_bit); | ||
1310 | next_page = rb_list_head(next_page); | ||
1311 | next_page->prev = tail_page; | ||
1312 | |||
1313 | /* make sure pages points to a valid page in the ring buffer */ | ||
1314 | cpu_buffer->pages = next_page; | ||
1315 | |||
1316 | /* update head page */ | ||
1317 | if (head_bit) | ||
1318 | cpu_buffer->head_page = list_entry(next_page, | ||
1319 | struct buffer_page, list); | ||
1320 | |||
1321 | /* | ||
1322 | * change read pointer to make sure any read iterators reset | ||
1323 | * themselves | ||
1324 | */ | ||
1325 | cpu_buffer->read = 0; | ||
1326 | |||
1327 | /* pages are removed, resume tracing and then free the pages */ | ||
1328 | atomic_dec(&cpu_buffer->record_disabled); | ||
1263 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1329 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1330 | |||
1331 | RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); | ||
1332 | |||
1333 | /* last buffer page to remove */ | ||
1334 | last_page = list_entry(rb_list_head(to_remove), struct buffer_page, | ||
1335 | list); | ||
1336 | tmp_iter_page = first_page; | ||
1337 | |||
1338 | do { | ||
1339 | to_remove_page = tmp_iter_page; | ||
1340 | rb_inc_page(cpu_buffer, &tmp_iter_page); | ||
1341 | |||
1342 | /* update the counters */ | ||
1343 | page_entries = rb_page_entries(to_remove_page); | ||
1344 | if (page_entries) { | ||
1345 | /* | ||
1346 | * If something was added to this page, it was full | ||
1347 | * since it is not the tail page. So we deduct the | ||
1348 | * bytes consumed in ring buffer from here. | ||
1349 | * No need to update overruns, since this page is | ||
1350 | * deleted from ring buffer and its entries are | ||
1351 | * already accounted for. | ||
1352 | */ | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * We have already removed references to this list item, just | ||
1358 | * free up the buffer_page and its page | ||
1359 | */ | ||
1360 | free_buffer_page(to_remove_page); | ||
1361 | nr_removed--; | ||
1362 | |||
1363 | } while (to_remove_page != last_page); | ||
1364 | |||
1365 | RB_WARN_ON(cpu_buffer, nr_removed); | ||
1366 | |||
1367 | return nr_removed == 0; | ||
1264 | } | 1368 | } |
1265 | 1369 | ||
1266 | static void | 1370 | static int |
1267 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | 1371 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) |
1268 | struct list_head *pages, unsigned nr_pages) | ||
1269 | { | 1372 | { |
1270 | struct buffer_page *bpage; | 1373 | struct list_head *pages = &cpu_buffer->new_pages; |
1271 | struct list_head *p; | 1374 | int retries, success; |
1272 | unsigned i; | ||
1273 | 1375 | ||
1274 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1376 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1275 | rb_head_page_deactivate(cpu_buffer); | 1377 | /* |
1378 | * We are holding the reader lock, so the reader page won't be swapped | ||
1379 | * in the ring buffer. Now we are racing with the writer trying to | ||
1380 | * move head page and the tail page. | ||
1381 | * We are going to adapt the reader page update process where: | ||
1382 | * 1. We first splice the start and end of list of new pages between | ||
1383 | * the head page and its previous page. | ||
1384 | * 2. We cmpxchg the prev_page->next to point from head page to the | ||
1385 | * start of new pages list. | ||
1386 | * 3. Finally, we update the head->prev to the end of new list. | ||
1387 | * | ||
1388 | * We will try this process 10 times, to make sure that we don't keep | ||
1389 | * spinning. | ||
1390 | */ | ||
1391 | retries = 10; | ||
1392 | success = 0; | ||
1393 | while (retries--) { | ||
1394 | struct list_head *head_page, *prev_page, *r; | ||
1395 | struct list_head *last_page, *first_page; | ||
1396 | struct list_head *head_page_with_bit; | ||
1276 | 1397 | ||
1277 | for (i = 0; i < nr_pages; i++) { | 1398 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1278 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1399 | prev_page = head_page->prev; |
1279 | goto out; | 1400 | |
1280 | p = pages->next; | 1401 | first_page = pages->next; |
1281 | bpage = list_entry(p, struct buffer_page, list); | 1402 | last_page = pages->prev; |
1282 | list_del_init(&bpage->list); | 1403 | |
1283 | list_add_tail(&bpage->list, cpu_buffer->pages); | 1404 | head_page_with_bit = (struct list_head *) |
1405 | ((unsigned long)head_page | RB_PAGE_HEAD); | ||
1406 | |||
1407 | last_page->next = head_page_with_bit; | ||
1408 | first_page->prev = prev_page; | ||
1409 | |||
1410 | r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); | ||
1411 | |||
1412 | if (r == head_page_with_bit) { | ||
1413 | /* | ||
1414 | * yay, we replaced the page pointer to our new list, | ||
1415 | * now, we just have to update to head page's prev | ||
1416 | * pointer to point to end of list | ||
1417 | */ | ||
1418 | head_page->prev = last_page; | ||
1419 | success = 1; | ||
1420 | break; | ||
1421 | } | ||
1284 | } | 1422 | } |
1285 | rb_reset_cpu(cpu_buffer); | ||
1286 | rb_check_pages(cpu_buffer); | ||
1287 | 1423 | ||
1288 | out: | 1424 | if (success) |
1425 | INIT_LIST_HEAD(pages); | ||
1426 | /* | ||
1427 | * If we weren't successful in adding in new pages, warn and stop | ||
1428 | * tracing | ||
1429 | */ | ||
1430 | RB_WARN_ON(cpu_buffer, !success); | ||
1289 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1431 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1432 | |||
1433 | /* free pages if they weren't inserted */ | ||
1434 | if (!success) { | ||
1435 | struct buffer_page *bpage, *tmp; | ||
1436 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, | ||
1437 | list) { | ||
1438 | list_del_init(&bpage->list); | ||
1439 | free_buffer_page(bpage); | ||
1440 | } | ||
1441 | } | ||
1442 | return success; | ||
1290 | } | 1443 | } |
1291 | 1444 | ||
1292 | static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer) | 1445 | static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) |
1293 | { | 1446 | { |
1447 | int success; | ||
1448 | |||
1294 | if (cpu_buffer->nr_pages_to_update > 0) | 1449 | if (cpu_buffer->nr_pages_to_update > 0) |
1295 | rb_insert_pages(cpu_buffer, &cpu_buffer->new_pages, | 1450 | success = rb_insert_pages(cpu_buffer); |
1296 | cpu_buffer->nr_pages_to_update); | ||
1297 | else | 1451 | else |
1298 | rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update); | 1452 | success = rb_remove_pages(cpu_buffer, |
1299 | cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; | 1453 | -cpu_buffer->nr_pages_to_update); |
1300 | /* reset this value */ | 1454 | |
1301 | cpu_buffer->nr_pages_to_update = 0; | 1455 | if (success) |
1456 | cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; | ||
1457 | } | ||
1458 | |||
1459 | static void update_pages_handler(struct work_struct *work) | ||
1460 | { | ||
1461 | struct ring_buffer_per_cpu *cpu_buffer = container_of(work, | ||
1462 | struct ring_buffer_per_cpu, update_pages_work); | ||
1463 | rb_update_pages(cpu_buffer); | ||
1464 | complete(&cpu_buffer->update_done); | ||
1302 | } | 1465 | } |
1303 | 1466 | ||
1304 | /** | 1467 | /** |
@@ -1308,14 +1471,14 @@ static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer) | |||
1308 | * | 1471 | * |
1309 | * Minimum size is 2 * BUF_PAGE_SIZE. | 1472 | * Minimum size is 2 * BUF_PAGE_SIZE. |
1310 | * | 1473 | * |
1311 | * Returns -1 on failure. | 1474 | * Returns 0 on success and < 0 on failure. |
1312 | */ | 1475 | */ |
1313 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | 1476 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, |
1314 | int cpu_id) | 1477 | int cpu_id) |
1315 | { | 1478 | { |
1316 | struct ring_buffer_per_cpu *cpu_buffer; | 1479 | struct ring_buffer_per_cpu *cpu_buffer; |
1317 | unsigned nr_pages; | 1480 | unsigned nr_pages; |
1318 | int cpu; | 1481 | int cpu, err = 0; |
1319 | 1482 | ||
1320 | /* | 1483 | /* |
1321 | * Always succeed at resizing a non-existent buffer: | 1484 | * Always succeed at resizing a non-existent buffer: |
@@ -1330,15 +1493,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1330 | if (size < BUF_PAGE_SIZE * 2) | 1493 | if (size < BUF_PAGE_SIZE * 2) |
1331 | size = BUF_PAGE_SIZE * 2; | 1494 | size = BUF_PAGE_SIZE * 2; |
1332 | 1495 | ||
1333 | atomic_inc(&buffer->record_disabled); | 1496 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1334 | 1497 | ||
1335 | /* Make sure all writers are done with this buffer. */ | 1498 | /* |
1336 | synchronize_sched(); | 1499 | * Don't succeed if resizing is disabled, as a reader might be |
1500 | * manipulating the ring buffer and is expecting a sane state while | ||
1501 | * this is true. | ||
1502 | */ | ||
1503 | if (atomic_read(&buffer->resize_disabled)) | ||
1504 | return -EBUSY; | ||
1337 | 1505 | ||
1506 | /* prevent another thread from changing buffer sizes */ | ||
1338 | mutex_lock(&buffer->mutex); | 1507 | mutex_lock(&buffer->mutex); |
1339 | get_online_cpus(); | ||
1340 | |||
1341 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | ||
1342 | 1508 | ||
1343 | if (cpu_id == RING_BUFFER_ALL_CPUS) { | 1509 | if (cpu_id == RING_BUFFER_ALL_CPUS) { |
1344 | /* calculate the pages to update */ | 1510 | /* calculate the pages to update */ |
@@ -1347,33 +1513,57 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1347 | 1513 | ||
1348 | cpu_buffer->nr_pages_to_update = nr_pages - | 1514 | cpu_buffer->nr_pages_to_update = nr_pages - |
1349 | cpu_buffer->nr_pages; | 1515 | cpu_buffer->nr_pages; |
1350 | |||
1351 | /* | 1516 | /* |
1352 | * nothing more to do for removing pages or no update | 1517 | * nothing more to do for removing pages or no update |
1353 | */ | 1518 | */ |
1354 | if (cpu_buffer->nr_pages_to_update <= 0) | 1519 | if (cpu_buffer->nr_pages_to_update <= 0) |
1355 | continue; | 1520 | continue; |
1356 | |||
1357 | /* | 1521 | /* |
1358 | * to add pages, make sure all new pages can be | 1522 | * to add pages, make sure all new pages can be |
1359 | * allocated without receiving ENOMEM | 1523 | * allocated without receiving ENOMEM |
1360 | */ | 1524 | */ |
1361 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | 1525 | INIT_LIST_HEAD(&cpu_buffer->new_pages); |
1362 | if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, | 1526 | if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, |
1363 | &cpu_buffer->new_pages, cpu)) | 1527 | &cpu_buffer->new_pages, cpu)) { |
1364 | /* not enough memory for new pages */ | 1528 | /* not enough memory for new pages */ |
1365 | goto no_mem; | 1529 | err = -ENOMEM; |
1530 | goto out_err; | ||
1531 | } | ||
1532 | } | ||
1533 | |||
1534 | get_online_cpus(); | ||
1535 | /* | ||
1536 | * Fire off all the required work handlers | ||
1537 | * We can't schedule on offline CPUs, but it's not necessary | ||
1538 | * since we can change their buffer sizes without any race. | ||
1539 | */ | ||
1540 | for_each_buffer_cpu(buffer, cpu) { | ||
1541 | cpu_buffer = buffer->buffers[cpu]; | ||
1542 | if (!cpu_buffer->nr_pages_to_update) | ||
1543 | continue; | ||
1544 | |||
1545 | if (cpu_online(cpu)) | ||
1546 | schedule_work_on(cpu, | ||
1547 | &cpu_buffer->update_pages_work); | ||
1548 | else | ||
1549 | rb_update_pages(cpu_buffer); | ||
1366 | } | 1550 | } |
1367 | 1551 | ||
1368 | /* wait for all the updates to complete */ | 1552 | /* wait for all the updates to complete */ |
1369 | for_each_buffer_cpu(buffer, cpu) { | 1553 | for_each_buffer_cpu(buffer, cpu) { |
1370 | cpu_buffer = buffer->buffers[cpu]; | 1554 | cpu_buffer = buffer->buffers[cpu]; |
1371 | if (cpu_buffer->nr_pages_to_update) { | 1555 | if (!cpu_buffer->nr_pages_to_update) |
1372 | update_pages_handler(cpu_buffer); | 1556 | continue; |
1373 | } | 1557 | |
1558 | if (cpu_online(cpu)) | ||
1559 | wait_for_completion(&cpu_buffer->update_done); | ||
1560 | cpu_buffer->nr_pages_to_update = 0; | ||
1374 | } | 1561 | } |
1562 | |||
1563 | put_online_cpus(); | ||
1375 | } else { | 1564 | } else { |
1376 | cpu_buffer = buffer->buffers[cpu_id]; | 1565 | cpu_buffer = buffer->buffers[cpu_id]; |
1566 | |||
1377 | if (nr_pages == cpu_buffer->nr_pages) | 1567 | if (nr_pages == cpu_buffer->nr_pages) |
1378 | goto out; | 1568 | goto out; |
1379 | 1569 | ||
@@ -1383,38 +1573,69 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1383 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | 1573 | INIT_LIST_HEAD(&cpu_buffer->new_pages); |
1384 | if (cpu_buffer->nr_pages_to_update > 0 && | 1574 | if (cpu_buffer->nr_pages_to_update > 0 && |
1385 | __rb_allocate_pages(cpu_buffer->nr_pages_to_update, | 1575 | __rb_allocate_pages(cpu_buffer->nr_pages_to_update, |
1386 | &cpu_buffer->new_pages, cpu_id)) | 1576 | &cpu_buffer->new_pages, cpu_id)) { |
1387 | goto no_mem; | 1577 | err = -ENOMEM; |
1578 | goto out_err; | ||
1579 | } | ||
1388 | 1580 | ||
1389 | update_pages_handler(cpu_buffer); | 1581 | get_online_cpus(); |
1582 | |||
1583 | if (cpu_online(cpu_id)) { | ||
1584 | schedule_work_on(cpu_id, | ||
1585 | &cpu_buffer->update_pages_work); | ||
1586 | wait_for_completion(&cpu_buffer->update_done); | ||
1587 | } else | ||
1588 | rb_update_pages(cpu_buffer); | ||
1589 | |||
1590 | cpu_buffer->nr_pages_to_update = 0; | ||
1591 | put_online_cpus(); | ||
1390 | } | 1592 | } |
1391 | 1593 | ||
1392 | out: | 1594 | out: |
1393 | put_online_cpus(); | 1595 | /* |
1394 | mutex_unlock(&buffer->mutex); | 1596 | * The ring buffer resize can happen with the ring buffer |
1395 | 1597 | * enabled, so that the update disturbs the tracing as little | |
1396 | atomic_dec(&buffer->record_disabled); | 1598 | * as possible. But if the buffer is disabled, we do not need |
1599 | * to worry about that, and we can take the time to verify | ||
1600 | * that the buffer is not corrupt. | ||
1601 | */ | ||
1602 | if (atomic_read(&buffer->record_disabled)) { | ||
1603 | atomic_inc(&buffer->record_disabled); | ||
1604 | /* | ||
1605 | * Even though the buffer was disabled, we must make sure | ||
1606 | * that it is truly disabled before calling rb_check_pages. | ||
1607 | * There could have been a race between checking | ||
1608 | * record_disable and incrementing it. | ||
1609 | */ | ||
1610 | synchronize_sched(); | ||
1611 | for_each_buffer_cpu(buffer, cpu) { | ||
1612 | cpu_buffer = buffer->buffers[cpu]; | ||
1613 | rb_check_pages(cpu_buffer); | ||
1614 | } | ||
1615 | atomic_dec(&buffer->record_disabled); | ||
1616 | } | ||
1397 | 1617 | ||
1618 | mutex_unlock(&buffer->mutex); | ||
1398 | return size; | 1619 | return size; |
1399 | 1620 | ||
1400 | no_mem: | 1621 | out_err: |
1401 | for_each_buffer_cpu(buffer, cpu) { | 1622 | for_each_buffer_cpu(buffer, cpu) { |
1402 | struct buffer_page *bpage, *tmp; | 1623 | struct buffer_page *bpage, *tmp; |
1624 | |||
1403 | cpu_buffer = buffer->buffers[cpu]; | 1625 | cpu_buffer = buffer->buffers[cpu]; |
1404 | /* reset this number regardless */ | ||
1405 | cpu_buffer->nr_pages_to_update = 0; | 1626 | cpu_buffer->nr_pages_to_update = 0; |
1627 | |||
1406 | if (list_empty(&cpu_buffer->new_pages)) | 1628 | if (list_empty(&cpu_buffer->new_pages)) |
1407 | continue; | 1629 | continue; |
1630 | |||
1408 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, | 1631 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, |
1409 | list) { | 1632 | list) { |
1410 | list_del_init(&bpage->list); | 1633 | list_del_init(&bpage->list); |
1411 | free_buffer_page(bpage); | 1634 | free_buffer_page(bpage); |
1412 | } | 1635 | } |
1413 | } | 1636 | } |
1414 | put_online_cpus(); | ||
1415 | mutex_unlock(&buffer->mutex); | 1637 | mutex_unlock(&buffer->mutex); |
1416 | atomic_dec(&buffer->record_disabled); | 1638 | return err; |
1417 | return -ENOMEM; | ||
1418 | } | 1639 | } |
1419 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1640 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1420 | 1641 | ||
@@ -1453,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) | |||
1453 | return __rb_page_index(iter->head_page, iter->head); | 1674 | return __rb_page_index(iter->head_page, iter->head); |
1454 | } | 1675 | } |
1455 | 1676 | ||
1456 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1457 | { | ||
1458 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1459 | } | ||
1460 | |||
1461 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | 1677 | static inline unsigned rb_page_commit(struct buffer_page *bpage) |
1462 | { | 1678 | { |
1463 | return local_read(&bpage->page->commit); | 1679 | return local_read(&bpage->page->commit); |
1464 | } | 1680 | } |
1465 | 1681 | ||
1466 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | ||
1467 | { | ||
1468 | return local_read(&bpage->entries) & RB_WRITE_MASK; | ||
1469 | } | ||
1470 | |||
1471 | /* Size is determined by what has been committed */ | 1682 | /* Size is determined by what has been committed */ |
1472 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1683 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1473 | { | 1684 | { |
@@ -3492,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) | |||
3492 | 3703 | ||
3493 | iter->cpu_buffer = cpu_buffer; | 3704 | iter->cpu_buffer = cpu_buffer; |
3494 | 3705 | ||
3706 | atomic_inc(&buffer->resize_disabled); | ||
3495 | atomic_inc(&cpu_buffer->record_disabled); | 3707 | atomic_inc(&cpu_buffer->record_disabled); |
3496 | 3708 | ||
3497 | return iter; | 3709 | return iter; |
@@ -3554,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) | |||
3554 | { | 3766 | { |
3555 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3767 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3556 | 3768 | ||
3769 | /* | ||
3770 | * Ring buffer is disabled from recording, here's a good place | ||
3771 | * to check the integrity of the ring buffer. | ||
3772 | */ | ||
3773 | rb_check_pages(cpu_buffer); | ||
3774 | |||
3557 | atomic_dec(&cpu_buffer->record_disabled); | 3775 | atomic_dec(&cpu_buffer->record_disabled); |
3776 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | ||
3558 | kfree(iter); | 3777 | kfree(iter); |
3559 | } | 3778 | } |
3560 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); | 3779 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); |
@@ -3626,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3626 | cpu_buffer->commit_page = cpu_buffer->head_page; | 3845 | cpu_buffer->commit_page = cpu_buffer->head_page; |
3627 | 3846 | ||
3628 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 3847 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
3848 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
3629 | local_set(&cpu_buffer->reader_page->write, 0); | 3849 | local_set(&cpu_buffer->reader_page->write, 0); |
3630 | local_set(&cpu_buffer->reader_page->entries, 0); | 3850 | local_set(&cpu_buffer->reader_page->entries, 0); |
3631 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3851 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
@@ -3662,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3662 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 3882 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
3663 | return; | 3883 | return; |
3664 | 3884 | ||
3885 | atomic_inc(&buffer->resize_disabled); | ||
3665 | atomic_inc(&cpu_buffer->record_disabled); | 3886 | atomic_inc(&cpu_buffer->record_disabled); |
3666 | 3887 | ||
3888 | /* Make sure all commits have finished */ | ||
3889 | synchronize_sched(); | ||
3890 | |||
3667 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3891 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3668 | 3892 | ||
3669 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3893 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
@@ -3679,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3679 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3903 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3680 | 3904 | ||
3681 | atomic_dec(&cpu_buffer->record_disabled); | 3905 | atomic_dec(&cpu_buffer->record_disabled); |
3906 | atomic_dec(&buffer->resize_disabled); | ||
3682 | } | 3907 | } |
3683 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); | 3908 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); |
3684 | 3909 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 48ef4960ec9..68032c6177d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -763,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
763 | * Register a new plugin tracer. | 763 | * Register a new plugin tracer. |
764 | */ | 764 | */ |
765 | int register_tracer(struct tracer *type) | 765 | int register_tracer(struct tracer *type) |
766 | __releases(kernel_lock) | ||
767 | __acquires(kernel_lock) | ||
768 | { | 766 | { |
769 | struct tracer *t; | 767 | struct tracer *t; |
770 | int ret = 0; | 768 | int ret = 0; |
@@ -2669,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
2669 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 2667 | if (cpumask_test_cpu(cpu, tracing_cpumask) && |
2670 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2668 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2671 | atomic_inc(&global_trace.data[cpu]->disabled); | 2669 | atomic_inc(&global_trace.data[cpu]->disabled); |
2670 | ring_buffer_record_disable_cpu(global_trace.buffer, cpu); | ||
2672 | } | 2671 | } |
2673 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 2672 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && |
2674 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2673 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2675 | atomic_dec(&global_trace.data[cpu]->disabled); | 2674 | atomic_dec(&global_trace.data[cpu]->disabled); |
2675 | ring_buffer_record_enable_cpu(global_trace.buffer, cpu); | ||
2676 | } | 2676 | } |
2677 | } | 2677 | } |
2678 | arch_spin_unlock(&ftrace_max_lock); | 2678 | arch_spin_unlock(&ftrace_max_lock); |
@@ -3076,20 +3076,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3076 | 3076 | ||
3077 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) | 3077 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) |
3078 | { | 3078 | { |
3079 | int cpu, ret = size; | 3079 | int ret = size; |
3080 | 3080 | ||
3081 | mutex_lock(&trace_types_lock); | 3081 | mutex_lock(&trace_types_lock); |
3082 | 3082 | ||
3083 | tracing_stop(); | ||
3084 | |||
3085 | /* disable all cpu buffers */ | ||
3086 | for_each_tracing_cpu(cpu) { | ||
3087 | if (global_trace.data[cpu]) | ||
3088 | atomic_inc(&global_trace.data[cpu]->disabled); | ||
3089 | if (max_tr.data[cpu]) | ||
3090 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
3091 | } | ||
3092 | |||
3093 | if (cpu_id != RING_BUFFER_ALL_CPUS) { | 3083 | if (cpu_id != RING_BUFFER_ALL_CPUS) { |
3094 | /* make sure, this cpu is enabled in the mask */ | 3084 | /* make sure, this cpu is enabled in the mask */ |
3095 | if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { | 3085 | if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { |
@@ -3103,14 +3093,6 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) | |||
3103 | ret = -ENOMEM; | 3093 | ret = -ENOMEM; |
3104 | 3094 | ||
3105 | out: | 3095 | out: |
3106 | for_each_tracing_cpu(cpu) { | ||
3107 | if (global_trace.data[cpu]) | ||
3108 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
3109 | if (max_tr.data[cpu]) | ||
3110 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
3111 | } | ||
3112 | |||
3113 | tracing_start(); | ||
3114 | mutex_unlock(&trace_types_lock); | 3096 | mutex_unlock(&trace_types_lock); |
3115 | 3097 | ||
3116 | return ret; | 3098 | return ret; |
@@ -3875,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3875 | struct print_entry *entry; | 3857 | struct print_entry *entry; |
3876 | unsigned long irq_flags; | 3858 | unsigned long irq_flags; |
3877 | struct page *pages[2]; | 3859 | struct page *pages[2]; |
3860 | void *map_page[2]; | ||
3878 | int nr_pages = 1; | 3861 | int nr_pages = 1; |
3879 | ssize_t written; | 3862 | ssize_t written; |
3880 | void *page1; | ||
3881 | void *page2; | ||
3882 | int offset; | 3863 | int offset; |
3883 | int size; | 3864 | int size; |
3884 | int len; | 3865 | int len; |
3885 | int ret; | 3866 | int ret; |
3867 | int i; | ||
3886 | 3868 | ||
3887 | if (tracing_disabled) | 3869 | if (tracing_disabled) |
3888 | return -EINVAL; | 3870 | return -EINVAL; |
@@ -3921,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3921 | goto out; | 3903 | goto out; |
3922 | } | 3904 | } |
3923 | 3905 | ||
3924 | page1 = kmap_atomic(pages[0]); | 3906 | for (i = 0; i < nr_pages; i++) |
3925 | if (nr_pages == 2) | 3907 | map_page[i] = kmap_atomic(pages[i]); |
3926 | page2 = kmap_atomic(pages[1]); | ||
3927 | 3908 | ||
3928 | local_save_flags(irq_flags); | 3909 | local_save_flags(irq_flags); |
3929 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 3910 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
@@ -3941,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3941 | 3922 | ||
3942 | if (nr_pages == 2) { | 3923 | if (nr_pages == 2) { |
3943 | len = PAGE_SIZE - offset; | 3924 | len = PAGE_SIZE - offset; |
3944 | memcpy(&entry->buf, page1 + offset, len); | 3925 | memcpy(&entry->buf, map_page[0] + offset, len); |
3945 | memcpy(&entry->buf[len], page2, cnt - len); | 3926 | memcpy(&entry->buf[len], map_page[1], cnt - len); |
3946 | } else | 3927 | } else |
3947 | memcpy(&entry->buf, page1 + offset, cnt); | 3928 | memcpy(&entry->buf, map_page[0] + offset, cnt); |
3948 | 3929 | ||
3949 | if (entry->buf[cnt - 1] != '\n') { | 3930 | if (entry->buf[cnt - 1] != '\n') { |
3950 | entry->buf[cnt] = '\n'; | 3931 | entry->buf[cnt] = '\n'; |
@@ -3959,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3959 | *fpos += written; | 3940 | *fpos += written; |
3960 | 3941 | ||
3961 | out_unlock: | 3942 | out_unlock: |
3962 | if (nr_pages == 2) | 3943 | for (i = 0; i < nr_pages; i++){ |
3963 | kunmap_atomic(page2); | 3944 | kunmap_atomic(map_page[i]); |
3964 | kunmap_atomic(page1); | 3945 | put_page(pages[i]); |
3965 | while (nr_pages > 0) | 3946 | } |
3966 | put_page(pages[--nr_pages]); | ||
3967 | out: | 3947 | out: |
3968 | return written; | 3948 | return written; |
3969 | } | 3949 | } |
@@ -4494,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4494 | struct dentry *d_cpu; | 4474 | struct dentry *d_cpu; |
4495 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 4475 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4496 | 4476 | ||
4477 | if (!d_percpu) | ||
4478 | return; | ||
4479 | |||
4497 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 4480 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4498 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4481 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4499 | if (!d_cpu) { | 4482 | if (!d_cpu) { |
@@ -4759,7 +4742,8 @@ static ssize_t | |||
4759 | rb_simple_read(struct file *filp, char __user *ubuf, | 4742 | rb_simple_read(struct file *filp, char __user *ubuf, |
4760 | size_t cnt, loff_t *ppos) | 4743 | size_t cnt, loff_t *ppos) |
4761 | { | 4744 | { |
4762 | struct ring_buffer *buffer = filp->private_data; | 4745 | struct trace_array *tr = filp->private_data; |
4746 | struct ring_buffer *buffer = tr->buffer; | ||
4763 | char buf[64]; | 4747 | char buf[64]; |
4764 | int r; | 4748 | int r; |
4765 | 4749 | ||
@@ -4777,7 +4761,8 @@ static ssize_t | |||
4777 | rb_simple_write(struct file *filp, const char __user *ubuf, | 4761 | rb_simple_write(struct file *filp, const char __user *ubuf, |
4778 | size_t cnt, loff_t *ppos) | 4762 | size_t cnt, loff_t *ppos) |
4779 | { | 4763 | { |
4780 | struct ring_buffer *buffer = filp->private_data; | 4764 | struct trace_array *tr = filp->private_data; |
4765 | struct ring_buffer *buffer = tr->buffer; | ||
4781 | unsigned long val; | 4766 | unsigned long val; |
4782 | int ret; | 4767 | int ret; |
4783 | 4768 | ||
@@ -4864,7 +4849,7 @@ static __init int tracer_init_debugfs(void) | |||
4864 | &trace_clock_fops); | 4849 | &trace_clock_fops); |
4865 | 4850 | ||
4866 | trace_create_file("tracing_on", 0644, d_tracer, | 4851 | trace_create_file("tracing_on", 0644, d_tracer, |
4867 | global_trace.buffer, &rb_simple_fops); | 4852 | &global_trace, &rb_simple_fops); |
4868 | 4853 | ||
4869 | #ifdef CONFIG_DYNAMIC_FTRACE | 4854 | #ifdef CONFIG_DYNAMIC_FTRACE |
4870 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 4855 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
@@ -5127,7 +5112,8 @@ __init static int tracer_alloc_buffers(void) | |||
5127 | max_tr.data[i] = &per_cpu(max_tr_data, i); | 5112 | max_tr.data[i] = &per_cpu(max_tr_data, i); |
5128 | } | 5113 | } |
5129 | 5114 | ||
5130 | set_buffer_entries(&global_trace, ring_buf_size); | 5115 | set_buffer_entries(&global_trace, |
5116 | ring_buffer_size(global_trace.buffer, 0)); | ||
5131 | #ifdef CONFIG_TRACER_MAX_TRACE | 5117 | #ifdef CONFIG_TRACER_MAX_TRACE |
5132 | set_buffer_entries(&max_tr, 1); | 5118 | set_buffer_entries(&max_tr, 1); |
5133 | #endif | 5119 | #endif |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a7d28e033a9..5aec220d2de 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -843,11 +843,11 @@ void trace_printk_init_buffers(void); | |||
843 | filter) | 843 | filter) |
844 | #include "trace_entries.h" | 844 | #include "trace_entries.h" |
845 | 845 | ||
846 | #ifdef CONFIG_FUNCTION_TRACER | 846 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) |
847 | int perf_ftrace_event_register(struct ftrace_event_call *call, | 847 | int perf_ftrace_event_register(struct ftrace_event_call *call, |
848 | enum trace_reg type, void *data); | 848 | enum trace_reg type, void *data); |
849 | #else | 849 | #else |
850 | #define perf_ftrace_event_register NULL | 850 | #define perf_ftrace_event_register NULL |
851 | #endif /* CONFIG_FUNCTION_TRACER */ | 851 | #endif |
852 | 852 | ||
853 | #endif /* _LINUX_KERNEL_TRACE_H */ | 853 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93ae8a9..29111da1d10 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
294 | if (!call->name || !call->class || !call->class->reg) | 294 | if (!call->name || !call->class || !call->class->reg) |
295 | continue; | 295 | continue; |
296 | 296 | ||
297 | if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) | ||
298 | continue; | ||
299 | |||
297 | if (match && | 300 | if (match && |
298 | strcmp(match, call->name) != 0 && | 301 | strcmp(match, call->name) != 0 && |
299 | strcmp(match, call->class->system) != 0) | 302 | strcmp(match, call->class->system) != 0) |
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1164 | return -1; | 1167 | return -1; |
1165 | } | 1168 | } |
1166 | 1169 | ||
1167 | if (call->class->reg) | 1170 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1168 | trace_create_file("enable", 0644, call->dir, call, | 1171 | trace_create_file("enable", 0644, call->dir, call, |
1169 | enable); | 1172 | enable); |
1170 | 1173 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8bc85..e039906b037 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
180 | .event.type = etype, \ | 180 | .event.type = etype, \ |
181 | .class = &event_class_ftrace_##call, \ | 181 | .class = &event_class_ftrace_##call, \ |
182 | .print_fmt = print, \ | 182 | .print_fmt = print, \ |
183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | ||
183 | }; \ | 184 | }; \ |
184 | struct ftrace_event_call __used \ | 185 | struct ftrace_event_call __used \ |
185 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 859fae6b182..df611a0e76c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
652 | { | 652 | { |
653 | u64 next_ts; | 653 | u64 next_ts; |
654 | int ret; | 654 | int ret; |
655 | /* trace_find_next_entry will reset ent_size */ | ||
656 | int ent_size = iter->ent_size; | ||
655 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
656 | struct trace_entry *entry = iter->ent, | 658 | struct trace_entry *entry = iter->ent, |
657 | *next_entry = trace_find_next_entry(iter, NULL, | 659 | *next_entry = trace_find_next_entry(iter, NULL, |
@@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
660 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); | 662 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); |
661 | unsigned long rel_usecs; | 663 | unsigned long rel_usecs; |
662 | 664 | ||
665 | /* Restore the original ent_size */ | ||
666 | iter->ent_size = ent_size; | ||
667 | |||
663 | if (!next_entry) | 668 | if (!next_entry) |
664 | next_ts = iter->ts; | 669 | next_ts = iter->ts; |
665 | rel_usecs = ns2usecs(next_ts - iter->ts); | 670 | rel_usecs = ns2usecs(next_ts - iter->ts); |
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a472..00000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null | |||
@@ -1,300 +0,0 @@ | |||
1 | /* | ||
2 | * Workqueue statistical tracer. | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | |||
9 | #include <trace/events/workqueue.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/percpu.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/kref.h> | ||
14 | #include "trace_stat.h" | ||
15 | #include "trace.h" | ||
16 | |||
17 | |||
18 | /* A cpu workqueue thread */ | ||
19 | struct cpu_workqueue_stats { | ||
20 | struct list_head list; | ||
21 | struct kref kref; | ||
22 | int cpu; | ||
23 | pid_t pid; | ||
24 | /* Can be inserted from interrupt or user context, need to be atomic */ | ||
25 | atomic_t inserted; | ||
26 | /* | ||
27 | * Don't need to be atomic, works are serialized in a single workqueue thread | ||
28 | * on a single CPU. | ||
29 | */ | ||
30 | unsigned int executed; | ||
31 | }; | ||
32 | |||
33 | /* List of workqueue threads on one cpu */ | ||
34 | struct workqueue_global_stats { | ||
35 | struct list_head list; | ||
36 | spinlock_t lock; | ||
37 | }; | ||
38 | |||
39 | /* Don't need a global lock because allocated before the workqueues, and | ||
40 | * never freed. | ||
41 | */ | ||
42 | static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); | ||
43 | #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) | ||
44 | |||
45 | static void cpu_workqueue_stat_free(struct kref *kref) | ||
46 | { | ||
47 | kfree(container_of(kref, struct cpu_workqueue_stats, kref)); | ||
48 | } | ||
49 | |||
50 | /* Insertion of a work */ | ||
51 | static void | ||
52 | probe_workqueue_insertion(void *ignore, | ||
53 | struct task_struct *wq_thread, | ||
54 | struct work_struct *work) | ||
55 | { | ||
56 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
57 | struct cpu_workqueue_stats *node; | ||
58 | unsigned long flags; | ||
59 | |||
60 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
61 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
62 | if (node->pid == wq_thread->pid) { | ||
63 | atomic_inc(&node->inserted); | ||
64 | goto found; | ||
65 | } | ||
66 | } | ||
67 | pr_debug("trace_workqueue: entry not found\n"); | ||
68 | found: | ||
69 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
70 | } | ||
71 | |||
72 | /* Execution of a work */ | ||
73 | static void | ||
74 | probe_workqueue_execution(void *ignore, | ||
75 | struct task_struct *wq_thread, | ||
76 | struct work_struct *work) | ||
77 | { | ||
78 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
79 | struct cpu_workqueue_stats *node; | ||
80 | unsigned long flags; | ||
81 | |||
82 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
83 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
84 | if (node->pid == wq_thread->pid) { | ||
85 | node->executed++; | ||
86 | goto found; | ||
87 | } | ||
88 | } | ||
89 | pr_debug("trace_workqueue: entry not found\n"); | ||
90 | found: | ||
91 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
92 | } | ||
93 | |||
94 | /* Creation of a cpu workqueue thread */ | ||
95 | static void probe_workqueue_creation(void *ignore, | ||
96 | struct task_struct *wq_thread, int cpu) | ||
97 | { | ||
98 | struct cpu_workqueue_stats *cws; | ||
99 | unsigned long flags; | ||
100 | |||
101 | WARN_ON(cpu < 0); | ||
102 | |||
103 | /* Workqueues are sometimes created in atomic context */ | ||
104 | cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); | ||
105 | if (!cws) { | ||
106 | pr_warning("trace_workqueue: not enough memory\n"); | ||
107 | return; | ||
108 | } | ||
109 | INIT_LIST_HEAD(&cws->list); | ||
110 | kref_init(&cws->kref); | ||
111 | cws->cpu = cpu; | ||
112 | cws->pid = wq_thread->pid; | ||
113 | |||
114 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
115 | list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); | ||
116 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
117 | } | ||
118 | |||
119 | /* Destruction of a cpu workqueue thread */ | ||
120 | static void | ||
121 | probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) | ||
122 | { | ||
123 | /* Workqueue only execute on one cpu */ | ||
124 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
125 | struct cpu_workqueue_stats *node, *next; | ||
126 | unsigned long flags; | ||
127 | |||
128 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
129 | list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, | ||
130 | list) { | ||
131 | if (node->pid == wq_thread->pid) { | ||
132 | list_del(&node->list); | ||
133 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
134 | goto found; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | pr_debug("trace_workqueue: don't find workqueue to destroy\n"); | ||
139 | found: | ||
140 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
141 | |||
142 | } | ||
143 | |||
144 | static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) | ||
145 | { | ||
146 | unsigned long flags; | ||
147 | struct cpu_workqueue_stats *ret = NULL; | ||
148 | |||
149 | |||
150 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
151 | |||
152 | if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { | ||
153 | ret = list_entry(workqueue_cpu_stat(cpu)->list.next, | ||
154 | struct cpu_workqueue_stats, list); | ||
155 | kref_get(&ret->kref); | ||
156 | } | ||
157 | |||
158 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
159 | |||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | static void *workqueue_stat_start(struct tracer_stat *trace) | ||
164 | { | ||
165 | int cpu; | ||
166 | void *ret = NULL; | ||
167 | |||
168 | for_each_possible_cpu(cpu) { | ||
169 | ret = workqueue_stat_start_cpu(cpu); | ||
170 | if (ret) | ||
171 | return ret; | ||
172 | } | ||
173 | return NULL; | ||
174 | } | ||
175 | |||
176 | static void *workqueue_stat_next(void *prev, int idx) | ||
177 | { | ||
178 | struct cpu_workqueue_stats *prev_cws = prev; | ||
179 | struct cpu_workqueue_stats *ret; | ||
180 | int cpu = prev_cws->cpu; | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
184 | if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { | ||
185 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
186 | do { | ||
187 | cpu = cpumask_next(cpu, cpu_possible_mask); | ||
188 | if (cpu >= nr_cpu_ids) | ||
189 | return NULL; | ||
190 | } while (!(ret = workqueue_stat_start_cpu(cpu))); | ||
191 | return ret; | ||
192 | } else { | ||
193 | ret = list_entry(prev_cws->list.next, | ||
194 | struct cpu_workqueue_stats, list); | ||
195 | kref_get(&ret->kref); | ||
196 | } | ||
197 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static int workqueue_stat_show(struct seq_file *s, void *p) | ||
203 | { | ||
204 | struct cpu_workqueue_stats *cws = p; | ||
205 | struct pid *pid; | ||
206 | struct task_struct *tsk; | ||
207 | |||
208 | pid = find_get_pid(cws->pid); | ||
209 | if (pid) { | ||
210 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
211 | if (tsk) { | ||
212 | seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, | ||
213 | atomic_read(&cws->inserted), cws->executed, | ||
214 | tsk->comm); | ||
215 | put_task_struct(tsk); | ||
216 | } | ||
217 | put_pid(pid); | ||
218 | } | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static void workqueue_stat_release(void *stat) | ||
224 | { | ||
225 | struct cpu_workqueue_stats *node = stat; | ||
226 | |||
227 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
228 | } | ||
229 | |||
230 | static int workqueue_stat_headers(struct seq_file *s) | ||
231 | { | ||
232 | seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); | ||
233 | seq_printf(s, "# | | | |\n"); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct tracer_stat workqueue_stats __read_mostly = { | ||
238 | .name = "workqueues", | ||
239 | .stat_start = workqueue_stat_start, | ||
240 | .stat_next = workqueue_stat_next, | ||
241 | .stat_show = workqueue_stat_show, | ||
242 | .stat_release = workqueue_stat_release, | ||
243 | .stat_headers = workqueue_stat_headers | ||
244 | }; | ||
245 | |||
246 | |||
247 | int __init stat_workqueue_init(void) | ||
248 | { | ||
249 | if (register_stat_tracer(&workqueue_stats)) { | ||
250 | pr_warning("Unable to register workqueue stat tracer\n"); | ||
251 | return 1; | ||
252 | } | ||
253 | |||
254 | return 0; | ||
255 | } | ||
256 | fs_initcall(stat_workqueue_init); | ||
257 | |||
258 | /* | ||
259 | * Workqueues are created very early, just after pre-smp initcalls. | ||
260 | * So we must register our tracepoints at this stage. | ||
261 | */ | ||
262 | int __init trace_workqueue_early_init(void) | ||
263 | { | ||
264 | int ret, cpu; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
272 | if (ret) | ||
273 | goto out; | ||
274 | |||
275 | ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
276 | if (ret) | ||
277 | goto no_insertion; | ||
278 | |||
279 | ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
280 | if (ret) | ||
281 | goto no_execution; | ||
282 | |||
283 | ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); | ||
284 | if (ret) | ||
285 | goto no_creation; | ||
286 | |||
287 | return 0; | ||
288 | |||
289 | no_creation: | ||
290 | unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
291 | no_execution: | ||
292 | unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
293 | no_insertion: | ||
294 | unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
295 | out: | ||
296 | pr_warning("trace_workqueue: unable to trace workqueues\n"); | ||
297 | |||
298 | return 1; | ||
299 | } | ||
300 | early_initcall(trace_workqueue_early_init); | ||
diff --git a/kernel/uid16.c b/kernel/uid16.c index 51c6e89e861..d7948eb1022 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) | |||
81 | return ret; | 81 | return ret; |
82 | } | 82 | } |
83 | 83 | ||
84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) | 84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) |
85 | { | 85 | { |
86 | const struct cred *cred = current_cred(); | 86 | const struct cred *cred = current_cred(); |
87 | int retval; | 87 | int retval; |
88 | old_uid_t ruid, euid, suid; | ||
88 | 89 | ||
89 | if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && | 90 | ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); |
90 | !(retval = put_user(high2lowuid(cred->euid), euid))) | 91 | euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); |
91 | retval = put_user(high2lowuid(cred->suid), suid); | 92 | suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); |
93 | |||
94 | if (!(retval = put_user(ruid, ruidp)) && | ||
95 | !(retval = put_user(euid, euidp))) | ||
96 | retval = put_user(suid, suidp); | ||
92 | 97 | ||
93 | return retval; | 98 | return retval; |
94 | } | 99 | } |
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) | |||
103 | } | 108 | } |
104 | 109 | ||
105 | 110 | ||
106 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) | 111 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) |
107 | { | 112 | { |
108 | const struct cred *cred = current_cred(); | 113 | const struct cred *cred = current_cred(); |
109 | int retval; | 114 | int retval; |
115 | old_gid_t rgid, egid, sgid; | ||
116 | |||
117 | rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid)); | ||
118 | egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid)); | ||
119 | sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid)); | ||
110 | 120 | ||
111 | if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && | 121 | if (!(retval = put_user(rgid, rgidp)) && |
112 | !(retval = put_user(high2lowgid(cred->egid), egid))) | 122 | !(retval = put_user(egid, egidp))) |
113 | retval = put_user(high2lowgid(cred->sgid), sgid); | 123 | retval = put_user(sgid, sgidp); |
114 | 124 | ||
115 | return retval; | 125 | return retval; |
116 | } | 126 | } |
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) | |||
134 | static int groups16_to_user(old_gid_t __user *grouplist, | 144 | static int groups16_to_user(old_gid_t __user *grouplist, |
135 | struct group_info *group_info) | 145 | struct group_info *group_info) |
136 | { | 146 | { |
147 | struct user_namespace *user_ns = current_user_ns(); | ||
137 | int i; | 148 | int i; |
138 | old_gid_t group; | 149 | old_gid_t group; |
150 | kgid_t kgid; | ||
139 | 151 | ||
140 | for (i = 0; i < group_info->ngroups; i++) { | 152 | for (i = 0; i < group_info->ngroups; i++) { |
141 | group = high2lowgid(GROUP_AT(group_info, i)); | 153 | kgid = GROUP_AT(group_info, i); |
154 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); | ||
142 | if (put_user(group, grouplist+i)) | 155 | if (put_user(group, grouplist+i)) |
143 | return -EFAULT; | 156 | return -EFAULT; |
144 | } | 157 | } |
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist, | |||
149 | static int groups16_from_user(struct group_info *group_info, | 162 | static int groups16_from_user(struct group_info *group_info, |
150 | old_gid_t __user *grouplist) | 163 | old_gid_t __user *grouplist) |
151 | { | 164 | { |
165 | struct user_namespace *user_ns = current_user_ns(); | ||
152 | int i; | 166 | int i; |
153 | old_gid_t group; | 167 | old_gid_t group; |
168 | kgid_t kgid; | ||
154 | 169 | ||
155 | for (i = 0; i < group_info->ngroups; i++) { | 170 | for (i = 0; i < group_info->ngroups; i++) { |
156 | if (get_user(group, grouplist+i)) | 171 | if (get_user(group, grouplist+i)) |
157 | return -EFAULT; | 172 | return -EFAULT; |
158 | GROUP_AT(group_info, i) = low2highgid(group); | 173 | |
174 | kgid = make_kgid(user_ns, low2highgid(group)); | ||
175 | if (!gid_valid(kgid)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | GROUP_AT(group_info, i) = kgid; | ||
159 | } | 179 | } |
160 | 180 | ||
161 | return 0; | 181 | return 0; |
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
211 | 231 | ||
212 | SYSCALL_DEFINE0(getuid16) | 232 | SYSCALL_DEFINE0(getuid16) |
213 | { | 233 | { |
214 | return high2lowuid(current_uid()); | 234 | return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); |
215 | } | 235 | } |
216 | 236 | ||
217 | SYSCALL_DEFINE0(geteuid16) | 237 | SYSCALL_DEFINE0(geteuid16) |
218 | { | 238 | { |
219 | return high2lowuid(current_euid()); | 239 | return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); |
220 | } | 240 | } |
221 | 241 | ||
222 | SYSCALL_DEFINE0(getgid16) | 242 | SYSCALL_DEFINE0(getgid16) |
223 | { | 243 | { |
224 | return high2lowgid(current_gid()); | 244 | return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); |
225 | } | 245 | } |
226 | 246 | ||
227 | SYSCALL_DEFINE0(getegid16) | 247 | SYSCALL_DEFINE0(getegid16) |
228 | { | 248 | { |
229 | return high2lowgid(current_egid()); | 249 | return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); |
230 | } | 250 | } |
diff --git a/kernel/user.c b/kernel/user.c index 71dd2363ab0..b815fefbe76 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -22,10 +22,27 @@ | |||
22 | * and 1 for... ? | 22 | * and 1 for... ? |
23 | */ | 23 | */ |
24 | struct user_namespace init_user_ns = { | 24 | struct user_namespace init_user_ns = { |
25 | .uid_map = { | ||
26 | .nr_extents = 1, | ||
27 | .extent[0] = { | ||
28 | .first = 0, | ||
29 | .lower_first = 0, | ||
30 | .count = 4294967295U, | ||
31 | }, | ||
32 | }, | ||
33 | .gid_map = { | ||
34 | .nr_extents = 1, | ||
35 | .extent[0] = { | ||
36 | .first = 0, | ||
37 | .lower_first = 0, | ||
38 | .count = 4294967295U, | ||
39 | }, | ||
40 | }, | ||
25 | .kref = { | 41 | .kref = { |
26 | .refcount = ATOMIC_INIT(3), | 42 | .refcount = ATOMIC_INIT(3), |
27 | }, | 43 | }, |
28 | .creator = &root_user, | 44 | .owner = GLOBAL_ROOT_UID, |
45 | .group = GLOBAL_ROOT_GID, | ||
29 | }; | 46 | }; |
30 | EXPORT_SYMBOL_GPL(init_user_ns); | 47 | EXPORT_SYMBOL_GPL(init_user_ns); |
31 | 48 | ||
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns); | |||
34 | * when changing user ID's (ie setuid() and friends). | 51 | * when changing user ID's (ie setuid() and friends). |
35 | */ | 52 | */ |
36 | 53 | ||
54 | #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) | ||
55 | #define UIDHASH_SZ (1 << UIDHASH_BITS) | ||
37 | #define UIDHASH_MASK (UIDHASH_SZ - 1) | 56 | #define UIDHASH_MASK (UIDHASH_SZ - 1) |
38 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | 57 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) |
39 | #define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) | 58 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) |
40 | 59 | ||
41 | static struct kmem_cache *uid_cachep; | 60 | static struct kmem_cache *uid_cachep; |
61 | struct hlist_head uidhash_table[UIDHASH_SZ]; | ||
42 | 62 | ||
43 | /* | 63 | /* |
44 | * The uidhash_lock is mostly taken from process context, but it is | 64 | * The uidhash_lock is mostly taken from process context, but it is |
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep; | |||
51 | */ | 71 | */ |
52 | static DEFINE_SPINLOCK(uidhash_lock); | 72 | static DEFINE_SPINLOCK(uidhash_lock); |
53 | 73 | ||
54 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ | 74 | /* root_user.__count is 1, for init task cred */ |
55 | struct user_struct root_user = { | 75 | struct user_struct root_user = { |
56 | .__count = ATOMIC_INIT(2), | 76 | .__count = ATOMIC_INIT(1), |
57 | .processes = ATOMIC_INIT(1), | 77 | .processes = ATOMIC_INIT(1), |
58 | .files = ATOMIC_INIT(0), | 78 | .files = ATOMIC_INIT(0), |
59 | .sigpending = ATOMIC_INIT(0), | 79 | .sigpending = ATOMIC_INIT(0), |
60 | .locked_shm = 0, | 80 | .locked_shm = 0, |
61 | .user_ns = &init_user_ns, | 81 | .uid = GLOBAL_ROOT_UID, |
62 | }; | 82 | }; |
63 | 83 | ||
64 | /* | 84 | /* |
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) | |||
72 | static void uid_hash_remove(struct user_struct *up) | 92 | static void uid_hash_remove(struct user_struct *up) |
73 | { | 93 | { |
74 | hlist_del_init(&up->uidhash_node); | 94 | hlist_del_init(&up->uidhash_node); |
75 | put_user_ns(up->user_ns); | ||
76 | } | 95 | } |
77 | 96 | ||
78 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 97 | static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) |
79 | { | 98 | { |
80 | struct user_struct *user; | 99 | struct user_struct *user; |
81 | struct hlist_node *h; | 100 | struct hlist_node *h; |
82 | 101 | ||
83 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 102 | hlist_for_each_entry(user, h, hashent, uidhash_node) { |
84 | if (user->uid == uid) { | 103 | if (uid_eq(user->uid, uid)) { |
85 | atomic_inc(&user->__count); | 104 | atomic_inc(&user->__count); |
86 | return user; | 105 | return user; |
87 | } | 106 | } |
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
110 | * | 129 | * |
111 | * If the user_struct could not be found, return NULL. | 130 | * If the user_struct could not be found, return NULL. |
112 | */ | 131 | */ |
113 | struct user_struct *find_user(uid_t uid) | 132 | struct user_struct *find_user(kuid_t uid) |
114 | { | 133 | { |
115 | struct user_struct *ret; | 134 | struct user_struct *ret; |
116 | unsigned long flags; | 135 | unsigned long flags; |
117 | struct user_namespace *ns = current_user_ns(); | ||
118 | 136 | ||
119 | spin_lock_irqsave(&uidhash_lock, flags); | 137 | spin_lock_irqsave(&uidhash_lock, flags); |
120 | ret = uid_hash_find(uid, uidhashentry(ns, uid)); | 138 | ret = uid_hash_find(uid, uidhashentry(uid)); |
121 | spin_unlock_irqrestore(&uidhash_lock, flags); | 139 | spin_unlock_irqrestore(&uidhash_lock, flags); |
122 | return ret; | 140 | return ret; |
123 | } | 141 | } |
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up) | |||
136 | local_irq_restore(flags); | 154 | local_irq_restore(flags); |
137 | } | 155 | } |
138 | 156 | ||
139 | struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | 157 | struct user_struct *alloc_uid(kuid_t uid) |
140 | { | 158 | { |
141 | struct hlist_head *hashent = uidhashentry(ns, uid); | 159 | struct hlist_head *hashent = uidhashentry(uid); |
142 | struct user_struct *up, *new; | 160 | struct user_struct *up, *new; |
143 | 161 | ||
144 | spin_lock_irq(&uidhash_lock); | 162 | spin_lock_irq(&uidhash_lock); |
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
153 | new->uid = uid; | 171 | new->uid = uid; |
154 | atomic_set(&new->__count, 1); | 172 | atomic_set(&new->__count, 1); |
155 | 173 | ||
156 | new->user_ns = get_user_ns(ns); | ||
157 | |||
158 | /* | 174 | /* |
159 | * Before adding this, check whether we raced | 175 | * Before adding this, check whether we raced |
160 | * on adding the same user already.. | 176 | * on adding the same user already.. |
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
162 | spin_lock_irq(&uidhash_lock); | 178 | spin_lock_irq(&uidhash_lock); |
163 | up = uid_hash_find(uid, hashent); | 179 | up = uid_hash_find(uid, hashent); |
164 | if (up) { | 180 | if (up) { |
165 | put_user_ns(ns); | ||
166 | key_put(new->uid_keyring); | 181 | key_put(new->uid_keyring); |
167 | key_put(new->session_keyring); | 182 | key_put(new->session_keyring); |
168 | kmem_cache_free(uid_cachep, new); | 183 | kmem_cache_free(uid_cachep, new); |
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void) | |||
187 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 202 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
188 | 203 | ||
189 | for(n = 0; n < UIDHASH_SZ; ++n) | 204 | for(n = 0; n < UIDHASH_SZ; ++n) |
190 | INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); | 205 | INIT_HLIST_HEAD(uidhash_table + n); |
191 | 206 | ||
192 | /* Insert the root user immediately (init already runs as root) */ | 207 | /* Insert the root user immediately (init already runs as root) */ |
193 | spin_lock_irq(&uidhash_lock); | 208 | spin_lock_irq(&uidhash_lock); |
194 | uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); | 209 | uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); |
195 | spin_unlock_irq(&uidhash_lock); | 210 | spin_unlock_irq(&uidhash_lock); |
196 | 211 | ||
197 | return 0; | 212 | return 0; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3b906e98b1d..86602316422 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -11,9 +11,20 @@ | |||
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
14 | #include <linux/securebits.h> | ||
15 | #include <linux/keyctl.h> | ||
16 | #include <linux/key-type.h> | ||
17 | #include <keys/user-type.h> | ||
18 | #include <linux/seq_file.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/uaccess.h> | ||
21 | #include <linux/ctype.h> | ||
14 | 22 | ||
15 | static struct kmem_cache *user_ns_cachep __read_mostly; | 23 | static struct kmem_cache *user_ns_cachep __read_mostly; |
16 | 24 | ||
25 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | ||
26 | struct uid_gid_map *map); | ||
27 | |||
17 | /* | 28 | /* |
18 | * Create a new user namespace, deriving the creator from the user in the | 29 | * Create a new user namespace, deriving the creator from the user in the |
19 | * passed credentials, and replacing that user with the new root user for the | 30 | * passed credentials, and replacing that user with the new root user for the |
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly; | |||
24 | */ | 35 | */ |
25 | int create_user_ns(struct cred *new) | 36 | int create_user_ns(struct cred *new) |
26 | { | 37 | { |
27 | struct user_namespace *ns; | 38 | struct user_namespace *ns, *parent_ns = new->user_ns; |
28 | struct user_struct *root_user; | 39 | kuid_t owner = new->euid; |
29 | int n; | 40 | kgid_t group = new->egid; |
41 | |||
42 | /* The creator needs a mapping in the parent user namespace | ||
43 | * or else we won't be able to reasonably tell userspace who | ||
44 | * created a user_namespace. | ||
45 | */ | ||
46 | if (!kuid_has_mapping(parent_ns, owner) || | ||
47 | !kgid_has_mapping(parent_ns, group)) | ||
48 | return -EPERM; | ||
30 | 49 | ||
31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); | 50 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); |
32 | if (!ns) | 51 | if (!ns) |
33 | return -ENOMEM; | 52 | return -ENOMEM; |
34 | 53 | ||
35 | kref_init(&ns->kref); | 54 | kref_init(&ns->kref); |
55 | ns->parent = parent_ns; | ||
56 | ns->owner = owner; | ||
57 | ns->group = group; | ||
36 | 58 | ||
37 | for (n = 0; n < UIDHASH_SZ; ++n) | 59 | /* Start with the same capabilities as init but useless for doing |
38 | INIT_HLIST_HEAD(ns->uidhash_table + n); | 60 | * anything as the capabilities are bound to the new user namespace. |
39 | 61 | */ | |
40 | /* Alloc new root user. */ | 62 | new->securebits = SECUREBITS_DEFAULT; |
41 | root_user = alloc_uid(ns, 0); | 63 | new->cap_inheritable = CAP_EMPTY_SET; |
42 | if (!root_user) { | 64 | new->cap_permitted = CAP_FULL_SET; |
43 | kmem_cache_free(user_ns_cachep, ns); | 65 | new->cap_effective = CAP_FULL_SET; |
44 | return -ENOMEM; | 66 | new->cap_bset = CAP_FULL_SET; |
45 | } | ||
46 | |||
47 | /* set the new root user in the credentials under preparation */ | ||
48 | ns->creator = new->user; | ||
49 | new->user = root_user; | ||
50 | new->uid = new->euid = new->suid = new->fsuid = 0; | ||
51 | new->gid = new->egid = new->sgid = new->fsgid = 0; | ||
52 | put_group_info(new->group_info); | ||
53 | new->group_info = get_group_info(&init_groups); | ||
54 | #ifdef CONFIG_KEYS | 67 | #ifdef CONFIG_KEYS |
55 | key_put(new->request_key_auth); | 68 | key_put(new->request_key_auth); |
56 | new->request_key_auth = NULL; | 69 | new->request_key_auth = NULL; |
57 | #endif | 70 | #endif |
58 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | 71 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ |
59 | 72 | ||
60 | /* root_user holds a reference to ns, our reference can be dropped */ | 73 | /* Leave the new->user_ns reference with the new user namespace. */ |
61 | put_user_ns(ns); | 74 | /* Leave the reference to our user_ns with the new cred. */ |
75 | new->user_ns = ns; | ||
62 | 76 | ||
63 | return 0; | 77 | return 0; |
64 | } | 78 | } |
65 | 79 | ||
66 | /* | 80 | void free_user_ns(struct kref *kref) |
67 | * Deferred destructor for a user namespace. This is required because | ||
68 | * free_user_ns() may be called with uidhash_lock held, but we need to call | ||
69 | * back to free_uid() which will want to take the lock again. | ||
70 | */ | ||
71 | static void free_user_ns_work(struct work_struct *work) | ||
72 | { | 81 | { |
73 | struct user_namespace *ns = | 82 | struct user_namespace *parent, *ns = |
74 | container_of(work, struct user_namespace, destroyer); | 83 | container_of(kref, struct user_namespace, kref); |
75 | free_uid(ns->creator); | 84 | |
85 | parent = ns->parent; | ||
76 | kmem_cache_free(user_ns_cachep, ns); | 86 | kmem_cache_free(user_ns_cachep, ns); |
87 | put_user_ns(parent); | ||
77 | } | 88 | } |
89 | EXPORT_SYMBOL(free_user_ns); | ||
78 | 90 | ||
79 | void free_user_ns(struct kref *kref) | 91 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) |
80 | { | 92 | { |
81 | struct user_namespace *ns = | 93 | unsigned idx, extents; |
82 | container_of(kref, struct user_namespace, kref); | 94 | u32 first, last, id2; |
95 | |||
96 | id2 = id + count - 1; | ||
97 | |||
98 | /* Find the matching extent */ | ||
99 | extents = map->nr_extents; | ||
100 | smp_read_barrier_depends(); | ||
101 | for (idx = 0; idx < extents; idx++) { | ||
102 | first = map->extent[idx].first; | ||
103 | last = first + map->extent[idx].count - 1; | ||
104 | if (id >= first && id <= last && | ||
105 | (id2 >= first && id2 <= last)) | ||
106 | break; | ||
107 | } | ||
108 | /* Map the id or note failure */ | ||
109 | if (idx < extents) | ||
110 | id = (id - first) + map->extent[idx].lower_first; | ||
111 | else | ||
112 | id = (u32) -1; | ||
83 | 113 | ||
84 | INIT_WORK(&ns->destroyer, free_user_ns_work); | 114 | return id; |
85 | schedule_work(&ns->destroyer); | ||
86 | } | 115 | } |
87 | EXPORT_SYMBOL(free_user_ns); | ||
88 | 116 | ||
89 | uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) | 117 | static u32 map_id_down(struct uid_gid_map *map, u32 id) |
90 | { | 118 | { |
91 | struct user_namespace *tmp; | 119 | unsigned idx, extents; |
120 | u32 first, last; | ||
92 | 121 | ||
93 | if (likely(to == cred->user->user_ns)) | 122 | /* Find the matching extent */ |
94 | return uid; | 123 | extents = map->nr_extents; |
124 | smp_read_barrier_depends(); | ||
125 | for (idx = 0; idx < extents; idx++) { | ||
126 | first = map->extent[idx].first; | ||
127 | last = first + map->extent[idx].count - 1; | ||
128 | if (id >= first && id <= last) | ||
129 | break; | ||
130 | } | ||
131 | /* Map the id or note failure */ | ||
132 | if (idx < extents) | ||
133 | id = (id - first) + map->extent[idx].lower_first; | ||
134 | else | ||
135 | id = (u32) -1; | ||
95 | 136 | ||
137 | return id; | ||
138 | } | ||
96 | 139 | ||
97 | /* Is cred->user the creator of the target user_ns | 140 | static u32 map_id_up(struct uid_gid_map *map, u32 id) |
98 | * or the creator of one of it's parents? | 141 | { |
99 | */ | 142 | unsigned idx, extents; |
100 | for ( tmp = to; tmp != &init_user_ns; | 143 | u32 first, last; |
101 | tmp = tmp->creator->user_ns ) { | 144 | |
102 | if (cred->user == tmp->creator) { | 145 | /* Find the matching extent */ |
103 | return (uid_t)0; | 146 | extents = map->nr_extents; |
104 | } | 147 | smp_read_barrier_depends(); |
148 | for (idx = 0; idx < extents; idx++) { | ||
149 | first = map->extent[idx].lower_first; | ||
150 | last = first + map->extent[idx].count - 1; | ||
151 | if (id >= first && id <= last) | ||
152 | break; | ||
105 | } | 153 | } |
154 | /* Map the id or note failure */ | ||
155 | if (idx < extents) | ||
156 | id = (id - first) + map->extent[idx].first; | ||
157 | else | ||
158 | id = (u32) -1; | ||
159 | |||
160 | return id; | ||
161 | } | ||
162 | |||
163 | /** | ||
164 | * make_kuid - Map a user-namespace uid pair into a kuid. | ||
165 | * @ns: User namespace that the uid is in | ||
166 | * @uid: User identifier | ||
167 | * | ||
168 | * Maps a user-namespace uid pair into a kernel internal kuid, | ||
169 | * and returns that kuid. | ||
170 | * | ||
171 | * When there is no mapping defined for the user-namespace uid | ||
172 | * pair INVALID_UID is returned. Callers are expected to test | ||
173 | * for and handle handle INVALID_UID being returned. INVALID_UID | ||
174 | * may be tested for using uid_valid(). | ||
175 | */ | ||
176 | kuid_t make_kuid(struct user_namespace *ns, uid_t uid) | ||
177 | { | ||
178 | /* Map the uid to a global kernel uid */ | ||
179 | return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); | ||
180 | } | ||
181 | EXPORT_SYMBOL(make_kuid); | ||
182 | |||
183 | /** | ||
184 | * from_kuid - Create a uid from a kuid user-namespace pair. | ||
185 | * @targ: The user namespace we want a uid in. | ||
186 | * @kuid: The kernel internal uid to start with. | ||
187 | * | ||
188 | * Map @kuid into the user-namespace specified by @targ and | ||
189 | * return the resulting uid. | ||
190 | * | ||
191 | * There is always a mapping into the initial user_namespace. | ||
192 | * | ||
193 | * If @kuid has no mapping in @targ (uid_t)-1 is returned. | ||
194 | */ | ||
195 | uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) | ||
196 | { | ||
197 | /* Map the uid from a global kernel uid */ | ||
198 | return map_id_up(&targ->uid_map, __kuid_val(kuid)); | ||
199 | } | ||
200 | EXPORT_SYMBOL(from_kuid); | ||
106 | 201 | ||
107 | /* No useful relationship so no mapping */ | 202 | /** |
108 | return overflowuid; | 203 | * from_kuid_munged - Create a uid from a kuid user-namespace pair. |
204 | * @targ: The user namespace we want a uid in. | ||
205 | * @kuid: The kernel internal uid to start with. | ||
206 | * | ||
207 | * Map @kuid into the user-namespace specified by @targ and | ||
208 | * return the resulting uid. | ||
209 | * | ||
210 | * There is always a mapping into the initial user_namespace. | ||
211 | * | ||
212 | * Unlike from_kuid from_kuid_munged never fails and always | ||
213 | * returns a valid uid. This makes from_kuid_munged appropriate | ||
214 | * for use in syscalls like stat and getuid where failing the | ||
215 | * system call and failing to provide a valid uid are not an | ||
216 | * options. | ||
217 | * | ||
218 | * If @kuid has no mapping in @targ overflowuid is returned. | ||
219 | */ | ||
220 | uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) | ||
221 | { | ||
222 | uid_t uid; | ||
223 | uid = from_kuid(targ, kuid); | ||
224 | |||
225 | if (uid == (uid_t) -1) | ||
226 | uid = overflowuid; | ||
227 | return uid; | ||
109 | } | 228 | } |
229 | EXPORT_SYMBOL(from_kuid_munged); | ||
110 | 230 | ||
111 | gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) | 231 | /** |
232 | * make_kgid - Map a user-namespace gid pair into a kgid. | ||
233 | * @ns: User namespace that the gid is in | ||
234 | * @uid: group identifier | ||
235 | * | ||
236 | * Maps a user-namespace gid pair into a kernel internal kgid, | ||
237 | * and returns that kgid. | ||
238 | * | ||
239 | * When there is no mapping defined for the user-namespace gid | ||
240 | * pair INVALID_GID is returned. Callers are expected to test | ||
241 | * for and handle INVALID_GID being returned. INVALID_GID may be | ||
242 | * tested for using gid_valid(). | ||
243 | */ | ||
244 | kgid_t make_kgid(struct user_namespace *ns, gid_t gid) | ||
112 | { | 245 | { |
113 | struct user_namespace *tmp; | 246 | /* Map the gid to a global kernel gid */ |
247 | return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); | ||
248 | } | ||
249 | EXPORT_SYMBOL(make_kgid); | ||
114 | 250 | ||
115 | if (likely(to == cred->user->user_ns)) | 251 | /** |
116 | return gid; | 252 | * from_kgid - Create a gid from a kgid user-namespace pair. |
253 | * @targ: The user namespace we want a gid in. | ||
254 | * @kgid: The kernel internal gid to start with. | ||
255 | * | ||
256 | * Map @kgid into the user-namespace specified by @targ and | ||
257 | * return the resulting gid. | ||
258 | * | ||
259 | * There is always a mapping into the initial user_namespace. | ||
260 | * | ||
261 | * If @kgid has no mapping in @targ (gid_t)-1 is returned. | ||
262 | */ | ||
263 | gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) | ||
264 | { | ||
265 | /* Map the gid from a global kernel gid */ | ||
266 | return map_id_up(&targ->gid_map, __kgid_val(kgid)); | ||
267 | } | ||
268 | EXPORT_SYMBOL(from_kgid); | ||
269 | |||
270 | /** | ||
271 | * from_kgid_munged - Create a gid from a kgid user-namespace pair. | ||
272 | * @targ: The user namespace we want a gid in. | ||
273 | * @kgid: The kernel internal gid to start with. | ||
274 | * | ||
275 | * Map @kgid into the user-namespace specified by @targ and | ||
276 | * return the resulting gid. | ||
277 | * | ||
278 | * There is always a mapping into the initial user_namespace. | ||
279 | * | ||
280 | * Unlike from_kgid from_kgid_munged never fails and always | ||
281 | * returns a valid gid. This makes from_kgid_munged appropriate | ||
282 | * for use in syscalls like stat and getgid where failing the | ||
283 | * system call and failing to provide a valid gid are not options. | ||
284 | * | ||
285 | * If @kgid has no mapping in @targ overflowgid is returned. | ||
286 | */ | ||
287 | gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) | ||
288 | { | ||
289 | gid_t gid; | ||
290 | gid = from_kgid(targ, kgid); | ||
117 | 291 | ||
118 | /* Is cred->user the creator of the target user_ns | 292 | if (gid == (gid_t) -1) |
119 | * or the creator of one of it's parents? | 293 | gid = overflowgid; |
294 | return gid; | ||
295 | } | ||
296 | EXPORT_SYMBOL(from_kgid_munged); | ||
297 | |||
298 | static int uid_m_show(struct seq_file *seq, void *v) | ||
299 | { | ||
300 | struct user_namespace *ns = seq->private; | ||
301 | struct uid_gid_extent *extent = v; | ||
302 | struct user_namespace *lower_ns; | ||
303 | uid_t lower; | ||
304 | |||
305 | lower_ns = current_user_ns(); | ||
306 | if ((lower_ns == ns) && lower_ns->parent) | ||
307 | lower_ns = lower_ns->parent; | ||
308 | |||
309 | lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); | ||
310 | |||
311 | seq_printf(seq, "%10u %10u %10u\n", | ||
312 | extent->first, | ||
313 | lower, | ||
314 | extent->count); | ||
315 | |||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static int gid_m_show(struct seq_file *seq, void *v) | ||
320 | { | ||
321 | struct user_namespace *ns = seq->private; | ||
322 | struct uid_gid_extent *extent = v; | ||
323 | struct user_namespace *lower_ns; | ||
324 | gid_t lower; | ||
325 | |||
326 | lower_ns = current_user_ns(); | ||
327 | if ((lower_ns == ns) && lower_ns->parent) | ||
328 | lower_ns = lower_ns->parent; | ||
329 | |||
330 | lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); | ||
331 | |||
332 | seq_printf(seq, "%10u %10u %10u\n", | ||
333 | extent->first, | ||
334 | lower, | ||
335 | extent->count); | ||
336 | |||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | ||
341 | { | ||
342 | struct uid_gid_extent *extent = NULL; | ||
343 | loff_t pos = *ppos; | ||
344 | |||
345 | if (pos < map->nr_extents) | ||
346 | extent = &map->extent[pos]; | ||
347 | |||
348 | return extent; | ||
349 | } | ||
350 | |||
351 | static void *uid_m_start(struct seq_file *seq, loff_t *ppos) | ||
352 | { | ||
353 | struct user_namespace *ns = seq->private; | ||
354 | |||
355 | return m_start(seq, ppos, &ns->uid_map); | ||
356 | } | ||
357 | |||
358 | static void *gid_m_start(struct seq_file *seq, loff_t *ppos) | ||
359 | { | ||
360 | struct user_namespace *ns = seq->private; | ||
361 | |||
362 | return m_start(seq, ppos, &ns->gid_map); | ||
363 | } | ||
364 | |||
365 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) | ||
366 | { | ||
367 | (*pos)++; | ||
368 | return seq->op->start(seq, pos); | ||
369 | } | ||
370 | |||
371 | static void m_stop(struct seq_file *seq, void *v) | ||
372 | { | ||
373 | return; | ||
374 | } | ||
375 | |||
376 | struct seq_operations proc_uid_seq_operations = { | ||
377 | .start = uid_m_start, | ||
378 | .stop = m_stop, | ||
379 | .next = m_next, | ||
380 | .show = uid_m_show, | ||
381 | }; | ||
382 | |||
383 | struct seq_operations proc_gid_seq_operations = { | ||
384 | .start = gid_m_start, | ||
385 | .stop = m_stop, | ||
386 | .next = m_next, | ||
387 | .show = gid_m_show, | ||
388 | }; | ||
389 | |||
390 | static DEFINE_MUTEX(id_map_mutex); | ||
391 | |||
392 | static ssize_t map_write(struct file *file, const char __user *buf, | ||
393 | size_t count, loff_t *ppos, | ||
394 | int cap_setid, | ||
395 | struct uid_gid_map *map, | ||
396 | struct uid_gid_map *parent_map) | ||
397 | { | ||
398 | struct seq_file *seq = file->private_data; | ||
399 | struct user_namespace *ns = seq->private; | ||
400 | struct uid_gid_map new_map; | ||
401 | unsigned idx; | ||
402 | struct uid_gid_extent *extent, *last = NULL; | ||
403 | unsigned long page = 0; | ||
404 | char *kbuf, *pos, *next_line; | ||
405 | ssize_t ret = -EINVAL; | ||
406 | |||
407 | /* | ||
408 | * The id_map_mutex serializes all writes to any given map. | ||
409 | * | ||
410 | * Any map is only ever written once. | ||
411 | * | ||
412 | * An id map fits within 1 cache line on most architectures. | ||
413 | * | ||
414 | * On read nothing needs to be done unless you are on an | ||
415 | * architecture with a crazy cache coherency model like alpha. | ||
416 | * | ||
417 | * There is a one time data dependency between reading the | ||
418 | * count of the extents and the values of the extents. The | ||
419 | * desired behavior is to see the values of the extents that | ||
420 | * were written before the count of the extents. | ||
421 | * | ||
422 | * To achieve this smp_wmb() is used on guarantee the write | ||
423 | * order and smp_read_barrier_depends() is guaranteed that we | ||
424 | * don't have crazy architectures returning stale data. | ||
425 | * | ||
120 | */ | 426 | */ |
121 | for ( tmp = to; tmp != &init_user_ns; | 427 | mutex_lock(&id_map_mutex); |
122 | tmp = tmp->creator->user_ns ) { | 428 | |
123 | if (cred->user == tmp->creator) { | 429 | ret = -EPERM; |
124 | return (gid_t)0; | 430 | /* Only allow one successful write to the map */ |
431 | if (map->nr_extents != 0) | ||
432 | goto out; | ||
433 | |||
434 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID | ||
435 | * over the user namespace in order to set the id mapping. | ||
436 | */ | ||
437 | if (!ns_capable(ns, cap_setid)) | ||
438 | goto out; | ||
439 | |||
440 | /* Get a buffer */ | ||
441 | ret = -ENOMEM; | ||
442 | page = __get_free_page(GFP_TEMPORARY); | ||
443 | kbuf = (char *) page; | ||
444 | if (!page) | ||
445 | goto out; | ||
446 | |||
447 | /* Only allow <= page size writes at the beginning of the file */ | ||
448 | ret = -EINVAL; | ||
449 | if ((*ppos != 0) || (count >= PAGE_SIZE)) | ||
450 | goto out; | ||
451 | |||
452 | /* Slurp in the user data */ | ||
453 | ret = -EFAULT; | ||
454 | if (copy_from_user(kbuf, buf, count)) | ||
455 | goto out; | ||
456 | kbuf[count] = '\0'; | ||
457 | |||
458 | /* Parse the user data */ | ||
459 | ret = -EINVAL; | ||
460 | pos = kbuf; | ||
461 | new_map.nr_extents = 0; | ||
462 | for (;pos; pos = next_line) { | ||
463 | extent = &new_map.extent[new_map.nr_extents]; | ||
464 | |||
465 | /* Find the end of line and ensure I don't look past it */ | ||
466 | next_line = strchr(pos, '\n'); | ||
467 | if (next_line) { | ||
468 | *next_line = '\0'; | ||
469 | next_line++; | ||
470 | if (*next_line == '\0') | ||
471 | next_line = NULL; | ||
125 | } | 472 | } |
473 | |||
474 | pos = skip_spaces(pos); | ||
475 | extent->first = simple_strtoul(pos, &pos, 10); | ||
476 | if (!isspace(*pos)) | ||
477 | goto out; | ||
478 | |||
479 | pos = skip_spaces(pos); | ||
480 | extent->lower_first = simple_strtoul(pos, &pos, 10); | ||
481 | if (!isspace(*pos)) | ||
482 | goto out; | ||
483 | |||
484 | pos = skip_spaces(pos); | ||
485 | extent->count = simple_strtoul(pos, &pos, 10); | ||
486 | if (*pos && !isspace(*pos)) | ||
487 | goto out; | ||
488 | |||
489 | /* Verify there is not trailing junk on the line */ | ||
490 | pos = skip_spaces(pos); | ||
491 | if (*pos != '\0') | ||
492 | goto out; | ||
493 | |||
494 | /* Verify we have been given valid starting values */ | ||
495 | if ((extent->first == (u32) -1) || | ||
496 | (extent->lower_first == (u32) -1 )) | ||
497 | goto out; | ||
498 | |||
499 | /* Verify count is not zero and does not cause the extent to wrap */ | ||
500 | if ((extent->first + extent->count) <= extent->first) | ||
501 | goto out; | ||
502 | if ((extent->lower_first + extent->count) <= extent->lower_first) | ||
503 | goto out; | ||
504 | |||
505 | /* For now only accept extents that are strictly in order */ | ||
506 | if (last && | ||
507 | (((last->first + last->count) > extent->first) || | ||
508 | ((last->lower_first + last->count) > extent->lower_first))) | ||
509 | goto out; | ||
510 | |||
511 | new_map.nr_extents++; | ||
512 | last = extent; | ||
513 | |||
514 | /* Fail if the file contains too many extents */ | ||
515 | if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && | ||
516 | (next_line != NULL)) | ||
517 | goto out; | ||
518 | } | ||
519 | /* Be very certaint the new map actually exists */ | ||
520 | if (new_map.nr_extents == 0) | ||
521 | goto out; | ||
522 | |||
523 | ret = -EPERM; | ||
524 | /* Validate the user is allowed to use user id's mapped to. */ | ||
525 | if (!new_idmap_permitted(ns, cap_setid, &new_map)) | ||
526 | goto out; | ||
527 | |||
528 | /* Map the lower ids from the parent user namespace to the | ||
529 | * kernel global id space. | ||
530 | */ | ||
531 | for (idx = 0; idx < new_map.nr_extents; idx++) { | ||
532 | u32 lower_first; | ||
533 | extent = &new_map.extent[idx]; | ||
534 | |||
535 | lower_first = map_id_range_down(parent_map, | ||
536 | extent->lower_first, | ||
537 | extent->count); | ||
538 | |||
539 | /* Fail if we can not map the specified extent to | ||
540 | * the kernel global id space. | ||
541 | */ | ||
542 | if (lower_first == (u32) -1) | ||
543 | goto out; | ||
544 | |||
545 | extent->lower_first = lower_first; | ||
126 | } | 546 | } |
127 | 547 | ||
128 | /* No useful relationship so no mapping */ | 548 | /* Install the map */ |
129 | return overflowgid; | 549 | memcpy(map->extent, new_map.extent, |
550 | new_map.nr_extents*sizeof(new_map.extent[0])); | ||
551 | smp_wmb(); | ||
552 | map->nr_extents = new_map.nr_extents; | ||
553 | |||
554 | *ppos = count; | ||
555 | ret = count; | ||
556 | out: | ||
557 | mutex_unlock(&id_map_mutex); | ||
558 | if (page) | ||
559 | free_page(page); | ||
560 | return ret; | ||
561 | } | ||
562 | |||
563 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
564 | { | ||
565 | struct seq_file *seq = file->private_data; | ||
566 | struct user_namespace *ns = seq->private; | ||
567 | |||
568 | if (!ns->parent) | ||
569 | return -EPERM; | ||
570 | |||
571 | return map_write(file, buf, size, ppos, CAP_SETUID, | ||
572 | &ns->uid_map, &ns->parent->uid_map); | ||
573 | } | ||
574 | |||
575 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
576 | { | ||
577 | struct seq_file *seq = file->private_data; | ||
578 | struct user_namespace *ns = seq->private; | ||
579 | |||
580 | if (!ns->parent) | ||
581 | return -EPERM; | ||
582 | |||
583 | return map_write(file, buf, size, ppos, CAP_SETGID, | ||
584 | &ns->gid_map, &ns->parent->gid_map); | ||
585 | } | ||
586 | |||
587 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | ||
588 | struct uid_gid_map *new_map) | ||
589 | { | ||
590 | /* Allow the specified ids if we have the appropriate capability | ||
591 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. | ||
592 | */ | ||
593 | if (ns_capable(ns->parent, cap_setid)) | ||
594 | return true; | ||
595 | |||
596 | return false; | ||
130 | } | 597 | } |
131 | 598 | ||
132 | static __init int user_namespaces_init(void) | 599 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 405caf91aad..679d97a5d3f 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | |||
43 | 43 | ||
44 | down_read(&uts_sem); | 44 | down_read(&uts_sem); |
45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); | 46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); |
47 | up_read(&uts_sem); | 47 | up_read(&uts_sem); |
48 | return ns; | 48 | return ns; |
49 | } | 49 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c0..9a3128dc67d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1032 | cwq = get_cwq(gcwq->cpu, wq); | 1032 | cwq = get_cwq(gcwq->cpu, wq); |
1033 | trace_workqueue_queue_work(cpu, cwq, work); | 1033 | trace_workqueue_queue_work(cpu, cwq, work); |
1034 | 1034 | ||
1035 | BUG_ON(!list_empty(&work->entry)); | 1035 | if (WARN_ON(!list_empty(&work->entry))) { |
1036 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
1037 | return; | ||
1038 | } | ||
1036 | 1039 | ||
1037 | cwq->nr_in_flight[cwq->work_color]++; | 1040 | cwq->nr_in_flight[cwq->work_color]++; |
1038 | work_flags = work_color_to_flags(cwq->work_color); | 1041 | work_flags = work_color_to_flags(cwq->work_color); |
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) | |||
1210 | } else | 1213 | } else |
1211 | wake_up_all(&gcwq->trustee_wait); | 1214 | wake_up_all(&gcwq->trustee_wait); |
1212 | 1215 | ||
1213 | /* sanity check nr_running */ | 1216 | /* |
1214 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | 1217 | * Sanity check nr_running. Because trustee releases gcwq->lock |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | ||
1219 | * warning may trigger spuriously. Check iff trustee is idle. | ||
1220 | */ | ||
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | ||
1222 | gcwq->nr_workers == gcwq->nr_idle && | ||
1215 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); |
1216 | } | 1224 | } |
1217 | 1225 | ||
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) | |||
1810 | * lock freed" warnings as well as problems when looking into | 1818 | * lock freed" warnings as well as problems when looking into |
1811 | * work->lockdep_map, make a copy and use that here. | 1819 | * work->lockdep_map, make a copy and use that here. |
1812 | */ | 1820 | */ |
1813 | struct lockdep_map lockdep_map = work->lockdep_map; | 1821 | struct lockdep_map lockdep_map; |
1822 | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | ||
1814 | #endif | 1824 | #endif |
1815 | /* | 1825 | /* |
1816 | * A single work shouldn't be executed concurrently by | 1826 | * A single work shouldn't be executed concurrently by |
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) | |||
2506 | { | 2516 | { |
2507 | struct wq_barrier barr; | 2517 | struct wq_barrier barr; |
2508 | 2518 | ||
2519 | lock_map_acquire(&work->lockdep_map); | ||
2520 | lock_map_release(&work->lockdep_map); | ||
2521 | |||
2509 | if (start_flush_work(work, &barr, true)) { | 2522 | if (start_flush_work(work, &barr, true)) { |
2510 | wait_for_completion(&barr.done); | 2523 | wait_for_completion(&barr.done); |
2511 | destroy_work_on_stack(&barr.work); | 2524 | destroy_work_on_stack(&barr.work); |