aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/capability.c21
-rw-r--r--kernel/cgroup.c570
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/compat.c73
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c31
-rw-r--r--kernel/cred.c44
-rw-r--r--kernel/events/core.c16
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c77
-rw-r--r--kernel/groups.c50
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/debug.h38
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/kfifo.c1
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/params.c62
-rw-r--r--kernel/power/Kconfig27
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/autosleep.c127
-rw-r--r--kernel/power/hibernate.c13
-rw-r--r--kernel/power/main.c160
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/swap.c84
-rw-r--r--kernel/power/wakelock.c259
-rw-r--r--kernel/printk.c1390
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c333
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/res_counter.c71
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c479
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c472
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/semaphore.c2
-rw-r--r--kernel/signal.c85
-rw-r--r--kernel/smp.c27
-rw-r--r--kernel/smpboot.c62
-rw-r--r--kernel/smpboot.h18
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/sys.c278
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/tick-broadcast.c13
-rw-r--r--kernel/timer.c20
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c198
-rw-r--r--kernel/trace/ring_buffer.c383
-rw-r--r--kernel/trace/trace.c62
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_output.c5
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/uid16.c48
-rw-r--r--kernel/user.c51
-rw-r--r--kernel/user_namespace.c595
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/workqueue.c21
76 files changed, 5726 insertions, 2601 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9..6c07f30fa9b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
45obj-$(CONFIG_SMP) += smp.o 45obj-$(CONFIG_SMP) += smp.o
46obj-$(CONFIG_SMP) += smpboot.o
46ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
47obj-y += up.o 48obj-y += up.o
48endif 49endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34ea..4b96415527b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
70#include <linux/compat.h>
70 71
71#include "audit.h" 72#include "audit.h"
72 73
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
2710 audit_log_end(ab); 2711 audit_log_end(ab);
2711} 2712}
2712 2713
2713void __audit_seccomp(unsigned long syscall) 2714void __audit_seccomp(unsigned long syscall, long signr, int code)
2714{ 2715{
2715 struct audit_buffer *ab; 2716 struct audit_buffer *ab;
2716 2717
2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2718 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2718 audit_log_abend(ab, "seccomp", SIGKILL); 2719 audit_log_abend(ab, "seccomp", signr);
2719 audit_log_format(ab, " syscall=%ld", syscall); 2720 audit_log_format(ab, " syscall=%ld", syscall);
2721 audit_log_format(ab, " compat=%d", is_compat_task());
2722 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2723 audit_log_format(ab, " code=0x%x", code);
2720 audit_log_end(ab); 2724 audit_log_end(ab);
2721} 2725}
2722 2726
diff --git a/kernel/capability.c b/kernel/capability.c
index 3f1adb6c647..493d9725948 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -419,3 +419,24 @@ bool nsown_capable(int cap)
419{ 419{
420 return ns_capable(current_user_ns(), cap); 420 return ns_capable(current_user_ns(), cap);
421} 421}
422
423/**
424 * inode_capable - Check superior capability over inode
425 * @inode: The inode in question
426 * @cap: The capability in question
427 *
428 * Return true if the current task has the given superior capability
429 * targeted at it's own user namespace and that the given inode is owned
430 * by the current user namespace or a child namespace.
431 *
432 * Currently we check to see if an inode is owned by the current
433 * user namespace by seeing if the inode's owner maps into the
434 * current user namespace.
435 *
436 */
437bool inode_capable(const struct inode *inode, int cap)
438{
439 struct user_namespace *ns = current_user_ns();
440
441 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
442}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c..a0c6af34d50 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
66/* 70/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 71 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 72 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 131 /* A list running through the active hierarchies */
128 struct list_head root_list; 132 struct list_head root_list;
129 133
134 /* All cgroups on this root, cgroup_mutex protected */
135 struct list_head allcg_list;
136
130 /* Hierarchy-specific flags */ 137 /* Hierarchy-specific flags */
131 unsigned long flags; 138 unsigned long flags;
132 139
@@ -145,6 +152,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 152static struct cgroupfs_root rootnode;
146 153
147/* 154/*
155 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
156 */
157struct cfent {
158 struct list_head node;
159 struct dentry *dentry;
160 struct cftype *type;
161};
162
163/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 164 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 165 * cgroup_subsys->use_id != 0.
150 */ 166 */
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)
239 255
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 257
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css)
260{
261 int v = atomic_read(&css->refcnt);
262
263 return v >= 0 ? v : v - CSS_DEACT_BIAS;
264}
265
242/* convenient tests for these bits */ 266/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 267inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 268{
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 303#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 304list_for_each_entry(_root, &roots, root_list)
281 305
306static inline struct cgroup *__d_cgrp(struct dentry *dentry)
307{
308 return dentry->d_fsdata;
309}
310
311static inline struct cfent *__d_cfe(struct dentry *dentry)
312{
313 return dentry->d_fsdata;
314}
315
316static inline struct cftype *__d_cft(struct dentry *dentry)
317{
318 return __d_cfe(dentry)->type;
319}
320
282/* the list of cgroups eligible for automatic release. Protected by 321/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 322 * release_list_lock */
284static LIST_HEAD(release_list); 323static LIST_HEAD(release_list);
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 855 struct cgroup_subsys *ss;
817 int ret = 0; 856 int ret = 0;
818 857
819 for_each_subsys(cgrp->root, ss) 858 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 859 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(cgrp); 860 continue;
822 if (ret) 861
823 break; 862 ret = ss->pre_destroy(cgrp);
863 if (ret) {
864 /* ->pre_destroy() failure is being deprecated */
865 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
866 break;
824 } 867 }
868 }
825 869
826 return ret; 870 return ret;
827} 871}
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 908 BUG_ON(!list_empty(&cgrp->pidlists));
865 909
866 kfree_rcu(cgrp, rcu_head); 910 kfree_rcu(cgrp, rcu_head);
911 } else {
912 struct cfent *cfe = __d_cfe(dentry);
913 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
914
915 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name);
918 kfree(cfe);
867 } 919 }
868 iput(inode); 920 iput(inode);
869} 921}
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 934 dput(parent);
883} 935}
884 936
885static void cgroup_clear_directory(struct dentry *dentry) 937static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 938{
887 struct list_head *node; 939 struct cfent *cfe;
888 940
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 941 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 942 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 943
892 while (node != &dentry->d_subdirs) { 944 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 945 struct dentry *d = cfe->dentry;
894 946
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 947 if (cft && cfe->type != cft)
896 list_del_init(node); 948 continue;
897 if (d->d_inode) { 949
898 /* This should never be called on a cgroup 950 dget(d);
899 * directory with child cgroups */ 951 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 952 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 953 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 954 dput(d);
903 spin_unlock(&dentry->d_lock); 955
904 d_delete(d); 956 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 957 }
912 spin_unlock(&dentry->d_lock); 958 return -ENOENT;
959}
960
961static void cgroup_clear_directory(struct dentry *dir)
962{
963 struct cgroup *cgrp = __d_cgrp(dir);
964
965 while (!list_empty(&cgrp->files))
966 cgroup_rm_file(cgrp, NULL);
913} 967}
914 968
915/* 969/*
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1348 if (ret)
1295 goto out_unlock; 1349 goto out_unlock;
1296 1350
1351 /* See feature-removal-schedule.txt */
1352 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1353 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1354 task_tgid_nr(current), current->comm);
1355
1297 /* Don't allow flags or name to change at remount */ 1356 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1357 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1358 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1367 goto out_unlock;
1309 } 1368 }
1310 1369
1311 /* (re)populate subsystem files */ 1370 /* clear out any existing files and repopulate subsystem files */
1371 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1372 cgroup_populate_dir(cgrp);
1313 1373
1314 if (opts.release_agent) 1374 if (opts.release_agent)
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1393{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1394 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1395 INIT_LIST_HEAD(&cgrp->children);
1396 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1397 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1398 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1399 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1405static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1406{
1346 struct cgroup *cgrp = &root->top_cgroup; 1407 struct cgroup *cgrp = &root->top_cgroup;
1408
1347 INIT_LIST_HEAD(&root->subsys_list); 1409 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1410 INIT_LIST_HEAD(&root->root_list);
1411 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1412 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1413 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1414 cgrp->top_cgroup = cgrp;
1415 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1416 init_cgroup_housekeeping(cgrp);
1353} 1417}
1354 1418
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = {
1692 1756
1693static struct kobject *cgroup_kobj; 1757static struct kobject *cgroup_kobj;
1694 1758
1695static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1696{
1697 return dentry->d_fsdata;
1698}
1699
1700static inline struct cftype *__d_cft(struct dentry *dentry)
1701{
1702 return dentry->d_fsdata;
1703}
1704
1705/** 1759/**
1706 * cgroup_path - generate the path of a cgroup 1760 * cgroup_path - generate the path of a cgroup
1707 * @cgrp: the cgroup in question 1761 * @cgrp: the cgroup in question
@@ -2160,9 +2214,9 @@ retry_find_task:
2160 * only need to check permissions on one of them. 2214 * only need to check permissions on one of them.
2161 */ 2215 */
2162 tcred = __task_cred(tsk); 2216 tcred = __task_cred(tsk);
2163 if (cred->euid && 2217 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2164 cred->euid != tcred->uid && 2218 !uid_eq(cred->euid, tcred->uid) &&
2165 cred->euid != tcred->suid) { 2219 !uid_eq(cred->euid, tcred->suid)) {
2166 rcu_read_unlock(); 2220 rcu_read_unlock();
2167 ret = -EACCES; 2221 ret = -EACCES;
2168 goto out_unlock_cgroup; 2222 goto out_unlock_cgroup;
@@ -2172,6 +2226,18 @@ retry_find_task:
2172 2226
2173 if (threadgroup) 2227 if (threadgroup)
2174 tsk = tsk->group_leader; 2228 tsk = tsk->group_leader;
2229
2230 /*
2231 * Workqueue threads may acquire PF_THREAD_BOUND and become
2232 * trapped in a cpuset, or RT worker may be born in a cgroup
2233 * with no rt_runtime allocated. Just say no.
2234 */
2235 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2236 ret = -EINVAL;
2237 rcu_read_unlock();
2238 goto out_unlock_cgroup;
2239 }
2240
2175 get_task_struct(tsk); 2241 get_task_struct(tsk);
2176 rcu_read_unlock(); 2242 rcu_read_unlock();
2177 2243
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2603 return mode; 2669 return mode;
2604} 2670}
2605 2671
2606int cgroup_add_file(struct cgroup *cgrp, 2672static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2607 struct cgroup_subsys *subsys, 2673 const struct cftype *cft)
2608 const struct cftype *cft)
2609{ 2674{
2610 struct dentry *dir = cgrp->dentry; 2675 struct dentry *dir = cgrp->dentry;
2676 struct cgroup *parent = __d_cgrp(dir);
2611 struct dentry *dentry; 2677 struct dentry *dentry;
2678 struct cfent *cfe;
2612 int error; 2679 int error;
2613 umode_t mode; 2680 umode_t mode;
2614
2615 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2681 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2682
2683 /* does @cft->flags tell us to skip creation on @cgrp? */
2684 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2685 return 0;
2686 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2687 return 0;
2688
2616 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2689 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2617 strcpy(name, subsys->name); 2690 strcpy(name, subsys->name);
2618 strcat(name, "."); 2691 strcat(name, ".");
2619 } 2692 }
2620 strcat(name, cft->name); 2693 strcat(name, cft->name);
2694
2621 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2695 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2696
2697 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2698 if (!cfe)
2699 return -ENOMEM;
2700
2622 dentry = lookup_one_len(name, dir, strlen(name)); 2701 dentry = lookup_one_len(name, dir, strlen(name));
2623 if (!IS_ERR(dentry)) { 2702 if (IS_ERR(dentry)) {
2624 mode = cgroup_file_mode(cft);
2625 error = cgroup_create_file(dentry, mode | S_IFREG,
2626 cgrp->root->sb);
2627 if (!error)
2628 dentry->d_fsdata = (void *)cft;
2629 dput(dentry);
2630 } else
2631 error = PTR_ERR(dentry); 2703 error = PTR_ERR(dentry);
2704 goto out;
2705 }
2706
2707 mode = cgroup_file_mode(cft);
2708 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2709 if (!error) {
2710 cfe->type = (void *)cft;
2711 cfe->dentry = dentry;
2712 dentry->d_fsdata = cfe;
2713 list_add_tail(&cfe->node, &parent->files);
2714 cfe = NULL;
2715 }
2716 dput(dentry);
2717out:
2718 kfree(cfe);
2632 return error; 2719 return error;
2633} 2720}
2634EXPORT_SYMBOL_GPL(cgroup_add_file);
2635 2721
2636int cgroup_add_files(struct cgroup *cgrp, 2722static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2637 struct cgroup_subsys *subsys, 2723 const struct cftype cfts[], bool is_add)
2638 const struct cftype cft[],
2639 int count)
2640{ 2724{
2641 int i, err; 2725 const struct cftype *cft;
2642 for (i = 0; i < count; i++) { 2726 int err, ret = 0;
2643 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2727
2644 if (err) 2728 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2645 return err; 2729 if (is_add)
2730 err = cgroup_add_file(cgrp, subsys, cft);
2731 else
2732 err = cgroup_rm_file(cgrp, cft);
2733 if (err) {
2734 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2735 is_add ? "add" : "remove", cft->name, err);
2736 ret = err;
2737 }
2738 }
2739 return ret;
2740}
2741
2742static DEFINE_MUTEX(cgroup_cft_mutex);
2743
2744static void cgroup_cfts_prepare(void)
2745 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2746{
2747 /*
2748 * Thanks to the entanglement with vfs inode locking, we can't walk
2749 * the existing cgroups under cgroup_mutex and create files.
2750 * Instead, we increment reference on all cgroups and build list of
2751 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2752 * exclusive access to the field.
2753 */
2754 mutex_lock(&cgroup_cft_mutex);
2755 mutex_lock(&cgroup_mutex);
2756}
2757
2758static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2759 const struct cftype *cfts, bool is_add)
2760 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2761{
2762 LIST_HEAD(pending);
2763 struct cgroup *cgrp, *n;
2764
2765 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2766 if (cfts && ss->root != &rootnode) {
2767 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2768 dget(cgrp->dentry);
2769 list_add_tail(&cgrp->cft_q_node, &pending);
2770 }
2771 }
2772
2773 mutex_unlock(&cgroup_mutex);
2774
2775 /*
2776 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2777 * files for all cgroups which were created before.
2778 */
2779 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2780 struct inode *inode = cgrp->dentry->d_inode;
2781
2782 mutex_lock(&inode->i_mutex);
2783 mutex_lock(&cgroup_mutex);
2784 if (!cgroup_is_removed(cgrp))
2785 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2786 mutex_unlock(&cgroup_mutex);
2787 mutex_unlock(&inode->i_mutex);
2788
2789 list_del_init(&cgrp->cft_q_node);
2790 dput(cgrp->dentry);
2646 } 2791 }
2792
2793 mutex_unlock(&cgroup_cft_mutex);
2794}
2795
2796/**
2797 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2798 * @ss: target cgroup subsystem
2799 * @cfts: zero-length name terminated array of cftypes
2800 *
2801 * Register @cfts to @ss. Files described by @cfts are created for all
2802 * existing cgroups to which @ss is attached and all future cgroups will
2803 * have them too. This function can be called anytime whether @ss is
2804 * attached or not.
2805 *
2806 * Returns 0 on successful registration, -errno on failure. Note that this
2807 * function currently returns 0 as long as @cfts registration is successful
2808 * even if some file creation attempts on existing cgroups fail.
2809 */
2810int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2811{
2812 struct cftype_set *set;
2813
2814 set = kzalloc(sizeof(*set), GFP_KERNEL);
2815 if (!set)
2816 return -ENOMEM;
2817
2818 cgroup_cfts_prepare();
2819 set->cfts = cfts;
2820 list_add_tail(&set->node, &ss->cftsets);
2821 cgroup_cfts_commit(ss, cfts, true);
2822
2647 return 0; 2823 return 0;
2648} 2824}
2649EXPORT_SYMBOL_GPL(cgroup_add_files); 2825EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2826
2827/**
2828 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2829 * @ss: target cgroup subsystem
2830 * @cfts: zero-length name terminated array of cftypes
2831 *
2832 * Unregister @cfts from @ss. Files described by @cfts are removed from
2833 * all existing cgroups to which @ss is attached and all future cgroups
2834 * won't have them either. This function can be called anytime whether @ss
2835 * is attached or not.
2836 *
2837 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2838 * registered with @ss.
2839 */
2840int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2841{
2842 struct cftype_set *set;
2843
2844 cgroup_cfts_prepare();
2845
2846 list_for_each_entry(set, &ss->cftsets, node) {
2847 if (set->cfts == cfts) {
2848 list_del_init(&set->node);
2849 cgroup_cfts_commit(ss, cfts, false);
2850 return 0;
2851 }
2852 }
2853
2854 cgroup_cfts_commit(ss, NULL, false);
2855 return -ENOENT;
2856}
2650 2857
2651/** 2858/**
2652 * cgroup_task_count - count the number of tasks in a cgroup. 2859 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3832,14 @@ static struct cftype files[] = {
3625 .read_u64 = cgroup_clone_children_read, 3832 .read_u64 = cgroup_clone_children_read,
3626 .write_u64 = cgroup_clone_children_write, 3833 .write_u64 = cgroup_clone_children_write,
3627 }, 3834 },
3628}; 3835 {
3629 3836 .name = "release_agent",
3630static struct cftype cft_release_agent = { 3837 .flags = CFTYPE_ONLY_ON_ROOT,
3631 .name = "release_agent", 3838 .read_seq_string = cgroup_release_agent_show,
3632 .read_seq_string = cgroup_release_agent_show, 3839 .write_string = cgroup_release_agent_write,
3633 .write_string = cgroup_release_agent_write, 3840 .max_write_len = PATH_MAX,
3634 .max_write_len = PATH_MAX, 3841 },
3842 { } /* terminate */
3635}; 3843};
3636 3844
3637static int cgroup_populate_dir(struct cgroup *cgrp) 3845static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3639 int err; 3847 int err;
3640 struct cgroup_subsys *ss; 3848 struct cgroup_subsys *ss;
3641 3849
3642 /* First clear out any existing files */ 3850 err = cgroup_addrm_files(cgrp, NULL, files, true);
3643 cgroup_clear_directory(cgrp->dentry);
3644
3645 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3646 if (err < 0) 3851 if (err < 0)
3647 return err; 3852 return err;
3648 3853
3649 if (cgrp == cgrp->top_cgroup) { 3854 /* process cftsets of each subsystem */
3650 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3651 return err;
3652 }
3653
3654 for_each_subsys(cgrp->root, ss) { 3855 for_each_subsys(cgrp->root, ss) {
3655 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3856 struct cftype_set *set;
3656 return err; 3857
3858 list_for_each_entry(set, &ss->cftsets, node)
3859 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3657 } 3860 }
3861
3658 /* This cgroup is ready now */ 3862 /* This cgroup is ready now */
3659 for_each_subsys(cgrp->root, ss) { 3863 for_each_subsys(cgrp->root, ss) {
3660 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3864 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3670 return 0; 3874 return 0;
3671} 3875}
3672 3876
3877static void css_dput_fn(struct work_struct *work)
3878{
3879 struct cgroup_subsys_state *css =
3880 container_of(work, struct cgroup_subsys_state, dput_work);
3881
3882 dput(css->cgroup->dentry);
3883}
3884
3673static void init_cgroup_css(struct cgroup_subsys_state *css, 3885static void init_cgroup_css(struct cgroup_subsys_state *css,
3674 struct cgroup_subsys *ss, 3886 struct cgroup_subsys *ss,
3675 struct cgroup *cgrp) 3887 struct cgroup *cgrp)
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3682 set_bit(CSS_ROOT, &css->flags); 3894 set_bit(CSS_ROOT, &css->flags);
3683 BUG_ON(cgrp->subsys[ss->subsys_id]); 3895 BUG_ON(cgrp->subsys[ss->subsys_id]);
3684 cgrp->subsys[ss->subsys_id] = css; 3896 cgrp->subsys[ss->subsys_id] = css;
3897
3898 /*
3899 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3900 * which is put on the last css_put(). dput() requires process
3901 * context, which css_put() may be called without. @css->dput_work
3902 * will be used to invoke dput() asynchronously from css_put().
3903 */
3904 INIT_WORK(&css->dput_work, css_dput_fn);
3905 if (ss->__DEPRECATED_clear_css_refs)
3906 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3685} 3907}
3686 3908
3687static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3909static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3784 if (err < 0) 4006 if (err < 0)
3785 goto err_remove; 4007 goto err_remove;
3786 4008
4009 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4010 for_each_subsys(root, ss)
4011 if (!ss->__DEPRECATED_clear_css_refs)
4012 dget(dentry);
4013
3787 /* The cgroup directory was pre-locked for us */ 4014 /* The cgroup directory was pre-locked for us */
3788 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4015 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3789 4016
4017 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4018
3790 err = cgroup_populate_dir(cgrp); 4019 err = cgroup_populate_dir(cgrp);
3791 /* If err < 0, we have a half-filled directory - oh well ;) */ 4020 /* If err < 0, we have a half-filled directory - oh well ;) */
3792 4021
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3826 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4055 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3827} 4056}
3828 4057
4058/*
4059 * Check the reference count on each subsystem. Since we already
4060 * established that there are no tasks in the cgroup, if the css refcount
4061 * is also 1, then there should be no outstanding references, so the
4062 * subsystem is safe to destroy. We scan across all subsystems rather than
4063 * using the per-hierarchy linked list of mounted subsystems since we can
4064 * be called via check_for_release() with no synchronization other than
4065 * RCU, and the subsystem linked list isn't RCU-safe.
4066 */
3829static int cgroup_has_css_refs(struct cgroup *cgrp) 4067static int cgroup_has_css_refs(struct cgroup *cgrp)
3830{ 4068{
3831 /* Check the reference count on each subsystem. Since we
3832 * already established that there are no tasks in the
3833 * cgroup, if the css refcount is also 1, then there should
3834 * be no outstanding references, so the subsystem is safe to
3835 * destroy. We scan across all subsystems rather than using
3836 * the per-hierarchy linked list of mounted subsystems since
3837 * we can be called via check_for_release() with no
3838 * synchronization other than RCU, and the subsystem linked
3839 * list isn't RCU-safe */
3840 int i; 4069 int i;
4070
3841 /* 4071 /*
3842 * We won't need to lock the subsys array, because the subsystems 4072 * We won't need to lock the subsys array, because the subsystems
3843 * we're concerned about aren't going anywhere since our cgroup root 4073 * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4076 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3847 struct cgroup_subsys *ss = subsys[i]; 4077 struct cgroup_subsys *ss = subsys[i];
3848 struct cgroup_subsys_state *css; 4078 struct cgroup_subsys_state *css;
4079
3849 /* Skip subsystems not present or not in this hierarchy */ 4080 /* Skip subsystems not present or not in this hierarchy */
3850 if (ss == NULL || ss->root != cgrp->root) 4081 if (ss == NULL || ss->root != cgrp->root)
3851 continue; 4082 continue;
4083
3852 css = cgrp->subsys[ss->subsys_id]; 4084 css = cgrp->subsys[ss->subsys_id];
3853 /* When called from check_for_release() it's possible 4085 /*
4086 * When called from check_for_release() it's possible
3854 * that by this point the cgroup has been removed 4087 * that by this point the cgroup has been removed
3855 * and the css deleted. But a false-positive doesn't 4088 * and the css deleted. But a false-positive doesn't
3856 * matter, since it can only happen if the cgroup 4089 * matter, since it can only happen if the cgroup
3857 * has been deleted and hence no longer needs the 4090 * has been deleted and hence no longer needs the
3858 * release agent to be called anyway. */ 4091 * release agent to be called anyway.
3859 if (css && (atomic_read(&css->refcnt) > 1)) 4092 */
4093 if (css && css_refcnt(css) > 1)
3860 return 1; 4094 return 1;
3861 } 4095 }
3862 return 0; 4096 return 0;
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3866 * Atomically mark all (or else none) of the cgroup's CSS objects as 4100 * Atomically mark all (or else none) of the cgroup's CSS objects as
3867 * CSS_REMOVED. Return true on success, or false if the cgroup has 4101 * CSS_REMOVED. Return true on success, or false if the cgroup has
3868 * busy subsystems. Call with cgroup_mutex held 4102 * busy subsystems. Call with cgroup_mutex held
4103 *
4104 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4105 * not, cgroup removal behaves differently.
4106 *
4107 * If clear is set, css refcnt for the subsystem should be zero before
4108 * cgroup removal can be committed. This is implemented by
4109 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4110 * called multiple times until all css refcnts reach zero and is allowed to
4111 * veto removal on any invocation. This behavior is deprecated and will be
4112 * removed as soon as the existing user (memcg) is updated.
4113 *
4114 * If clear is not set, each css holds an extra reference to the cgroup's
4115 * dentry and cgroup removal proceeds regardless of css refs.
4116 * ->pre_destroy() will be called at least once and is not allowed to fail.
4117 * On the last put of each css, whenever that may be, the extra dentry ref
4118 * is put so that dentry destruction happens only after all css's are
4119 * released.
3869 */ 4120 */
3870
3871static int cgroup_clear_css_refs(struct cgroup *cgrp) 4121static int cgroup_clear_css_refs(struct cgroup *cgrp)
3872{ 4122{
3873 struct cgroup_subsys *ss; 4123 struct cgroup_subsys *ss;
3874 unsigned long flags; 4124 unsigned long flags;
3875 bool failed = false; 4125 bool failed = false;
4126
3876 local_irq_save(flags); 4127 local_irq_save(flags);
4128
4129 /*
4130 * Block new css_tryget() by deactivating refcnt. If all refcnts
4131 * for subsystems w/ clear_css_refs set were 1 at the moment of
4132 * deactivation, we succeeded.
4133 */
3877 for_each_subsys(cgrp->root, ss) { 4134 for_each_subsys(cgrp->root, ss) {
3878 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4135 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3879 int refcnt; 4136
3880 while (1) { 4137 WARN_ON(atomic_read(&css->refcnt) < 0);
3881 /* We can only remove a CSS with a refcnt==1 */ 4138 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3882 refcnt = atomic_read(&css->refcnt); 4139
3883 if (refcnt > 1) { 4140 if (ss->__DEPRECATED_clear_css_refs)
3884 failed = true; 4141 failed |= css_refcnt(css) != 1;
3885 goto done;
3886 }
3887 BUG_ON(!refcnt);
3888 /*
3889 * Drop the refcnt to 0 while we check other
3890 * subsystems. This will cause any racing
3891 * css_tryget() to spin until we set the
3892 * CSS_REMOVED bits or abort
3893 */
3894 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3895 break;
3896 cpu_relax();
3897 }
3898 } 4142 }
3899 done: 4143
4144 /*
4145 * If succeeded, set REMOVED and put all the base refs; otherwise,
4146 * restore refcnts to positive values. Either way, all in-progress
4147 * css_tryget() will be released.
4148 */
3900 for_each_subsys(cgrp->root, ss) { 4149 for_each_subsys(cgrp->root, ss) {
3901 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4150 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3902 if (failed) { 4151
3903 /* 4152 if (!failed) {
3904 * Restore old refcnt if we previously managed
3905 * to clear it from 1 to 0
3906 */
3907 if (!atomic_read(&css->refcnt))
3908 atomic_set(&css->refcnt, 1);
3909 } else {
3910 /* Commit the fact that the CSS is removed */
3911 set_bit(CSS_REMOVED, &css->flags); 4153 set_bit(CSS_REMOVED, &css->flags);
4154 css_put(css);
4155 } else {
4156 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3912 } 4157 }
3913 } 4158 }
4159
3914 local_irq_restore(flags); 4160 local_irq_restore(flags);
3915 return !failed; 4161 return !failed;
3916} 4162}
@@ -3995,6 +4241,8 @@ again:
3995 list_del_init(&cgrp->sibling); 4241 list_del_init(&cgrp->sibling);
3996 cgroup_unlock_hierarchy(cgrp->root); 4242 cgroup_unlock_hierarchy(cgrp->root);
3997 4243
4244 list_del_init(&cgrp->allcg_node);
4245
3998 d = dget(cgrp->dentry); 4246 d = dget(cgrp->dentry);
3999 4247
4000 cgroup_d_remove_dir(d); 4248 cgroup_d_remove_dir(d);
@@ -4021,12 +4269,29 @@ again:
4021 return 0; 4269 return 0;
4022} 4270}
4023 4271
4272static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4273{
4274 INIT_LIST_HEAD(&ss->cftsets);
4275
4276 /*
4277 * base_cftset is embedded in subsys itself, no need to worry about
4278 * deregistration.
4279 */
4280 if (ss->base_cftypes) {
4281 ss->base_cftset.cfts = ss->base_cftypes;
4282 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4283 }
4284}
4285
4024static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4286static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4025{ 4287{
4026 struct cgroup_subsys_state *css; 4288 struct cgroup_subsys_state *css;
4027 4289
4028 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4290 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4029 4291
4292 /* init base cftset */
4293 cgroup_init_cftsets(ss);
4294
4030 /* Create the top cgroup state for this subsystem */ 4295 /* Create the top cgroup state for this subsystem */
4031 list_add(&ss->sibling, &rootnode.subsys_list); 4296 list_add(&ss->sibling, &rootnode.subsys_list);
4032 ss->root = &rootnode; 4297 ss->root = &rootnode;
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 return 0; 4361 return 0;
4097 } 4362 }
4098 4363
4364 /* init base cftset */
4365 cgroup_init_cftsets(ss);
4366
4099 /* 4367 /*
4100 * need to register a subsys id before anything else - for example, 4368 * need to register a subsys id before anything else - for example,
4101 * init_cgroup_css needs it. 4369 * init_cgroup_css needs it.
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp)
4685} 4953}
4686 4954
4687/* Caller must verify that the css is not for root cgroup */ 4955/* Caller must verify that the css is not for root cgroup */
4688void __css_put(struct cgroup_subsys_state *css, int count) 4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 do {
4959 int v = css_refcnt(css);
4960
4961 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4962 return true;
4963 cpu_relax();
4964 } while (!test_bit(CSS_REMOVED, &css->flags));
4965
4966 return false;
4967}
4968EXPORT_SYMBOL_GPL(__css_tryget);
4969
4970/* Caller must verify that the css is not for root cgroup */
4971void __css_put(struct cgroup_subsys_state *css)
4689{ 4972{
4690 struct cgroup *cgrp = css->cgroup; 4973 struct cgroup *cgrp = css->cgroup;
4691 int val; 4974
4692 rcu_read_lock(); 4975 rcu_read_lock();
4693 val = atomic_sub_return(count, &css->refcnt); 4976 atomic_dec(&css->refcnt);
4694 if (val == 1) { 4977 switch (css_refcnt(css)) {
4978 case 1:
4695 if (notify_on_release(cgrp)) { 4979 if (notify_on_release(cgrp)) {
4696 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4980 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4697 check_for_release(cgrp); 4981 check_for_release(cgrp);
4698 } 4982 }
4699 cgroup_wakeup_rmdir_waiter(cgrp); 4983 cgroup_wakeup_rmdir_waiter(cgrp);
4984 break;
4985 case 0:
4986 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4987 schedule_work(&css->dput_work);
4988 break;
4700 } 4989 }
4701 rcu_read_unlock(); 4990 rcu_read_unlock();
4702 WARN_ON_ONCE(val < 1);
4703} 4991}
4704EXPORT_SYMBOL_GPL(__css_put); 4992EXPORT_SYMBOL_GPL(__css_put);
4705 4993
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4818 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5106 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4819 * it's unchanged until freed. 5107 * it's unchanged until freed.
4820 */ 5108 */
4821 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5109 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4822 5110
4823 if (cssid) 5111 if (cssid)
4824 return cssid->id; 5112 return cssid->id;
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4830{ 5118{
4831 struct css_id *cssid; 5119 struct css_id *cssid;
4832 5120
4833 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5121 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4834 5122
4835 if (cssid) 5123 if (cssid)
4836 return cssid->depth; 5124 return cssid->depth;
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = {
5211 .name = "releasable", 5499 .name = "releasable",
5212 .read_u64 = releasable_read, 5500 .read_u64 = releasable_read,
5213 }, 5501 },
5214};
5215 5502
5216static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5503 { } /* terminate */
5217{ 5504};
5218 return cgroup_add_files(cont, ss, debug_files,
5219 ARRAY_SIZE(debug_files));
5220}
5221 5505
5222struct cgroup_subsys debug_subsys = { 5506struct cgroup_subsys debug_subsys = {
5223 .name = "debug", 5507 .name = "debug",
5224 .create = debug_create, 5508 .create = debug_create,
5225 .destroy = debug_destroy, 5509 .destroy = debug_destroy,
5226 .populate = debug_populate,
5227 .subsys_id = debug_subsys_id, 5510 .subsys_id = debug_subsys_id,
5511 .base_cftypes = debug_files,
5228}; 5512};
5229#endif /* CONFIG_CGROUP_DEBUG */ 5513#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b6..3649fc6b3ea 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
358static struct cftype files[] = { 358static struct cftype files[] = {
359 { 359 {
360 .name = "state", 360 .name = "state",
361 .flags = CFTYPE_NOT_ON_ROOT,
361 .read_seq_string = freezer_read, 362 .read_seq_string = freezer_read,
362 .write_string = freezer_write, 363 .write_string = freezer_write,
363 }, 364 },
365 { } /* terminate */
364}; 366};
365 367
366static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
367{
368 if (!cgroup->parent)
369 return 0;
370 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
371}
372
373struct cgroup_subsys freezer_subsys = { 368struct cgroup_subsys freezer_subsys = {
374 .name = "freezer", 369 .name = "freezer",
375 .create = freezer_create, 370 .create = freezer_create,
376 .destroy = freezer_destroy, 371 .destroy = freezer_destroy,
377 .populate = freezer_populate,
378 .subsys_id = freezer_subsys_id, 372 .subsys_id = freezer_subsys_id,
379 .can_attach = freezer_can_attach, 373 .can_attach = freezer_can_attach,
380 .fork = freezer_fork, 374 .fork = freezer_fork,
375 .base_cftypes = files,
381}; 376};
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809..c28a306ae05 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
372 372
373#ifdef __ARCH_WANT_SYS_SIGPROCMASK 373#ifdef __ARCH_WANT_SYS_SIGPROCMASK
374 374
375asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 375/*
376 compat_old_sigset_t __user *oset) 376 * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
377 * blocked set of signals to the supplied signal set
378 */
379static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
377{ 380{
378 old_sigset_t s; 381 memcpy(blocked->sig, &set, sizeof(set));
379 long ret; 382}
380 mm_segment_t old_fs;
381 383
382 if (set && get_user(s, set)) 384asmlinkage long compat_sys_sigprocmask(int how,
383 return -EFAULT; 385 compat_old_sigset_t __user *nset,
384 old_fs = get_fs(); 386 compat_old_sigset_t __user *oset)
385 set_fs(KERNEL_DS); 387{
386 ret = sys_sigprocmask(how, 388 old_sigset_t old_set, new_set;
387 set ? (old_sigset_t __user *) &s : NULL, 389 sigset_t new_blocked;
388 oset ? (old_sigset_t __user *) &s : NULL); 390
389 set_fs(old_fs); 391 old_set = current->blocked.sig[0];
390 if (ret == 0) 392
391 if (oset) 393 if (nset) {
392 ret = put_user(s, oset); 394 if (get_user(new_set, nset))
393 return ret; 395 return -EFAULT;
396 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
397
398 new_blocked = current->blocked;
399
400 switch (how) {
401 case SIG_BLOCK:
402 sigaddsetmask(&new_blocked, new_set);
403 break;
404 case SIG_UNBLOCK:
405 sigdelsetmask(&new_blocked, new_set);
406 break;
407 case SIG_SETMASK:
408 compat_sig_setmask(&new_blocked, new_set);
409 break;
410 default:
411 return -EINVAL;
412 }
413
414 set_current_blocked(&new_blocked);
415 }
416
417 if (oset) {
418 if (put_user(old_set, oset))
419 return -EFAULT;
420 }
421
422 return 0;
394} 423}
395 424
396#endif 425#endif
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
1044 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) 1073 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1045 return -EFAULT; 1074 return -EFAULT;
1046 sigset_from_compat(&newset, &newset32); 1075 sigset_from_compat(&newset, &newset32);
1047 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1076 return sigsuspend(&newset);
1048
1049 current->saved_sigmask = current->blocked;
1050 set_current_blocked(&newset);
1051
1052 current->state = TASK_INTERRUPTIBLE;
1053 schedule();
1054 set_restore_sigmask();
1055 return -ERESTARTNOHAND;
1056} 1077}
1057#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 1078#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1058 1079
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e5702..0e6353cf147 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,8 @@
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/suspend.h> 18#include <linux/suspend.h>
19 19
20#include "smpboot.h"
21
20#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
21/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 23/* Serializes the updates to cpu_online_mask, cpu_present_mask */
22static DEFINE_MUTEX(cpu_add_remove_lock); 24static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
295 int ret, nr_calls = 0; 297 int ret, nr_calls = 0;
296 void *hcpu = (void *)(long)cpu; 298 void *hcpu = (void *)(long)cpu;
297 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 299 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
300 struct task_struct *idle;
298 301
299 if (cpu_online(cpu) || !cpu_present(cpu)) 302 if (cpu_online(cpu) || !cpu_present(cpu))
300 return -EINVAL; 303 return -EINVAL;
301 304
302 cpu_hotplug_begin(); 305 cpu_hotplug_begin();
306
307 idle = idle_thread_get(cpu);
308 if (IS_ERR(idle)) {
309 ret = PTR_ERR(idle);
310 goto out;
311 }
312
303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 313 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
304 if (ret) { 314 if (ret) {
305 nr_calls--; 315 nr_calls--;
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
309 } 319 }
310 320
311 /* Arch-specific enabling code. */ 321 /* Arch-specific enabling code. */
312 ret = __cpu_up(cpu); 322 ret = __cpu_up(cpu, idle);
313 if (ret != 0) 323 if (ret != 0)
314 goto out_notify; 324 goto out_notify;
315 BUG_ON(!cpu_online(cpu)); 325 BUG_ON(!cpu_online(cpu));
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
320out_notify: 330out_notify:
321 if (ret != 0) 331 if (ret != 0)
322 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 332 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
333out:
323 cpu_hotplug_done(); 334 cpu_hotplug_done();
324 335
325 return ret; 336 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070b4ba..8c8bd652dd1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
1765 .write_u64 = cpuset_write_u64, 1765 .write_u64 = cpuset_write_u64,
1766 .private = FILE_SPREAD_SLAB, 1766 .private = FILE_SPREAD_SLAB,
1767 }, 1767 },
1768};
1769
1770static struct cftype cft_memory_pressure_enabled = {
1771 .name = "memory_pressure_enabled",
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775};
1776 1768
1777static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1769 {
1778{ 1770 .name = "memory_pressure_enabled",
1779 int err; 1771 .flags = CFTYPE_ONLY_ON_ROOT,
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775 },
1780 1776
1781 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 1777 { } /* terminate */
1782 if (err) 1778};
1783 return err;
1784 /* memory_pressure_enabled is in root cpuset only */
1785 if (!cont->parent)
1786 err = cgroup_add_file(cont, ss,
1787 &cft_memory_pressure_enabled);
1788 return err;
1789}
1790 1779
1791/* 1780/*
1792 * post_clone() is called during cgroup_create() when the 1781 * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
1887 .destroy = cpuset_destroy, 1876 .destroy = cpuset_destroy,
1888 .can_attach = cpuset_can_attach, 1877 .can_attach = cpuset_can_attach,
1889 .attach = cpuset_attach, 1878 .attach = cpuset_attach,
1890 .populate = cpuset_populate,
1891 .post_clone = cpuset_post_clone, 1879 .post_clone = cpuset_post_clone,
1892 .subsys_id = cpuset_subsys_id, 1880 .subsys_id = cpuset_subsys_id,
1881 .base_cftypes = files,
1893 .early_init = 1, 1882 .early_init = 1,
1894}; 1883};
1895 1884
diff --git a/kernel/cred.c b/kernel/cred.c
index e70683d9ec3..430557ea488 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,6 +49,14 @@ struct cred init_cred = {
49 .subscribers = ATOMIC_INIT(2), 49 .subscribers = ATOMIC_INIT(2),
50 .magic = CRED_MAGIC, 50 .magic = CRED_MAGIC,
51#endif 51#endif
52 .uid = GLOBAL_ROOT_UID,
53 .gid = GLOBAL_ROOT_GID,
54 .suid = GLOBAL_ROOT_UID,
55 .sgid = GLOBAL_ROOT_GID,
56 .euid = GLOBAL_ROOT_UID,
57 .egid = GLOBAL_ROOT_GID,
58 .fsuid = GLOBAL_ROOT_UID,
59 .fsgid = GLOBAL_ROOT_GID,
52 .securebits = SECUREBITS_DEFAULT, 60 .securebits = SECUREBITS_DEFAULT,
53 .cap_inheritable = CAP_EMPTY_SET, 61 .cap_inheritable = CAP_EMPTY_SET,
54 .cap_permitted = CAP_FULL_SET, 62 .cap_permitted = CAP_FULL_SET,
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
148 if (cred->group_info) 156 if (cred->group_info)
149 put_group_info(cred->group_info); 157 put_group_info(cred->group_info);
150 free_uid(cred->user); 158 free_uid(cred->user);
159 put_user_ns(cred->user_ns);
151 kmem_cache_free(cred_jar, cred); 160 kmem_cache_free(cred_jar, cred);
152} 161}
153 162
@@ -303,6 +312,7 @@ struct cred *prepare_creds(void)
303 set_cred_subscribers(new, 0); 312 set_cred_subscribers(new, 0);
304 get_group_info(new->group_info); 313 get_group_info(new->group_info);
305 get_uid(new->user); 314 get_uid(new->user);
315 get_user_ns(new->user_ns);
306 316
307#ifdef CONFIG_KEYS 317#ifdef CONFIG_KEYS
308 key_get(new->thread_keyring); 318 key_get(new->thread_keyring);
@@ -414,11 +424,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
414 goto error_put; 424 goto error_put;
415 } 425 }
416 426
417 /* cache user_ns in cred. Doesn't need a refcount because it will
418 * stay pinned by cred->user
419 */
420 new->user_ns = new->user->user_ns;
421
422#ifdef CONFIG_KEYS 427#ifdef CONFIG_KEYS
423 /* new threads get their own thread keyrings if their parent already 428 /* new threads get their own thread keyrings if their parent already
424 * had one */ 429 * had one */
@@ -493,10 +498,10 @@ int commit_creds(struct cred *new)
493 get_cred(new); /* we will require a ref for the subj creds too */ 498 get_cred(new); /* we will require a ref for the subj creds too */
494 499
495 /* dumpability changes */ 500 /* dumpability changes */
496 if (old->euid != new->euid || 501 if (!uid_eq(old->euid, new->euid) ||
497 old->egid != new->egid || 502 !gid_eq(old->egid, new->egid) ||
498 old->fsuid != new->fsuid || 503 !uid_eq(old->fsuid, new->fsuid) ||
499 old->fsgid != new->fsgid || 504 !gid_eq(old->fsgid, new->fsgid) ||
500 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 505 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
501 if (task->mm) 506 if (task->mm)
502 set_dumpable(task->mm, suid_dumpable); 507 set_dumpable(task->mm, suid_dumpable);
@@ -505,9 +510,9 @@ int commit_creds(struct cred *new)
505 } 510 }
506 511
507 /* alter the thread keyring */ 512 /* alter the thread keyring */
508 if (new->fsuid != old->fsuid) 513 if (!uid_eq(new->fsuid, old->fsuid))
509 key_fsuid_changed(task); 514 key_fsuid_changed(task);
510 if (new->fsgid != old->fsgid) 515 if (!gid_eq(new->fsgid, old->fsgid))
511 key_fsgid_changed(task); 516 key_fsgid_changed(task);
512 517
513 /* do it 518 /* do it
@@ -524,16 +529,16 @@ int commit_creds(struct cred *new)
524 alter_cred_subscribers(old, -2); 529 alter_cred_subscribers(old, -2);
525 530
526 /* send notifications */ 531 /* send notifications */
527 if (new->uid != old->uid || 532 if (!uid_eq(new->uid, old->uid) ||
528 new->euid != old->euid || 533 !uid_eq(new->euid, old->euid) ||
529 new->suid != old->suid || 534 !uid_eq(new->suid, old->suid) ||
530 new->fsuid != old->fsuid) 535 !uid_eq(new->fsuid, old->fsuid))
531 proc_id_connector(task, PROC_EVENT_UID); 536 proc_id_connector(task, PROC_EVENT_UID);
532 537
533 if (new->gid != old->gid || 538 if (!gid_eq(new->gid, old->gid) ||
534 new->egid != old->egid || 539 !gid_eq(new->egid, old->egid) ||
535 new->sgid != old->sgid || 540 !gid_eq(new->sgid, old->sgid) ||
536 new->fsgid != old->fsgid) 541 !gid_eq(new->fsgid, old->fsgid))
537 proc_id_connector(task, PROC_EVENT_GID); 542 proc_id_connector(task, PROC_EVENT_GID);
538 543
539 /* release the old obj and subj refs both */ 544 /* release the old obj and subj refs both */
@@ -678,6 +683,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
678 atomic_set(&new->usage, 1); 683 atomic_set(&new->usage, 1);
679 set_cred_subscribers(new, 0); 684 set_cred_subscribers(new, 0);
680 get_uid(new->user); 685 get_uid(new->user);
686 get_user_ns(new->user_ns);
681 get_group_info(new->group_info); 687 get_group_info(new->group_info);
682 688
683#ifdef CONFIG_KEYS 689#ifdef CONFIG_KEYS
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e82c7a1face..5b06cbbf693 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2039,8 +2039,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2039 * accessing the event control register. If a NMI hits, then it will 2039 * accessing the event control register. If a NMI hits, then it will
2040 * not restart the event. 2040 * not restart the event.
2041 */ 2041 */
2042static void __perf_event_task_sched_out(struct task_struct *task, 2042void __perf_event_task_sched_out(struct task_struct *task,
2043 struct task_struct *next) 2043 struct task_struct *next)
2044{ 2044{
2045 int ctxn; 2045 int ctxn;
2046 2046
@@ -2279,8 +2279,8 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
2279 * accessing the event control register. If a NMI hits, then it will 2279 * accessing the event control register. If a NMI hits, then it will
2280 * keep the event running. 2280 * keep the event running.
2281 */ 2281 */
2282static void __perf_event_task_sched_in(struct task_struct *prev, 2282void __perf_event_task_sched_in(struct task_struct *prev,
2283 struct task_struct *task) 2283 struct task_struct *task)
2284{ 2284{
2285 struct perf_event_context *ctx; 2285 struct perf_event_context *ctx;
2286 int ctxn; 2286 int ctxn;
@@ -2305,12 +2305,6 @@ static void __perf_event_task_sched_in(struct task_struct *prev,
2305 perf_branch_stack_sched_in(prev, task); 2305 perf_branch_stack_sched_in(prev, task);
2306} 2306}
2307 2307
2308void __perf_event_task_sched(struct task_struct *prev, struct task_struct *next)
2309{
2310 __perf_event_task_sched_out(prev, next);
2311 __perf_event_task_sched_in(prev, next);
2312}
2313
2314static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2315{ 2309{
2316 u64 frequency = event->attr.sample_freq; 2310 u64 frequency = event->attr.sample_freq;
@@ -3189,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event,
3189 perf_event_for_each_child(event, func); 3183 perf_event_for_each_child(event, func);
3190 func(event); 3184 func(event);
3191 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3185 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3192 perf_event_for_each_child(event, func); 3186 perf_event_for_each_child(sibling, func);
3193 mutex_unlock(&ctx->mutex); 3187 mutex_unlock(&ctx->mutex);
3194} 3188}
3195 3189
diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b425fa..910a0716e17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1214 unsigned long state; 1214 unsigned long state;
1215 int retval, status, traced; 1215 int retval, status, traced;
1216 pid_t pid = task_pid_vnr(p); 1216 pid_t pid = task_pid_vnr(p);
1217 uid_t uid = __task_cred(p)->uid; 1217 uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
1218 struct siginfo __user *infop; 1218 struct siginfo __user *infop;
1219 1219
1220 if (!likely(wo->wo_flags & WEXITED)) 1220 if (!likely(wo->wo_flags & WEXITED))
@@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1427 if (!unlikely(wo->wo_flags & WNOWAIT)) 1427 if (!unlikely(wo->wo_flags & WNOWAIT))
1428 *p_code = 0; 1428 *p_code = 0;
1429 1429
1430 uid = task_uid(p); 1430 uid = from_kuid_munged(current_user_ns(), task_uid(p));
1431unlock_sig: 1431unlock_sig:
1432 spin_unlock_irq(&p->sighand->siglock); 1432 spin_unlock_irq(&p->sighand->siglock);
1433 if (!exit_code) 1433 if (!exit_code)
@@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1500 } 1500 }
1501 if (!unlikely(wo->wo_flags & WNOWAIT)) 1501 if (!unlikely(wo->wo_flags & WNOWAIT))
1502 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1502 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1503 uid = task_uid(p); 1503 uid = from_kuid_munged(current_user_ns(), task_uid(p));
1504 spin_unlock_irq(&p->sighand->siglock); 1504 spin_unlock_irq(&p->sighand->siglock);
1505 1505
1506 pid = task_pid_vnr(p); 1506 pid = task_pid_vnr(p);
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b824..fe35a634bf7 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
35extern struct exception_table_entry __start___ex_table[]; 35extern struct exception_table_entry __start___ex_table[];
36extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
37 37
38/* Cleared by build time tools if the table is already sorted. */
39u32 __initdata main_extable_sort_needed = 1;
40
38/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
39void __init sort_main_extable(void) 42void __init sort_main_extable(void)
40{ 43{
41 sort_extable(__start___ex_table, __stop___ex_table); 44 if (main_extable_sort_needed)
45 sort_extable(__start___ex_table, __stop___ex_table);
46 else
47 pr_notice("__ex_table already sorted, skipping sort\n");
42} 48}
43 49
44/* Given an address, look for it in the exception tables. */ 50/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index ca9a3845ef3..47b4e4f379f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
34#include <linux/cgroup.h> 34#include <linux/cgroup.h>
35#include <linux/security.h> 35#include <linux/security.h>
36#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
37#include <linux/seccomp.h>
37#include <linux/swap.h> 38#include <linux/swap.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
39#include <linux/jiffies.h> 40#include <linux/jiffies.h>
@@ -47,6 +48,7 @@
47#include <linux/audit.h> 48#include <linux/audit.h>
48#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
49#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/proc_fs.h>
50#include <linux/profile.h> 52#include <linux/profile.h>
51#include <linux/rmap.h> 53#include <linux/rmap.h>
52#include <linux/ksm.h> 54#include <linux/ksm.h>
@@ -112,32 +114,67 @@ int nr_processes(void)
112 return total; 114 return total;
113} 115}
114 116
115#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 117#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
116# define alloc_task_struct_node(node) \
117 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
118# define free_task_struct(tsk) \
119 kmem_cache_free(task_struct_cachep, (tsk))
120static struct kmem_cache *task_struct_cachep; 118static struct kmem_cache *task_struct_cachep;
119
120static inline struct task_struct *alloc_task_struct_node(int node)
121{
122 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
123}
124
125void __weak arch_release_task_struct(struct task_struct *tsk) { }
126
127static inline void free_task_struct(struct task_struct *tsk)
128{
129 arch_release_task_struct(tsk);
130 kmem_cache_free(task_struct_cachep, tsk);
131}
121#endif 132#endif
122 133
123#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 134#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
135void __weak arch_release_thread_info(struct thread_info *ti) { }
136
137/*
138 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
139 * kmemcache based allocator.
140 */
141# if THREAD_SIZE >= PAGE_SIZE
124static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 142static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
125 int node) 143 int node)
126{ 144{
127#ifdef CONFIG_DEBUG_STACK_USAGE 145 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
128 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 146 THREAD_SIZE_ORDER);
129#else
130 gfp_t mask = GFP_KERNEL;
131#endif
132 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
133 147
134 return page ? page_address(page) : NULL; 148 return page ? page_address(page) : NULL;
135} 149}
136 150
137static inline void free_thread_info(struct thread_info *ti) 151static inline void free_thread_info(struct thread_info *ti)
138{ 152{
153 arch_release_thread_info(ti);
139 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 154 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
140} 155}
156# else
157static struct kmem_cache *thread_info_cache;
158
159static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
160 int node)
161{
162 return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
163}
164
165static void free_thread_info(struct thread_info *ti)
166{
167 arch_release_thread_info(ti);
168 kmem_cache_free(thread_info_cache, ti);
169}
170
171void thread_info_cache_init(void)
172{
173 thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
174 THREAD_SIZE, 0, NULL);
175 BUG_ON(thread_info_cache == NULL);
176}
177# endif
141#endif 178#endif
142 179
143/* SLAB cache for signal_struct structures (tsk->signal) */ 180/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -171,6 +208,7 @@ void free_task(struct task_struct *tsk)
171 free_thread_info(tsk->stack); 208 free_thread_info(tsk->stack);
172 rt_mutex_debug_task_free(tsk); 209 rt_mutex_debug_task_free(tsk);
173 ftrace_graph_exit_task(tsk); 210 ftrace_graph_exit_task(tsk);
211 put_seccomp_filter(tsk);
174 free_task_struct(tsk); 212 free_task_struct(tsk);
175} 213}
176EXPORT_SYMBOL(free_task); 214EXPORT_SYMBOL(free_task);
@@ -204,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk)
204} 242}
205EXPORT_SYMBOL_GPL(__put_task_struct); 243EXPORT_SYMBOL_GPL(__put_task_struct);
206 244
207/* 245void __init __weak arch_task_cache_init(void) { }
208 * macro override instead of weak attribute alias, to workaround
209 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
210 */
211#ifndef arch_task_cache_init
212#define arch_task_cache_init()
213#endif
214 246
215void __init fork_init(unsigned long mempages) 247void __init fork_init(unsigned long mempages)
216{ 248{
217#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 249#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
218#ifndef ARCH_MIN_TASKALIGN 250#ifndef ARCH_MIN_TASKALIGN
219#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 251#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
220#endif 252#endif
@@ -261,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
261 int node = tsk_fork_get_node(orig); 293 int node = tsk_fork_get_node(orig);
262 int err; 294 int err;
263 295
264 prepare_to_copy(orig);
265
266 tsk = alloc_task_struct_node(node); 296 tsk = alloc_task_struct_node(node);
267 if (!tsk) 297 if (!tsk)
268 return NULL; 298 return NULL;
@@ -1170,6 +1200,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1170 goto fork_out; 1200 goto fork_out;
1171 1201
1172 ftrace_graph_init_task(p); 1202 ftrace_graph_init_task(p);
1203 get_seccomp_filter(p);
1173 1204
1174 rt_mutex_init_task(p); 1205 rt_mutex_init_task(p);
1175 1206
@@ -1473,6 +1504,8 @@ bad_fork_cleanup_io:
1473 if (p->io_context) 1504 if (p->io_context)
1474 exit_io_context(p); 1505 exit_io_context(p);
1475bad_fork_cleanup_namespaces: 1506bad_fork_cleanup_namespaces:
1507 if (unlikely(clone_flags & CLONE_NEWPID))
1508 pid_ns_release_proc(p->nsproxy->pid_ns);
1476 exit_task_namespaces(p); 1509 exit_task_namespaces(p);
1477bad_fork_cleanup_mm: 1510bad_fork_cleanup_mm:
1478 if (p->mm) 1511 if (p->mm)
diff --git a/kernel/groups.c b/kernel/groups.c
index 99b53d1eb7e..6b2588dd04f 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
31 group_info->blocks[0] = group_info->small_block; 31 group_info->blocks[0] = group_info->small_block;
32 else { 32 else {
33 for (i = 0; i < nblocks; i++) { 33 for (i = 0; i < nblocks; i++) {
34 gid_t *b; 34 kgid_t *b;
35 b = (void *)__get_free_page(GFP_USER); 35 b = (void *)__get_free_page(GFP_USER);
36 if (!b) 36 if (!b)
37 goto out_undo_partial_alloc; 37 goto out_undo_partial_alloc;
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free);
66static int groups_to_user(gid_t __user *grouplist, 66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info) 67 const struct group_info *group_info)
68{ 68{
69 struct user_namespace *user_ns = current_user_ns();
69 int i; 70 int i;
70 unsigned int count = group_info->ngroups; 71 unsigned int count = group_info->ngroups;
71 72
72 for (i = 0; i < group_info->nblocks; i++) { 73 for (i = 0; i < count; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); 74 gid_t gid;
74 unsigned int len = cp_count * sizeof(*grouplist); 75 gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
75 76 if (put_user(gid, grouplist+i))
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT; 77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 } 78 }
82 return 0; 79 return 0;
83} 80}
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist,
86static int groups_from_user(struct group_info *group_info, 83static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist) 84 gid_t __user *grouplist)
88{ 85{
86 struct user_namespace *user_ns = current_user_ns();
89 int i; 87 int i;
90 unsigned int count = group_info->ngroups; 88 unsigned int count = group_info->ngroups;
91 89
92 for (i = 0; i < group_info->nblocks; i++) { 90 for (i = 0; i < count; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); 91 gid_t gid;
94 unsigned int len = cp_count * sizeof(*grouplist); 92 kgid_t kgid;
95 93 if (get_user(gid, grouplist+i))
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT; 94 return -EFAULT;
98 95
99 grouplist += NGROUPS_PER_BLOCK; 96 kgid = make_kgid(user_ns, gid);
100 count -= cp_count; 97 if (!gid_valid(kgid))
98 return -EINVAL;
99
100 GROUP_AT(group_info, i) = kgid;
101 } 101 }
102 return 0; 102 return 0;
103} 103}
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
117 for (base = 0; base < max; base++) { 117 for (base = 0; base < max; base++) {
118 int left = base; 118 int left = base;
119 int right = left + stride; 119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right); 120 kgid_t tmp = GROUP_AT(group_info, right);
121 121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) { 122 while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
123 GROUP_AT(group_info, right) = 123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left); 124 GROUP_AT(group_info, left);
125 right = left; 125 right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
132} 132}
133 133
134/* a simple bsearch */ 134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp) 135int groups_search(const struct group_info *group_info, kgid_t grp)
136{ 136{
137 unsigned int left, right; 137 unsigned int left, right;
138 138
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 if (grp > GROUP_AT(group_info, mid)) 146 if (gid_gt(grp, GROUP_AT(group_info, mid)))
147 left = mid + 1; 147 left = mid + 1;
148 else if (grp < GROUP_AT(group_info, mid)) 148 else if (gid_lt(grp, GROUP_AT(group_info, mid)))
149 right = mid; 149 right = mid;
150 else 150 else
151 return 1; 151 return 1;
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
256/* 256/*
257 * Check whether we're fsgid/egid or in the supplemental group.. 257 * Check whether we're fsgid/egid or in the supplemental group..
258 */ 258 */
259int in_group_p(gid_t grp) 259int in_group_p(kgid_t grp)
260{ 260{
261 const struct cred *cred = current_cred(); 261 const struct cred *cred = current_cred();
262 int retval = 1; 262 int retval = 1;
263 263
264 if (grp != cred->fsgid) 264 if (!gid_eq(grp, cred->fsgid))
265 retval = groups_search(cred->group_info, grp); 265 retval = groups_search(cred->group_info, grp);
266 return retval; 266 return retval;
267} 267}
268 268
269EXPORT_SYMBOL(in_group_p); 269EXPORT_SYMBOL(in_group_p);
270 270
271int in_egroup_p(gid_t grp) 271int in_egroup_p(kgid_t grp)
272{ 272{
273 const struct cred *cred = current_cred(); 273 const struct cred *cred = current_cred();
274 int retval = 1; 274 int retval = 1;
275 275
276 if (grp != cred->egid) 276 if (!gid_eq(grp, cred->egid))
277 retval = groups_search(cred->group_info, grp); 277 retval = groups_search(cred->group_info, grp);
278 return retval; 278 return retval;
279} 279}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2..6df614912b9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
108 108
109 touch_nmi_watchdog(); 109 touch_nmi_watchdog();
110 110
111 if (sysctl_hung_task_panic) 111 if (sysctl_hung_task_panic) {
112 trigger_all_cpu_backtrace();
112 panic("hung_task: blocked tasks"); 113 panic("hung_task: blocked tasks");
114 }
113} 115}
114 116
115/* 117/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c3..fc275e4f629 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
379 * If its disabled or no action available 379 * If its disabled or no action available
380 * keep it masked and get out of here 380 * keep it masked and get out of here
381 */ 381 */
382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
383 desc->istate |= IRQS_PENDING;
383 goto out_unlock; 384 goto out_unlock;
385 }
384 386
385 handle_irq_event(desc); 387 handle_irq_event(desc);
386 388
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
518out_unlock: 520out_unlock:
519 raw_spin_unlock(&desc->lock); 521 raw_spin_unlock(&desc->lock);
520} 522}
523EXPORT_SYMBOL(handle_edge_irq);
521 524
522#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER 525#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
523/** 526/**
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 97a8bfadc88..e75e29e4434 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,10 +4,10 @@
4 4
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6 6
7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) 7#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) 8#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */ 9/* FIXME */
10#define PD(f) do { } while (0) 10#define ___PD(f) do { } while (0)
11 11
12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
13{ 13{
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
23 print_symbol("%s\n", (unsigned long)desc->action->handler); 23 print_symbol("%s\n", (unsigned long)desc->action->handler);
24 } 24 }
25 25
26 P(IRQ_LEVEL); 26 ___P(IRQ_LEVEL);
27 P(IRQ_PER_CPU); 27 ___P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE); 28 ___P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST); 29 ___P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD); 30 ___P(IRQ_NOTHREAD);
31 P(IRQ_NOAUTOEN); 31 ___P(IRQ_NOAUTOEN);
32 32
33 PS(IRQS_AUTODETECT); 33 ___PS(IRQS_AUTODETECT);
34 PS(IRQS_REPLAY); 34 ___PS(IRQS_REPLAY);
35 PS(IRQS_WAITING); 35 ___PS(IRQS_WAITING);
36 PS(IRQS_PENDING); 36 ___PS(IRQS_PENDING);
37 37
38 PD(IRQS_INPROGRESS); 38 ___PD(IRQS_INPROGRESS);
39 PD(IRQS_DISABLED); 39 ___PD(IRQS_DISABLED);
40 PD(IRQS_MASKED); 40 ___PD(IRQS_MASKED);
41} 41}
42 42
43#undef P 43#undef ___P
44#undef PS 44#undef ___PS
45#undef PD 45#undef ___PD
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95e..192a302d6cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
112{ 112{
113 return radix_tree_lookup(&irq_desc_tree, irq); 113 return radix_tree_lookup(&irq_desc_tree, irq);
114} 114}
115EXPORT_SYMBOL(irq_to_desc);
115 116
116static void delete_irq_desc(unsigned int irq) 117static void delete_irq_desc(unsigned int irq)
117{ 118{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569..bb32326afe8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
565 * IRQF_TRIGGER_* but the PIC does not support multiple 565 * IRQF_TRIGGER_* but the PIC does not support multiple
566 * flow-types? 566 * flow-types?
567 */ 567 */
568 pr_debug("No set_type function for IRQ %d (%s)\n", irq, 568 pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
569 chip ? (chip->name ? : "unknown") : "unknown"); 569 chip ? (chip->name ? : "unknown") : "unknown");
570 return 0; 570 return 0;
571 } 571 }
572 572
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
600 ret = 0; 600 ret = 0;
601 break; 601 break;
602 default: 602 default:
603 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 603 pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
604 flags, irq, chip->irq_set_type); 604 flags, irq, chip->irq_set_type);
605 } 605 }
606 if (unmask) 606 if (unmask)
@@ -837,8 +837,7 @@ void exit_irq_thread(void)
837 837
838 action = kthread_data(tsk); 838 action = kthread_data(tsk);
839 839
840 printk(KERN_ERR 840 pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
841 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
842 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 841 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
843 842
844 desc = irq_to_desc(action->irq); 843 desc = irq_to_desc(action->irq);
@@ -878,7 +877,6 @@ static int
878__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 877__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
879{ 878{
880 struct irqaction *old, **old_ptr; 879 struct irqaction *old, **old_ptr;
881 const char *old_name = NULL;
882 unsigned long flags, thread_mask = 0; 880 unsigned long flags, thread_mask = 0;
883 int ret, nested, shared = 0; 881 int ret, nested, shared = 0;
884 cpumask_var_t mask; 882 cpumask_var_t mask;
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
972 */ 970 */
973 if (!((old->flags & new->flags) & IRQF_SHARED) || 971 if (!((old->flags & new->flags) & IRQF_SHARED) ||
974 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || 972 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
975 ((old->flags ^ new->flags) & IRQF_ONESHOT)) { 973 ((old->flags ^ new->flags) & IRQF_ONESHOT))
976 old_name = old->name;
977 goto mismatch; 974 goto mismatch;
978 }
979 975
980 /* All handlers must agree on per-cpuness */ 976 /* All handlers must agree on per-cpuness */
981 if ((old->flags & IRQF_PERCPU) != 977 if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1031 * all existing action->thread_mask bits. 1027 * all existing action->thread_mask bits.
1032 */ 1028 */
1033 new->thread_mask = 1 << ffz(thread_mask); 1029 new->thread_mask = 1 << ffz(thread_mask);
1030
1031 } else if (new->handler == irq_default_primary_handler) {
1032 /*
1033 * The interrupt was requested with handler = NULL, so
1034 * we use the default primary handler for it. But it
1035 * does not have the oneshot flag set. In combination
1036 * with level interrupts this is deadly, because the
1037 * default primary handler just wakes the thread, then
1038 * the irq lines is reenabled, but the device still
1039 * has the level irq asserted. Rinse and repeat....
1040 *
1041 * While this works for edge type interrupts, we play
1042 * it safe and reject unconditionally because we can't
1043 * say for sure which type this interrupt really
1044 * has. The type flags are unreliable as the
1045 * underlying chip implementation can override them.
1046 */
1047 pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1048 irq);
1049 ret = -EINVAL;
1050 goto out_mask;
1034 } 1051 }
1035 1052
1036 if (!shared) { 1053 if (!shared) {
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1078 1095
1079 if (nmsk != omsk) 1096 if (nmsk != omsk)
1080 /* hope the handler works with current trigger mode */ 1097 /* hope the handler works with current trigger mode */
1081 pr_warning("IRQ %d uses trigger mode %u; requested %u\n", 1098 pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
1082 irq, nmsk, omsk); 1099 irq, nmsk, omsk);
1083 } 1100 }
1084 1101
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1115 return 0; 1132 return 0;
1116 1133
1117mismatch: 1134mismatch:
1118#ifdef CONFIG_DEBUG_SHIRQ
1119 if (!(new->flags & IRQF_PROBE_SHARED)) { 1135 if (!(new->flags & IRQF_PROBE_SHARED)) {
1120 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 1136 pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
1121 if (old_name) 1137 irq, new->flags, new->name, old->flags, old->name);
1122 printk(KERN_ERR "current handler: %s\n", old_name); 1138#ifdef CONFIG_DEBUG_SHIRQ
1123 dump_stack(); 1139 dump_stack();
1124 }
1125#endif 1140#endif
1141 }
1126 ret = -EBUSY; 1142 ret = -EBUSY;
1127 1143
1128out_mask: 1144out_mask:
@@ -1204,12 +1220,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1204 /* Found it - now remove it from the list of entries: */ 1220 /* Found it - now remove it from the list of entries: */
1205 *action_ptr = action->next; 1221 *action_ptr = action->next;
1206 1222
1207 /* Currently used only by UML, might disappear one day: */
1208#ifdef CONFIG_IRQ_RELEASE_METHOD
1209 if (desc->irq_data.chip->release)
1210 desc->irq_data.chip->release(irq, dev_id);
1211#endif
1212
1213 /* If this was the last handler, shut down the IRQ line: */ 1223 /* If this was the last handler, shut down the IRQ line: */
1214 if (!desc->action) 1224 if (!desc->action)
1215 irq_shutdown(desc); 1225 irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a..cb228bf2176 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
103 int irq; 103 int irq;
104 104
105 for_each_irq_desc(irq, desc) { 105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
106 if (irqd_is_wakeup_set(&desc->irq_data)) { 111 if (irqd_is_wakeup_set(&desc->irq_data)) {
107 if (desc->istate & IRQS_PENDING) 112 if (desc->depth == 1 && desc->istate & IRQS_PENDING)
108 return -EBUSY; 113 return -EBUSY;
109 continue; 114 continue;
110 } 115 }
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c..6454db7b6a4 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
58 /* 58 /*
59 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
60 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
61 * active. 61 * active. Clear the pending bit so suspend/resume does not
62 * get confused.
62 */ 63 */
63 if (irq_settings_is_level(desc)) 64 if (irq_settings_is_level(desc)) {
65 desc->istate &= ~IRQS_PENDING;
64 return; 66 return;
67 }
65 if (desc->istate & IRQS_REPLAY) 68 if (desc->istate & IRQS_REPLAY)
66 return; 69 return;
67 if (desc->istate & IRQS_PENDING) { 70 if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index c744b88c44e..59dcf5b81d2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
402 return max; 402 return max;
403 return len; 403 return len;
404} 404}
405EXPORT_SYMBOL(__kfifo_max_r);
405 406
406#define __KFIFO_PEEK(data, out, mask) \ 407#define __KFIFO_PEEK(data, out, mask) \
407 ((data)[(out) & (mask)]) 408 ((data)[(out) & (mask)])
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e42..4edbd9c11ac 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info,
2429 goto free_hdr; 2429 goto free_hdr;
2430 } 2430 }
2431 2431
2432 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { 2432 if (hdr->e_shoff >= len ||
2433 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
2433 err = -ENOEXEC; 2434 err = -ENOEXEC;
2434 goto free_hdr; 2435 goto free_hdr;
2435 } 2436 }
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod,
2953 2954
2954 /* Module is ready to execute: parsing args may do that. */ 2955 /* Module is ready to execute: parsing args may do that. */
2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 2956 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL); 2957 -32768, 32767, &ddebug_dyndbg_module_param_cb);
2957 if (err < 0) 2958 if (err < 0)
2958 goto unlink; 2959 goto unlink;
2959 2960
diff --git a/kernel/params.c b/kernel/params.c
index f37d8263134..ed35345be53 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
85 85
86static int parse_one(char *param, 86static int parse_one(char *param,
87 char *val, 87 char *val,
88 const char *doing,
88 const struct kernel_param *params, 89 const struct kernel_param *params,
89 unsigned num_params, 90 unsigned num_params,
90 s16 min_level, 91 s16 min_level,
91 s16 max_level, 92 s16 max_level,
92 int (*handle_unknown)(char *param, char *val)) 93 int (*handle_unknown)(char *param, char *val,
94 const char *doing))
93{ 95{
94 unsigned int i; 96 unsigned int i;
95 int err; 97 int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
104 if (!val && params[i].ops->set != param_set_bool 106 if (!val && params[i].ops->set != param_set_bool
105 && params[i].ops->set != param_set_bint) 107 && params[i].ops->set != param_set_bint)
106 return -EINVAL; 108 return -EINVAL;
107 pr_debug("They are equal! Calling %p\n", 109 pr_debug("handling %s with %p\n", param,
108 params[i].ops->set); 110 params[i].ops->set);
109 mutex_lock(&param_lock); 111 mutex_lock(&param_lock);
110 err = params[i].ops->set(val, &params[i]); 112 err = params[i].ops->set(val, &params[i]);
111 mutex_unlock(&param_lock); 113 mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
114 } 116 }
115 117
116 if (handle_unknown) { 118 if (handle_unknown) {
117 pr_debug("Unknown argument: calling %p\n", handle_unknown); 119 pr_debug("doing %s: %s='%s'\n", doing, param, val);
118 return handle_unknown(param, val); 120 return handle_unknown(param, val, doing);
119 } 121 }
120 122
121 pr_debug("Unknown argument `%s'\n", param); 123 pr_debug("Unknown argument '%s'\n", param);
122 return -ENOENT; 124 return -ENOENT;
123} 125}
124 126
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
175} 177}
176 178
177/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
178int parse_args(const char *name, 180int parse_args(const char *doing,
179 char *args, 181 char *args,
180 const struct kernel_param *params, 182 const struct kernel_param *params,
181 unsigned num, 183 unsigned num,
182 s16 min_level, 184 s16 min_level,
183 s16 max_level, 185 s16 max_level,
184 int (*unknown)(char *param, char *val)) 186 int (*unknown)(char *param, char *val, const char *doing))
185{ 187{
186 char *param, *val; 188 char *param, *val;
187 189
188 pr_debug("Parsing ARGS: %s\n", args);
189
190 /* Chew leading spaces */ 190 /* Chew leading spaces */
191 args = skip_spaces(args); 191 args = skip_spaces(args);
192 192
193 if (*args)
194 pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
195
193 while (*args) { 196 while (*args) {
194 int ret; 197 int ret;
195 int irq_was_disabled; 198 int irq_was_disabled;
196 199
197 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
198 irq_was_disabled = irqs_disabled(); 201 irq_was_disabled = irqs_disabled();
199 ret = parse_one(param, val, params, num, 202 ret = parse_one(param, val, doing, params, num,
200 min_level, max_level, unknown); 203 min_level, max_level, unknown);
201 if (irq_was_disabled && !irqs_disabled()) { 204 if (irq_was_disabled && !irqs_disabled())
202 printk(KERN_WARNING "parse_args(): option '%s' enabled " 205 pr_warn("%s: option '%s' enabled irq's!\n",
203 "irq's!\n", param); 206 doing, param);
204 } 207
205 switch (ret) { 208 switch (ret) {
206 case -ENOENT: 209 case -ENOENT:
207 printk(KERN_ERR "%s: Unknown parameter `%s'\n", 210 pr_err("%s: Unknown parameter `%s'\n", doing, param);
208 name, param);
209 return ret; 211 return ret;
210 case -ENOSPC: 212 case -ENOSPC:
211 printk(KERN_ERR 213 pr_err("%s: `%s' too large for parameter `%s'\n",
212 "%s: `%s' too large for parameter `%s'\n", 214 doing, val ?: "", param);
213 name, val ?: "", param);
214 return ret; 215 return ret;
215 case 0: 216 case 0:
216 break; 217 break;
217 default: 218 default:
218 printk(KERN_ERR 219 pr_err("%s: `%s' invalid for parameter `%s'\n",
219 "%s: `%s' invalid for parameter `%s'\n", 220 doing, val ?: "", param);
220 name, val ?: "", param);
221 return ret; 221 return ret;
222 } 222 }
223 } 223 }
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
263int param_set_charp(const char *val, const struct kernel_param *kp) 263int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 264{
265 if (strlen(val) > 1024) { 265 if (strlen(val) > 1024) {
266 printk(KERN_ERR "%s: string parameter too long\n", 266 pr_err("%s: string parameter too long\n", kp->name);
267 kp->name);
268 return -ENOSPC; 267 return -ENOSPC;
269 } 268 }
270 269
@@ -400,8 +399,7 @@ static int param_array(const char *name,
400 int len; 399 int len;
401 400
402 if (*num == max) { 401 if (*num == max) {
403 printk(KERN_ERR "%s: can only take %i arguments\n", 402 pr_err("%s: can only take %i arguments\n", name, max);
404 name, max);
405 return -EINVAL; 403 return -EINVAL;
406 } 404 }
407 len = strcspn(val, ","); 405 len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
420 } while (save == ','); 418 } while (save == ',');
421 419
422 if (*num < min) { 420 if (*num < min) {
423 printk(KERN_ERR "%s: needs at least %i arguments\n", 421 pr_err("%s: needs at least %i arguments\n", name, min);
424 name, min);
425 return -EINVAL; 422 return -EINVAL;
426 } 423 }
427 return 0; 424 return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
480 const struct kparam_string *kps = kp->str; 477 const struct kparam_string *kps = kp->str;
481 478
482 if (strlen(val)+1 > kps->maxlen) { 479 if (strlen(val)+1 > kps->maxlen) {
483 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 480 pr_err("%s: string doesn't fit in %u chars.\n",
484 kp->name, kps->maxlen-1); 481 kp->name, kps->maxlen-1);
485 return -ENOSPC; 482 return -ENOSPC;
486 } 483 }
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
750#endif 747#endif
751 if (err) { 748 if (err) {
752 kobject_put(&mk->kobj); 749 kobject_put(&mk->kobj);
753 printk(KERN_ERR 750 pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
754 "Module '%s' failed add to sysfs, error number %d\n",
755 name, err); 751 name, err);
756 printk(KERN_ERR
757 "The system will be unstable now.\n");
758 return NULL; 752 return NULL;
759 } 753 }
760 754
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e321..8f9b4eb974e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP
103 select HOTPLUG 103 select HOTPLUG
104 select HOTPLUG_CPU 104 select HOTPLUG_CPU
105 105
106config PM_AUTOSLEEP
107 bool "Opportunistic sleep"
108 depends on PM_SLEEP
109 default n
110 ---help---
111 Allow the kernel to trigger a system transition into a global sleep
112 state automatically whenever there are no active wakeup sources.
113
114config PM_WAKELOCKS
115 bool "User space wakeup sources interface"
116 depends on PM_SLEEP
117 default n
118 ---help---
119 Allow user space to create, activate and deactivate wakeup source
120 objects with the help of a sysfs-based interface.
121
122config PM_WAKELOCKS_LIMIT
123 int "Maximum number of user space wakeup sources (0 = no limit)"
124 range 0 100000
125 default 100
126 depends on PM_WAKELOCKS
127
128config PM_WAKELOCKS_GC
129 bool "Garbage collector for user space wakeup sources"
130 depends on PM_WAKELOCKS
131 default y
132
106config PM_RUNTIME 133config PM_RUNTIME
107 bool "Run-time PM core functionality" 134 bool "Run-time PM core functionality"
108 depends on !IA64_HP_SIM 135 depends on !IA64_HP_SIM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 66d808ec525..29472bff11e 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o
9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
11 block_io.o 11 block_io.o
12obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
13obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
12 14
13obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 00000000000..ca304046d9e
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,127 @@
1/*
2 * kernel/power/autosleep.c
3 *
4 * Opportunistic sleep support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/device.h>
10#include <linux/mutex.h>
11#include <linux/pm_wakeup.h>
12
13#include "power.h"
14
15static suspend_state_t autosleep_state;
16static struct workqueue_struct *autosleep_wq;
17/*
18 * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
19 * is active, otherwise a deadlock with try_to_suspend() is possible.
20 * Alternatively mutex_lock_interruptible() can be used. This will then fail
21 * if an auto_sleep cycle tries to freeze processes.
22 */
23static DEFINE_MUTEX(autosleep_lock);
24static struct wakeup_source *autosleep_ws;
25
26static void try_to_suspend(struct work_struct *work)
27{
28 unsigned int initial_count, final_count;
29
30 if (!pm_get_wakeup_count(&initial_count, true))
31 goto out;
32
33 mutex_lock(&autosleep_lock);
34
35 if (!pm_save_wakeup_count(initial_count)) {
36 mutex_unlock(&autosleep_lock);
37 goto out;
38 }
39
40 if (autosleep_state == PM_SUSPEND_ON) {
41 mutex_unlock(&autosleep_lock);
42 return;
43 }
44 if (autosleep_state >= PM_SUSPEND_MAX)
45 hibernate();
46 else
47 pm_suspend(autosleep_state);
48
49 mutex_unlock(&autosleep_lock);
50
51 if (!pm_get_wakeup_count(&final_count, false))
52 goto out;
53
54 /*
55 * If the wakeup occured for an unknown reason, wait to prevent the
56 * system from trying to suspend and waking up in a tight loop.
57 */
58 if (final_count == initial_count)
59 schedule_timeout_uninterruptible(HZ / 2);
60
61 out:
62 queue_up_suspend_work();
63}
64
65static DECLARE_WORK(suspend_work, try_to_suspend);
66
67void queue_up_suspend_work(void)
68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work);
71}
72
73suspend_state_t pm_autosleep_state(void)
74{
75 return autosleep_state;
76}
77
78int pm_autosleep_lock(void)
79{
80 return mutex_lock_interruptible(&autosleep_lock);
81}
82
83void pm_autosleep_unlock(void)
84{
85 mutex_unlock(&autosleep_lock);
86}
87
88int pm_autosleep_set_state(suspend_state_t state)
89{
90
91#ifndef CONFIG_HIBERNATION
92 if (state >= PM_SUSPEND_MAX)
93 return -EINVAL;
94#endif
95
96 __pm_stay_awake(autosleep_ws);
97
98 mutex_lock(&autosleep_lock);
99
100 autosleep_state = state;
101
102 __pm_relax(autosleep_ws);
103
104 if (state > PM_SUSPEND_ON) {
105 pm_wakep_autosleep_enabled(true);
106 queue_up_suspend_work();
107 } else {
108 pm_wakep_autosleep_enabled(false);
109 }
110
111 mutex_unlock(&autosleep_lock);
112 return 0;
113}
114
115int __init pm_autosleep_init(void)
116{
117 autosleep_ws = wakeup_source_register("autosleep");
118 if (!autosleep_ws)
119 return -ENOMEM;
120
121 autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
122 if (autosleep_wq)
123 return 0;
124
125 wakeup_source_unregister(autosleep_ws);
126 return -ENOMEM;
127}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e09dfbfeece..8b53db38a27 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,6 +25,8 @@
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/ctype.h>
29#include <linux/genhd.h>
28#include <scsi/scsi_scan.h> 30#include <scsi/scsi_scan.h>
29 31
30#include "power.h" 32#include "power.h"
@@ -722,6 +724,17 @@ static int software_resume(void)
722 724
723 /* Check if the device is there */ 725 /* Check if the device is there */
724 swsusp_resume_device = name_to_dev_t(resume_file); 726 swsusp_resume_device = name_to_dev_t(resume_file);
727
728 /*
729 * name_to_dev_t is ineffective to verify parition if resume_file is in
730 * integer format. (e.g. major:minor)
731 */
732 if (isdigit(resume_file[0]) && resume_wait) {
733 int partno;
734 while (!get_gendisk(swsusp_resume_device, &partno))
735 msleep(10);
736 }
737
725 if (!swsusp_resume_device) { 738 if (!swsusp_resume_device) {
726 /* 739 /*
727 * Some device discovery might still be in progress; we need 740 * Some device discovery might still be in progress; we need
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c12581f1c6..428f8a034e9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
269 return (s - buf); 269 return (s - buf);
270} 270}
271 271
272static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, 272static suspend_state_t decode_state(const char *buf, size_t n)
273 const char *buf, size_t n)
274{ 273{
275#ifdef CONFIG_SUSPEND 274#ifdef CONFIG_SUSPEND
276 suspend_state_t state = PM_SUSPEND_STANDBY; 275 suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
278#endif 277#endif
279 char *p; 278 char *p;
280 int len; 279 int len;
281 int error = -EINVAL;
282 280
283 p = memchr(buf, '\n', n); 281 p = memchr(buf, '\n', n);
284 len = p ? p - buf : n; 282 len = p ? p - buf : n;
285 283
286 /* First, check if we are requested to hibernate */ 284 /* Check hibernation first. */
287 if (len == 4 && !strncmp(buf, "disk", len)) { 285 if (len == 4 && !strncmp(buf, "disk", len))
288 error = hibernate(); 286 return PM_SUSPEND_MAX;
289 goto Exit;
290 }
291 287
292#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { 290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
295 error = pm_suspend(state); 291 return state;
296 break;
297 }
298 }
299#endif 292#endif
300 293
301 Exit: 294 return PM_SUSPEND_ON;
295}
296
297static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
298 const char *buf, size_t n)
299{
300 suspend_state_t state;
301 int error;
302
303 error = pm_autosleep_lock();
304 if (error)
305 return error;
306
307 if (pm_autosleep_state() > PM_SUSPEND_ON) {
308 error = -EBUSY;
309 goto out;
310 }
311
312 state = decode_state(buf, n);
313 if (state < PM_SUSPEND_MAX)
314 error = pm_suspend(state);
315 else if (state == PM_SUSPEND_MAX)
316 error = hibernate();
317 else
318 error = -EINVAL;
319
320 out:
321 pm_autosleep_unlock();
302 return error ? error : n; 322 return error ? error : n;
303} 323}
304 324
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
339{ 359{
340 unsigned int val; 360 unsigned int val;
341 361
342 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; 362 return pm_get_wakeup_count(&val, true) ?
363 sprintf(buf, "%u\n", val) : -EINTR;
343} 364}
344 365
345static ssize_t wakeup_count_store(struct kobject *kobj, 366static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
347 const char *buf, size_t n) 368 const char *buf, size_t n)
348{ 369{
349 unsigned int val; 370 unsigned int val;
371 int error;
372
373 error = pm_autosleep_lock();
374 if (error)
375 return error;
376
377 if (pm_autosleep_state() > PM_SUSPEND_ON) {
378 error = -EBUSY;
379 goto out;
380 }
350 381
382 error = -EINVAL;
351 if (sscanf(buf, "%u", &val) == 1) { 383 if (sscanf(buf, "%u", &val) == 1) {
352 if (pm_save_wakeup_count(val)) 384 if (pm_save_wakeup_count(val))
353 return n; 385 error = n;
354 } 386 }
355 return -EINVAL; 387
388 out:
389 pm_autosleep_unlock();
390 return error;
356} 391}
357 392
358power_attr(wakeup_count); 393power_attr(wakeup_count);
394
395#ifdef CONFIG_PM_AUTOSLEEP
396static ssize_t autosleep_show(struct kobject *kobj,
397 struct kobj_attribute *attr,
398 char *buf)
399{
400 suspend_state_t state = pm_autosleep_state();
401
402 if (state == PM_SUSPEND_ON)
403 return sprintf(buf, "off\n");
404
405#ifdef CONFIG_SUSPEND
406 if (state < PM_SUSPEND_MAX)
407 return sprintf(buf, "%s\n", valid_state(state) ?
408 pm_states[state] : "error");
409#endif
410#ifdef CONFIG_HIBERNATION
411 return sprintf(buf, "disk\n");
412#else
413 return sprintf(buf, "error");
414#endif
415}
416
417static ssize_t autosleep_store(struct kobject *kobj,
418 struct kobj_attribute *attr,
419 const char *buf, size_t n)
420{
421 suspend_state_t state = decode_state(buf, n);
422 int error;
423
424 if (state == PM_SUSPEND_ON
425 && strcmp(buf, "off") && strcmp(buf, "off\n"))
426 return -EINVAL;
427
428 error = pm_autosleep_set_state(state);
429 return error ? error : n;
430}
431
432power_attr(autosleep);
433#endif /* CONFIG_PM_AUTOSLEEP */
434
435#ifdef CONFIG_PM_WAKELOCKS
436static ssize_t wake_lock_show(struct kobject *kobj,
437 struct kobj_attribute *attr,
438 char *buf)
439{
440 return pm_show_wakelocks(buf, true);
441}
442
443static ssize_t wake_lock_store(struct kobject *kobj,
444 struct kobj_attribute *attr,
445 const char *buf, size_t n)
446{
447 int error = pm_wake_lock(buf);
448 return error ? error : n;
449}
450
451power_attr(wake_lock);
452
453static ssize_t wake_unlock_show(struct kobject *kobj,
454 struct kobj_attribute *attr,
455 char *buf)
456{
457 return pm_show_wakelocks(buf, false);
458}
459
460static ssize_t wake_unlock_store(struct kobject *kobj,
461 struct kobj_attribute *attr,
462 const char *buf, size_t n)
463{
464 int error = pm_wake_unlock(buf);
465 return error ? error : n;
466}
467
468power_attr(wake_unlock);
469
470#endif /* CONFIG_PM_WAKELOCKS */
359#endif /* CONFIG_PM_SLEEP */ 471#endif /* CONFIG_PM_SLEEP */
360 472
361#ifdef CONFIG_PM_TRACE 473#ifdef CONFIG_PM_TRACE
@@ -409,6 +521,13 @@ static struct attribute * g[] = {
409#ifdef CONFIG_PM_SLEEP 521#ifdef CONFIG_PM_SLEEP
410 &pm_async_attr.attr, 522 &pm_async_attr.attr,
411 &wakeup_count_attr.attr, 523 &wakeup_count_attr.attr,
524#ifdef CONFIG_PM_AUTOSLEEP
525 &autosleep_attr.attr,
526#endif
527#ifdef CONFIG_PM_WAKELOCKS
528 &wake_lock_attr.attr,
529 &wake_unlock_attr.attr,
530#endif
412#ifdef CONFIG_PM_DEBUG 531#ifdef CONFIG_PM_DEBUG
413 &pm_test_attr.attr, 532 &pm_test_attr.attr,
414#endif 533#endif
@@ -444,7 +563,10 @@ static int __init pm_init(void)
444 power_kobj = kobject_create_and_add("power", NULL); 563 power_kobj = kobject_create_and_add("power", NULL);
445 if (!power_kobj) 564 if (!power_kobj)
446 return -ENOMEM; 565 return -ENOMEM;
447 return sysfs_create_group(power_kobj, &attr_group); 566 error = sysfs_create_group(power_kobj, &attr_group);
567 if (error)
568 return error;
569 return pm_autosleep_init();
448} 570}
449 571
450core_initcall(pm_init); 572core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98f3622d740..b0bd4beaebf 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void)
264{ 264{
265} 265}
266#endif 266#endif
267
268#ifdef CONFIG_PM_AUTOSLEEP
269
270/* kernel/power/autosleep.c */
271extern int pm_autosleep_init(void);
272extern int pm_autosleep_lock(void);
273extern void pm_autosleep_unlock(void);
274extern suspend_state_t pm_autosleep_state(void);
275extern int pm_autosleep_set_state(suspend_state_t state);
276
277#else /* !CONFIG_PM_AUTOSLEEP */
278
279static inline int pm_autosleep_init(void) { return 0; }
280static inline int pm_autosleep_lock(void) { return 0; }
281static inline void pm_autosleep_unlock(void) {}
282static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
283
284#endif /* !CONFIG_PM_AUTOSLEEP */
285
286#ifdef CONFIG_PM_WAKELOCKS
287
288/* kernel/power/wakelock.c */
289extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
290extern int pm_wake_lock(const char *buf);
291extern int pm_wake_unlock(const char *buf);
292
293#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8742fd013a9..11e22c068e8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> 9 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
10 * 10 *
11 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
12 * 12 *
@@ -51,6 +51,23 @@
51 51
52#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) 52#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
53 53
54/*
55 * Number of free pages that are not high.
56 */
57static inline unsigned long low_free_pages(void)
58{
59 return nr_free_pages() - nr_free_highpages();
60}
61
62/*
63 * Number of pages required to be kept free while writing the image. Always
64 * half of all available low pages before the writing starts.
65 */
66static inline unsigned long reqd_free_pages(void)
67{
68 return low_free_pages() / 2;
69}
70
54struct swap_map_page { 71struct swap_map_page {
55 sector_t entries[MAP_PAGE_ENTRIES]; 72 sector_t entries[MAP_PAGE_ENTRIES];
56 sector_t next_swap; 73 sector_t next_swap;
@@ -72,7 +89,7 @@ struct swap_map_handle {
72 sector_t cur_swap; 89 sector_t cur_swap;
73 sector_t first_sector; 90 sector_t first_sector;
74 unsigned int k; 91 unsigned int k;
75 unsigned long nr_free_pages, written; 92 unsigned long reqd_free_pages;
76 u32 crc32; 93 u32 crc32;
77}; 94};
78 95
@@ -265,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
265 return -ENOSPC; 282 return -ENOSPC;
266 283
267 if (bio_chain) { 284 if (bio_chain) {
268 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 285 src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
286 __GFP_NORETRY);
269 if (src) { 287 if (src) {
270 copy_page(src, buf); 288 copy_page(src, buf);
271 } else { 289 } else {
272 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ 290 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
273 if (ret) 291 if (ret)
274 return ret; 292 return ret;
275 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 293 src = (void *)__get_free_page(__GFP_WAIT |
294 __GFP_NOWARN |
295 __GFP_NORETRY);
276 if (src) { 296 if (src) {
277 copy_page(src, buf); 297 copy_page(src, buf);
278 } else { 298 } else {
@@ -316,8 +336,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
316 goto err_rel; 336 goto err_rel;
317 } 337 }
318 handle->k = 0; 338 handle->k = 0;
319 handle->nr_free_pages = nr_free_pages() >> 1; 339 handle->reqd_free_pages = reqd_free_pages();
320 handle->written = 0;
321 handle->first_sector = handle->cur_swap; 340 handle->first_sector = handle->cur_swap;
322 return 0; 341 return 0;
323err_rel: 342err_rel:
@@ -351,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
351 clear_page(handle->cur); 370 clear_page(handle->cur);
352 handle->cur_swap = offset; 371 handle->cur_swap = offset;
353 handle->k = 0; 372 handle->k = 0;
354 } 373
355 if (bio_chain && ++handle->written > handle->nr_free_pages) { 374 if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
356 error = hib_wait_on_bio_chain(bio_chain); 375 error = hib_wait_on_bio_chain(bio_chain);
357 if (error) 376 if (error)
358 goto out; 377 goto out;
359 handle->written = 0; 378 /*
379 * Recalculate the number of required free pages, to
380 * make sure we never take more than half.
381 */
382 handle->reqd_free_pages = reqd_free_pages();
383 }
360 } 384 }
361 out: 385 out:
362 return error; 386 return error;
@@ -403,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
403/* Maximum number of threads for compression/decompression. */ 427/* Maximum number of threads for compression/decompression. */
404#define LZO_THREADS 3 428#define LZO_THREADS 3
405 429
406/* Maximum number of pages for read buffering. */ 430/* Minimum/maximum number of pages for read buffering. */
407#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) 431#define LZO_MIN_RD_PAGES 1024
432#define LZO_MAX_RD_PAGES 8192
408 433
409 434
410/** 435/**
@@ -615,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
615 } 640 }
616 641
617 /* 642 /*
618 * Adjust number of free pages after all allocations have been done.
619 * We don't want to run out of pages when writing.
620 */
621 handle->nr_free_pages = nr_free_pages() >> 1;
622
623 /*
624 * Start the CRC32 thread. 643 * Start the CRC32 thread.
625 */ 644 */
626 init_waitqueue_head(&crc->go); 645 init_waitqueue_head(&crc->go);
@@ -641,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
641 goto out_clean; 660 goto out_clean;
642 } 661 }
643 662
663 /*
664 * Adjust the number of required free pages after all allocations have
665 * been done. We don't want to run out of pages when writing.
666 */
667 handle->reqd_free_pages = reqd_free_pages();
668
644 printk(KERN_INFO 669 printk(KERN_INFO
645 "PM: Using %u thread(s) for compression.\n" 670 "PM: Using %u thread(s) for compression.\n"
646 "PM: Compressing and saving image data (%u pages) ... ", 671 "PM: Compressing and saving image data (%u pages) ... ",
@@ -1051,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1051 unsigned i, thr, run_threads, nr_threads; 1076 unsigned i, thr, run_threads, nr_threads;
1052 unsigned ring = 0, pg = 0, ring_size = 0, 1077 unsigned ring = 0, pg = 0, ring_size = 0,
1053 have = 0, want, need, asked = 0; 1078 have = 0, want, need, asked = 0;
1054 unsigned long read_pages; 1079 unsigned long read_pages = 0;
1055 unsigned char **page = NULL; 1080 unsigned char **page = NULL;
1056 struct dec_data *data = NULL; 1081 struct dec_data *data = NULL;
1057 struct crc_data *crc = NULL; 1082 struct crc_data *crc = NULL;
@@ -1063,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1063 nr_threads = num_online_cpus() - 1; 1088 nr_threads = num_online_cpus() - 1;
1064 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); 1089 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1065 1090
1066 page = vmalloc(sizeof(*page) * LZO_READ_PAGES); 1091 page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
1067 if (!page) { 1092 if (!page) {
1068 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 1093 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1069 ret = -ENOMEM; 1094 ret = -ENOMEM;
@@ -1128,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
1128 } 1153 }
1129 1154
1130 /* 1155 /*
1131 * Adjust number of pages for read buffering, in case we are short. 1156 * Set the number of pages for read buffering.
1157 * This is complete guesswork, because we'll only know the real
1158 * picture once prepare_image() is called, which is much later on
1159 * during the image load phase. We'll assume the worst case and
1160 * say that none of the image pages are from high memory.
1132 */ 1161 */
1133 read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; 1162 if (low_free_pages() > snapshot_get_image_size())
1134 read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); 1163 read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
1164 read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
1135 1165
1136 for (i = 0; i < read_pages; i++) { 1166 for (i = 0; i < read_pages; i++) {
1137 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? 1167 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1138 __GFP_WAIT | __GFP_HIGH : 1168 __GFP_WAIT | __GFP_HIGH :
1139 __GFP_WAIT); 1169 __GFP_WAIT | __GFP_NOWARN |
1170 __GFP_NORETRY);
1171
1140 if (!page[i]) { 1172 if (!page[i]) {
1141 if (i < LZO_CMP_PAGES) { 1173 if (i < LZO_CMP_PAGES) {
1142 ring_size = i; 1174 ring_size = i;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 00000000000..c8fba338007
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,259 @@
1/*
2 * kernel/power/wakelock.c
3 *
4 * User space wakeup sources support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This code is based on the analogous interface allowing user space to
9 * manipulate wakelocks on Android.
10 */
11
12#include <linux/ctype.h>
13#include <linux/device.h>
14#include <linux/err.h>
15#include <linux/hrtimer.h>
16#include <linux/list.h>
17#include <linux/rbtree.h>
18#include <linux/slab.h>
19
20static DEFINE_MUTEX(wakelocks_lock);
21
22struct wakelock {
23 char *name;
24 struct rb_node node;
25 struct wakeup_source ws;
26#ifdef CONFIG_PM_WAKELOCKS_GC
27 struct list_head lru;
28#endif
29};
30
31static struct rb_root wakelocks_tree = RB_ROOT;
32
33ssize_t pm_show_wakelocks(char *buf, bool show_active)
34{
35 struct rb_node *node;
36 struct wakelock *wl;
37 char *str = buf;
38 char *end = buf + PAGE_SIZE;
39
40 mutex_lock(&wakelocks_lock);
41
42 for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
43 wl = rb_entry(node, struct wakelock, node);
44 if (wl->ws.active == show_active)
45 str += scnprintf(str, end - str, "%s ", wl->name);
46 }
47 if (str > buf)
48 str--;
49
50 str += scnprintf(str, end - str, "\n");
51
52 mutex_unlock(&wakelocks_lock);
53 return (str - buf);
54}
55
56#if CONFIG_PM_WAKELOCKS_LIMIT > 0
57static unsigned int number_of_wakelocks;
58
59static inline bool wakelocks_limit_exceeded(void)
60{
61 return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
62}
63
64static inline void increment_wakelocks_number(void)
65{
66 number_of_wakelocks++;
67}
68
69static inline void decrement_wakelocks_number(void)
70{
71 number_of_wakelocks--;
72}
73#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
74static inline bool wakelocks_limit_exceeded(void) { return false; }
75static inline void increment_wakelocks_number(void) {}
76static inline void decrement_wakelocks_number(void) {}
77#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
78
79#ifdef CONFIG_PM_WAKELOCKS_GC
80#define WL_GC_COUNT_MAX 100
81#define WL_GC_TIME_SEC 300
82
83static LIST_HEAD(wakelocks_lru_list);
84static unsigned int wakelocks_gc_count;
85
86static inline void wakelocks_lru_add(struct wakelock *wl)
87{
88 list_add(&wl->lru, &wakelocks_lru_list);
89}
90
91static inline void wakelocks_lru_most_recent(struct wakelock *wl)
92{
93 list_move(&wl->lru, &wakelocks_lru_list);
94}
95
96static void wakelocks_gc(void)
97{
98 struct wakelock *wl, *aux;
99 ktime_t now;
100
101 if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
102 return;
103
104 now = ktime_get();
105 list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
106 u64 idle_time_ns;
107 bool active;
108
109 spin_lock_irq(&wl->ws.lock);
110 idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
111 active = wl->ws.active;
112 spin_unlock_irq(&wl->ws.lock);
113
114 if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
115 break;
116
117 if (!active) {
118 wakeup_source_remove(&wl->ws);
119 rb_erase(&wl->node, &wakelocks_tree);
120 list_del(&wl->lru);
121 kfree(wl->name);
122 kfree(wl);
123 decrement_wakelocks_number();
124 }
125 }
126 wakelocks_gc_count = 0;
127}
128#else /* !CONFIG_PM_WAKELOCKS_GC */
129static inline void wakelocks_lru_add(struct wakelock *wl) {}
130static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
131static inline void wakelocks_gc(void) {}
132#endif /* !CONFIG_PM_WAKELOCKS_GC */
133
134static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
135 bool add_if_not_found)
136{
137 struct rb_node **node = &wakelocks_tree.rb_node;
138 struct rb_node *parent = *node;
139 struct wakelock *wl;
140
141 while (*node) {
142 int diff;
143
144 parent = *node;
145 wl = rb_entry(*node, struct wakelock, node);
146 diff = strncmp(name, wl->name, len);
147 if (diff == 0) {
148 if (wl->name[len])
149 diff = -1;
150 else
151 return wl;
152 }
153 if (diff < 0)
154 node = &(*node)->rb_left;
155 else
156 node = &(*node)->rb_right;
157 }
158 if (!add_if_not_found)
159 return ERR_PTR(-EINVAL);
160
161 if (wakelocks_limit_exceeded())
162 return ERR_PTR(-ENOSPC);
163
164 /* Not found, we have to add a new one. */
165 wl = kzalloc(sizeof(*wl), GFP_KERNEL);
166 if (!wl)
167 return ERR_PTR(-ENOMEM);
168
169 wl->name = kstrndup(name, len, GFP_KERNEL);
170 if (!wl->name) {
171 kfree(wl);
172 return ERR_PTR(-ENOMEM);
173 }
174 wl->ws.name = wl->name;
175 wakeup_source_add(&wl->ws);
176 rb_link_node(&wl->node, parent, node);
177 rb_insert_color(&wl->node, &wakelocks_tree);
178 wakelocks_lru_add(wl);
179 increment_wakelocks_number();
180 return wl;
181}
182
183int pm_wake_lock(const char *buf)
184{
185 const char *str = buf;
186 struct wakelock *wl;
187 u64 timeout_ns = 0;
188 size_t len;
189 int ret = 0;
190
191 while (*str && !isspace(*str))
192 str++;
193
194 len = str - buf;
195 if (!len)
196 return -EINVAL;
197
198 if (*str && *str != '\n') {
199 /* Find out if there's a valid timeout string appended. */
200 ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
201 if (ret)
202 return -EINVAL;
203 }
204
205 mutex_lock(&wakelocks_lock);
206
207 wl = wakelock_lookup_add(buf, len, true);
208 if (IS_ERR(wl)) {
209 ret = PTR_ERR(wl);
210 goto out;
211 }
212 if (timeout_ns) {
213 u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
214
215 do_div(timeout_ms, NSEC_PER_MSEC);
216 __pm_wakeup_event(&wl->ws, timeout_ms);
217 } else {
218 __pm_stay_awake(&wl->ws);
219 }
220
221 wakelocks_lru_most_recent(wl);
222
223 out:
224 mutex_unlock(&wakelocks_lock);
225 return ret;
226}
227
228int pm_wake_unlock(const char *buf)
229{
230 struct wakelock *wl;
231 size_t len;
232 int ret = 0;
233
234 len = strlen(buf);
235 if (!len)
236 return -EINVAL;
237
238 if (buf[len-1] == '\n')
239 len--;
240
241 if (!len)
242 return -EINVAL;
243
244 mutex_lock(&wakelocks_lock);
245
246 wl = wakelock_lookup_add(buf, len, false);
247 if (IS_ERR(wl)) {
248 ret = PTR_ERR(wl);
249 goto out;
250 }
251 __pm_relax(&wl->ws);
252
253 wakelocks_lru_most_recent(wl);
254 wakelocks_gc();
255
256 out:
257 mutex_unlock(&wakelocks_lock);
258 return ret;
259}
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d3..32462d2b364 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
41#include <linux/cpu.h> 41#include <linux/cpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46 47
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
54{ 55{
55} 56}
56 57
57#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
58
59/* printk's without a loglevel use this.. */ 58/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 59#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 60
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
99static int console_locked, console_suspended; 98static int console_locked, console_suspended;
100 99
101/* 100/*
102 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
103 * It is also used in interesting ways to provide interlocking in
104 * console_unlock();.
105 */
106static DEFINE_RAW_SPINLOCK(logbuf_lock);
107
108#define LOG_BUF_MASK (log_buf_len-1)
109#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
110
111/*
112 * The indices into log_buf are not constrained to log_buf_len - they
113 * must be masked before subscripting
114 */
115static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
116static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
117static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
118
119/*
120 * If exclusive_console is non-NULL then only this console is to be printed to. 101 * If exclusive_console is non-NULL then only this console is to be printed to.
121 */ 102 */
122static struct console *exclusive_console; 103static struct console *exclusive_console;
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);
145/* Flag: console code may call schedule() */ 126/* Flag: console code may call schedule() */
146static int console_may_schedule; 127static int console_may_schedule;
147 128
129/*
130 * The printk log buffer consists of a chain of concatenated variable
131 * length records. Every record starts with a record header, containing
132 * the overall length of the record.
133 *
134 * The heads to the first and last entry in the buffer, as well as the
135 * sequence numbers of these both entries are maintained when messages
136 * are stored..
137 *
138 * If the heads indicate available messages, the length in the header
139 * tells the start next message. A length == 0 for the next message
140 * indicates a wrap-around to the beginning of the buffer.
141 *
142 * Every record carries the monotonic timestamp in microseconds, as well as
143 * the standard userspace syslog level and syslog facility. The usual
144 * kernel messages use LOG_KERN; userspace-injected messages always carry
145 * a matching syslog facility, by default LOG_USER. The origin of every
146 * message can be reliably determined that way.
147 *
148 * The human readable log message directly follows the message header. The
149 * length of the message text is stored in the header, the stored message
150 * is not terminated.
151 *
152 * Optionally, a message can carry a dictionary of properties (key/value pairs),
153 * to provide userspace with a machine-readable message context.
154 *
155 * Examples for well-defined, commonly used property names are:
156 * DEVICE=b12:8 device identifier
157 * b12:8 block dev_t
158 * c127:3 char dev_t
159 * n8 netdev ifindex
160 * +sound:card0 subsystem:devname
161 * SUBSYSTEM=pci driver-core subsystem name
162 *
163 * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
164 * follows directly after a '=' character. Every property is terminated by
165 * a '\0' character. The last property is not terminated.
166 *
167 * Example of a message structure:
168 * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
169 * 0008 34 00 record is 52 bytes long
170 * 000a 0b 00 text is 11 bytes long
171 * 000c 1f 00 dictionary is 23 bytes long
172 * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
173 * 0010 69 74 27 73 20 61 20 6c "it's a l"
174 * 69 6e 65 "ine"
175 * 001b 44 45 56 49 43 "DEVIC"
176 * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
177 * 52 49 56 45 52 3d 62 75 "RIVER=bu"
178 * 67 "g"
179 * 0032 00 00 00 padding to next message header
180 *
181 * The 'struct log' buffer header must never be directly exported to
182 * userspace, it is a kernel-private implementation detail that might
183 * need to be changed in the future, when the requirements change.
184 *
185 * /dev/kmsg exports the structured data in the following line format:
186 * "level,sequnum,timestamp;<message text>\n"
187 *
188 * The optional key/value pairs are attached as continuation lines starting
189 * with a space character and terminated by a newline. All possible
190 * non-prinatable characters are escaped in the "\xff" notation.
191 *
192 * Users of the export format should ignore possible additional values
193 * separated by ',', and find the message after the ';' character.
194 */
195
196struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */
202};
203
204/*
205 * The logbuf_lock protects kmsg buffer, indices, counters. It is also
206 * used in interesting ways to provide interlocking in console_unlock();
207 */
208static DEFINE_RAW_SPINLOCK(logbuf_lock);
209
210/* the next printk record to read by syslog(READ) or /proc/kmsg */
211static u64 syslog_seq;
212static u32 syslog_idx;
213
214/* index and sequence number of the first record stored in the buffer */
215static u64 log_first_seq;
216static u32 log_first_idx;
217
218/* index and sequence number of the next record to store in the buffer */
219static u64 log_next_seq;
148#ifdef CONFIG_PRINTK 220#ifdef CONFIG_PRINTK
221static u32 log_next_idx;
222
223/* the next printk record to read after the last 'clear' command */
224static u64 clear_seq;
225static u32 clear_idx;
226
227#define LOG_LINE_MAX 1024
149 228
150static char __log_buf[__LOG_BUF_LEN]; 229/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4
232#else
233#define LOG_ALIGN 8
234#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
151static char *log_buf = __log_buf; 237static char *log_buf = __log_buf;
152static int log_buf_len = __LOG_BUF_LEN; 238static u32 log_buf_len = __LOG_BUF_LEN;
153static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 239
154static int saved_console_loglevel = -1; 240/* cpu currently holding logbuf_lock */
241static volatile unsigned int logbuf_cpu = UINT_MAX;
242
243/* human readable text of the record */
244static char *log_text(const struct log *msg)
245{
246 return (char *)msg + sizeof(struct log);
247}
248
249/* optional key/value pair dictionary attached to the record */
250static char *log_dict(const struct log *msg)
251{
252 return (char *)msg + sizeof(struct log) + msg->text_len;
253}
254
255/* get record by index; idx must point to valid msg */
256static struct log *log_from_idx(u32 idx)
257{
258 struct log *msg = (struct log *)(log_buf + idx);
259
260 /*
261 * A length == 0 record is the end of buffer marker. Wrap around and
262 * read the message at the start of the buffer.
263 */
264 if (!msg->len)
265 return (struct log *)log_buf;
266 return msg;
267}
268
269/* get next record; idx must point to valid msg */
270static u32 log_next(u32 idx)
271{
272 struct log *msg = (struct log *)(log_buf + idx);
273
274 /* length == 0 indicates the end of the buffer; wrap */
275 /*
276 * A length == 0 record is the end of buffer marker. Wrap around and
277 * read the message at the start of the buffer as *this* one, and
278 * return the one after that.
279 */
280 if (!msg->len) {
281 msg = (struct log *)log_buf;
282 return msg->len;
283 }
284 return idx + msg->len;
285}
286
287/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level,
289 const char *dict, u16 dict_len,
290 const char *text, u16 text_len)
291{
292 struct log *msg;
293 u32 size, pad_len;
294
295 /* number of '\0' padding bytes to next message */
296 size = sizeof(struct log) + text_len + dict_len;
297 pad_len = (-size) & (LOG_ALIGN - 1);
298 size += pad_len;
299
300 while (log_first_seq < log_next_seq) {
301 u32 free;
302
303 if (log_next_idx > log_first_idx)
304 free = max(log_buf_len - log_next_idx, log_first_idx);
305 else
306 free = log_first_idx - log_next_idx;
307
308 if (free > size + sizeof(struct log))
309 break;
310
311 /* drop old messages until we have enough contiuous space */
312 log_first_idx = log_next(log_first_idx);
313 log_first_seq++;
314 }
315
316 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
317 /*
318 * This message + an additional empty header does not fit
319 * at the end of the buffer. Add an empty header with len == 0
320 * to signify a wrap around.
321 */
322 memset(log_buf + log_next_idx, 0, sizeof(struct log));
323 log_next_idx = 0;
324 }
325
326 /* fill message */
327 msg = (struct log *)(log_buf + log_next_idx);
328 memcpy(log_text(msg), text, text_len);
329 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7);
333 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336
337 /* insert message */
338 log_next_idx += msg->len;
339 log_next_seq++;
340}
341
342/* /dev/kmsg - userspace message inject/listen interface */
343struct devkmsg_user {
344 u64 seq;
345 u32 idx;
346 struct mutex lock;
347 char buf[8192];
348};
349
350static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
351 unsigned long count, loff_t pos)
352{
353 char *buf, *line;
354 int i;
355 int level = default_message_loglevel;
356 int facility = 1; /* LOG_USER */
357 size_t len = iov_length(iv, count);
358 ssize_t ret = len;
359
360 if (len > LOG_LINE_MAX)
361 return -EINVAL;
362 buf = kmalloc(len+1, GFP_KERNEL);
363 if (buf == NULL)
364 return -ENOMEM;
365
366 line = buf;
367 for (i = 0; i < count; i++) {
368 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
369 goto out;
370 line += iv[i].iov_len;
371 }
372
373 /*
374 * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
375 * the decimal value represents 32bit, the lower 3 bit are the log
376 * level, the rest are the log facility.
377 *
378 * If no prefix or no userspace facility is specified, we
379 * enforce LOG_USER, to be able to reliably distinguish
380 * kernel-generated messages from userspace-injected ones.
381 */
382 line = buf;
383 if (line[0] == '<') {
384 char *endp = NULL;
385
386 i = simple_strtoul(line+1, &endp, 10);
387 if (endp && endp[0] == '>') {
388 level = i & 7;
389 if (i >> 3)
390 facility = i >> 3;
391 endp++;
392 len -= endp - line;
393 line = endp;
394 }
395 }
396 line[len] = '\0';
397
398 printk_emit(facility, level, NULL, 0, "%s", line);
399out:
400 kfree(buf);
401 return ret;
402}
403
404static ssize_t devkmsg_read(struct file *file, char __user *buf,
405 size_t count, loff_t *ppos)
406{
407 struct devkmsg_user *user = file->private_data;
408 struct log *msg;
409 u64 ts_usec;
410 size_t i;
411 size_t len;
412 ssize_t ret;
413
414 if (!user)
415 return -EBADF;
416
417 mutex_lock(&user->lock);
418 raw_spin_lock(&logbuf_lock);
419 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) {
421 ret = -EAGAIN;
422 raw_spin_unlock(&logbuf_lock);
423 goto out;
424 }
425
426 raw_spin_unlock(&logbuf_lock);
427 ret = wait_event_interruptible(log_wait,
428 user->seq != log_next_seq);
429 if (ret)
430 goto out;
431 raw_spin_lock(&logbuf_lock);
432 }
433
434 if (user->seq < log_first_seq) {
435 /* our last seen message is gone, return error and reset */
436 user->idx = log_first_idx;
437 user->seq = log_first_seq;
438 ret = -EPIPE;
439 raw_spin_unlock(&logbuf_lock);
440 goto out;
441 }
442
443 msg = log_from_idx(user->idx);
444 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec);
448
449 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) {
451 unsigned char c = log_text(msg)[i];
452
453 if (c < ' ' || c >= 128)
454 len += sprintf(user->buf + len, "\\x%02x", c);
455 else
456 user->buf[len++] = c;
457 }
458 user->buf[len++] = '\n';
459
460 if (msg->dict_len) {
461 bool line = true;
462
463 for (i = 0; i < msg->dict_len; i++) {
464 unsigned char c = log_dict(msg)[i];
465
466 if (line) {
467 user->buf[len++] = ' ';
468 line = false;
469 }
470
471 if (c == '\0') {
472 user->buf[len++] = '\n';
473 line = true;
474 continue;
475 }
476
477 if (c < ' ' || c >= 128) {
478 len += sprintf(user->buf + len, "\\x%02x", c);
479 continue;
480 }
481
482 user->buf[len++] = c;
483 }
484 user->buf[len++] = '\n';
485 }
486
487 user->idx = log_next(user->idx);
488 user->seq++;
489 raw_spin_unlock(&logbuf_lock);
490
491 if (len > count) {
492 ret = -EINVAL;
493 goto out;
494 }
495
496 if (copy_to_user(buf, user->buf, len)) {
497 ret = -EFAULT;
498 goto out;
499 }
500 ret = len;
501out:
502 mutex_unlock(&user->lock);
503 return ret;
504}
505
506static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
507{
508 struct devkmsg_user *user = file->private_data;
509 loff_t ret = 0;
510
511 if (!user)
512 return -EBADF;
513 if (offset)
514 return -ESPIPE;
515
516 raw_spin_lock(&logbuf_lock);
517 switch (whence) {
518 case SEEK_SET:
519 /* the first record */
520 user->idx = log_first_idx;
521 user->seq = log_first_seq;
522 break;
523 case SEEK_DATA:
524 /*
525 * The first record after the last SYSLOG_ACTION_CLEAR,
526 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
527 * changes no global state, and does not clear anything.
528 */
529 user->idx = clear_idx;
530 user->seq = clear_seq;
531 break;
532 case SEEK_END:
533 /* after the last record */
534 user->idx = log_next_idx;
535 user->seq = log_next_seq;
536 break;
537 default:
538 ret = -EINVAL;
539 }
540 raw_spin_unlock(&logbuf_lock);
541 return ret;
542}
543
544static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
545{
546 struct devkmsg_user *user = file->private_data;
547 int ret = 0;
548
549 if (!user)
550 return POLLERR|POLLNVAL;
551
552 poll_wait(file, &log_wait, wait);
553
554 raw_spin_lock(&logbuf_lock);
555 if (user->seq < log_next_seq) {
556 /* return error when data has vanished underneath us */
557 if (user->seq < log_first_seq)
558 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
559 ret = POLLIN|POLLRDNORM;
560 }
561 raw_spin_unlock(&logbuf_lock);
562
563 return ret;
564}
565
566static int devkmsg_open(struct inode *inode, struct file *file)
567{
568 struct devkmsg_user *user;
569 int err;
570
571 /* write-only does not need any file context */
572 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
573 return 0;
574
575 err = security_syslog(SYSLOG_ACTION_READ_ALL);
576 if (err)
577 return err;
578
579 user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
580 if (!user)
581 return -ENOMEM;
582
583 mutex_init(&user->lock);
584
585 raw_spin_lock(&logbuf_lock);
586 user->idx = log_first_idx;
587 user->seq = log_first_seq;
588 raw_spin_unlock(&logbuf_lock);
589
590 file->private_data = user;
591 return 0;
592}
593
594static int devkmsg_release(struct inode *inode, struct file *file)
595{
596 struct devkmsg_user *user = file->private_data;
597
598 if (!user)
599 return 0;
600
601 mutex_destroy(&user->lock);
602 kfree(user);
603 return 0;
604}
605
606const struct file_operations kmsg_fops = {
607 .open = devkmsg_open,
608 .read = devkmsg_read,
609 .aio_write = devkmsg_writev,
610 .llseek = devkmsg_llseek,
611 .poll = devkmsg_poll,
612 .release = devkmsg_release,
613};
155 614
156#ifdef CONFIG_KEXEC 615#ifdef CONFIG_KEXEC
157/* 616/*
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;
165void log_buf_kexec_setup(void) 624void log_buf_kexec_setup(void)
166{ 625{
167 VMCOREINFO_SYMBOL(log_buf); 626 VMCOREINFO_SYMBOL(log_buf);
168 VMCOREINFO_SYMBOL(log_end);
169 VMCOREINFO_SYMBOL(log_buf_len); 627 VMCOREINFO_SYMBOL(log_buf_len);
170 VMCOREINFO_SYMBOL(logged_chars); 628 VMCOREINFO_SYMBOL(log_first_idx);
629 VMCOREINFO_SYMBOL(log_next_idx);
171} 630}
172#endif 631#endif
173 632
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);
191void __init setup_log_buf(int early) 650void __init setup_log_buf(int early)
192{ 651{
193 unsigned long flags; 652 unsigned long flags;
194 unsigned start, dest_idx, offset;
195 char *new_log_buf; 653 char *new_log_buf;
196 int free; 654 int free;
197 655
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early)
219 log_buf_len = new_log_buf_len; 677 log_buf_len = new_log_buf_len;
220 log_buf = new_log_buf; 678 log_buf = new_log_buf;
221 new_log_buf_len = 0; 679 new_log_buf_len = 0;
222 free = __LOG_BUF_LEN - log_end; 680 free = __LOG_BUF_LEN - log_next_idx;
223 681 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
224 offset = start = min(con_start, log_start);
225 dest_idx = 0;
226 while (start != log_end) {
227 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
228
229 log_buf[dest_idx] = __log_buf[log_idx_mask];
230 start++;
231 dest_idx++;
232 }
233 log_start -= offset;
234 con_start -= offset;
235 log_end -= offset;
236 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 682 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
237 683
238 pr_info("log_buf_len: %d\n", log_buf_len); 684 pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)
332 return 0; 778 return 0;
333} 779}
334 780
781#if defined(CONFIG_PRINTK_TIME)
782static bool printk_time = 1;
783#else
784static bool printk_time;
785#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{
790 size_t len = 0;
791
792 if (syslog) {
793 if (buf) {
794 len += sprintf(buf, "<%u>", msg->level);
795 } else {
796 len += 3;
797 if (msg->level > 9)
798 len++;
799 if (msg->level > 99)
800 len++;
801 }
802 }
803
804 if (printk_time) {
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len;
817}
818
819static size_t msg_print_text(const struct log *msg, bool syslog,
820 char *buf, size_t size)
821{
822 const char *text = log_text(msg);
823 size_t text_size = msg->text_len;
824 size_t len = 0;
825
826 do {
827 const char *next = memchr(text, '\n', text_size);
828 size_t text_len;
829
830 if (next) {
831 text_len = next - text;
832 next++;
833 text_size -= next - text;
834 } else {
835 text_len = text_size;
836 }
837
838 if (buf) {
839 if (print_prefix(msg, syslog, NULL) +
840 text_len + 1>= size - len)
841 break;
842
843 len += print_prefix(msg, syslog, buf + len);
844 memcpy(buf + len, text, text_len);
845 len += text_len;
846 buf[len++] = '\n';
847 } else {
848 /* SYSLOG_ACTION_* buffer size only calculation */
849 len += print_prefix(msg, syslog, NULL);
850 len += text_len + 1;
851 }
852
853 text = next;
854 } while (text);
855
856 return len;
857}
858
859static int syslog_print(char __user *buf, int size)
860{
861 char *text;
862 struct log *msg;
863 int len;
864
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text)
867 return -ENOMEM;
868
869 raw_spin_lock_irq(&logbuf_lock);
870 if (syslog_seq < log_first_seq) {
871 /* messages are gone, move to first one */
872 syslog_seq = log_first_seq;
873 syslog_idx = log_first_idx;
874 }
875 msg = log_from_idx(syslog_idx);
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX);
877 syslog_idx = log_next(syslog_idx);
878 syslog_seq++;
879 raw_spin_unlock_irq(&logbuf_lock);
880
881 if (len > 0 && copy_to_user(buf, text, len))
882 len = -EFAULT;
883
884 kfree(text);
885 return len;
886}
887
888static int syslog_print_all(char __user *buf, int size, bool clear)
889{
890 char *text;
891 int len = 0;
892
893 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
894 if (!text)
895 return -ENOMEM;
896
897 raw_spin_lock_irq(&logbuf_lock);
898 if (buf) {
899 u64 next_seq;
900 u64 seq;
901 u32 idx;
902
903 if (clear_seq < log_first_seq) {
904 /* messages are gone, move to first available one */
905 clear_seq = log_first_seq;
906 clear_idx = log_first_idx;
907 }
908
909 /*
910 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump.
912 */
913 seq = clear_seq;
914 idx = clear_idx;
915 while (seq < log_next_seq) {
916 struct log *msg = log_from_idx(idx);
917
918 len += msg_print_text(msg, true, NULL, 0);
919 idx = log_next(idx);
920 seq++;
921 }
922 seq = clear_seq;
923 idx = clear_idx;
924 while (len > size && seq < log_next_seq) {
925 struct log *msg = log_from_idx(idx);
926
927 len -= msg_print_text(msg, true, NULL, 0);
928 idx = log_next(idx);
929 seq++;
930 }
931
932 /* last message in this dump */
933 next_seq = log_next_seq;
934
935 len = 0;
936 while (len >= 0 && seq < next_seq) {
937 struct log *msg = log_from_idx(idx);
938 int textlen;
939
940 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
941 if (textlen < 0) {
942 len = textlen;
943 break;
944 }
945 idx = log_next(idx);
946 seq++;
947
948 raw_spin_unlock_irq(&logbuf_lock);
949 if (copy_to_user(buf + len, text, textlen))
950 len = -EFAULT;
951 else
952 len += textlen;
953 raw_spin_lock_irq(&logbuf_lock);
954
955 if (seq < log_first_seq) {
956 /* messages are gone, move to next one */
957 seq = log_first_seq;
958 idx = log_first_idx;
959 }
960 }
961 }
962
963 if (clear) {
964 clear_seq = log_next_seq;
965 clear_idx = log_next_idx;
966 }
967 raw_spin_unlock_irq(&logbuf_lock);
968
969 kfree(text);
970 return len;
971}
972
335int do_syslog(int type, char __user *buf, int len, bool from_file) 973int do_syslog(int type, char __user *buf, int len, bool from_file)
336{ 974{
337 unsigned i, j, limit, count; 975 bool clear = false;
338 int do_clear = 0; 976 static int saved_console_loglevel = -1;
339 char c;
340 int error; 977 int error;
341 978
342 error = check_syslog_permissions(type, from_file); 979 error = check_syslog_permissions(type, from_file);
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
364 goto out; 1001 goto out;
365 } 1002 }
366 error = wait_event_interruptible(log_wait, 1003 error = wait_event_interruptible(log_wait,
367 (log_start - log_end)); 1004 syslog_seq != log_next_seq);
368 if (error) 1005 if (error)
369 goto out; 1006 goto out;
370 i = 0; 1007 error = syslog_print(buf, len);
371 raw_spin_lock_irq(&logbuf_lock);
372 while (!error && (log_start != log_end) && i < len) {
373 c = LOG_BUF(log_start);
374 log_start++;
375 raw_spin_unlock_irq(&logbuf_lock);
376 error = __put_user(c,buf);
377 buf++;
378 i++;
379 cond_resched();
380 raw_spin_lock_irq(&logbuf_lock);
381 }
382 raw_spin_unlock_irq(&logbuf_lock);
383 if (!error)
384 error = i;
385 break; 1008 break;
386 /* Read/clear last kernel messages */ 1009 /* Read/clear last kernel messages */
387 case SYSLOG_ACTION_READ_CLEAR: 1010 case SYSLOG_ACTION_READ_CLEAR:
388 do_clear = 1; 1011 clear = true;
389 /* FALL THRU */ 1012 /* FALL THRU */
390 /* Read last kernel messages */ 1013 /* Read last kernel messages */
391 case SYSLOG_ACTION_READ_ALL: 1014 case SYSLOG_ACTION_READ_ALL:
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
399 error = -EFAULT; 1022 error = -EFAULT;
400 goto out; 1023 goto out;
401 } 1024 }
402 count = len; 1025 error = syslog_print_all(buf, len, clear);
403 if (count > log_buf_len)
404 count = log_buf_len;
405 raw_spin_lock_irq(&logbuf_lock);
406 if (count > logged_chars)
407 count = logged_chars;
408 if (do_clear)
409 logged_chars = 0;
410 limit = log_end;
411 /*
412 * __put_user() could sleep, and while we sleep
413 * printk() could overwrite the messages
414 * we try to copy to user space. Therefore
415 * the messages are copied in reverse. <manfreds>
416 */
417 for (i = 0; i < count && !error; i++) {
418 j = limit-1-i;
419 if (j + log_buf_len < log_end)
420 break;
421 c = LOG_BUF(j);
422 raw_spin_unlock_irq(&logbuf_lock);
423 error = __put_user(c,&buf[count-1-i]);
424 cond_resched();
425 raw_spin_lock_irq(&logbuf_lock);
426 }
427 raw_spin_unlock_irq(&logbuf_lock);
428 if (error)
429 break;
430 error = i;
431 if (i != count) {
432 int offset = count-error;
433 /* buffer overflow during copy, correct user buffer. */
434 for (i = 0; i < error; i++) {
435 if (__get_user(c,&buf[i+offset]) ||
436 __put_user(c,&buf[i])) {
437 error = -EFAULT;
438 break;
439 }
440 cond_resched();
441 }
442 }
443 break; 1026 break;
444 /* Clear ring buffer */ 1027 /* Clear ring buffer */
445 case SYSLOG_ACTION_CLEAR: 1028 case SYSLOG_ACTION_CLEAR:
446 logged_chars = 0; 1029 syslog_print_all(NULL, 0, true);
447 break;
448 /* Disable logging to console */ 1030 /* Disable logging to console */
449 case SYSLOG_ACTION_CONSOLE_OFF: 1031 case SYSLOG_ACTION_CONSOLE_OFF:
450 if (saved_console_loglevel == -1) 1032 if (saved_console_loglevel == -1)
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
472 break; 1054 break;
473 /* Number of chars in the log buffer */ 1055 /* Number of chars in the log buffer */
474 case SYSLOG_ACTION_SIZE_UNREAD: 1056 case SYSLOG_ACTION_SIZE_UNREAD:
475 error = log_end - log_start; 1057 raw_spin_lock_irq(&logbuf_lock);
1058 if (syslog_seq < log_first_seq) {
1059 /* messages are gone, move to first one */
1060 syslog_seq = log_first_seq;
1061 syslog_idx = log_first_idx;
1062 }
1063 if (from_file) {
1064 /*
1065 * Short-cut for poll(/"proc/kmsg") which simply checks
1066 * for pending data, not the size; return the count of
1067 * records, not the length.
1068 */
1069 error = log_next_idx - syslog_idx;
1070 } else {
1071 u64 seq;
1072 u32 idx;
1073
1074 error = 0;
1075 seq = syslog_seq;
1076 idx = syslog_idx;
1077 while (seq < log_next_seq) {
1078 struct log *msg = log_from_idx(idx);
1079
1080 error += msg_print_text(msg, true, NULL, 0);
1081 idx = log_next(idx);
1082 seq++;
1083 }
1084 }
1085 raw_spin_unlock_irq(&logbuf_lock);
476 break; 1086 break;
477 /* Size of the log buffer */ 1087 /* Size of the log buffer */
478 case SYSLOG_ACTION_SIZE_BUFFER: 1088 case SYSLOG_ACTION_SIZE_BUFFER:
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])
501{ 1111{
502 syslog_data[0] = log_buf; 1112 syslog_data[0] = log_buf;
503 syslog_data[1] = log_buf + log_buf_len; 1113 syslog_data[1] = log_buf + log_buf_len;
504 syslog_data[2] = log_buf + log_end - 1114 syslog_data[2] = log_buf + log_first_idx;
505 (logged_chars < log_buf_len ? logged_chars : log_buf_len); 1115 syslog_data[3] = log_buf + log_next_idx;
506 syslog_data[3] = log_buf + log_end;
507} 1116}
508#endif /* CONFIG_KGDB_KDB */ 1117#endif /* CONFIG_KGDB_KDB */
509 1118
510/*
511 * Call the console drivers on a range of log_buf
512 */
513static void __call_console_drivers(unsigned start, unsigned end)
514{
515 struct console *con;
516
517 for_each_console(con) {
518 if (exclusive_console && con != exclusive_console)
519 continue;
520 if ((con->flags & CON_ENABLED) && con->write &&
521 (cpu_online(smp_processor_id()) ||
522 (con->flags & CON_ANYTIME)))
523 con->write(con, &LOG_BUF(start), end - start);
524 }
525}
526
527static bool __read_mostly ignore_loglevel; 1119static bool __read_mostly ignore_loglevel;
528 1120
529static int __init ignore_loglevel_setup(char *str) 1121static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
540 "print all kernel messages to the console."); 1132 "print all kernel messages to the console.");
541 1133
542/* 1134/*
543 * Write out chars from start to end - 1 inclusive
544 */
545static void _call_console_drivers(unsigned start,
546 unsigned end, int msg_log_level)
547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
551 console_drivers && start != end) {
552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
553 /* wrapped write */
554 __call_console_drivers(start & LOG_BUF_MASK,
555 log_buf_len);
556 __call_console_drivers(0, end & LOG_BUF_MASK);
557 } else {
558 __call_console_drivers(start, end);
559 }
560 }
561}
562
563/*
564 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
565 * lower 3 bit are the log level, the rest are the log facility. In case
566 * userspace passes usual userspace syslog messages to /dev/kmsg or
567 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
568 * to extract the correct log level for in-kernel processing, and not mangle
569 * the original value.
570 *
571 * If a prefix is found, the length of the prefix is returned. If 'level' is
572 * passed, it will be filled in with the log level without a possible facility
573 * value. If 'special' is passed, the special printk prefix chars are accepted
574 * and returned. If no valid header is found, 0 is returned and the passed
575 * variables are not touched.
576 */
577static size_t log_prefix(const char *p, unsigned int *level, char *special)
578{
579 unsigned int lev = 0;
580 char sp = '\0';
581 size_t len;
582
583 if (p[0] != '<' || !p[1])
584 return 0;
585 if (p[2] == '>') {
586 /* usual single digit level number or special char */
587 switch (p[1]) {
588 case '0' ... '7':
589 lev = p[1] - '0';
590 break;
591 case 'c': /* KERN_CONT */
592 case 'd': /* KERN_DEFAULT */
593 sp = p[1];
594 break;
595 default:
596 return 0;
597 }
598 len = 3;
599 } else {
600 /* multi digit including the level and facility number */
601 char *endp = NULL;
602
603 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
604 if (endp == NULL || endp[0] != '>')
605 return 0;
606 len = (endp + 1) - p;
607 }
608
609 /* do not accept special char if not asked for */
610 if (sp && !special)
611 return 0;
612
613 if (special) {
614 *special = sp;
615 /* return special char, do not touch level */
616 if (sp)
617 return len;
618 }
619
620 if (level)
621 *level = lev;
622 return len;
623}
624
625/*
626 * Call the console drivers, asking them to write out 1135 * Call the console drivers, asking them to write out
627 * log_buf[start] to log_buf[end - 1]. 1136 * log_buf[start] to log_buf[end - 1].
628 * The console_lock must be held. 1137 * The console_lock must be held.
629 */ 1138 */
630static void call_console_drivers(unsigned start, unsigned end) 1139static void call_console_drivers(int level, const char *text, size_t len)
631{ 1140{
632 unsigned cur_index, start_print; 1141 struct console *con;
633 static int msg_level = -1;
634 1142
635 BUG_ON(((int)(start - end)) > 0); 1143 trace_console(text, 0, len, len);
636 1144
637 cur_index = start; 1145 if (level >= console_loglevel && !ignore_loglevel)
638 start_print = start; 1146 return;
639 while (cur_index != end) { 1147 if (!console_drivers)
640 if (msg_level < 0 && ((end - cur_index) > 2)) { 1148 return;
641 /* strip log prefix */
642 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
643 start_print = cur_index;
644 }
645 while (cur_index != end) {
646 char c = LOG_BUF(cur_index);
647
648 cur_index++;
649 if (c == '\n') {
650 if (msg_level < 0) {
651 /*
652 * printk() has already given us loglevel tags in
653 * the buffer. This code is here in case the
654 * log buffer has wrapped right round and scribbled
655 * on those tags
656 */
657 msg_level = default_message_loglevel;
658 }
659 _call_console_drivers(start_print, cur_index, msg_level);
660 msg_level = -1;
661 start_print = cur_index;
662 break;
663 }
664 }
665 }
666 _call_console_drivers(start_print, end, msg_level);
667}
668 1149
669static void emit_log_char(char c) 1150 for_each_console(con) {
670{ 1151 if (exclusive_console && con != exclusive_console)
671 LOG_BUF(log_end) = c; 1152 continue;
672 log_end++; 1153 if (!(con->flags & CON_ENABLED))
673 if (log_end - log_start > log_buf_len) 1154 continue;
674 log_start = log_end - log_buf_len; 1155 if (!con->write)
675 if (log_end - con_start > log_buf_len) 1156 continue;
676 con_start = log_end - log_buf_len; 1157 if (!cpu_online(smp_processor_id()) &&
677 if (logged_chars < log_buf_len) 1158 !(con->flags & CON_ANYTIME))
678 logged_chars++; 1159 continue;
1160 con->write(con, text, len);
1161 }
679} 1162}
680 1163
681/* 1164/*
@@ -700,16 +1183,6 @@ static void zap_locks(void)
700 sema_init(&console_sem, 1); 1183 sema_init(&console_sem, 1);
701} 1184}
702 1185
703#if defined(CONFIG_PRINTK_TIME)
704static bool printk_time = 1;
705#else
706static bool printk_time = 0;
707#endif
708module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
709
710static bool always_kmsg_dump;
711module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
712
713/* Check if we have any console registered that can be called early in boot. */ 1186/* Check if we have any console registered that can be called early in boot. */
714static int have_callable_console(void) 1187static int have_callable_console(void)
715{ 1188{
@@ -722,51 +1195,6 @@ static int have_callable_console(void)
722 return 0; 1195 return 0;
723} 1196}
724 1197
725/**
726 * printk - print a kernel message
727 * @fmt: format string
728 *
729 * This is printk(). It can be called from any context. We want it to work.
730 *
731 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
732 * call the console drivers. If we fail to get the semaphore we place the output
733 * into the log buffer and return. The current holder of the console_sem will
734 * notice the new output in console_unlock(); and will send it to the
735 * consoles before releasing the lock.
736 *
737 * One effect of this deferred printing is that code which calls printk() and
738 * then changes console_loglevel may break. This is because console_loglevel
739 * is inspected when the actual printing occurs.
740 *
741 * See also:
742 * printf(3)
743 *
744 * See the vsnprintf() documentation for format string extensions over C99.
745 */
746
747asmlinkage int printk(const char *fmt, ...)
748{
749 va_list args;
750 int r;
751
752#ifdef CONFIG_KGDB_KDB
753 if (unlikely(kdb_trap_printk)) {
754 va_start(args, fmt);
755 r = vkdb_printf(fmt, args);
756 va_end(args);
757 return r;
758 }
759#endif
760 va_start(args, fmt);
761 r = vprintk(fmt, args);
762 va_end(args);
763
764 return r;
765}
766
767/* cpu currently holding logbuf_lock */
768static volatile unsigned int printk_cpu = UINT_MAX;
769
770/* 1198/*
771 * Can we actually use the console at this time on this cpu? 1199 * Can we actually use the console at this time on this cpu?
772 * 1200 *
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)
810 retval = 0; 1238 retval = 0;
811 } 1239 }
812 } 1240 }
813 printk_cpu = UINT_MAX; 1241 logbuf_cpu = UINT_MAX;
814 if (wake) 1242 if (wake)
815 up(&console_sem); 1243 up(&console_sem);
816 raw_spin_unlock(&logbuf_lock); 1244 raw_spin_unlock(&logbuf_lock);
817 return retval; 1245 return retval;
818} 1246}
819static const char recursion_bug_msg [] =
820 KERN_CRIT "BUG: recent printk recursion!\n";
821static int recursion_bug;
822static int new_text_line = 1;
823static char printk_buf[1024];
824 1247
825int printk_delay_msec __read_mostly; 1248int printk_delay_msec __read_mostly;
826 1249
@@ -836,15 +1259,23 @@ static inline void printk_delay(void)
836 } 1259 }
837} 1260}
838 1261
839asmlinkage int vprintk(const char *fmt, va_list args) 1262asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args)
840{ 1265{
841 int printed_len = 0; 1266 static int recursion_bug;
842 int current_log_level = default_message_loglevel; 1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf;
1273 size_t text_len;
843 unsigned long flags; 1274 unsigned long flags;
844 int this_cpu; 1275 int this_cpu;
845 char *p; 1276 bool newline = false;
846 size_t plen; 1277 bool prefix = false;
847 char special; 1278 int printed_len = 0;
848 1279
849 boot_delay_msec(); 1280 boot_delay_msec();
850 printk_delay(); 1281 printk_delay();
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 /* 1287 /*
857 * Ouch, printk recursed into itself! 1288 * Ouch, printk recursed into itself!
858 */ 1289 */
859 if (unlikely(printk_cpu == this_cpu)) { 1290 if (unlikely(logbuf_cpu == this_cpu)) {
860 /* 1291 /*
861 * If a crash is occurring during printk() on this CPU, 1292 * If a crash is occurring during printk() on this CPU,
862 * then try to get the crash message out but make sure 1293 * then try to get the crash message out but make sure
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)
873 1304
874 lockdep_off(); 1305 lockdep_off();
875 raw_spin_lock(&logbuf_lock); 1306 raw_spin_lock(&logbuf_lock);
876 printk_cpu = this_cpu; 1307 logbuf_cpu = this_cpu;
877 1308
878 if (recursion_bug) { 1309 if (recursion_bug) {
1310 static const char recursion_msg[] =
1311 "BUG: recent printk recursion!";
1312
879 recursion_bug = 0; 1313 recursion_bug = 0;
880 strcpy(printk_buf, recursion_bug_msg); 1314 printed_len += strlen(recursion_msg);
881 printed_len = strlen(recursion_bug_msg); 1315 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len);
882 } 1317 }
883 /* Emit the output into the temporary buffer */
884 printed_len += vscnprintf(printk_buf + printed_len,
885 sizeof(printk_buf) - printed_len, fmt, args);
886 1318
887 p = printk_buf; 1319 /*
1320 * The printf needs to come first; we need the syslog
1321 * prefix which might be passed-in as a parameter.
1322 */
1323 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
888 1324
889 /* Read log level and handle special printk prefix */ 1325 /* mark and strip a trailing newline */
890 plen = log_prefix(p, &current_log_level, &special); 1326 if (text_len && text[text_len-1] == '\n') {
891 if (plen) { 1327 text_len--;
892 p += plen; 1328 newline = true;
1329 }
893 1330
894 switch (special) { 1331 /* strip syslog prefix and extract log level or control flags */
895 case 'c': /* Strip <c> KERN_CONT, continue line */ 1332 if (text[0] == '<' && text[1] && text[2] == '>') {
896 plen = 0; 1333 switch (text[1]) {
897 break; 1334 case '0' ... '7':
898 case 'd': /* Strip <d> KERN_DEFAULT, start new line */ 1335 if (level == -1)
899 plen = 0; 1336 level = text[1] - '0';
900 default: 1337 case 'd': /* KERN_DEFAULT */
901 if (!new_text_line) { 1338 prefix = true;
902 emit_log_char('\n'); 1339 case 'c': /* KERN_CONT */
903 new_text_line = 1; 1340 text += 3;
904 } 1341 text_len -= 3;
905 } 1342 }
906 } 1343 }
907 1344
908 /* 1345 if (level == -1)
909 * Copy the output into log_buf. If the caller didn't provide 1346 level = default_message_loglevel;
910 * the appropriate log prefix, we insert them here
911 */
912 for (; *p; p++) {
913 if (new_text_line) {
914 new_text_line = 0;
915
916 if (plen) {
917 /* Copy original log prefix */
918 int i;
919
920 for (i = 0; i < plen; i++)
921 emit_log_char(printk_buf[i]);
922 printed_len += plen;
923 } else {
924 /* Add log prefix */
925 emit_log_char('<');
926 emit_log_char(current_log_level + '0');
927 emit_log_char('>');
928 printed_len += 3;
929 }
930 1347
931 if (printk_time) { 1348 if (dict) {
932 /* Add the current time stamp */ 1349 prefix = true;
933 char tbuf[50], *tp; 1350 newline = true;
934 unsigned tlen; 1351 }
935 unsigned long long t;
936 unsigned long nanosec_rem;
937
938 t = cpu_clock(printk_cpu);
939 nanosec_rem = do_div(t, 1000000000);
940 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
941 (unsigned long) t,
942 nanosec_rem / 1000);
943
944 for (tp = tbuf; tp < tbuf + tlen; tp++)
945 emit_log_char(*tp);
946 printed_len += tlen;
947 }
948 1352
949 if (!*p) 1353 if (!newline) {
950 break; 1354 if (cont_len && (prefix || cont_task != current)) {
1355 /*
1356 * Flush earlier buffer, which is either from a
1357 * different thread, or when we got a new prefix.
1358 */
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
1360 cont_len = 0;
951 } 1361 }
952 1362
953 emit_log_char(*p); 1363 if (!cont_len) {
954 if (*p == '\n') 1364 cont_level = level;
955 new_text_line = 1; 1365 cont_task = current;
1366 }
1367
1368 /* buffer or append to earlier buffer from the same thread */
1369 if (cont_len + text_len > sizeof(cont_buf))
1370 text_len = sizeof(cont_buf) - cont_len;
1371 memcpy(cont_buf + cont_len, text, text_len);
1372 cont_len += text_len;
1373 } else {
1374 if (cont_len && cont_task == current) {
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385
1386 /* append to the earlier buffer and flush */
1387 if (cont_len + text_len > sizeof(cont_buf))
1388 text_len = sizeof(cont_buf) - cont_len;
1389 memcpy(cont_buf + cont_len, text, text_len);
1390 cont_len += text_len;
1391 log_store(facility, cont_level,
1392 NULL, 0, cont_buf, cont_len);
1393 cont_len = 0;
1394 cont_task = NULL;
1395 printed_len = cont_len;
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 }
956 } 1402 }
957 1403
958 /* 1404 /*
959 * Try to acquire and then immediately release the 1405 * Try to acquire and then immediately release the console semaphore.
960 * console semaphore. The release will do all the 1406 * The release will print out buffers and wake up /dev/kmsg and syslog()
961 * actual magic (print out buffers, wake up klogd, 1407 * users.
962 * etc).
963 * 1408 *
964 * The console_trylock_for_printk() function 1409 * The console_trylock_for_printk() function will release 'logbuf_lock'
965 * will release 'logbuf_lock' regardless of whether it 1410 * regardless of whether it actually gets the console semaphore or not.
966 * actually gets the semaphore or not.
967 */ 1411 */
968 if (console_trylock_for_printk(this_cpu)) 1412 if (console_trylock_for_printk(this_cpu))
969 console_unlock(); 1413 console_unlock();
@@ -974,16 +1418,81 @@ out_restore_irqs:
974 1418
975 return printed_len; 1419 return printed_len;
976} 1420}
977EXPORT_SYMBOL(printk); 1421EXPORT_SYMBOL(vprintk_emit);
978EXPORT_SYMBOL(vprintk);
979 1422
980#else 1423asmlinkage int vprintk(const char *fmt, va_list args)
1424{
1425 return vprintk_emit(0, -1, NULL, 0, fmt, args);
1426}
1427EXPORT_SYMBOL(vprintk);
981 1428
982static void call_console_drivers(unsigned start, unsigned end) 1429asmlinkage int printk_emit(int facility, int level,
1430 const char *dict, size_t dictlen,
1431 const char *fmt, ...)
983{ 1432{
1433 va_list args;
1434 int r;
1435
1436 va_start(args, fmt);
1437 r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
1438 va_end(args);
1439
1440 return r;
984} 1441}
1442EXPORT_SYMBOL(printk_emit);
985 1443
1444/**
1445 * printk - print a kernel message
1446 * @fmt: format string
1447 *
1448 * This is printk(). It can be called from any context. We want it to work.
1449 *
1450 * We try to grab the console_lock. If we succeed, it's easy - we log the
1451 * output and call the console drivers. If we fail to get the semaphore, we
1452 * place the output into the log buffer and return. The current holder of
1453 * the console_sem will notice the new output in console_unlock(); and will
1454 * send it to the consoles before releasing the lock.
1455 *
1456 * One effect of this deferred printing is that code which calls printk() and
1457 * then changes console_loglevel may break. This is because console_loglevel
1458 * is inspected when the actual printing occurs.
1459 *
1460 * See also:
1461 * printf(3)
1462 *
1463 * See the vsnprintf() documentation for format string extensions over C99.
1464 */
1465asmlinkage int printk(const char *fmt, ...)
1466{
1467 va_list args;
1468 int r;
1469
1470#ifdef CONFIG_KGDB_KDB
1471 if (unlikely(kdb_trap_printk)) {
1472 va_start(args, fmt);
1473 r = vkdb_printf(fmt, args);
1474 va_end(args);
1475 return r;
1476 }
986#endif 1477#endif
1478 va_start(args, fmt);
1479 r = vprintk_emit(0, -1, NULL, 0, fmt, args);
1480 va_end(args);
1481
1482 return r;
1483}
1484EXPORT_SYMBOL(printk);
1485
1486#else
1487
1488#define LOG_LINE_MAX 0
1489static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog,
1493 char *buf, size_t size) { return 0; }
1494
1495#endif /* CONFIG_PRINTK */
987 1496
988static int __add_preferred_console(char *name, int idx, char *options, 1497static int __add_preferred_console(char *name, int idx, char *options,
989 char *brl_options) 1498 char *brl_options)
@@ -1217,7 +1726,7 @@ int is_console_locked(void)
1217} 1726}
1218 1727
1219/* 1728/*
1220 * Delayed printk facility, for scheduler-internal messages: 1729 * Delayed printk version, for scheduler-internal messages:
1221 */ 1730 */
1222#define PRINTK_BUF_SIZE 512 1731#define PRINTK_BUF_SIZE 512
1223 1732
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void)
1253 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1762 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1254} 1763}
1255 1764
1765/* the next printk record to write to the console */
1766static u64 console_seq;
1767static u32 console_idx;
1768
1256/** 1769/**
1257 * console_unlock - unlock the console system 1770 * console_unlock - unlock the console system
1258 * 1771 *
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void)
1263 * by printk(). If this is the case, console_unlock(); emits 1776 * by printk(). If this is the case, console_unlock(); emits
1264 * the output prior to releasing the lock. 1777 * the output prior to releasing the lock.
1265 * 1778 *
1266 * If there is output waiting for klogd, we wake it up. 1779 * If there is output waiting, we wake /dev/kmsg and syslog() users.
1267 * 1780 *
1268 * console_unlock(); may be called from any context. 1781 * console_unlock(); may be called from any context.
1269 */ 1782 */
1270void console_unlock(void) 1783void console_unlock(void)
1271{ 1784{
1785 static u64 seen_seq;
1272 unsigned long flags; 1786 unsigned long flags;
1273 unsigned _con_start, _log_end; 1787 bool wake_klogd = false;
1274 unsigned wake_klogd = 0, retry = 0; 1788 bool retry;
1275 1789
1276 if (console_suspended) { 1790 if (console_suspended) {
1277 up(&console_sem); 1791 up(&console_sem);
@@ -1281,17 +1795,38 @@ void console_unlock(void)
1281 console_may_schedule = 0; 1795 console_may_schedule = 0;
1282 1796
1283again: 1797again:
1284 for ( ; ; ) { 1798 for (;;) {
1799 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len;
1802 int level;
1803
1285 raw_spin_lock_irqsave(&logbuf_lock, flags); 1804 raw_spin_lock_irqsave(&logbuf_lock, flags);
1286 wake_klogd |= log_start - log_end; 1805 if (seen_seq != log_next_seq) {
1287 if (con_start == log_end) 1806 wake_klogd = true;
1288 break; /* Nothing to print */ 1807 seen_seq = log_next_seq;
1289 _con_start = con_start; 1808 }
1290 _log_end = log_end; 1809
1291 con_start = log_end; /* Flush */ 1810 if (console_seq < log_first_seq) {
1811 /* messages are gone, move to first one */
1812 console_seq = log_first_seq;
1813 console_idx = log_first_idx;
1814 }
1815
1816 if (console_seq == log_next_seq)
1817 break;
1818
1819 msg = log_from_idx(console_idx);
1820 level = msg->level & 7;
1821
1822 len = msg_print_text(msg, false, text, sizeof(text));
1823
1824 console_idx = log_next(console_idx);
1825 console_seq++;
1292 raw_spin_unlock(&logbuf_lock); 1826 raw_spin_unlock(&logbuf_lock);
1827
1293 stop_critical_timings(); /* don't trace print latency */ 1828 stop_critical_timings(); /* don't trace print latency */
1294 call_console_drivers(_con_start, _log_end); 1829 call_console_drivers(level, text, len);
1295 start_critical_timings(); 1830 start_critical_timings();
1296 local_irq_restore(flags); 1831 local_irq_restore(flags);
1297 } 1832 }
@@ -1312,8 +1847,7 @@ again:
1312 * flush, no worries. 1847 * flush, no worries.
1313 */ 1848 */
1314 raw_spin_lock(&logbuf_lock); 1849 raw_spin_lock(&logbuf_lock);
1315 if (con_start != log_end) 1850 retry = console_seq != log_next_seq;
1316 retry = 1;
1317 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1851 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1318 1852
1319 if (retry && console_trylock()) 1853 if (retry && console_trylock())
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)
1549 * for us. 2083 * for us.
1550 */ 2084 */
1551 raw_spin_lock_irqsave(&logbuf_lock, flags); 2085 raw_spin_lock_irqsave(&logbuf_lock, flags);
1552 con_start = log_start; 2086 console_seq = syslog_seq;
2087 console_idx = syslog_idx;
1553 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2088 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1554 /* 2089 /*
1555 * We're about to replay the log buffer. Only do this to the 2090 * We're about to replay the log buffer. Only do this to the
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1758} 2293}
1759EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 2294EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1760 2295
2296static bool always_kmsg_dump;
2297module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2298
1761/** 2299/**
1762 * kmsg_dump - dump kernel log to kernel message dumpers. 2300 * kmsg_dump - dump kernel log to kernel message dumpers.
1763 * @reason: the reason (oops, panic etc) for dumping 2301 * @reason: the reason (oops, panic etc) for dumping
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1767 */ 2305 */
1768void kmsg_dump(enum kmsg_dump_reason reason) 2306void kmsg_dump(enum kmsg_dump_reason reason)
1769{ 2307{
1770 unsigned long end; 2308 u64 idx;
1771 unsigned chars;
1772 struct kmsg_dumper *dumper; 2309 struct kmsg_dumper *dumper;
1773 const char *s1, *s2; 2310 const char *s1, *s2;
1774 unsigned long l1, l2; 2311 unsigned long l1, l2;
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1780 /* Theoretically, the log could move on after we do this, but 2317 /* Theoretically, the log could move on after we do this, but
1781 there's not a lot we can do about that. The new messages 2318 there's not a lot we can do about that. The new messages
1782 will overwrite the start of what we dump. */ 2319 will overwrite the start of what we dump. */
2320
1783 raw_spin_lock_irqsave(&logbuf_lock, flags); 2321 raw_spin_lock_irqsave(&logbuf_lock, flags);
1784 end = log_end & LOG_BUF_MASK; 2322 if (syslog_seq < log_first_seq)
1785 chars = logged_chars; 2323 idx = syslog_idx;
1786 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2324 else
2325 idx = log_first_idx;
1787 2326
1788 if (chars > end) { 2327 if (idx > log_next_idx) {
1789 s1 = log_buf + log_buf_len - chars + end; 2328 s1 = log_buf;
1790 l1 = chars - end; 2329 l1 = log_next_idx;
1791 2330
1792 s2 = log_buf; 2331 s2 = log_buf + idx;
1793 l2 = end; 2332 l2 = log_buf_len - idx;
1794 } else { 2333 } else {
1795 s1 = ""; 2334 s1 = "";
1796 l1 = 0; 2335 l1 = 0;
1797 2336
1798 s2 = log_buf + end - chars; 2337 s2 = log_buf + idx;
1799 l2 = chars; 2338 l2 = log_next_idx - idx;
1800 } 2339 }
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1801 2341
1802 rcu_read_lock(); 2342 rcu_read_lock();
1803 list_for_each_entry_rcu(dumper, &dump_list, list) 2343 list_for_each_entry_rcu(dumper, &dump_list, list)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee8d49b9c30..a232bb59d93 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
198 return 0; 198 return 0;
199 rcu_read_lock(); 199 rcu_read_lock();
200 tcred = __task_cred(task); 200 tcred = __task_cred(task);
201 if (cred->user->user_ns == tcred->user->user_ns && 201 if (uid_eq(cred->uid, tcred->euid) &&
202 (cred->uid == tcred->euid && 202 uid_eq(cred->uid, tcred->suid) &&
203 cred->uid == tcred->suid && 203 uid_eq(cred->uid, tcred->uid) &&
204 cred->uid == tcred->uid && 204 gid_eq(cred->gid, tcred->egid) &&
205 cred->gid == tcred->egid && 205 gid_eq(cred->gid, tcred->sgid) &&
206 cred->gid == tcred->sgid && 206 gid_eq(cred->gid, tcred->gid))
207 cred->gid == tcred->gid))
208 goto ok; 207 goto ok;
209 if (ptrace_has_cap(tcred->user->user_ns, mode)) 208 if (ptrace_has_cap(tcred->user_ns, mode))
210 goto ok; 209 goto ok;
211 rcu_read_unlock(); 210 rcu_read_unlock();
212 return -EPERM; 211 return -EPERM;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc2..95cba41ce1e 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
51 51
52#include "rcu.h" 52#include "rcu.h"
53 53
54#ifdef CONFIG_PREEMPT_RCU
55
56/*
57 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep
60 * is enabled.
61 */
62void exit_rcu(void)
63{
64 struct task_struct *t = current;
65
66 if (likely(list_empty(&current->rcu_node_entry)))
67 return;
68 t->rcu_read_lock_nesting = 1;
69 barrier();
70 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
71 __rcu_read_unlock();
72}
73
74#else /* #ifdef CONFIG_PREEMPT_RCU */
75
76void exit_rcu(void)
77{
78}
79
80#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
81
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 82#ifdef CONFIG_DEBUG_LOCK_ALLOC
55static struct lock_class_key rcu_lock_key; 83static struct lock_class_key rcu_lock_key;
56struct lockdep_map rcu_lock_map = 84struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb6..fc31a2d6510 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 852}
853 853
854/*
855 * Check for a task exiting while in a preemptible -RCU read-side
856 * critical section, clean up if so. No need to issue warnings,
857 * as debug_check_no_locks_held() already does this if lockdep
858 * is enabled.
859 */
860void exit_rcu(void)
861{
862 struct task_struct *t = current;
863
864 if (t->rcu_read_lock_nesting == 0)
865 return;
866 t->rcu_read_lock_nesting = 1;
867 __rcu_read_unlock();
868}
869
870#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 854#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
871 855
872#ifdef CONFIG_RCU_TRACE 856#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6..e66b34ab755 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
96MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
97module_param(fqs_stutter, int, 0444); 98module_param(fqs_stutter, int, 0444);
98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
99module_param(onoff_interval, int, 0444); 102module_param(onoff_interval, int, 0444);
100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444); 104module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
139static struct task_struct *onoff_task; 142static struct task_struct *onoff_task;
140#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task; 144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
142 147
143#define RCU_TORTURE_PIPE_LEN 10 148#define RCU_TORTURE_PIPE_LEN 10
144 149
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
164static atomic_t n_rcu_torture_free; 169static atomic_t n_rcu_torture_free;
165static atomic_t n_rcu_torture_mberror; 170static atomic_t n_rcu_torture_mberror;
166static atomic_t n_rcu_torture_error; 171static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
167static long n_rcu_torture_boost_ktrerror; 173static long n_rcu_torture_boost_ktrerror;
168static long n_rcu_torture_boost_rterror; 174static long n_rcu_torture_boost_rterror;
169static long n_rcu_torture_boost_failure; 175static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
173static long n_offline_successes; 179static long n_offline_successes;
174static long n_online_attempts; 180static long n_online_attempts;
175static long n_online_successes; 181static long n_online_successes;
182static long n_barrier_attempts;
183static long n_barrier_successes;
176static struct list_head rcu_torture_removed; 184static struct list_head rcu_torture_removed;
177static cpumask_var_t shuffle_tmp_mask; 185static cpumask_var_t shuffle_tmp_mask;
178 186
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
197static unsigned long boost_starttime; /* jiffies of next boost test start. */ 205static unsigned long boost_starttime; /* jiffies of next boost test start. */
198DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
199 /* and boost task create/destroy. */ 207 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
200 212
201/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 213/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
202 214
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
327 int (*completed)(void); 339 int (*completed)(void);
328 void (*deferred_free)(struct rcu_torture *p); 340 void (*deferred_free)(struct rcu_torture *p);
329 void (*sync)(void); 341 void (*sync)(void);
342 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
330 void (*cb_barrier)(void); 343 void (*cb_barrier)(void);
331 void (*fqs)(void); 344 void (*fqs)(void);
332 int (*stats)(char *page); 345 int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
417 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
418 .deferred_free = rcu_torture_deferred_free, 431 .deferred_free = rcu_torture_deferred_free,
419 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .call = call_rcu,
420 .cb_barrier = rcu_barrier, 434 .cb_barrier = rcu_barrier,
421 .fqs = rcu_force_quiescent_state, 435 .fqs = rcu_force_quiescent_state,
422 .stats = NULL, 436 .stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
460 .completed = rcu_torture_completed, 474 .completed = rcu_torture_completed,
461 .deferred_free = rcu_sync_torture_deferred_free, 475 .deferred_free = rcu_sync_torture_deferred_free,
462 .sync = synchronize_rcu, 476 .sync = synchronize_rcu,
477 .call = NULL,
463 .cb_barrier = NULL, 478 .cb_barrier = NULL,
464 .fqs = rcu_force_quiescent_state, 479 .fqs = rcu_force_quiescent_state,
465 .stats = NULL, 480 .stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
477 .completed = rcu_no_completed, 492 .completed = rcu_no_completed,
478 .deferred_free = rcu_sync_torture_deferred_free, 493 .deferred_free = rcu_sync_torture_deferred_free,
479 .sync = synchronize_rcu_expedited, 494 .sync = synchronize_rcu_expedited,
495 .call = NULL,
480 .cb_barrier = NULL, 496 .cb_barrier = NULL,
481 .fqs = rcu_force_quiescent_state, 497 .fqs = rcu_force_quiescent_state,
482 .stats = NULL, 498 .stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
519 .completed = rcu_bh_torture_completed, 535 .completed = rcu_bh_torture_completed,
520 .deferred_free = rcu_bh_torture_deferred_free, 536 .deferred_free = rcu_bh_torture_deferred_free,
521 .sync = synchronize_rcu_bh, 537 .sync = synchronize_rcu_bh,
538 .call = call_rcu_bh,
522 .cb_barrier = rcu_barrier_bh, 539 .cb_barrier = rcu_barrier_bh,
523 .fqs = rcu_bh_force_quiescent_state, 540 .fqs = rcu_bh_force_quiescent_state,
524 .stats = NULL, 541 .stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
535 .completed = rcu_bh_torture_completed, 552 .completed = rcu_bh_torture_completed,
536 .deferred_free = rcu_sync_torture_deferred_free, 553 .deferred_free = rcu_sync_torture_deferred_free,
537 .sync = synchronize_rcu_bh, 554 .sync = synchronize_rcu_bh,
555 .call = NULL,
538 .cb_barrier = NULL, 556 .cb_barrier = NULL,
539 .fqs = rcu_bh_force_quiescent_state, 557 .fqs = rcu_bh_force_quiescent_state,
540 .stats = NULL, 558 .stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
551 .completed = rcu_bh_torture_completed, 569 .completed = rcu_bh_torture_completed,
552 .deferred_free = rcu_sync_torture_deferred_free, 570 .deferred_free = rcu_sync_torture_deferred_free,
553 .sync = synchronize_rcu_bh_expedited, 571 .sync = synchronize_rcu_bh_expedited,
572 .call = NULL,
554 .cb_barrier = NULL, 573 .cb_barrier = NULL,
555 .fqs = rcu_bh_force_quiescent_state, 574 .fqs = rcu_bh_force_quiescent_state,
556 .stats = NULL, 575 .stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
606 return srcu_batches_completed(&srcu_ctl); 625 return srcu_batches_completed(&srcu_ctl);
607} 626}
608 627
628static void srcu_torture_deferred_free(struct rcu_torture *rp)
629{
630 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
631}
632
609static void srcu_torture_synchronize(void) 633static void srcu_torture_synchronize(void)
610{ 634{
611 synchronize_srcu(&srcu_ctl); 635 synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
621 torture_type, TORTURE_FLAG, idx); 645 torture_type, TORTURE_FLAG, idx);
622 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
623 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
626 } 650 }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
635 .read_delay = srcu_read_delay, 659 .read_delay = srcu_read_delay,
636 .readunlock = srcu_torture_read_unlock, 660 .readunlock = srcu_torture_read_unlock,
637 .completed = srcu_torture_completed, 661 .completed = srcu_torture_completed,
638 .deferred_free = rcu_sync_torture_deferred_free, 662 .deferred_free = srcu_torture_deferred_free,
639 .sync = srcu_torture_synchronize, 663 .sync = srcu_torture_synchronize,
664 .call = NULL,
640 .cb_barrier = NULL, 665 .cb_barrier = NULL,
641 .stats = srcu_torture_stats, 666 .stats = srcu_torture_stats,
642 .name = "srcu" 667 .name = "srcu"
643}; 668};
644 669
670static struct rcu_torture_ops srcu_sync_ops = {
671 .init = srcu_torture_init,
672 .cleanup = srcu_torture_cleanup,
673 .readlock = srcu_torture_read_lock,
674 .read_delay = srcu_read_delay,
675 .readunlock = srcu_torture_read_unlock,
676 .completed = srcu_torture_completed,
677 .deferred_free = rcu_sync_torture_deferred_free,
678 .sync = srcu_torture_synchronize,
679 .call = NULL,
680 .cb_barrier = NULL,
681 .stats = srcu_torture_stats,
682 .name = "srcu_sync"
683};
684
645static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) 685static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
646{ 686{
647 return srcu_read_lock_raw(&srcu_ctl); 687 return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
659 .read_delay = srcu_read_delay, 699 .read_delay = srcu_read_delay,
660 .readunlock = srcu_torture_read_unlock_raw, 700 .readunlock = srcu_torture_read_unlock_raw,
661 .completed = srcu_torture_completed, 701 .completed = srcu_torture_completed,
662 .deferred_free = rcu_sync_torture_deferred_free, 702 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 703 .sync = srcu_torture_synchronize,
704 .call = NULL,
664 .cb_barrier = NULL, 705 .cb_barrier = NULL,
665 .stats = srcu_torture_stats, 706 .stats = srcu_torture_stats,
666 .name = "srcu_raw" 707 .name = "srcu_raw"
667}; 708};
668 709
710static struct rcu_torture_ops srcu_raw_sync_ops = {
711 .init = srcu_torture_init,
712 .cleanup = srcu_torture_cleanup,
713 .readlock = srcu_torture_read_lock_raw,
714 .read_delay = srcu_read_delay,
715 .readunlock = srcu_torture_read_unlock_raw,
716 .completed = srcu_torture_completed,
717 .deferred_free = rcu_sync_torture_deferred_free,
718 .sync = srcu_torture_synchronize,
719 .call = NULL,
720 .cb_barrier = NULL,
721 .stats = srcu_torture_stats,
722 .name = "srcu_raw_sync"
723};
724
669static void srcu_torture_synchronize_expedited(void) 725static void srcu_torture_synchronize_expedited(void)
670{ 726{
671 synchronize_srcu_expedited(&srcu_ctl); 727 synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
680 .completed = srcu_torture_completed, 736 .completed = srcu_torture_completed,
681 .deferred_free = rcu_sync_torture_deferred_free, 737 .deferred_free = rcu_sync_torture_deferred_free,
682 .sync = srcu_torture_synchronize_expedited, 738 .sync = srcu_torture_synchronize_expedited,
739 .call = NULL,
683 .cb_barrier = NULL, 740 .cb_barrier = NULL,
684 .stats = srcu_torture_stats, 741 .stats = srcu_torture_stats,
685 .name = "srcu_expedited" 742 .name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1131 "rtbf: %ld rtb: %ld nt: %ld " 1188 "rtbf: %ld rtb: %ld nt: %ld "
1132 "onoff: %ld/%ld:%ld/%ld", 1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1133 rcu_torture_current, 1191 rcu_torture_current,
1134 rcu_torture_current_version, 1192 rcu_torture_current_version,
1135 list_empty(&rcu_torture_freelist), 1193 list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
1145 n_online_successes, 1203 n_online_successes,
1146 n_online_attempts, 1204 n_online_attempts,
1147 n_offline_successes, 1205 n_offline_successes,
1148 n_offline_attempts); 1206 n_offline_attempts,
1207 n_barrier_successes,
1208 n_barrier_attempts,
1209 n_rcu_torture_barrier_error);
1210 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1149 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1211 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1212 n_rcu_torture_barrier_error != 0 ||
1150 n_rcu_torture_boost_ktrerror != 0 || 1213 n_rcu_torture_boost_ktrerror != 0 ||
1151 n_rcu_torture_boost_rterror != 0 || 1214 n_rcu_torture_boost_rterror != 0 ||
1152 n_rcu_torture_boost_failure != 0) 1215 n_rcu_torture_boost_failure != 0 ||
1153 cnt += sprintf(&page[cnt], " !!!"); 1216 i > 1) {
1154 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1155 if (i > 1) {
1156 cnt += sprintf(&page[cnt], "!!! "); 1217 cnt += sprintf(&page[cnt], "!!! ");
1157 atomic_inc(&n_rcu_torture_error); 1218 atomic_inc(&n_rcu_torture_error);
1158 WARN_ON_ONCE(1); 1219 WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
1337 1398
1338 /* This must be outside of the mutex, otherwise deadlock! */ 1399 /* This must be outside of the mutex, otherwise deadlock! */
1339 kthread_stop(t); 1400 kthread_stop(t);
1401 boost_tasks[cpu] = NULL;
1340} 1402}
1341 1403
1342static int rcutorture_booster_init(int cpu) 1404static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
1484 return; 1546 return;
1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1547 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1486 kthread_stop(onoff_task); 1548 kthread_stop(onoff_task);
1549 onoff_task = NULL;
1487} 1550}
1488 1551
1489#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552#else /* #ifdef CONFIG_HOTPLUG_CPU */
1490 1553
1491static void 1554static int
1492rcu_torture_onoff_init(void) 1555rcu_torture_onoff_init(void)
1493{ 1556{
1557 return 0;
1494} 1558}
1495 1559
1496static void rcu_torture_onoff_cleanup(void) 1560static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
1554 return; 1618 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1619 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task); 1620 kthread_stop(stall_task);
1621 stall_task = NULL;
1622}
1623
1624/* Callback function for RCU barrier testing. */
1625void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1626{
1627 atomic_inc(&barrier_cbs_invoked);
1628}
1629
1630/* kthread function to register callbacks used to test RCU barriers. */
1631static int rcu_torture_barrier_cbs(void *arg)
1632{
1633 long myid = (long)arg;
1634 struct rcu_head rcu;
1635
1636 init_rcu_head_on_stack(&rcu);
1637 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1638 set_user_nice(current, 19);
1639 do {
1640 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
1642 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP);
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1647 if (atomic_dec_and_test(&barrier_cbs_count))
1648 wake_up(&barrier_wq);
1649 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1650 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1651 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1652 while (!kthread_should_stop())
1653 schedule_timeout_interruptible(1);
1654 cur_ops->cb_barrier();
1655 destroy_rcu_head_on_stack(&rcu);
1656 return 0;
1657}
1658
1659/* kthread function to drive and coordinate RCU barrier testing. */
1660static int rcu_torture_barrier(void *arg)
1661{
1662 int i;
1663
1664 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1665 do {
1666 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */
1669 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq,
1672 atomic_read(&barrier_cbs_count) == 0 ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1676 break;
1677 n_barrier_attempts++;
1678 cur_ops->cb_barrier();
1679 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1680 n_rcu_torture_barrier_error++;
1681 WARN_ON_ONCE(1);
1682 }
1683 n_barrier_successes++;
1684 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1688 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1);
1690 return 0;
1691}
1692
1693/* Initialize RCU barrier testing. */
1694static int rcu_torture_barrier_init(void)
1695{
1696 int i;
1697 int ret;
1698
1699 if (n_barrier_cbs == 0)
1700 return 0;
1701 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1702 printk(KERN_ALERT "%s" TORTURE_FLAG
1703 " Call or barrier ops missing for %s,\n",
1704 torture_type, cur_ops->name);
1705 printk(KERN_ALERT "%s" TORTURE_FLAG
1706 " RCU barrier testing omitted from run.\n",
1707 torture_type);
1708 return 0;
1709 }
1710 atomic_set(&barrier_cbs_count, 0);
1711 atomic_set(&barrier_cbs_invoked, 0);
1712 barrier_cbs_tasks =
1713 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1714 GFP_KERNEL);
1715 barrier_cbs_wq =
1716 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1717 GFP_KERNEL);
1718 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1719 return -ENOMEM;
1720 for (i = 0; i < n_barrier_cbs; i++) {
1721 init_waitqueue_head(&barrier_cbs_wq[i]);
1722 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1723 (void *)(long)i,
1724 "rcu_torture_barrier_cbs");
1725 if (IS_ERR(barrier_cbs_tasks[i])) {
1726 ret = PTR_ERR(barrier_cbs_tasks[i]);
1727 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1728 barrier_cbs_tasks[i] = NULL;
1729 return ret;
1730 }
1731 }
1732 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1733 "rcu_torture_barrier");
1734 if (IS_ERR(barrier_task)) {
1735 ret = PTR_ERR(barrier_task);
1736 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1737 barrier_task = NULL;
1738 }
1739 return 0;
1740}
1741
1742/* Clean up after RCU barrier testing. */
1743static void rcu_torture_barrier_cleanup(void)
1744{
1745 int i;
1746
1747 if (barrier_task != NULL) {
1748 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1749 kthread_stop(barrier_task);
1750 barrier_task = NULL;
1751 }
1752 if (barrier_cbs_tasks != NULL) {
1753 for (i = 0; i < n_barrier_cbs; i++) {
1754 if (barrier_cbs_tasks[i] != NULL) {
1755 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1756 kthread_stop(barrier_cbs_tasks[i]);
1757 barrier_cbs_tasks[i] = NULL;
1758 }
1759 }
1760 kfree(barrier_cbs_tasks);
1761 barrier_cbs_tasks = NULL;
1762 }
1763 if (barrier_cbs_wq != NULL) {
1764 kfree(barrier_cbs_wq);
1765 barrier_cbs_wq = NULL;
1766 }
1557} 1767}
1558 1768
1559static int rcutorture_cpu_notify(struct notifier_block *self, 1769static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
1598 fullstop = FULLSTOP_RMMOD; 1808 fullstop = FULLSTOP_RMMOD;
1599 mutex_unlock(&fullstop_mutex); 1809 mutex_unlock(&fullstop_mutex);
1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1810 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1811 rcu_torture_barrier_cleanup();
1601 rcu_torture_stall_cleanup(); 1812 rcu_torture_stall_cleanup();
1602 if (stutter_task) { 1813 if (stutter_task) {
1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1876 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1666 kthread_stop(shutdown_task); 1877 kthread_stop(shutdown_task);
1667 } 1878 }
1879 shutdown_task = NULL;
1668 rcu_torture_onoff_cleanup(); 1880 rcu_torture_onoff_cleanup();
1669 1881
1670 /* Wait for all RCU callbacks to fire. */ 1882 /* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
1676 1888
1677 if (cur_ops->cleanup) 1889 if (cur_ops->cleanup)
1678 cur_ops->cleanup(); 1890 cur_ops->cleanup();
1679 if (atomic_read(&n_rcu_torture_error)) 1891 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1892 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts || 1893 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts) 1894 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
1692 int i; 1904 int i;
1693 int cpu; 1905 int cpu;
1694 int firsterr = 0; 1906 int firsterr = 0;
1907 int retval;
1695 static struct rcu_torture_ops *torture_ops[] = 1908 static struct rcu_torture_ops *torture_ops[] =
1696 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1697 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1698 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops,
1699 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1700 1914
1701 mutex_lock(&fullstop_mutex); 1915 mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
1749 atomic_set(&n_rcu_torture_free, 0); 1963 atomic_set(&n_rcu_torture_free, 0);
1750 atomic_set(&n_rcu_torture_mberror, 0); 1964 atomic_set(&n_rcu_torture_mberror, 0);
1751 atomic_set(&n_rcu_torture_error, 0); 1965 atomic_set(&n_rcu_torture_error, 0);
1966 n_rcu_torture_barrier_error = 0;
1752 n_rcu_torture_boost_ktrerror = 0; 1967 n_rcu_torture_boost_ktrerror = 0;
1753 n_rcu_torture_boost_rterror = 0; 1968 n_rcu_torture_boost_rterror = 0;
1754 n_rcu_torture_boost_failure = 0; 1969 n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
1872 test_boost_duration = 2; 2087 test_boost_duration = 2;
1873 if ((test_boost == 1 && cur_ops->can_boost) || 2088 if ((test_boost == 1 && cur_ops->can_boost) ||
1874 test_boost == 2) { 2089 test_boost == 2) {
1875 int retval;
1876 2090
1877 boost_starttime = jiffies + test_boost_interval * HZ; 2091 boost_starttime = jiffies + test_boost_interval * HZ;
1878 register_cpu_notifier(&rcutorture_cpu_nb); 2092 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
1897 goto unwind; 2111 goto unwind;
1898 } 2112 }
1899 } 2113 }
1900 rcu_torture_onoff_init(); 2114 i = rcu_torture_onoff_init();
2115 if (i != 0) {
2116 firsterr = i;
2117 goto unwind;
2118 }
1901 register_reboot_notifier(&rcutorture_shutdown_nb); 2119 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init(); 2120 i = rcu_torture_stall_init();
2121 if (i != 0) {
2122 firsterr = i;
2123 goto unwind;
2124 }
2125 retval = rcu_torture_barrier_init();
2126 if (retval != 0) {
2127 firsterr = retval;
2128 goto unwind;
2129 }
1903 rcutorture_record_test_transition(); 2130 rcutorture_record_test_transition();
1904 mutex_unlock(&fullstop_mutex); 2131 mutex_unlock(&fullstop_mutex);
1905 return 0; 2132 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 1050d6d3922..0da7b88d92d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
75 .gpnum = -300, \ 75 .gpnum = -300, \
76 .completed = -300, \ 76 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \
78 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
79 .n_force_qs = 0, \ 81 .n_force_qs = 0, \
80 .n_force_qs_ngp = 0, \ 82 .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145unsigned long rcutorture_testseq; 147unsigned long rcutorture_testseq;
146unsigned long rcutorture_vernum; 148unsigned long rcutorture_vernum;
147 149
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
148/* 157/*
149 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
150 * permit this function to be invoked without holding the root rcu_node 159 * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
192{ 201{
193 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
194 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
195 rcu_preempt_note_context_switch(cpu);
196 trace_rcu_utilization("End context switch"); 204 trace_rcu_utilization("End context switch");
197} 205}
198EXPORT_SYMBOL_GPL(rcu_note_context_switch); 206EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1311#ifdef CONFIG_HOTPLUG_CPU 1319#ifdef CONFIG_HOTPLUG_CPU
1312 1320
1313/* 1321/*
1314 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1322 * Send the specified CPU's RCU callbacks to the orphanage. The
1315 * Also record a quiescent state for this CPU for the current grace period. 1323 * specified CPU must be offline, and the caller must hold the
1316 * Synchronization and interrupt disabling are not required because 1324 * ->onofflock.
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1324 */ 1325 */
1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1326static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp)
1326{ 1329{
1327 int i; 1330 int i;
1328 unsigned long mask;
1329 int receive_cpu = cpumask_any(cpu_online_mask);
1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333 1331
1334 /* First, adjust the counts. */ 1332 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of
1335 * the callbacks, thus no memory barrier is required.
1336 */
1335 if (rdp->nxtlist != NULL) { 1337 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy; 1338 rsp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen; 1339 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen;
1338 rdp->qlen_lazy = 0; 1341 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0; 1342 rdp->qlen = 0;
1340 } 1343 }
1341 1344
1342 /* 1345 /*
1343 * Next, move ready-to-invoke callbacks to be invoked on some 1346 * Next, move those callbacks still needing a grace period to
1344 * other CPU. These will not be required to pass through another 1347 * the orphanage, where some other CPU will pick them up.
1345 * grace period: They are done, regardless of CPU. 1348 * Some of the callbacks might have gone partway through a grace
1349 * period, but that is too bad. They get to start over because we
1350 * cannot assume that grace periods are synchronized across CPUs.
1351 * We don't bother updating the ->nxttail[] array yet, instead
1352 * we just reset the whole thing later on.
1346 */ 1353 */
1347 if (rdp->nxtlist != NULL && 1354 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1355 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1349 struct rcu_head *oldhead; 1356 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1350 struct rcu_head **oldtail; 1357 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 } 1358 }
1366 1359
1367 /* 1360 /*
1368 * Finally, put the rest of the callbacks at the end of the list. 1361 * Then move the ready-to-invoke callbacks to the orphanage,
1369 * The ones that made it partway through get to start over: We 1362 * where some other CPU will pick them up. These will not be
1370 * cannot assume that grace periods are synchronized across CPUs. 1363 * required to pass though another grace period: They are done.
1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */ 1364 */
1374 if (rdp->nxtlist != NULL) { 1365 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1366 *rsp->orphan_donetail = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] = 1367 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 } 1368 }
1385 1369
1370 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL;
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374}
1375
1376/*
1377 * Adopt the RCU callbacks from the specified rcu_state structure's
1378 * orphanage. The caller must hold the ->onofflock.
1379 */
1380static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1381{
1382 int i;
1383 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1384
1386 /* 1385 /*
1387 * Record a quiescent state for the dying CPU. This is safe 1386 * If there is an rcu_barrier() operation in progress, then
1388 * only because we have already cleared out the callbacks. 1387 * only the task doing that operation is permitted to adopt
1389 * (Otherwise, the RCU core might try to schedule the invocation 1388 * callbacks. To do otherwise breaks rcu_barrier() and friends
1390 * of callbacks on this now-offline CPU, which would be bad.) 1389 * by causing them to fail to wait for the callbacks in the
1390 * orphanage.
1391 */ 1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1392 if (rsp->rcu_barrier_in_progress &&
1393 rsp->rcu_barrier_in_progress != current)
1394 return;
1395
1396 /* Do the accounting first. */
1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen;
1400 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0;
1402
1403 /*
1404 * We do not need a memory barrier here because the only way we
1405 * can get here if there is an rcu_barrier() in flight is if
1406 * we are the task doing the rcu_barrier().
1407 */
1408
1409 /* First adopt the ready-to-invoke callbacks. */
1410 if (rsp->orphan_donelist != NULL) {
1411 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1412 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1413 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1414 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1415 rdp->nxttail[i] = rsp->orphan_donetail;
1416 rsp->orphan_donelist = NULL;
1417 rsp->orphan_donetail = &rsp->orphan_donelist;
1418 }
1419
1420 /* And then adopt the callbacks that still need a grace period. */
1421 if (rsp->orphan_nxtlist != NULL) {
1422 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1423 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1424 rsp->orphan_nxtlist = NULL;
1425 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1426 }
1427}
1428
1429/*
1430 * Trace the fact that this CPU is going offline.
1431 */
1432static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1433{
1434 RCU_TRACE(unsigned long mask);
1435 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1436 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1437
1438 RCU_TRACE(mask = rdp->grpmask);
1393 trace_rcu_grace_period(rsp->name, 1439 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1440 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl"); 1441 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1398} 1442}
1399 1443
1400/* 1444/*
1401 * The CPU has been completely removed, and some other CPU is reporting 1445 * The CPU has been completely removed, and some other CPU is reporting
1402 * this fact from process context. Do the remainder of the cleanup. 1446 * this fact from process context. Do the remainder of the cleanup,
1447 * including orphaning the outgoing CPU's RCU callbacks, and also
1448 * adopting them, if there is no _rcu_barrier() instance running.
1403 * There can only be one CPU hotplug operation at a time, so no other 1449 * There can only be one CPU hotplug operation at a time, so no other
1404 * CPU can be attempting to update rcu_cpu_kthread_task. 1450 * CPU can be attempting to update rcu_cpu_kthread_task.
1405 */ 1451 */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1409 unsigned long mask; 1455 unsigned long mask;
1410 int need_report = 0; 1456 int need_report = 0;
1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1457 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1458 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1413 1459
1414 /* Adjust any no-longer-needed kthreads. */ 1460 /* Adjust any no-longer-needed kthreads. */
1415 rcu_stop_cpu_kthread(cpu); 1461 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1); 1462 rcu_node_kthread_setaffinity(rnp, -1);
1417 1463
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1464 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1419 1465
1420 /* Exclude any attempts to start a new grace period. */ 1466 /* Exclude any attempts to start a new grace period. */
1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1467 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1422 1468
1469 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1470 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1471 rcu_adopt_orphan_cbs(rsp);
1472
1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1473 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1424 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1474 mask = rdp->grpmask; /* rnp->grplo is constant. */
1425 do { 1475 do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1456 1506
1457#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1507#else /* #ifdef CONFIG_HOTPLUG_CPU */
1458 1508
1509static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1510{
1511}
1512
1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1513static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1460{ 1514{
1461} 1515}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1524 rcu_is_callbacks_kthread()); 1578 rcu_is_callbacks_kthread());
1525 1579
1526 /* Update count, and requeue any remaining callbacks. */ 1580 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1528 rdp->qlen -= count;
1529 rdp->n_cbs_invoked += count;
1530 if (list != NULL) { 1581 if (list != NULL) {
1531 *tail = rdp->nxtlist; 1582 *tail = rdp->nxtlist;
1532 rdp->nxtlist = list; 1583 rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1536 else 1587 else
1537 break; 1588 break;
1538 } 1589 }
1590 smp_mb(); /* List handling before counting for rcu_barrier(). */
1591 rdp->qlen_lazy -= count_lazy;
1592 rdp->qlen -= count;
1593 rdp->n_cbs_invoked += count;
1539 1594
1540 /* Reinstate batch limit if we have worked down the excess. */ 1595 /* Reinstate batch limit if we have worked down the excess. */
1541 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1596 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1820,15 +1875,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1820 * a quiescent state betweentimes. 1875 * a quiescent state betweentimes.
1821 */ 1876 */
1822 local_irq_save(flags); 1877 local_irq_save(flags);
1823 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1824 rdp = this_cpu_ptr(rsp->rda); 1878 rdp = this_cpu_ptr(rsp->rda);
1825 1879
1826 /* Add the callback to our list. */ 1880 /* Add the callback to our list. */
1827 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1828 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1829 rdp->qlen++; 1881 rdp->qlen++;
1830 if (lazy) 1882 if (lazy)
1831 rdp->qlen_lazy++; 1883 rdp->qlen_lazy++;
1884 else
1885 rcu_idle_count_callbacks_posted();
1886 smp_mb(); /* Count before adding callback for rcu_barrier(). */
1887 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1888 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1832 1889
1833 if (__is_kfree_rcu_offset((unsigned long)func)) 1890 if (__is_kfree_rcu_offset((unsigned long)func))
1834 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1891 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1894,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1894} 1951}
1895EXPORT_SYMBOL_GPL(call_rcu_bh); 1952EXPORT_SYMBOL_GPL(call_rcu_bh);
1896 1953
1954/*
1955 * Because a context switch is a grace period for RCU-sched and RCU-bh,
1956 * any blocking grace-period wait automatically implies a grace period
1957 * if there is only one CPU online at any point time during execution
1958 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
1959 * occasionally incorrectly indicate that there are multiple CPUs online
1960 * when there was in fact only one the whole time, as this just adds
1961 * some overhead: RCU still operates correctly.
1962 *
1963 * Of course, sampling num_online_cpus() with preemption enabled can
1964 * give erroneous results if there are concurrent CPU-hotplug operations.
1965 * For example, given a demonic sequence of preemptions in num_online_cpus()
1966 * and CPU-hotplug operations, there could be two or more CPUs online at
1967 * all times, but num_online_cpus() might well return one (or even zero).
1968 *
1969 * However, all such demonic sequences require at least one CPU-offline
1970 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1971 * is only a problem if there is an RCU read-side critical section executing
1972 * throughout. But RCU-sched and RCU-bh read-side critical sections
1973 * disable either preemption or bh, which prevents a CPU from going offline.
1974 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1975 * that there is only one CPU when in fact there was more than one throughout
1976 * is when there were no RCU readers in the system. If there are no
1977 * RCU readers, the grace period by definition can be of zero length,
1978 * regardless of the number of online CPUs.
1979 */
1980static inline int rcu_blocking_is_gp(void)
1981{
1982 might_sleep(); /* Check for RCU read-side critical section. */
1983 return num_online_cpus() <= 1;
1984}
1985
1897/** 1986/**
1898 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1987 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1899 * 1988 *
@@ -2167,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
2167 rcu_preempt_cpu_has_callbacks(cpu); 2256 rcu_preempt_cpu_has_callbacks(cpu);
2168} 2257}
2169 2258
2170static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2259/*
2171static atomic_t rcu_barrier_cpu_count; 2260 * RCU callback function for _rcu_barrier(). If we are last, wake
2172static DEFINE_MUTEX(rcu_barrier_mutex); 2261 * up the task executing _rcu_barrier().
2173static struct completion rcu_barrier_completion; 2262 */
2174
2175static void rcu_barrier_callback(struct rcu_head *notused) 2263static void rcu_barrier_callback(struct rcu_head *notused)
2176{ 2264{
2177 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2265 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2201,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
2201 void (*call_rcu_func)(struct rcu_head *head, 2289 void (*call_rcu_func)(struct rcu_head *head,
2202 void (*func)(struct rcu_head *head))) 2290 void (*func)(struct rcu_head *head)))
2203{ 2291{
2204 BUG_ON(in_interrupt()); 2292 int cpu;
2293 unsigned long flags;
2294 struct rcu_data *rdp;
2295 struct rcu_head rh;
2296
2297 init_rcu_head_on_stack(&rh);
2298
2205 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2299 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2206 mutex_lock(&rcu_barrier_mutex); 2300 mutex_lock(&rcu_barrier_mutex);
2207 init_completion(&rcu_barrier_completion); 2301
2302 smp_mb(); /* Prevent any prior operations from leaking in. */
2303
2208 /* 2304 /*
2209 * Initialize rcu_barrier_cpu_count to 1, then invoke 2305 * Initialize the count to one rather than to zero in order to
2210 * rcu_barrier_func() on each CPU, so that each CPU also has 2306 * avoid a too-soon return to zero in case of a short grace period
2211 * incremented rcu_barrier_cpu_count. Only then is it safe to 2307 * (or preemption of this task). Also flag this task as doing
2212 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2308 * an rcu_barrier(). This will prevent anyone else from adopting
2213 * might complete its grace period before all of the other CPUs 2309 * orphaned callbacks, which could cause otherwise failure if a
2214 * did their increment, causing this function to return too 2310 * CPU went offline and quickly came back online. To see this,
2215 * early. Note that on_each_cpu() disables irqs, which prevents 2311 * consider the following sequence of events:
2216 * any CPUs from coming online or going offline until each online 2312 *
2217 * CPU has queued its RCU-barrier callback. 2313 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2314 * 2. CPU 1 goes offline, orphaning its callbacks.
2315 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2316 * 4. CPU 1 comes back online.
2317 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2318 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2319 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2218 */ 2320 */
2321 init_completion(&rcu_barrier_completion);
2219 atomic_set(&rcu_barrier_cpu_count, 1); 2322 atomic_set(&rcu_barrier_cpu_count, 1);
2220 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2323 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2324 rsp->rcu_barrier_in_progress = current;
2325 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2326
2327 /*
2328 * Force every CPU with callbacks to register a new callback
2329 * that will tell us when all the preceding callbacks have
2330 * been invoked. If an offline CPU has callbacks, wait for
2331 * it to either come back online or to finish orphaning those
2332 * callbacks.
2333 */
2334 for_each_possible_cpu(cpu) {
2335 preempt_disable();
2336 rdp = per_cpu_ptr(rsp->rda, cpu);
2337 if (cpu_is_offline(cpu)) {
2338 preempt_enable();
2339 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2340 schedule_timeout_interruptible(1);
2341 } else if (ACCESS_ONCE(rdp->qlen)) {
2342 smp_call_function_single(cpu, rcu_barrier_func,
2343 (void *)call_rcu_func, 1);
2344 preempt_enable();
2345 } else {
2346 preempt_enable();
2347 }
2348 }
2349
2350 /*
2351 * Now that all online CPUs have rcu_barrier_callback() callbacks
2352 * posted, we can adopt all of the orphaned callbacks and place
2353 * an rcu_barrier_callback() callback after them. When that is done,
2354 * we are guaranteed to have an rcu_barrier_callback() callback
2355 * following every callback that could possibly have been
2356 * registered before _rcu_barrier() was called.
2357 */
2358 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2359 rcu_adopt_orphan_cbs(rsp);
2360 rsp->rcu_barrier_in_progress = NULL;
2361 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2362 atomic_inc(&rcu_barrier_cpu_count);
2363 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2364 call_rcu_func(&rh, rcu_barrier_callback);
2365
2366 /*
2367 * Now that we have an rcu_barrier_callback() callback on each
2368 * CPU, and thus each counted, remove the initial count.
2369 */
2221 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2370 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2222 complete(&rcu_barrier_completion); 2371 complete(&rcu_barrier_completion);
2372
2373 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2223 wait_for_completion(&rcu_barrier_completion); 2374 wait_for_completion(&rcu_barrier_completion);
2375
2376 /* Other rcu_barrier() invocations can now safely proceed. */
2224 mutex_unlock(&rcu_barrier_mutex); 2377 mutex_unlock(&rcu_barrier_mutex);
2378
2379 destroy_rcu_head_on_stack(&rh);
2225} 2380}
2226 2381
2227/** 2382/**
@@ -2418,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2418 2573
2419 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2574 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2420 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2575 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2421 rsp->levelspread[0] = RCU_FANOUT_LEAF; 2576 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
2422} 2577}
2423#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2578#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2424static void __init rcu_init_levelspread(struct rcu_state *rsp) 2579static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a407..7f5d138dedf 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
33 * CONFIG_RCU_FANOUT_LEAF.
33 * In theory, it should be possible to add more levels straightforwardly. 34 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this did work well going from three levels to four. 35 * In practice, this did work well going from three levels to four.
35 * Of course, your mileage may vary. 36 * Of course, your mileage may vary.
36 */ 37 */
37#define MAX_RCU_LVLS 4 38#define MAX_RCU_LVLS 4
38#if CONFIG_RCU_FANOUT > 16 39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
371 367
372 raw_spinlock_t onofflock; /* exclude on/offline and */ 368 raw_spinlock_t onofflock; /* exclude on/offline and */
373 /* starting new GP. */ 369 /* starting new GP. */
370 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
371 /* need a grace period. */
372 struct rcu_head **orphan_nxttail; /* Tail of above. */
373 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
374 /* are ready to invoke. */
375 struct rcu_head **orphan_donetail; /* Tail of above. */
376 long qlen_lazy; /* Number of lazy callbacks. */
377 long qlen; /* Total number of callbacks. */
378 struct task_struct *rcu_barrier_in_progress;
379 /* Task doing rcu_barrier(), */
380 /* or NULL if no barrier. */
374 raw_spinlock_t fqslock; /* Only one task forcing */ 381 raw_spinlock_t fqslock; /* Only one task forcing */
375 /* quiescent states. */ 382 /* quiescent states. */
376 unsigned long jiffies_force_qs; /* Time at which to invoke */ 383 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
423/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
424static void rcu_bootup_announce(void); 431static void rcu_bootup_announce(void);
425long rcu_batches_completed(void); 432long rcu_batches_completed(void);
426static void rcu_preempt_note_context_switch(int cpu);
427static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
428#ifdef CONFIG_HOTPLUG_CPU 434#ifdef CONFIG_HOTPLUG_CPU
429static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 477static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 478static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 479static void rcu_prepare_for_idle(int cpu);
480static void rcu_idle_count_callbacks_posted(void);
474static void print_cpu_stall_info_begin(void); 481static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void); 483static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816b..2411000d986 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156static void rcu_preempt_note_context_switch(int cpu) 156void rcu_preempt_note_context_switch(void)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 rdp = __this_cpu_ptr(rcu_preempt_state.rda);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(cpu); 231 rcu_preempt_qs(smp_processor_id());
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
970} 970}
971 971
972/*
973 * Check for a task exiting while in a preemptible-RCU read-side
974 * critical section, clean up if so. No need to issue warnings,
975 * as debug_check_no_locks_held() already does this if lockdep
976 * is enabled.
977 */
978void exit_rcu(void)
979{
980 struct task_struct *t = current;
981
982 if (t->rcu_read_lock_nesting == 0)
983 return;
984 t->rcu_read_lock_nesting = 1;
985 __rcu_read_unlock();
986}
987
988#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 972#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
989 973
990static struct rcu_state *rcu_state = &rcu_sched_state; 974static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
1018EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1019 1003
1020/* 1004/*
1021 * Because preemptible RCU does not exist, we never have to check for
1022 * CPUs being in quiescent states.
1023 */
1024static void rcu_preempt_note_context_switch(int cpu)
1025{
1026}
1027
1028/*
1029 * Because preemptible RCU does not exist, there are never any preempted 1005 * Because preemptible RCU does not exist, there are never any preempted
1030 * RCU readers. 1006 * RCU readers.
1031 */ 1007 */
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
1938{ 1914{
1939} 1915}
1940 1916
1917/*
1918 * Don't bother keeping a running count of the number of RCU callbacks
1919 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1920 */
1921static void rcu_idle_count_callbacks_posted(void)
1922{
1923}
1924
1941#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1925#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1942 1926
1943/* 1927/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1980 1964
1965/* Loop counter for rcu_prepare_for_idle(). */
1981static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1986 1979
1987/* 1980/*
1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
1995 */ 1988 */
1996int rcu_needs_cpu(int cpu) 1989int rcu_needs_cpu(int cpu)
1997{ 1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1998 /* If no callbacks, RCU doesn't need the CPU. */ 1993 /* If no callbacks, RCU doesn't need the CPU. */
1999 if (!rcu_cpu_has_callbacks(cpu)) 1994 if (!rcu_cpu_has_callbacks(cpu))
2000 return 0; 1995 return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2045} 2040}
2046 2041
2047/* 2042/*
2043 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing.
2045 */
2046void rcu_idle_demigrate(void *unused)
2047{
2048 trace_rcu_prep_idle("Demigrate");
2049}
2050
2051/*
2048 * Timer handler used to force CPU to start pushing its remaining RCU 2052 * Timer handler used to force CPU to start pushing its remaining RCU
2049 * callbacks in the case where it entered dyntick-idle mode with callbacks 2053 * callbacks in the case where it entered dyntick-idle mode with callbacks
2050 * pending. The hander doesn't really need to do anything because the 2054 * pending. The hander doesn't really need to do anything because the
2051 * real work is done upon re-entry to idle, or by the next scheduling-clock 2055 * real work is done upon re-entry to idle, or by the next scheduling-clock
2052 * interrupt should idle not be re-entered. 2056 * interrupt should idle not be re-entered.
2057 *
2058 * One special case: the timer gets migrated without awakening the CPU
2059 * on which the timer was scheduled on. In this case, we must wake up
2060 * that CPU. We do so with smp_call_function_single().
2053 */ 2061 */
2054static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2062static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2055{ 2063{
2064 int cpu = (int)cpu_in;
2065
2056 trace_rcu_prep_idle("Timer"); 2066 trace_rcu_prep_idle("Timer");
2057 return HRTIMER_NORESTART; 2067 if (cpu != smp_processor_id())
2068 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
2069 else
2070 WARN_ON_ONCE(1); /* Getting here can hang the system... */
2058} 2071}
2059 2072
2060/* 2073/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2062 */ 2075 */
2063static void rcu_prepare_for_idle_init(int cpu) 2076static void rcu_prepare_for_idle_init(int cpu)
2064{ 2077{
2065 static int firsttime = 1; 2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2066 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
2067 2080 rcu_idle_gp_timer_func, cpu);
2068 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
2069 hrtp->function = rcu_idle_gp_timer_func; 2082 per_cpu(rcu_idle_first_pass, cpu) = 1;
2070 if (firsttime) {
2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2072
2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2076 firsttime = 0;
2077 }
2078} 2083}
2079 2084
2080/* 2085/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2084 */ 2089 */
2085static void rcu_cleanup_after_idle(int cpu) 2090static void rcu_cleanup_after_idle(int cpu)
2086{ 2091{
2087 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
2093 trace_rcu_prep_idle("Cleanup after idle");
2088} 2094}
2089 2095
2090/* 2096/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
2108 */ 2114 */
2109static void rcu_prepare_for_idle(int cpu) 2115static void rcu_prepare_for_idle(int cpu)
2110{ 2116{
2117 struct timer_list *tp;
2118
2119 /*
2120 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
2125 * pending.
2126 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) ==
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2133 }
2134 return;
2135 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139
2111 /* 2140 /*
2112 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2141 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2113 * Also reset state to avoid prejudicing later attempts. 2142 * Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2169 per_cpu(rcu_dyntick_drain, cpu) = 0;
2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2171 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2172 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2173 jiffies + RCU_IDLE_GP_DELAY;
2145 else 2174 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2175 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2176 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2180 per_cpu(rcu_nonlazy_posted, cpu);
2148 return; /* Nothing more to do immediately. */ 2181 return; /* Nothing more to do immediately. */
2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2150 /* We have hit the limit, so time to give up. */ 2183 /* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
2184 trace_rcu_prep_idle("Callbacks drained"); 2217 trace_rcu_prep_idle("Callbacks drained");
2185} 2218}
2186 2219
2220/*
2221 * Keep a running count of the number of non-lazy callbacks posted
2222 * on this CPU. This running counter (which is never decremented) allows
2223 * rcu_prepare_for_idle() to detect when something out of the idle loop
2224 * posts a callback, even if an equal number of callbacks are invoked.
2225 * Of course, callbacks should only be posted from within a trace event
2226 * designed to be called from idle or from within RCU_NONIDLE().
2227 */
2228static void rcu_idle_count_callbacks_posted(void)
2229{
2230 __this_cpu_add(rcu_nonlazy_posted, 1);
2231}
2232
2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188 2234
2189#ifdef CONFIG_RCU_CPU_STALL_INFO 2235#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
2192 2238
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{ 2240{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
2196 2242
2197 sprintf(cp, "drain=%d %c timer=%lld", 2243 sprintf(cp, "drain=%d %c timer=%lu",
2198 per_cpu(rcu_dyntick_drain, cpu), 2244 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp) 2246 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203} 2247}
2204 2248
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2249#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff4..d4bc16ddd1d 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
271 271
272 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
275 rsp->completed, gpnum, rsp->fqs_state, 275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 279 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh); 280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
282 if (rnp->level != level) { 282 if (rnp->level != level) {
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b..bebe2b170d4 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
25int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
26 bool force)
26{ 27{
28 int ret = 0;
29
27 if (counter->usage + val > counter->limit) { 30 if (counter->usage + val > counter->limit) {
28 counter->failcnt++; 31 counter->failcnt++;
29 return -ENOMEM; 32 ret = -ENOMEM;
33 if (!force)
34 return ret;
30 } 35 }
31 36
32 counter->usage += val; 37 counter->usage += val;
33 if (counter->usage > counter->max_usage) 38 if (counter->usage > counter->max_usage)
34 counter->max_usage = counter->usage; 39 counter->max_usage = counter->usage;
35 return 0; 40 return ret;
36} 41}
37 42
38int res_counter_charge(struct res_counter *counter, unsigned long val, 43static int __res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at) 44 struct res_counter **limit_fail_at, bool force)
40{ 45{
41 int ret; 46 int ret, r;
42 unsigned long flags; 47 unsigned long flags;
43 struct res_counter *c, *u; 48 struct res_counter *c, *u;
44 49
50 r = ret = 0;
45 *limit_fail_at = NULL; 51 *limit_fail_at = NULL;
46 local_irq_save(flags); 52 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) { 53 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock); 54 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val); 55 r = res_counter_charge_locked(c, val, force);
50 spin_unlock(&c->lock); 56 spin_unlock(&c->lock);
51 if (ret < 0) { 57 if (r < 0 && !ret) {
58 ret = r;
52 *limit_fail_at = c; 59 *limit_fail_at = c;
53 goto undo; 60 if (!force)
61 break;
54 } 62 }
55 } 63 }
56 ret = 0; 64
57 goto done; 65 if (ret < 0 && !force) {
58undo: 66 for (u = counter; u != c; u = u->parent) {
59 for (u = counter; u != c; u = u->parent) { 67 spin_lock(&u->lock);
60 spin_lock(&u->lock); 68 res_counter_uncharge_locked(u, val);
61 res_counter_uncharge_locked(u, val); 69 spin_unlock(&u->lock);
62 spin_unlock(&u->lock); 70 }
63 } 71 }
64done:
65 local_irq_restore(flags); 72 local_irq_restore(flags);
73
66 return ret; 74 return ret;
67} 75}
68 76
77int res_counter_charge(struct res_counter *counter, unsigned long val,
78 struct res_counter **limit_fail_at)
79{
80 return __res_counter_charge(counter, val, limit_fail_at, false);
81}
82
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, 83int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at) 84 struct res_counter **limit_fail_at)
71{ 85{
72 int ret, r; 86 return __res_counter_charge(counter, val, limit_fail_at, true);
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93} 87}
88
94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
95{ 90{
96 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a..173ea52f3af 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c692a0a555..39eb6011bc3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 693}
693#endif 694#endif
694 695
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 696static void set_load_weight(struct task_struct *p)
698{ 697{
699 int prio = p->static_prio - MAX_RT_PRIO; 698 int prio = p->static_prio - MAX_RT_PRIO;
@@ -1913,7 +1912,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1913 struct task_struct *next) 1912 struct task_struct *next)
1914{ 1913{
1915 sched_info_switch(prev, next); 1914 sched_info_switch(prev, next);
1916 perf_event_task_sched(prev, next); 1915 perf_event_task_sched_out(prev, next);
1917 fire_sched_out_preempt_notifiers(prev, next); 1916 fire_sched_out_preempt_notifiers(prev, next);
1918 prepare_lock_switch(rq, next); 1917 prepare_lock_switch(rq, next);
1919 prepare_arch_switch(next); 1918 prepare_arch_switch(next);
@@ -1956,6 +1955,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1956 */ 1955 */
1957 prev_state = prev->state; 1956 prev_state = prev->state;
1958 finish_arch_switch(prev); 1957 finish_arch_switch(prev);
1958#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1959 local_irq_disable();
1960#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1961 perf_event_task_sched_in(prev, current);
1962#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1963 local_irq_enable();
1964#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1959 finish_lock_switch(rq, prev); 1965 finish_lock_switch(rq, prev);
1960 finish_arch_post_lock_switch(); 1966 finish_arch_post_lock_switch();
1961 1967
@@ -2076,6 +2082,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2076#endif 2082#endif
2077 2083
2078 /* Here we just switch the register state and the stack. */ 2084 /* Here we just switch the register state and the stack. */
2085 rcu_switch_from(prev);
2079 switch_to(prev, next, prev); 2086 switch_to(prev, next, prev);
2080 2087
2081 barrier(); 2088 barrier();
@@ -2479,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2479 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2480 * every tick. We fix it up based on jiffies. 2487 * every tick. We fix it up based on jiffies.
2481 */ 2488 */
2482void update_cpu_load(struct rq *this_rq) 2489static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2490 unsigned long pending_updates)
2483{ 2491{
2484 unsigned long this_load = this_rq->load.weight;
2485 unsigned long curr_jiffies = jiffies;
2486 unsigned long pending_updates;
2487 int i, scale; 2492 int i, scale;
2488 2493
2489 this_rq->nr_load_updates++; 2494 this_rq->nr_load_updates++;
2490 2495
2491 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2492 if (curr_jiffies == this_rq->last_load_update_tick)
2493 return;
2494
2495 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2496 this_rq->last_load_update_tick = curr_jiffies;
2497
2498 /* Update our load: */ 2496 /* Update our load: */
2499 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2497 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2500 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2498 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2519,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq)
2519 sched_avg_update(this_rq); 2517 sched_avg_update(this_rq);
2520} 2518}
2521 2519
2520/*
2521 * Called from nohz_idle_balance() to update the load ratings before doing the
2522 * idle balance.
2523 */
2524void update_idle_cpu_load(struct rq *this_rq)
2525{
2526 unsigned long curr_jiffies = jiffies;
2527 unsigned long load = this_rq->load.weight;
2528 unsigned long pending_updates;
2529
2530 /*
2531 * Bloody broken means of dealing with nohz, but better than nothing..
2532 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533 * update and see 0 difference the one time and 2 the next, even though
2534 * we ticked at roughtly the same rate.
2535 *
2536 * Hence we only use this from nohz_idle_balance() and skip this
2537 * nonsense when called from the scheduler_tick() since that's
2538 * guaranteed a stable rate.
2539 */
2540 if (load || curr_jiffies == this_rq->last_load_update_tick)
2541 return;
2542
2543 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2544 this_rq->last_load_update_tick = curr_jiffies;
2545
2546 __update_cpu_load(this_rq, load, pending_updates);
2547}
2548
2549/*
2550 * Called from scheduler_tick()
2551 */
2522static void update_cpu_load_active(struct rq *this_rq) 2552static void update_cpu_load_active(struct rq *this_rq)
2523{ 2553{
2524 update_cpu_load(this_rq); 2554 /*
2555 * See the mess in update_idle_cpu_load().
2556 */
2557 this_rq->last_load_update_tick = jiffies;
2558 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2525 2559
2526 calc_load_account_active(this_rq); 2560 calc_load_account_active(this_rq);
2527} 2561}
@@ -3106,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3106 if (irqs_disabled()) 3140 if (irqs_disabled())
3107 print_irqtrace_events(prev); 3141 print_irqtrace_events(prev);
3108 dump_stack(); 3142 dump_stack();
3143 add_taint(TAINT_WARN);
3109} 3144}
3110 3145
3111/* 3146/*
@@ -4035,11 +4070,8 @@ static bool check_same_owner(struct task_struct *p)
4035 4070
4036 rcu_read_lock(); 4071 rcu_read_lock();
4037 pcred = __task_cred(p); 4072 pcred = __task_cred(p);
4038 if (cred->user->user_ns == pcred->user->user_ns) 4073 match = (uid_eq(cred->euid, pcred->euid) ||
4039 match = (cred->euid == pcred->euid || 4074 uid_eq(cred->euid, pcred->uid));
4040 cred->euid == pcred->uid);
4041 else
4042 match = false;
4043 rcu_read_unlock(); 4075 rcu_read_unlock();
4044 return match; 4076 return match;
4045} 4077}
@@ -5553,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5553 break; 5585 break;
5554 } 5586 }
5555 5587
5556 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5588 if (!(sd->flags & SD_OVERLAP) &&
5589 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5557 printk(KERN_CONT "\n"); 5590 printk(KERN_CONT "\n");
5558 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5591 printk(KERN_ERR "ERROR: repeated CPUs\n");
5559 break; 5592 break;
@@ -5891,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str)
5891 5924
5892__setup("isolcpus=", isolated_cpu_setup); 5925__setup("isolcpus=", isolated_cpu_setup);
5893 5926
5894#ifdef CONFIG_NUMA
5895
5896/**
5897 * find_next_best_node - find the next node to include in a sched_domain
5898 * @node: node whose sched_domain we're building
5899 * @used_nodes: nodes already in the sched_domain
5900 *
5901 * Find the next node to include in a given scheduling domain. Simply
5902 * finds the closest node not already in the @used_nodes map.
5903 *
5904 * Should use nodemask_t.
5905 */
5906static int find_next_best_node(int node, nodemask_t *used_nodes)
5907{
5908 int i, n, val, min_val, best_node = -1;
5909
5910 min_val = INT_MAX;
5911
5912 for (i = 0; i < nr_node_ids; i++) {
5913 /* Start at @node */
5914 n = (node + i) % nr_node_ids;
5915
5916 if (!nr_cpus_node(n))
5917 continue;
5918
5919 /* Skip already used nodes */
5920 if (node_isset(n, *used_nodes))
5921 continue;
5922
5923 /* Simple min distance search */
5924 val = node_distance(node, n);
5925
5926 if (val < min_val) {
5927 min_val = val;
5928 best_node = n;
5929 }
5930 }
5931
5932 if (best_node != -1)
5933 node_set(best_node, *used_nodes);
5934 return best_node;
5935}
5936
5937/**
5938 * sched_domain_node_span - get a cpumask for a node's sched_domain
5939 * @node: node whose cpumask we're constructing
5940 * @span: resulting cpumask
5941 *
5942 * Given a node, construct a good cpumask for its sched_domain to span. It
5943 * should be one that prevents unnecessary balancing, but also spreads tasks
5944 * out optimally.
5945 */
5946static void sched_domain_node_span(int node, struct cpumask *span)
5947{
5948 nodemask_t used_nodes;
5949 int i;
5950
5951 cpumask_clear(span);
5952 nodes_clear(used_nodes);
5953
5954 cpumask_or(span, span, cpumask_of_node(node));
5955 node_set(node, used_nodes);
5956
5957 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5958 int next_node = find_next_best_node(node, &used_nodes);
5959 if (next_node < 0)
5960 break;
5961 cpumask_or(span, span, cpumask_of_node(next_node));
5962 }
5963}
5964
5965static const struct cpumask *cpu_node_mask(int cpu)
5966{
5967 lockdep_assert_held(&sched_domains_mutex);
5968
5969 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5970
5971 return sched_domains_tmpmask;
5972}
5973
5974static const struct cpumask *cpu_allnodes_mask(int cpu)
5975{
5976 return cpu_possible_mask;
5977}
5978#endif /* CONFIG_NUMA */
5979
5980static const struct cpumask *cpu_cpu_mask(int cpu) 5927static const struct cpumask *cpu_cpu_mask(int cpu)
5981{ 5928{
5982 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
5983} 5930}
5984 5931
5985int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5986
5987struct sd_data { 5932struct sd_data {
5988 struct sched_domain **__percpu sd; 5933 struct sched_domain **__percpu sd;
5989 struct sched_group **__percpu sg; 5934 struct sched_group **__percpu sg;
@@ -6013,6 +5958,7 @@ struct sched_domain_topology_level {
6013 sched_domain_init_f init; 5958 sched_domain_init_f init;
6014 sched_domain_mask_f mask; 5959 sched_domain_mask_f mask;
6015 int flags; 5960 int flags;
5961 int numa_level;
6016 struct sd_data data; 5962 struct sd_data data;
6017}; 5963};
6018 5964
@@ -6204,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6204} 6150}
6205 6151
6206SD_INIT_FUNC(CPU) 6152SD_INIT_FUNC(CPU)
6207#ifdef CONFIG_NUMA
6208 SD_INIT_FUNC(ALLNODES)
6209 SD_INIT_FUNC(NODE)
6210#endif
6211#ifdef CONFIG_SCHED_SMT 6153#ifdef CONFIG_SCHED_SMT
6212 SD_INIT_FUNC(SIBLING) 6154 SD_INIT_FUNC(SIBLING)
6213#endif 6155#endif
@@ -6329,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = {
6329 { sd_init_BOOK, cpu_book_mask, }, 6271 { sd_init_BOOK, cpu_book_mask, },
6330#endif 6272#endif
6331 { sd_init_CPU, cpu_cpu_mask, }, 6273 { sd_init_CPU, cpu_cpu_mask, },
6332#ifdef CONFIG_NUMA
6333 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6334 { sd_init_ALLNODES, cpu_allnodes_mask, },
6335#endif
6336 { NULL, }, 6274 { NULL, },
6337}; 6275};
6338 6276
6339static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6277static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6340 6278
6279#ifdef CONFIG_NUMA
6280
6281static int sched_domains_numa_levels;
6282static int sched_domains_numa_scale;
6283static int *sched_domains_numa_distance;
6284static struct cpumask ***sched_domains_numa_masks;
6285static int sched_domains_curr_level;
6286
6287static inline int sd_local_flags(int level)
6288{
6289 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6290 return 0;
6291
6292 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6293}
6294
6295static struct sched_domain *
6296sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6297{
6298 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6299 int level = tl->numa_level;
6300 int sd_weight = cpumask_weight(
6301 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6302
6303 *sd = (struct sched_domain){
6304 .min_interval = sd_weight,
6305 .max_interval = 2*sd_weight,
6306 .busy_factor = 32,
6307 .imbalance_pct = 125,
6308 .cache_nice_tries = 2,
6309 .busy_idx = 3,
6310 .idle_idx = 2,
6311 .newidle_idx = 0,
6312 .wake_idx = 0,
6313 .forkexec_idx = 0,
6314
6315 .flags = 1*SD_LOAD_BALANCE
6316 | 1*SD_BALANCE_NEWIDLE
6317 | 0*SD_BALANCE_EXEC
6318 | 0*SD_BALANCE_FORK
6319 | 0*SD_BALANCE_WAKE
6320 | 0*SD_WAKE_AFFINE
6321 | 0*SD_PREFER_LOCAL
6322 | 0*SD_SHARE_CPUPOWER
6323 | 0*SD_SHARE_PKG_RESOURCES
6324 | 1*SD_SERIALIZE
6325 | 0*SD_PREFER_SIBLING
6326 | sd_local_flags(level)
6327 ,
6328 .last_balance = jiffies,
6329 .balance_interval = sd_weight,
6330 };
6331 SD_INIT_NAME(sd, NUMA);
6332 sd->private = &tl->data;
6333
6334 /*
6335 * Ugly hack to pass state to sd_numa_mask()...
6336 */
6337 sched_domains_curr_level = tl->numa_level;
6338
6339 return sd;
6340}
6341
6342static const struct cpumask *sd_numa_mask(int cpu)
6343{
6344 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6345}
6346
6347static void sched_init_numa(void)
6348{
6349 int next_distance, curr_distance = node_distance(0, 0);
6350 struct sched_domain_topology_level *tl;
6351 int level = 0;
6352 int i, j, k;
6353
6354 sched_domains_numa_scale = curr_distance;
6355 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6356 if (!sched_domains_numa_distance)
6357 return;
6358
6359 /*
6360 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6361 * unique distances in the node_distance() table.
6362 *
6363 * Assumes node_distance(0,j) includes all distances in
6364 * node_distance(i,j) in order to avoid cubic time.
6365 *
6366 * XXX: could be optimized to O(n log n) by using sort()
6367 */
6368 next_distance = curr_distance;
6369 for (i = 0; i < nr_node_ids; i++) {
6370 for (j = 0; j < nr_node_ids; j++) {
6371 int distance = node_distance(0, j);
6372 if (distance > curr_distance &&
6373 (distance < next_distance ||
6374 next_distance == curr_distance))
6375 next_distance = distance;
6376 }
6377 if (next_distance != curr_distance) {
6378 sched_domains_numa_distance[level++] = next_distance;
6379 sched_domains_numa_levels = level;
6380 curr_distance = next_distance;
6381 } else break;
6382 }
6383 /*
6384 * 'level' contains the number of unique distances, excluding the
6385 * identity distance node_distance(i,i).
6386 *
6387 * The sched_domains_nume_distance[] array includes the actual distance
6388 * numbers.
6389 */
6390
6391 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6392 if (!sched_domains_numa_masks)
6393 return;
6394
6395 /*
6396 * Now for each level, construct a mask per node which contains all
6397 * cpus of nodes that are that many hops away from us.
6398 */
6399 for (i = 0; i < level; i++) {
6400 sched_domains_numa_masks[i] =
6401 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6402 if (!sched_domains_numa_masks[i])
6403 return;
6404
6405 for (j = 0; j < nr_node_ids; j++) {
6406 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6407 if (!mask)
6408 return;
6409
6410 sched_domains_numa_masks[i][j] = mask;
6411
6412 for (k = 0; k < nr_node_ids; k++) {
6413 if (node_distance(j, k) > sched_domains_numa_distance[i])
6414 continue;
6415
6416 cpumask_or(mask, mask, cpumask_of_node(k));
6417 }
6418 }
6419 }
6420
6421 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6422 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6423 if (!tl)
6424 return;
6425
6426 /*
6427 * Copy the default topology bits..
6428 */
6429 for (i = 0; default_topology[i].init; i++)
6430 tl[i] = default_topology[i];
6431
6432 /*
6433 * .. and append 'j' levels of NUMA goodness.
6434 */
6435 for (j = 0; j < level; i++, j++) {
6436 tl[i] = (struct sched_domain_topology_level){
6437 .init = sd_numa_init,
6438 .mask = sd_numa_mask,
6439 .flags = SDTL_OVERLAP,
6440 .numa_level = j,
6441 };
6442 }
6443
6444 sched_domain_topology = tl;
6445}
6446#else
6447static inline void sched_init_numa(void)
6448{
6449}
6450#endif /* CONFIG_NUMA */
6451
6341static int __sdt_alloc(const struct cpumask *cpu_map) 6452static int __sdt_alloc(const struct cpumask *cpu_map)
6342{ 6453{
6343 struct sched_domain_topology_level *tl; 6454 struct sched_domain_topology_level *tl;
@@ -6375,6 +6486,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6375 if (!sg) 6486 if (!sg)
6376 return -ENOMEM; 6487 return -ENOMEM;
6377 6488
6489 sg->next = sg;
6490
6378 *per_cpu_ptr(sdd->sg, j) = sg; 6491 *per_cpu_ptr(sdd->sg, j) = sg;
6379 6492
6380 sgp = kzalloc_node(sizeof(struct sched_group_power), 6493 sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6398,16 +6511,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
6398 struct sd_data *sdd = &tl->data; 6511 struct sd_data *sdd = &tl->data;
6399 6512
6400 for_each_cpu(j, cpu_map) { 6513 for_each_cpu(j, cpu_map) {
6401 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 6514 struct sched_domain *sd;
6402 if (sd && (sd->flags & SD_OVERLAP)) 6515
6403 free_sched_groups(sd->groups, 0); 6516 if (sdd->sd) {
6404 kfree(*per_cpu_ptr(sdd->sd, j)); 6517 sd = *per_cpu_ptr(sdd->sd, j);
6405 kfree(*per_cpu_ptr(sdd->sg, j)); 6518 if (sd && (sd->flags & SD_OVERLAP))
6406 kfree(*per_cpu_ptr(sdd->sgp, j)); 6519 free_sched_groups(sd->groups, 0);
6520 kfree(*per_cpu_ptr(sdd->sd, j));
6521 }
6522
6523 if (sdd->sg)
6524 kfree(*per_cpu_ptr(sdd->sg, j));
6525 if (sdd->sgp)
6526 kfree(*per_cpu_ptr(sdd->sgp, j));
6407 } 6527 }
6408 free_percpu(sdd->sd); 6528 free_percpu(sdd->sd);
6529 sdd->sd = NULL;
6409 free_percpu(sdd->sg); 6530 free_percpu(sdd->sg);
6531 sdd->sg = NULL;
6410 free_percpu(sdd->sgp); 6532 free_percpu(sdd->sgp);
6533 sdd->sgp = NULL;
6411 } 6534 }
6412} 6535}
6413 6536
@@ -6693,97 +6816,6 @@ match2:
6693 mutex_unlock(&sched_domains_mutex); 6816 mutex_unlock(&sched_domains_mutex);
6694} 6817}
6695 6818
6696#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6697static void reinit_sched_domains(void)
6698{
6699 get_online_cpus();
6700
6701 /* Destroy domains first to force the rebuild */
6702 partition_sched_domains(0, NULL, NULL);
6703
6704 rebuild_sched_domains();
6705 put_online_cpus();
6706}
6707
6708static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6709{
6710 unsigned int level = 0;
6711
6712 if (sscanf(buf, "%u", &level) != 1)
6713 return -EINVAL;
6714
6715 /*
6716 * level is always be positive so don't check for
6717 * level < POWERSAVINGS_BALANCE_NONE which is 0
6718 * What happens on 0 or 1 byte write,
6719 * need to check for count as well?
6720 */
6721
6722 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6723 return -EINVAL;
6724
6725 if (smt)
6726 sched_smt_power_savings = level;
6727 else
6728 sched_mc_power_savings = level;
6729
6730 reinit_sched_domains();
6731
6732 return count;
6733}
6734
6735#ifdef CONFIG_SCHED_MC
6736static ssize_t sched_mc_power_savings_show(struct device *dev,
6737 struct device_attribute *attr,
6738 char *buf)
6739{
6740 return sprintf(buf, "%u\n", sched_mc_power_savings);
6741}
6742static ssize_t sched_mc_power_savings_store(struct device *dev,
6743 struct device_attribute *attr,
6744 const char *buf, size_t count)
6745{
6746 return sched_power_savings_store(buf, count, 0);
6747}
6748static DEVICE_ATTR(sched_mc_power_savings, 0644,
6749 sched_mc_power_savings_show,
6750 sched_mc_power_savings_store);
6751#endif
6752
6753#ifdef CONFIG_SCHED_SMT
6754static ssize_t sched_smt_power_savings_show(struct device *dev,
6755 struct device_attribute *attr,
6756 char *buf)
6757{
6758 return sprintf(buf, "%u\n", sched_smt_power_savings);
6759}
6760static ssize_t sched_smt_power_savings_store(struct device *dev,
6761 struct device_attribute *attr,
6762 const char *buf, size_t count)
6763{
6764 return sched_power_savings_store(buf, count, 1);
6765}
6766static DEVICE_ATTR(sched_smt_power_savings, 0644,
6767 sched_smt_power_savings_show,
6768 sched_smt_power_savings_store);
6769#endif
6770
6771int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6772{
6773 int err = 0;
6774
6775#ifdef CONFIG_SCHED_SMT
6776 if (smt_capable())
6777 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6778#endif
6779#ifdef CONFIG_SCHED_MC
6780 if (!err && mc_capable())
6781 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6782#endif
6783 return err;
6784}
6785#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6786
6787/* 6819/*
6788 * Update cpusets according to cpu_active mask. If cpusets are 6820 * Update cpusets according to cpu_active mask. If cpusets are
6789 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6821 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6821,6 +6853,8 @@ void __init sched_init_smp(void)
6821 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6853 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6822 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6854 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6823 6855
6856 sched_init_numa();
6857
6824 get_online_cpus(); 6858 get_online_cpus();
6825 mutex_lock(&sched_domains_mutex); 6859 mutex_lock(&sched_domains_mutex);
6826 init_sched_domains(cpu_active_mask); 6860 init_sched_domains(cpu_active_mask);
@@ -7042,6 +7076,7 @@ void __init sched_init(void)
7042 /* May be allocated at isolcpus cmdline parse time */ 7076 /* May be allocated at isolcpus cmdline parse time */
7043 if (cpu_isolated_map == NULL) 7077 if (cpu_isolated_map == NULL)
7044 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7078 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7079 idle_thread_set_boot_cpu();
7045#endif 7080#endif
7046 init_sched_fair_class(); 7081 init_sched_fair_class();
7047 7082
@@ -7963,13 +7998,9 @@ static struct cftype cpu_files[] = {
7963 .write_u64 = cpu_rt_period_write_uint, 7998 .write_u64 = cpu_rt_period_write_uint,
7964 }, 7999 },
7965#endif 8000#endif
8001 { } /* terminate */
7966}; 8002};
7967 8003
7968static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7969{
7970 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7971}
7972
7973struct cgroup_subsys cpu_cgroup_subsys = { 8004struct cgroup_subsys cpu_cgroup_subsys = {
7974 .name = "cpu", 8005 .name = "cpu",
7975 .create = cpu_cgroup_create, 8006 .create = cpu_cgroup_create,
@@ -7977,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7977 .can_attach = cpu_cgroup_can_attach, 8008 .can_attach = cpu_cgroup_can_attach,
7978 .attach = cpu_cgroup_attach, 8009 .attach = cpu_cgroup_attach,
7979 .exit = cpu_cgroup_exit, 8010 .exit = cpu_cgroup_exit,
7980 .populate = cpu_cgroup_populate,
7981 .subsys_id = cpu_cgroup_subsys_id, 8011 .subsys_id = cpu_cgroup_subsys_id,
8012 .base_cftypes = cpu_files,
7982 .early_init = 1, 8013 .early_init = 1,
7983}; 8014};
7984 8015
@@ -8163,13 +8194,9 @@ static struct cftype files[] = {
8163 .name = "stat", 8194 .name = "stat",
8164 .read_map = cpuacct_stats_show, 8195 .read_map = cpuacct_stats_show,
8165 }, 8196 },
8197 { } /* terminate */
8166}; 8198};
8167 8199
8168static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8169{
8170 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8171}
8172
8173/* 8200/*
8174 * charge this task's execution time to its accounting group. 8201 * charge this task's execution time to its accounting group.
8175 * 8202 *
@@ -8201,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = {
8201 .name = "cpuacct", 8228 .name = "cpuacct",
8202 .create = cpuacct_create, 8229 .create = cpuacct_create,
8203 .destroy = cpuacct_destroy, 8230 .destroy = cpuacct_destroy,
8204 .populate = cpuacct_populate,
8205 .subsys_id = cpuacct_subsys_id, 8231 .subsys_id = cpuacct_subsys_id,
8232 .base_cftypes = files,
8206}; 8233};
8207#endif /* CONFIG_CGROUP_CPUACCT */ 8234#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161..6f79596e0ea 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebdc58f..940e6d17cf9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785#ifdef CONFIG_SMP 785#ifdef CONFIG_SMP
786 if (entity_is_task(se)) 786 if (entity_is_task(se))
787 list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 787 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788#endif 788#endif
789 cfs_rq->nr_running++; 789 cfs_rq->nr_running++;
790} 790}
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env)
3215 3212
3216static unsigned long task_h_load(struct task_struct *p); 3213static unsigned long task_h_load(struct task_struct *p);
3217 3214
3215static const unsigned int sched_nr_migrate_break = 32;
3216
3218/* 3217/*
3219 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3220 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3221 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3222 * 3221 *
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3229 unsigned long load; 3228 unsigned long load;
3230 int pulled = 0; 3229 int pulled = 0;
3231 3230
3232 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3233 return 0; 3232 return 0;
3234 3233
3235 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env)
3242 3241
3243 /* take a breather every nr_migrate tasks */ 3242 /* take a breather every nr_migrate tasks */
3244 if (env->loop > env->loop_break) { 3243 if (env->loop > env->loop_break) {
3245 env->loop_break += sysctl_sched_nr_migrate; 3244 env->loop_break += sched_nr_migrate_break;
3246 env->flags |= LBF_NEED_BREAK; 3245 env->flags |= LBF_NEED_BREAK;
3247 break; 3246 break;
3248 } 3247 }
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env)
3252 3251
3253 load = task_h_load(p); 3252 load = task_h_load(p);
3254 3253
3255 if (load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3256 goto next; 3255 goto next;
3257 3256
3258 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3259 goto next; 3258 goto next;
3260 3259
3261 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3263 3262
3264 move_task(p, env); 3263 move_task(p, env);
3265 pulled++; 3264 pulled++;
3266 env->load_move -= load; 3265 env->imbalance -= load;
3267 3266
3268#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3269 /* 3268 /*
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3279 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3280 * weighted load. 3279 * weighted load.
3281 */ 3280 */
3282 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3283 break; 3282 break;
3284 3283
3285 continue; 3284 continue;
@@ -3433,14 +3432,6 @@ struct sd_lb_stats {
3433 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3434 3433
3435 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3436#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3437 int power_savings_balance; /* Is powersave balance needed for this sd */
3438 struct sched_group *group_min; /* Least loaded group in sd */
3439 struct sched_group *group_leader; /* Group which relieves group_min */
3440 unsigned long min_load_per_task; /* load_per_task in group_min */
3441 unsigned long leader_nr_running; /* Nr running of group_leader */
3442 unsigned long min_nr_running; /* Nr running of group_min */
3443#endif
3444}; 3435};
3445 3436
3446/* 3437/*
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3484 return load_idx; 3475 return load_idx;
3485} 3476}
3486 3477
3487
3488#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3489/**
3490 * init_sd_power_savings_stats - Initialize power savings statistics for
3491 * the given sched_domain, during load balancing.
3492 *
3493 * @sd: Sched domain whose power-savings statistics are to be initialized.
3494 * @sds: Variable containing the statistics for sd.
3495 * @idle: Idle status of the CPU at which we're performing load-balancing.
3496 */
3497static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3498 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3499{
3500 /*
3501 * Busy processors will not participate in power savings
3502 * balance.
3503 */
3504 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3505 sds->power_savings_balance = 0;
3506 else {
3507 sds->power_savings_balance = 1;
3508 sds->min_nr_running = ULONG_MAX;
3509 sds->leader_nr_running = 0;
3510 }
3511}
3512
3513/**
3514 * update_sd_power_savings_stats - Update the power saving stats for a
3515 * sched_domain while performing load balancing.
3516 *
3517 * @group: sched_group belonging to the sched_domain under consideration.
3518 * @sds: Variable containing the statistics of the sched_domain
3519 * @local_group: Does group contain the CPU for which we're performing
3520 * load balancing ?
3521 * @sgs: Variable containing the statistics of the group.
3522 */
3523static inline void update_sd_power_savings_stats(struct sched_group *group,
3524 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3525{
3526
3527 if (!sds->power_savings_balance)
3528 return;
3529
3530 /*
3531 * If the local group is idle or completely loaded
3532 * no need to do power savings balance at this domain
3533 */
3534 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3535 !sds->this_nr_running))
3536 sds->power_savings_balance = 0;
3537
3538 /*
3539 * If a group is already running at full capacity or idle,
3540 * don't include that group in power savings calculations
3541 */
3542 if (!sds->power_savings_balance ||
3543 sgs->sum_nr_running >= sgs->group_capacity ||
3544 !sgs->sum_nr_running)
3545 return;
3546
3547 /*
3548 * Calculate the group which has the least non-idle load.
3549 * This is the group from where we need to pick up the load
3550 * for saving power
3551 */
3552 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3553 (sgs->sum_nr_running == sds->min_nr_running &&
3554 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3555 sds->group_min = group;
3556 sds->min_nr_running = sgs->sum_nr_running;
3557 sds->min_load_per_task = sgs->sum_weighted_load /
3558 sgs->sum_nr_running;
3559 }
3560
3561 /*
3562 * Calculate the group which is almost near its
3563 * capacity but still has some space to pick up some load
3564 * from other group and save more power
3565 */
3566 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3567 return;
3568
3569 if (sgs->sum_nr_running > sds->leader_nr_running ||
3570 (sgs->sum_nr_running == sds->leader_nr_running &&
3571 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3572 sds->group_leader = group;
3573 sds->leader_nr_running = sgs->sum_nr_running;
3574 }
3575}
3576
3577/**
3578 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3579 * @sds: Variable containing the statistics of the sched_domain
3580 * under consideration.
3581 * @this_cpu: Cpu at which we're currently performing load-balancing.
3582 * @imbalance: Variable to store the imbalance.
3583 *
3584 * Description:
3585 * Check if we have potential to perform some power-savings balance.
3586 * If yes, set the busiest group to be the least loaded group in the
3587 * sched_domain, so that it's CPUs can be put to idle.
3588 *
3589 * Returns 1 if there is potential to perform power-savings balance.
3590 * Else returns 0.
3591 */
3592static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3593 int this_cpu, unsigned long *imbalance)
3594{
3595 if (!sds->power_savings_balance)
3596 return 0;
3597
3598 if (sds->this != sds->group_leader ||
3599 sds->group_leader == sds->group_min)
3600 return 0;
3601
3602 *imbalance = sds->min_load_per_task;
3603 sds->busiest = sds->group_min;
3604
3605 return 1;
3606
3607}
3608#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3609static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3610 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3611{
3612 return;
3613}
3614
3615static inline void update_sd_power_savings_stats(struct sched_group *group,
3616 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3617{
3618 return;
3619}
3620
3621static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3622 int this_cpu, unsigned long *imbalance)
3623{
3624 return 0;
3625}
3626#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3627
3628
3629unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3630{ 3479{
3631 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3763,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3763 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3764 * @sd: The sched_domain whose statistics are to be updated. 3613 * @sd: The sched_domain whose statistics are to be updated.
3765 * @group: sched_group whose statistics are to be updated. 3614 * @group: sched_group whose statistics are to be updated.
3766 * @this_cpu: Cpu for which load balance is currently performed.
3767 * @idle: Idle status of this_cpu
3768 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3615 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3769 * @local_group: Does group contain this_cpu. 3616 * @local_group: Does group contain this_cpu.
3770 * @cpus: Set of cpus considered for load balancing. 3617 * @cpus: Set of cpus considered for load balancing.
3771 * @balance: Should we balance. 3618 * @balance: Should we balance.
3772 * @sgs: variable to hold the statistics for this group. 3619 * @sgs: variable to hold the statistics for this group.
3773 */ 3620 */
3774static inline void update_sg_lb_stats(struct sched_domain *sd, 3621static inline void update_sg_lb_stats(struct lb_env *env,
3775 struct sched_group *group, int this_cpu, 3622 struct sched_group *group, int load_idx,
3776 enum cpu_idle_type idle, int load_idx,
3777 int local_group, const struct cpumask *cpus, 3623 int local_group, const struct cpumask *cpus,
3778 int *balance, struct sg_lb_stats *sgs) 3624 int *balance, struct sg_lb_stats *sgs)
3779{ 3625{
3780 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3626 unsigned long nr_running, max_nr_running, min_nr_running;
3781 int i; 3627 unsigned long load, max_cpu_load, min_cpu_load;
3782 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3628 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3783 unsigned long avg_load_per_task = 0; 3629 unsigned long avg_load_per_task = 0;
3630 int i;
3784 3631
3785 if (local_group) 3632 if (local_group)
3786 balance_cpu = group_first_cpu(group); 3633 balance_cpu = group_first_cpu(group);
@@ -3789,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3789 max_cpu_load = 0; 3636 max_cpu_load = 0;
3790 min_cpu_load = ~0UL; 3637 min_cpu_load = ~0UL;
3791 max_nr_running = 0; 3638 max_nr_running = 0;
3639 min_nr_running = ~0UL;
3792 3640
3793 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3641 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3794 struct rq *rq = cpu_rq(i); 3642 struct rq *rq = cpu_rq(i);
3795 3643
3644 nr_running = rq->nr_running;
3645
3796 /* Bias balancing toward cpus of our domain */ 3646 /* Bias balancing toward cpus of our domain */
3797 if (local_group) { 3647 if (local_group) {
3798 if (idle_cpu(i) && !first_idle_cpu) { 3648 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3803,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3803 load = target_load(i, load_idx); 3653 load = target_load(i, load_idx);
3804 } else { 3654 } else {
3805 load = source_load(i, load_idx); 3655 load = source_load(i, load_idx);
3806 if (load > max_cpu_load) { 3656 if (load > max_cpu_load)
3807 max_cpu_load = load; 3657 max_cpu_load = load;
3808 max_nr_running = rq->nr_running;
3809 }
3810 if (min_cpu_load > load) 3658 if (min_cpu_load > load)
3811 min_cpu_load = load; 3659 min_cpu_load = load;
3660
3661 if (nr_running > max_nr_running)
3662 max_nr_running = nr_running;
3663 if (min_nr_running > nr_running)
3664 min_nr_running = nr_running;
3812 } 3665 }
3813 3666
3814 sgs->group_load += load; 3667 sgs->group_load += load;
3815 sgs->sum_nr_running += rq->nr_running; 3668 sgs->sum_nr_running += nr_running;
3816 sgs->sum_weighted_load += weighted_cpuload(i); 3669 sgs->sum_weighted_load += weighted_cpuload(i);
3817 if (idle_cpu(i)) 3670 if (idle_cpu(i))
3818 sgs->idle_cpus++; 3671 sgs->idle_cpus++;
@@ -3825,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3825 * to do the newly idle load balance. 3678 * to do the newly idle load balance.
3826 */ 3679 */
3827 if (local_group) { 3680 if (local_group) {
3828 if (idle != CPU_NEWLY_IDLE) { 3681 if (env->idle != CPU_NEWLY_IDLE) {
3829 if (balance_cpu != this_cpu) { 3682 if (balance_cpu != env->dst_cpu) {
3830 *balance = 0; 3683 *balance = 0;
3831 return; 3684 return;
3832 } 3685 }
3833 update_group_power(sd, this_cpu); 3686 update_group_power(env->sd, env->dst_cpu);
3834 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3687 } else if (time_after_eq(jiffies, group->sgp->next_update))
3835 update_group_power(sd, this_cpu); 3688 update_group_power(env->sd, env->dst_cpu);
3836 } 3689 }
3837 3690
3838 /* Adjust by relative CPU power of the group */ 3691 /* Adjust by relative CPU power of the group */
@@ -3850,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3850 if (sgs->sum_nr_running) 3703 if (sgs->sum_nr_running)
3851 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3704 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3852 3705
3853 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3706 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3707 (max_nr_running - min_nr_running) > 1)
3854 sgs->group_imb = 1; 3708 sgs->group_imb = 1;
3855 3709
3856 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3710 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3857 SCHED_POWER_SCALE); 3711 SCHED_POWER_SCALE);
3858 if (!sgs->group_capacity) 3712 if (!sgs->group_capacity)
3859 sgs->group_capacity = fix_small_capacity(sd, group); 3713 sgs->group_capacity = fix_small_capacity(env->sd, group);
3860 sgs->group_weight = group->group_weight; 3714 sgs->group_weight = group->group_weight;
3861 3715
3862 if (sgs->group_capacity > sgs->sum_nr_running) 3716 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3874,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3874 * Determine if @sg is a busier group than the previously selected 3728 * Determine if @sg is a busier group than the previously selected
3875 * busiest group. 3729 * busiest group.
3876 */ 3730 */
3877static bool update_sd_pick_busiest(struct sched_domain *sd, 3731static bool update_sd_pick_busiest(struct lb_env *env,
3878 struct sd_lb_stats *sds, 3732 struct sd_lb_stats *sds,
3879 struct sched_group *sg, 3733 struct sched_group *sg,
3880 struct sg_lb_stats *sgs, 3734 struct sg_lb_stats *sgs)
3881 int this_cpu)
3882{ 3735{
3883 if (sgs->avg_load <= sds->max_load) 3736 if (sgs->avg_load <= sds->max_load)
3884 return false; 3737 return false;
@@ -3894,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3894 * numbered CPUs in the group, therefore mark all groups 3747 * numbered CPUs in the group, therefore mark all groups
3895 * higher than ourself as busy. 3748 * higher than ourself as busy.
3896 */ 3749 */
3897 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3750 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3898 this_cpu < group_first_cpu(sg)) { 3751 env->dst_cpu < group_first_cpu(sg)) {
3899 if (!sds->busiest) 3752 if (!sds->busiest)
3900 return true; 3753 return true;
3901 3754
@@ -3915,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3915 * @balance: Should we balance. 3768 * @balance: Should we balance.
3916 * @sds: variable to hold the statistics for this sched_domain. 3769 * @sds: variable to hold the statistics for this sched_domain.
3917 */ 3770 */
3918static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3771static inline void update_sd_lb_stats(struct lb_env *env,
3919 enum cpu_idle_type idle, const struct cpumask *cpus, 3772 const struct cpumask *cpus,
3920 int *balance, struct sd_lb_stats *sds) 3773 int *balance, struct sd_lb_stats *sds)
3921{ 3774{
3922 struct sched_domain *child = sd->child; 3775 struct sched_domain *child = env->sd->child;
3923 struct sched_group *sg = sd->groups; 3776 struct sched_group *sg = env->sd->groups;
3924 struct sg_lb_stats sgs; 3777 struct sg_lb_stats sgs;
3925 int load_idx, prefer_sibling = 0; 3778 int load_idx, prefer_sibling = 0;
3926 3779
3927 if (child && child->flags & SD_PREFER_SIBLING) 3780 if (child && child->flags & SD_PREFER_SIBLING)
3928 prefer_sibling = 1; 3781 prefer_sibling = 1;
3929 3782
3930 init_sd_power_savings_stats(sd, sds, idle); 3783 load_idx = get_sd_load_idx(env->sd, env->idle);
3931 load_idx = get_sd_load_idx(sd, idle);
3932 3784
3933 do { 3785 do {
3934 int local_group; 3786 int local_group;
3935 3787
3936 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3788 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3937 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3938 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3790 update_sg_lb_stats(env, sg, load_idx, local_group,
3939 local_group, cpus, balance, &sgs); 3791 cpus, balance, &sgs);
3940 3792
3941 if (local_group && !(*balance)) 3793 if (local_group && !(*balance))
3942 return; 3794 return;
@@ -3964,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3964 sds->this_load_per_task = sgs.sum_weighted_load; 3816 sds->this_load_per_task = sgs.sum_weighted_load;
3965 sds->this_has_capacity = sgs.group_has_capacity; 3817 sds->this_has_capacity = sgs.group_has_capacity;
3966 sds->this_idle_cpus = sgs.idle_cpus; 3818 sds->this_idle_cpus = sgs.idle_cpus;
3967 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3819 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3968 sds->max_load = sgs.avg_load; 3820 sds->max_load = sgs.avg_load;
3969 sds->busiest = sg; 3821 sds->busiest = sg;
3970 sds->busiest_nr_running = sgs.sum_nr_running; 3822 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3976,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3976 sds->group_imb = sgs.group_imb; 3828 sds->group_imb = sgs.group_imb;
3977 } 3829 }
3978 3830
3979 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3980 sg = sg->next; 3831 sg = sg->next;
3981 } while (sg != sd->groups); 3832 } while (sg != env->sd->groups);
3982} 3833}
3983 3834
3984/** 3835/**
@@ -4006,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4006 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4007 * @imbalance: returns amount of imbalanced due to packing. 3858 * @imbalance: returns amount of imbalanced due to packing.
4008 */ 3859 */
4009static int check_asym_packing(struct sched_domain *sd, 3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4010 struct sd_lb_stats *sds,
4011 int this_cpu, unsigned long *imbalance)
4012{ 3861{
4013 int busiest_cpu; 3862 int busiest_cpu;
4014 3863
4015 if (!(sd->flags & SD_ASYM_PACKING)) 3864 if (!(env->sd->flags & SD_ASYM_PACKING))
4016 return 0; 3865 return 0;
4017 3866
4018 if (!sds->busiest) 3867 if (!sds->busiest)
4019 return 0; 3868 return 0;
4020 3869
4021 busiest_cpu = group_first_cpu(sds->busiest); 3870 busiest_cpu = group_first_cpu(sds->busiest);
4022 if (this_cpu > busiest_cpu) 3871 if (env->dst_cpu > busiest_cpu)
4023 return 0; 3872 return 0;
4024 3873
4025 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3874 env->imbalance = DIV_ROUND_CLOSEST(
4026 SCHED_POWER_SCALE); 3875 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3876
4027 return 1; 3877 return 1;
4028} 3878}
4029 3879
@@ -4035,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd,
4035 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4036 * @imbalance: Variable to store the imbalance. 3886 * @imbalance: Variable to store the imbalance.
4037 */ 3887 */
4038static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3888static inline
4039 int this_cpu, unsigned long *imbalance) 3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4040{ 3890{
4041 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3891 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4042 unsigned int imbn = 2; 3892 unsigned int imbn = 2;
@@ -4047,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4047 if (sds->busiest_load_per_task > 3897 if (sds->busiest_load_per_task >
4048 sds->this_load_per_task) 3898 sds->this_load_per_task)
4049 imbn = 1; 3899 imbn = 1;
4050 } else 3900 } else {
4051 sds->this_load_per_task = 3901 sds->this_load_per_task =
4052 cpu_avg_load_per_task(this_cpu); 3902 cpu_avg_load_per_task(env->dst_cpu);
3903 }
4053 3904
4054 scaled_busy_load_per_task = sds->busiest_load_per_task 3905 scaled_busy_load_per_task = sds->busiest_load_per_task
4055 * SCHED_POWER_SCALE; 3906 * SCHED_POWER_SCALE;
@@ -4057,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4057 3908
4058 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3909 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4059 (scaled_busy_load_per_task * imbn)) { 3910 (scaled_busy_load_per_task * imbn)) {
4060 *imbalance = sds->busiest_load_per_task; 3911 env->imbalance = sds->busiest_load_per_task;
4061 return; 3912 return;
4062 } 3913 }
4063 3914
@@ -4094,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4094 3945
4095 /* Move if we gain throughput */ 3946 /* Move if we gain throughput */
4096 if (pwr_move > pwr_now) 3947 if (pwr_move > pwr_now)
4097 *imbalance = sds->busiest_load_per_task; 3948 env->imbalance = sds->busiest_load_per_task;
4098} 3949}
4099 3950
4100/** 3951/**
4101 * calculate_imbalance - Calculate the amount of imbalance present within the 3952 * calculate_imbalance - Calculate the amount of imbalance present within the
4102 * groups of a given sched_domain during load balance. 3953 * groups of a given sched_domain during load balance.
3954 * @env: load balance environment
4103 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3955 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4104 * @this_cpu: Cpu for which currently load balance is being performed.
4105 * @imbalance: The variable to store the imbalance.
4106 */ 3956 */
4107static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3957static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4108 unsigned long *imbalance)
4109{ 3958{
4110 unsigned long max_pull, load_above_capacity = ~0UL; 3959 unsigned long max_pull, load_above_capacity = ~0UL;
4111 3960
@@ -4121,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4121 * its cpu_power, while calculating max_load..) 3970 * its cpu_power, while calculating max_load..)
4122 */ 3971 */
4123 if (sds->max_load < sds->avg_load) { 3972 if (sds->max_load < sds->avg_load) {
4124 *imbalance = 0; 3973 env->imbalance = 0;
4125 return fix_small_imbalance(sds, this_cpu, imbalance); 3974 return fix_small_imbalance(env, sds);
4126 } 3975 }
4127 3976
4128 if (!sds->group_imb) { 3977 if (!sds->group_imb) {
@@ -4150,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4150 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3999 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4151 4000
4152 /* How much load to actually move to equalise the imbalance */ 4001 /* How much load to actually move to equalise the imbalance */
4153 *imbalance = min(max_pull * sds->busiest->sgp->power, 4002 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4154 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4003 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4155 / SCHED_POWER_SCALE; 4004 / SCHED_POWER_SCALE;
4156 4005
@@ -4160,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4160 * a think about bumping its value to force at least one task to be 4009 * a think about bumping its value to force at least one task to be
4161 * moved 4010 * moved
4162 */ 4011 */
4163 if (*imbalance < sds->busiest_load_per_task) 4012 if (env->imbalance < sds->busiest_load_per_task)
4164 return fix_small_imbalance(sds, this_cpu, imbalance); 4013 return fix_small_imbalance(env, sds);
4165 4014
4166} 4015}
4167 4016
@@ -4192,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4192 * put to idle by rebalancing its tasks onto our group. 4041 * put to idle by rebalancing its tasks onto our group.
4193 */ 4042 */
4194static struct sched_group * 4043static struct sched_group *
4195find_busiest_group(struct sched_domain *sd, int this_cpu, 4044find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4196 unsigned long *imbalance, enum cpu_idle_type idle,
4197 const struct cpumask *cpus, int *balance)
4198{ 4045{
4199 struct sd_lb_stats sds; 4046 struct sd_lb_stats sds;
4200 4047
@@ -4204,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4204 * Compute the various statistics relavent for load balancing at 4051 * Compute the various statistics relavent for load balancing at
4205 * this level. 4052 * this level.
4206 */ 4053 */
4207 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4054 update_sd_lb_stats(env, cpus, balance, &sds);
4208 4055
4209 /* 4056 /*
4210 * this_cpu is not the appropriate cpu to perform load balancing at 4057 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4213,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4213 if (!(*balance)) 4060 if (!(*balance))
4214 goto ret; 4061 goto ret;
4215 4062
4216 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4063 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4217 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4064 check_asym_packing(env, &sds))
4218 return sds.busiest; 4065 return sds.busiest;
4219 4066
4220 /* There is no busy sibling group to pull tasks from */ 4067 /* There is no busy sibling group to pull tasks from */
@@ -4232,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4232 goto force_balance; 4079 goto force_balance;
4233 4080
4234 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4081 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4235 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4082 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4236 !sds.busiest_has_capacity) 4083 !sds.busiest_has_capacity)
4237 goto force_balance; 4084 goto force_balance;
4238 4085
@@ -4250,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4250 if (sds.this_load >= sds.avg_load) 4097 if (sds.this_load >= sds.avg_load)
4251 goto out_balanced; 4098 goto out_balanced;
4252 4099
4253 if (idle == CPU_IDLE) { 4100 if (env->idle == CPU_IDLE) {
4254 /* 4101 /*
4255 * This cpu is idle. If the busiest group load doesn't 4102 * This cpu is idle. If the busiest group load doesn't
4256 * have more tasks than the number of available cpu's and 4103 * have more tasks than the number of available cpu's and
@@ -4265,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4265 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4112 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4266 * imbalance_pct to be conservative. 4113 * imbalance_pct to be conservative.
4267 */ 4114 */
4268 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4115 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4269 goto out_balanced; 4116 goto out_balanced;
4270 } 4117 }
4271 4118
4272force_balance: 4119force_balance:
4273 /* Looks like there is an imbalance. Compute it */ 4120 /* Looks like there is an imbalance. Compute it */
4274 calculate_imbalance(&sds, this_cpu, imbalance); 4121 calculate_imbalance(env, &sds);
4275 return sds.busiest; 4122 return sds.busiest;
4276 4123
4277out_balanced: 4124out_balanced:
4278 /*
4279 * There is no obvious imbalance. But check if we can do some balancing
4280 * to save power.
4281 */
4282 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4283 return sds.busiest;
4284ret: 4125ret:
4285 *imbalance = 0; 4126 env->imbalance = 0;
4286 return NULL; 4127 return NULL;
4287} 4128}
4288 4129
4289/* 4130/*
4290 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4131 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4291 */ 4132 */
4292static struct rq * 4133static struct rq *find_busiest_queue(struct lb_env *env,
4293find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4134 struct sched_group *group,
4294 enum cpu_idle_type idle, unsigned long imbalance, 4135 const struct cpumask *cpus)
4295 const struct cpumask *cpus)
4296{ 4136{
4297 struct rq *busiest = NULL, *rq; 4137 struct rq *busiest = NULL, *rq;
4298 unsigned long max_load = 0; 4138 unsigned long max_load = 0;
@@ -4305,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4305 unsigned long wl; 4145 unsigned long wl;
4306 4146
4307 if (!capacity) 4147 if (!capacity)
4308 capacity = fix_small_capacity(sd, group); 4148 capacity = fix_small_capacity(env->sd, group);
4309 4149
4310 if (!cpumask_test_cpu(i, cpus)) 4150 if (!cpumask_test_cpu(i, cpus))
4311 continue; 4151 continue;
@@ -4317,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4317 * When comparing with imbalance, use weighted_cpuload() 4157 * When comparing with imbalance, use weighted_cpuload()
4318 * which is not scaled with the cpu power. 4158 * which is not scaled with the cpu power.
4319 */ 4159 */
4320 if (capacity && rq->nr_running == 1 && wl > imbalance) 4160 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4321 continue; 4161 continue;
4322 4162
4323 /* 4163 /*
@@ -4346,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4346/* Working cpumask for load_balance and load_balance_newidle. */ 4186/* Working cpumask for load_balance and load_balance_newidle. */
4347DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4187DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4348 4188
4349static int need_active_balance(struct sched_domain *sd, int idle, 4189static int need_active_balance(struct lb_env *env)
4350 int busiest_cpu, int this_cpu)
4351{ 4190{
4352 if (idle == CPU_NEWLY_IDLE) { 4191 struct sched_domain *sd = env->sd;
4192
4193 if (env->idle == CPU_NEWLY_IDLE) {
4353 4194
4354 /* 4195 /*
4355 * ASYM_PACKING needs to force migrate tasks from busy but 4196 * ASYM_PACKING needs to force migrate tasks from busy but
4356 * higher numbered CPUs in order to pack all tasks in the 4197 * higher numbered CPUs in order to pack all tasks in the
4357 * lowest numbered CPUs. 4198 * lowest numbered CPUs.
4358 */ 4199 */
4359 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4200 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4360 return 1; 4201 return 1;
4361
4362 /*
4363 * The only task running in a non-idle cpu can be moved to this
4364 * cpu in an attempt to completely freeup the other CPU
4365 * package.
4366 *
4367 * The package power saving logic comes from
4368 * find_busiest_group(). If there are no imbalance, then
4369 * f_b_g() will return NULL. However when sched_mc={1,2} then
4370 * f_b_g() will select a group from which a running task may be
4371 * pulled to this cpu in order to make the other package idle.
4372 * If there is no opportunity to make a package idle and if
4373 * there are no imbalance, then f_b_g() will return NULL and no
4374 * action will be taken in load_balance_newidle().
4375 *
4376 * Under normal task pull operation due to imbalance, there
4377 * will be more than one task in the source run queue and
4378 * move_tasks() will succeed. ld_moved will be true and this
4379 * active balance code will not be triggered.
4380 */
4381 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4382 return 0;
4383 } 4202 }
4384 4203
4385 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4204 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4397,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4397{ 4216{
4398 int ld_moved, active_balance = 0; 4217 int ld_moved, active_balance = 0;
4399 struct sched_group *group; 4218 struct sched_group *group;
4400 unsigned long imbalance;
4401 struct rq *busiest; 4219 struct rq *busiest;
4402 unsigned long flags; 4220 unsigned long flags;
4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4221 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4407,7 +4225,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4407 .dst_cpu = this_cpu, 4225 .dst_cpu = this_cpu,
4408 .dst_rq = this_rq, 4226 .dst_rq = this_rq,
4409 .idle = idle, 4227 .idle = idle,
4410 .loop_break = sysctl_sched_nr_migrate, 4228 .loop_break = sched_nr_migrate_break,
4411 }; 4229 };
4412 4230
4413 cpumask_copy(cpus, cpu_active_mask); 4231 cpumask_copy(cpus, cpu_active_mask);
@@ -4415,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4415 schedstat_inc(sd, lb_count[idle]); 4233 schedstat_inc(sd, lb_count[idle]);
4416 4234
4417redo: 4235redo:
4418 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4236 group = find_busiest_group(&env, cpus, balance);
4419 cpus, balance);
4420 4237
4421 if (*balance == 0) 4238 if (*balance == 0)
4422 goto out_balanced; 4239 goto out_balanced;
@@ -4426,7 +4243,7 @@ redo:
4426 goto out_balanced; 4243 goto out_balanced;
4427 } 4244 }
4428 4245
4429 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4246 busiest = find_busiest_queue(&env, group, cpus);
4430 if (!busiest) { 4247 if (!busiest) {
4431 schedstat_inc(sd, lb_nobusyq[idle]); 4248 schedstat_inc(sd, lb_nobusyq[idle]);
4432 goto out_balanced; 4249 goto out_balanced;
@@ -4434,7 +4251,7 @@ redo:
4434 4251
4435 BUG_ON(busiest == this_rq); 4252 BUG_ON(busiest == this_rq);
4436 4253
4437 schedstat_add(sd, lb_imbalance[idle], imbalance); 4254 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4438 4255
4439 ld_moved = 0; 4256 ld_moved = 0;
4440 if (busiest->nr_running > 1) { 4257 if (busiest->nr_running > 1) {
@@ -4445,10 +4262,9 @@ redo:
4445 * correctly treated as an imbalance. 4262 * correctly treated as an imbalance.
4446 */ 4263 */
4447 env.flags |= LBF_ALL_PINNED; 4264 env.flags |= LBF_ALL_PINNED;
4448 env.load_move = imbalance; 4265 env.src_cpu = busiest->cpu;
4449 env.src_cpu = busiest->cpu; 4266 env.src_rq = busiest;
4450 env.src_rq = busiest; 4267 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4451 env.loop_max = busiest->nr_running;
4452 4268
4453more_balance: 4269more_balance:
4454 local_irq_save(flags); 4270 local_irq_save(flags);
@@ -4490,7 +4306,7 @@ more_balance:
4490 if (idle != CPU_NEWLY_IDLE) 4306 if (idle != CPU_NEWLY_IDLE)
4491 sd->nr_balance_failed++; 4307 sd->nr_balance_failed++;
4492 4308
4493 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4309 if (need_active_balance(&env)) {
4494 raw_spin_lock_irqsave(&busiest->lock, flags); 4310 raw_spin_lock_irqsave(&busiest->lock, flags);
4495 4311
4496 /* don't kick the active_load_balance_cpu_stop, 4312 /* don't kick the active_load_balance_cpu_stop,
@@ -4517,10 +4333,11 @@ more_balance:
4517 } 4333 }
4518 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4334 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4519 4335
4520 if (active_balance) 4336 if (active_balance) {
4521 stop_one_cpu_nowait(cpu_of(busiest), 4337 stop_one_cpu_nowait(cpu_of(busiest),
4522 active_load_balance_cpu_stop, busiest, 4338 active_load_balance_cpu_stop, busiest,
4523 &busiest->active_balance_work); 4339 &busiest->active_balance_work);
4340 }
4524 4341
4525 /* 4342 /*
4526 * We've kicked active balancing, reset the failure 4343 * We've kicked active balancing, reset the failure
@@ -4701,104 +4518,15 @@ static struct {
4701 unsigned long next_balance; /* in jiffy units */ 4518 unsigned long next_balance; /* in jiffy units */
4702} nohz ____cacheline_aligned; 4519} nohz ____cacheline_aligned;
4703 4520
4704#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4521static inline int find_new_ilb(int call_cpu)
4705/**
4706 * lowest_flag_domain - Return lowest sched_domain containing flag.
4707 * @cpu: The cpu whose lowest level of sched domain is to
4708 * be returned.
4709 * @flag: The flag to check for the lowest sched_domain
4710 * for the given cpu.
4711 *
4712 * Returns the lowest sched_domain of a cpu which contains the given flag.
4713 */
4714static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4715{
4716 struct sched_domain *sd;
4717
4718 for_each_domain(cpu, sd)
4719 if (sd->flags & flag)
4720 break;
4721
4722 return sd;
4723}
4724
4725/**
4726 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4727 * @cpu: The cpu whose domains we're iterating over.
4728 * @sd: variable holding the value of the power_savings_sd
4729 * for cpu.
4730 * @flag: The flag to filter the sched_domains to be iterated.
4731 *
4732 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4733 * set, starting from the lowest sched_domain to the highest.
4734 */
4735#define for_each_flag_domain(cpu, sd, flag) \
4736 for (sd = lowest_flag_domain(cpu, flag); \
4737 (sd && (sd->flags & flag)); sd = sd->parent)
4738
4739/**
4740 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4741 * @cpu: The cpu which is nominating a new idle_load_balancer.
4742 *
4743 * Returns: Returns the id of the idle load balancer if it exists,
4744 * Else, returns >= nr_cpu_ids.
4745 *
4746 * This algorithm picks the idle load balancer such that it belongs to a
4747 * semi-idle powersavings sched_domain. The idea is to try and avoid
4748 * completely idle packages/cores just for the purpose of idle load balancing
4749 * when there are other idle cpu's which are better suited for that job.
4750 */
4751static int find_new_ilb(int cpu)
4752{ 4522{
4753 int ilb = cpumask_first(nohz.idle_cpus_mask); 4523 int ilb = cpumask_first(nohz.idle_cpus_mask);
4754 struct sched_group *ilbg;
4755 struct sched_domain *sd;
4756
4757 /*
4758 * Have idle load balancer selection from semi-idle packages only
4759 * when power-aware load balancing is enabled
4760 */
4761 if (!(sched_smt_power_savings || sched_mc_power_savings))
4762 goto out_done;
4763
4764 /*
4765 * Optimize for the case when we have no idle CPUs or only one
4766 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4767 */
4768 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4769 goto out_done;
4770
4771 rcu_read_lock();
4772 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4773 ilbg = sd->groups;
4774
4775 do {
4776 if (ilbg->group_weight !=
4777 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4778 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4779 sched_group_cpus(ilbg));
4780 goto unlock;
4781 }
4782
4783 ilbg = ilbg->next;
4784
4785 } while (ilbg != sd->groups);
4786 }
4787unlock:
4788 rcu_read_unlock();
4789 4524
4790out_done:
4791 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4525 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4792 return ilb; 4526 return ilb;
4793 4527
4794 return nr_cpu_ids; 4528 return nr_cpu_ids;
4795} 4529}
4796#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4797static inline int find_new_ilb(int call_cpu)
4798{
4799 return nr_cpu_ids;
4800}
4801#endif
4802 4530
4803/* 4531/*
4804 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4532 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5021,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5021 4749
5022 raw_spin_lock_irq(&this_rq->lock); 4750 raw_spin_lock_irq(&this_rq->lock);
5023 update_rq_clock(this_rq); 4751 update_rq_clock(this_rq);
5024 update_cpu_load(this_rq); 4752 update_idle_cpu_load(this_rq);
5025 raw_spin_unlock_irq(&this_rq->lock); 4753 raw_spin_unlock_irq(&this_rq->lock);
5026 4754
5027 rebalance_domains(balance_cpu, CPU_IDLE); 4755 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d..de00a486c5c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, false) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, true) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
71SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f28..b44d604b35d 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d..c5565c3c515 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1803static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1804 const struct cpumask *new_mask)
1805{ 1805{
1806 int weight = cpumask_weight(new_mask); 1806 struct rq *rq;
1807 int weight;
1807 1808
1808 BUG_ON(!rt_task(p)); 1809 BUG_ON(!rt_task(p));
1809 1810
1810 /* 1811 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1812 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816
1817 if (!task_current(rq, p)) {
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1813
1827 /* 1814 weight = cpumask_weight(new_mask);
1828 * Requeue if our weight is changing and still > 1
1829 */
1830 if (weight > 1)
1831 enqueue_pushable_task(rq, p);
1832 1815
1833 } 1816 /*
1817 * Only update if the process changes its state from whether it
1818 * can migrate or not.
1819 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
1821 return;
1834 1822
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1823 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1824
1842 update_rt_migration(&rq->rt); 1825 /*
1826 * The process used to be able to migrate OR it can now migrate
1827 */
1828 if (weight <= 1) {
1829 if (!task_current(rq, p))
1830 dequeue_pushable_task(rq, p);
1831 BUG_ON(!rq->rt.rt_nr_migratory);
1832 rq->rt.rt_nr_migratory--;
1833 } else {
1834 if (!task_current(rq, p))
1835 enqueue_pushable_task(rq, p);
1836 rq->rt.rt_nr_migratory++;
1843 } 1837 }
1838
1839 update_rt_migration(&rq->rt);
1844} 1840}
1845 1841
1846/* Assumes rq->lock is held */ 1842/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52..ba9dccfd24c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895e..ee376beedaf 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
3 * 3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 * 5 *
6 * This defines a simple but solid secure-computing mode. 6 * Copyright (C) 2012 Google, Inc.
7 * Will Drewry <wad@chromium.org>
8 *
9 * This defines a simple but solid secure-computing facility.
10 *
11 * Mode 1 uses a fixed list of allowed system calls.
12 * Mode 2 allows user-defined system call filters in the form
13 * of Berkeley Packet Filters/Linux Socket Filters.
7 */ 14 */
8 15
16#include <linux/atomic.h>
9#include <linux/audit.h> 17#include <linux/audit.h>
10#include <linux/seccomp.h>
11#include <linux/sched.h>
12#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/sched.h>
20#include <linux/seccomp.h>
13 21
14/* #define SECCOMP_DEBUG 1 */ 22/* #define SECCOMP_DEBUG 1 */
15#define NR_SECCOMP_MODES 1 23
24#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h>
26#include <linux/filter.h>
27#include <linux/ptrace.h>
28#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h>
31#include <linux/uaccess.h>
32
33/**
34 * struct seccomp_filter - container for seccomp BPF programs
35 *
36 * @usage: reference count to manage the object lifetime.
37 * get/put helpers should be used when accessing an instance
38 * outside of a lifetime-guarded section. In general, this
39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate
43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting
46 * with current->seccomp.filter, the most recently attached or inherited filter.
47 * However, multiple filters may share a @prev node, by way of fork(), which
48 * results in a unidirectional tree existing in memory. This is similar to
49 * how namespaces work.
50 *
51 * seccomp_filter objects should never be modified after being attached
52 * to a task_struct (other than @usage).
53 */
54struct seccomp_filter {
55 atomic_t usage;
56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[];
59};
60
61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63
64/**
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture.
76 */
77static inline u32 get_u32(u64 data, int index)
78{
79 return ((u32 *)&data)[index];
80}
81
82/* Helper for bpf_load below. */
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
84/**
85 * bpf_load: checks and returns a pointer to the requested offset
86 * @off: offset into struct seccomp_data to load from
87 *
88 * Returns the requested 32-bits of data.
89 * seccomp_check_filter() should assure that @off is 32-bit aligned
90 * and not out of bounds. Failure to do so is a BUG.
91 */
92u32 seccomp_bpf_load(int off)
93{
94 struct pt_regs *regs = task_pt_regs(current);
95 if (off == BPF_DATA(nr))
96 return syscall_get_nr(current, regs);
97 if (off == BPF_DATA(arch))
98 return syscall_get_arch(current, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
100 unsigned long value;
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
102 int index = !!(off % sizeof(u64));
103 syscall_get_arguments(current, regs, arg, 1, &value);
104 return get_u32(value, index);
105 }
106 if (off == BPF_DATA(instruction_pointer))
107 return get_u32(KSTK_EIP(current), 0);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112}
113
114/**
115 * seccomp_check_filter - verify seccomp filter code
116 * @filter: filter to verify
117 * @flen: length of filter
118 *
119 * Takes a previously checked filter (by sk_chk_filter) and
120 * redirects all filter code that loads struct sk_buff data
121 * and related data through seccomp_bpf_load. It also
122 * enforces length and alignment checking of those loads.
123 *
124 * Returns 0 if the rule set is legal or -EINVAL if not.
125 */
126static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
127{
128 int pc;
129 for (pc = 0; pc < flen; pc++) {
130 struct sock_filter *ftest = &filter[pc];
131 u16 code = ftest->code;
132 u32 k = ftest->k;
133
134 switch (code) {
135 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W;
137 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL;
140 continue;
141 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM;
143 ftest->k = sizeof(struct seccomp_data);
144 continue;
145 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM;
147 ftest->k = sizeof(struct seccomp_data);
148 continue;
149 /* Explicitly include allowed calls. */
150 case BPF_S_RET_K:
151 case BPF_S_RET_A:
152 case BPF_S_ALU_ADD_K:
153 case BPF_S_ALU_ADD_X:
154 case BPF_S_ALU_SUB_K:
155 case BPF_S_ALU_SUB_X:
156 case BPF_S_ALU_MUL_K:
157 case BPF_S_ALU_MUL_X:
158 case BPF_S_ALU_DIV_X:
159 case BPF_S_ALU_AND_K:
160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K:
166 case BPF_S_ALU_RSH_X:
167 case BPF_S_ALU_NEG:
168 case BPF_S_LD_IMM:
169 case BPF_S_LDX_IMM:
170 case BPF_S_MISC_TAX:
171 case BPF_S_MISC_TXA:
172 case BPF_S_ALU_DIV_K:
173 case BPF_S_LD_MEM:
174 case BPF_S_LDX_MEM:
175 case BPF_S_ST:
176 case BPF_S_STX:
177 case BPF_S_JMP_JA:
178 case BPF_S_JMP_JEQ_K:
179 case BPF_S_JMP_JEQ_X:
180 case BPF_S_JMP_JGE_K:
181 case BPF_S_JMP_JGE_X:
182 case BPF_S_JMP_JGT_K:
183 case BPF_S_JMP_JGT_X:
184 case BPF_S_JMP_JSET_K:
185 case BPF_S_JMP_JSET_X:
186 continue;
187 default:
188 return -EINVAL;
189 }
190 }
191 return 0;
192}
193
194/**
195 * seccomp_run_filters - evaluates all seccomp filters against @syscall
196 * @syscall: number of the current system call
197 *
198 * Returns valid seccomp BPF response codes.
199 */
200static u32 seccomp_run_filters(int syscall)
201{
202 struct seccomp_filter *f;
203 u32 ret = SECCOMP_RET_ALLOW;
204
205 /* Ensure unexpected behavior doesn't result in failing open. */
206 if (WARN_ON(current->seccomp.filter == NULL))
207 return SECCOMP_RET_KILL;
208
209 /*
210 * All filters in the list are evaluated and the lowest BPF return
211 * value always takes priority (ignoring the DATA).
212 */
213 for (f = current->seccomp.filter; f; f = f->prev) {
214 u32 cur_ret = sk_run_filter(NULL, f->insns);
215 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
216 ret = cur_ret;
217 }
218 return ret;
219}
220
221/**
222 * seccomp_attach_filter: Attaches a seccomp filter to current.
223 * @fprog: BPF program to install
224 *
225 * Returns 0 on success or an errno on failure.
226 */
227static long seccomp_attach_filter(struct sock_fprog *fprog)
228{
229 struct seccomp_filter *filter;
230 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
231 unsigned long total_insns = fprog->len;
232 long ret;
233
234 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
235 return -EINVAL;
236
237 for (filter = current->seccomp.filter; filter; filter = filter->prev)
238 total_insns += filter->len + 4; /* include a 4 instr penalty */
239 if (total_insns > MAX_INSNS_PER_PATH)
240 return -ENOMEM;
241
242 /*
243 * Installing a seccomp filter requires that the task have
244 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
245 * This avoids scenarios where unprivileged tasks can affect the
246 * behavior of privileged children.
247 */
248 if (!current->no_new_privs &&
249 security_capable_noaudit(current_cred(), current_user_ns(),
250 CAP_SYS_ADMIN) != 0)
251 return -EACCES;
252
253 /* Allocate a new seccomp_filter */
254 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
255 GFP_KERNEL|__GFP_NOWARN);
256 if (!filter)
257 return -ENOMEM;
258 atomic_set(&filter->usage, 1);
259 filter->len = fprog->len;
260
261 /* Copy the instructions from fprog. */
262 ret = -EFAULT;
263 if (copy_from_user(filter->insns, fprog->filter, fp_size))
264 goto fail;
265
266 /* Check and rewrite the fprog via the skb checker */
267 ret = sk_chk_filter(filter->insns, filter->len);
268 if (ret)
269 goto fail;
270
271 /* Check and rewrite the fprog for seccomp use */
272 ret = seccomp_check_filter(filter->insns, filter->len);
273 if (ret)
274 goto fail;
275
276 /*
277 * If there is an existing filter, make it the prev and don't drop its
278 * task reference.
279 */
280 filter->prev = current->seccomp.filter;
281 current->seccomp.filter = filter;
282 return 0;
283fail:
284 kfree(filter);
285 return ret;
286}
287
288/**
289 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
290 * @user_filter: pointer to the user data containing a sock_fprog.
291 *
292 * Returns 0 on success and non-zero otherwise.
293 */
294long seccomp_attach_user_filter(char __user *user_filter)
295{
296 struct sock_fprog fprog;
297 long ret = -EFAULT;
298
299#ifdef CONFIG_COMPAT
300 if (is_compat_task()) {
301 struct compat_sock_fprog fprog32;
302 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
303 goto out;
304 fprog.len = fprog32.len;
305 fprog.filter = compat_ptr(fprog32.filter);
306 } else /* falls through to the if below. */
307#endif
308 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
309 goto out;
310 ret = seccomp_attach_filter(&fprog);
311out:
312 return ret;
313}
314
315/* get_seccomp_filter - increments the reference count of the filter on @tsk */
316void get_seccomp_filter(struct task_struct *tsk)
317{
318 struct seccomp_filter *orig = tsk->seccomp.filter;
319 if (!orig)
320 return;
321 /* Reference count is bounded by the number of total processes. */
322 atomic_inc(&orig->usage);
323}
324
325/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
326void put_seccomp_filter(struct task_struct *tsk)
327{
328 struct seccomp_filter *orig = tsk->seccomp.filter;
329 /* Clean up single-reference branches iteratively. */
330 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig;
332 orig = orig->prev;
333 kfree(freeme);
334 }
335}
336
337/**
338 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
339 * @syscall: syscall number to send to userland
340 * @reason: filter-supplied reason code to send to userland (via si_errno)
341 *
342 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
343 */
344static void seccomp_send_sigsys(int syscall, int reason)
345{
346 struct siginfo info;
347 memset(&info, 0, sizeof(info));
348 info.si_signo = SIGSYS;
349 info.si_code = SYS_SECCOMP;
350 info.si_call_addr = (void __user *)KSTK_EIP(current);
351 info.si_errno = reason;
352 info.si_arch = syscall_get_arch(current, task_pt_regs(current));
353 info.si_syscall = syscall;
354 force_sig_info(SIGSYS, &info, current);
355}
356#endif /* CONFIG_SECCOMP_FILTER */
16 357
17/* 358/*
18 * Secure computing mode 1 allows only read/write/exit/sigreturn. 359 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
31}; 372};
32#endif 373#endif
33 374
34void __secure_computing(int this_syscall) 375int __secure_computing(int this_syscall)
35{ 376{
36 int mode = current->seccomp.mode; 377 int mode = current->seccomp.mode;
37 int * syscall; 378 int exit_sig = 0;
379 int *syscall;
380 u32 ret;
38 381
39 switch (mode) { 382 switch (mode) {
40 case 1: 383 case SECCOMP_MODE_STRICT:
41 syscall = mode1_syscalls; 384 syscall = mode1_syscalls;
42#ifdef CONFIG_COMPAT 385#ifdef CONFIG_COMPAT
43 if (is_compat_task()) 386 if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
45#endif 388#endif
46 do { 389 do {
47 if (*syscall == this_syscall) 390 if (*syscall == this_syscall)
48 return; 391 return 0;
49 } while (*++syscall); 392 } while (*++syscall);
393 exit_sig = SIGKILL;
394 ret = SECCOMP_RET_KILL;
395 break;
396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: {
398 int data;
399 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION;
402 switch (ret) {
403 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current),
406 -data, 0);
407 goto skip;
408 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current));
411 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data);
413 goto skip;
414 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
417 goto skip;
418 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /*
421 * The delivery of a fatal signal during event
422 * notification may silently skip tracer notification.
423 * Terminating the task now avoids executing a system
424 * call that may not be intended.
425 */
426 if (fatal_signal_pending(current))
427 break;
428 return 0;
429 case SECCOMP_RET_ALLOW:
430 return 0;
431 case SECCOMP_RET_KILL:
432 default:
433 break;
434 }
435 exit_sig = SIGSYS;
50 break; 436 break;
437 }
438#endif
51 default: 439 default:
52 BUG(); 440 BUG();
53 } 441 }
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
55#ifdef SECCOMP_DEBUG 443#ifdef SECCOMP_DEBUG
56 dump_stack(); 444 dump_stack();
57#endif 445#endif
58 audit_seccomp(this_syscall); 446 audit_seccomp(this_syscall, exit_sig, ret);
59 do_exit(SIGKILL); 447 do_exit(exit_sig);
448#ifdef CONFIG_SECCOMP_FILTER
449skip:
450 audit_seccomp(this_syscall, exit_sig, ret);
451#endif
452 return -1;
60} 453}
61 454
62long prctl_get_seccomp(void) 455long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
64 return current->seccomp.mode; 457 return current->seccomp.mode;
65} 458}
66 459
67long prctl_set_seccomp(unsigned long seccomp_mode) 460/**
461 * prctl_set_seccomp: configures current->seccomp.mode
462 * @seccomp_mode: requested mode to use
463 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
464 *
465 * This function may be called repeatedly with a @seccomp_mode of
466 * SECCOMP_MODE_FILTER to install additional filters. Every filter
467 * successfully installed will be evaluated (in reverse order) for each system
468 * call the task makes.
469 *
470 * Once current->seccomp.mode is non-zero, it may not be changed.
471 *
472 * Returns 0 on success or -EINVAL on failure.
473 */
474long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
68{ 475{
69 long ret; 476 long ret = -EINVAL;
70 477
71 /* can set it only once to be even more secure */ 478 if (current->seccomp.mode &&
72 ret = -EPERM; 479 current->seccomp.mode != seccomp_mode)
73 if (unlikely(current->seccomp.mode))
74 goto out; 480 goto out;
75 481
76 ret = -EINVAL; 482 switch (seccomp_mode) {
77 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { 483 case SECCOMP_MODE_STRICT:
78 current->seccomp.mode = seccomp_mode; 484 ret = 0;
79 set_thread_flag(TIF_SECCOMP);
80#ifdef TIF_NOTSC 485#ifdef TIF_NOTSC
81 disable_TSC(); 486 disable_TSC();
82#endif 487#endif
83 ret = 0; 488 break;
489#ifdef CONFIG_SECCOMP_FILTER
490 case SECCOMP_MODE_FILTER:
491 ret = seccomp_attach_user_filter(filter);
492 if (ret)
493 goto out;
494 break;
495#endif
496 default:
497 goto out;
84 } 498 }
85 499
86 out: 500 current->seccomp.mode = seccomp_mode;
501 set_thread_flag(TIF_SECCOMP);
502out:
87 return ret; 503 return ret;
88} 504}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 60636a4e25c..4567fc020fe 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
118 * down_trylock - try to acquire the semaphore, without waiting 118 * down_trylock - try to acquire the semaphore, without waiting
119 * @sem: the semaphore to be acquired 119 * @sem: the semaphore to be acquired
120 * 120 *
121 * Try to acquire the semaphore atomically. Returns 0 if the mutex has 121 * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
122 * been acquired successfully or 1 if it it cannot be acquired. 122 * been acquired successfully or 1 if it it cannot be acquired.
123 * 123 *
124 * NOTE: This return value is inverted from both spin_trylock and 124 * NOTE: This return value is inverted from both spin_trylock and
diff --git a/kernel/signal.c b/kernel/signal.c
index 60d80ab2601..f7b41821763 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -161,7 +161,7 @@ void recalc_sigpending(void)
161 161
162#define SYNCHRONOUS_MASK \ 162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ 163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE)) 164 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
165 165
166int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
167{ 167{
@@ -768,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t)
768 const struct cred *cred = current_cred(); 768 const struct cred *cred = current_cred();
769 const struct cred *tcred = __task_cred(t); 769 const struct cred *tcred = __task_cred(t);
770 770
771 if (cred->user->user_ns == tcred->user->user_ns && 771 if (uid_eq(cred->euid, tcred->suid) ||
772 (cred->euid == tcred->suid || 772 uid_eq(cred->euid, tcred->uid) ||
773 cred->euid == tcred->uid || 773 uid_eq(cred->uid, tcred->suid) ||
774 cred->uid == tcred->suid || 774 uid_eq(cred->uid, tcred->uid))
775 cred->uid == tcred->uid))
776 return 1; 775 return 1;
777 776
778 if (ns_capable(tcred->user->user_ns, CAP_KILL)) 777 if (ns_capable(tcred->user_ns, CAP_KILL))
779 return 1; 778 return 1;
780 779
781 return 0; 780 return 0;
@@ -1021,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1021 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1022} 1021}
1023 1022
1024/*
1025 * map the uid in struct cred into user namespace *ns
1026 */
1027static inline uid_t map_cred_ns(const struct cred *cred,
1028 struct user_namespace *ns)
1029{
1030 return user_ns_map_uid(ns, cred, cred->uid);
1031}
1032
1033#ifdef CONFIG_USER_NS 1023#ifdef CONFIG_USER_NS
1034static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) 1024static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1035{ 1025{
@@ -1039,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
1039 if (SI_FROMKERNEL(info)) 1029 if (SI_FROMKERNEL(info))
1040 return; 1030 return;
1041 1031
1042 info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), 1032 rcu_read_lock();
1043 current_cred(), info->si_uid); 1033 info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
1034 make_kuid(current_user_ns(), info->si_uid));
1035 rcu_read_unlock();
1044} 1036}
1045#else 1037#else
1046static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) 1038static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1107,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1107 q->info.si_code = SI_USER; 1099 q->info.si_code = SI_USER;
1108 q->info.si_pid = task_tgid_nr_ns(current, 1100 q->info.si_pid = task_tgid_nr_ns(current,
1109 task_active_pid_ns(t)); 1101 task_active_pid_ns(t));
1110 q->info.si_uid = current_uid(); 1102 q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
1111 break; 1103 break;
1112 case (unsigned long) SEND_SIG_PRIV: 1104 case (unsigned long) SEND_SIG_PRIV:
1113 q->info.si_signo = sig; 1105 q->info.si_signo = sig;
@@ -1388,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred,
1388 struct task_struct *target) 1380 struct task_struct *target)
1389{ 1381{
1390 const struct cred *pcred = __task_cred(target); 1382 const struct cred *pcred = __task_cred(target);
1391 if (cred->user_ns != pcred->user_ns) 1383 if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
1392 return 0; 1384 !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
1393 if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
1394 cred->uid != pcred->suid && cred->uid != pcred->uid)
1395 return 0; 1385 return 0;
1396 return 1; 1386 return 1;
1397} 1387}
@@ -1679,8 +1669,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1679 */ 1669 */
1680 rcu_read_lock(); 1670 rcu_read_lock();
1681 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1671 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1682 info.si_uid = map_cred_ns(__task_cred(tsk), 1672 info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
1683 task_cred_xxx(tsk->parent, user_ns)); 1673 task_uid(tsk));
1684 rcu_read_unlock(); 1674 rcu_read_unlock();
1685 1675
1686 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1676 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1763,8 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1763 */ 1753 */
1764 rcu_read_lock(); 1754 rcu_read_lock();
1765 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1755 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1766 info.si_uid = map_cred_ns(__task_cred(tsk), 1756 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1767 task_cred_xxx(parent, user_ns));
1768 rcu_read_unlock(); 1757 rcu_read_unlock();
1769 1758
1770 info.si_utime = cputime_to_clock_t(tsk->utime); 1759 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1974,7 +1963,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1974 info.si_signo = signr; 1963 info.si_signo = signr;
1975 info.si_code = exit_code; 1964 info.si_code = exit_code;
1976 info.si_pid = task_pid_vnr(current); 1965 info.si_pid = task_pid_vnr(current);
1977 info.si_uid = current_uid(); 1966 info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
1978 1967
1979 /* Let the debugger run. */ 1968 /* Let the debugger run. */
1980 ptrace_stop(exit_code, why, 1, &info); 1969 ptrace_stop(exit_code, why, 1, &info);
@@ -2182,8 +2171,8 @@ static int ptrace_signal(int signr, siginfo_t *info,
2182 info->si_code = SI_USER; 2171 info->si_code = SI_USER;
2183 rcu_read_lock(); 2172 rcu_read_lock();
2184 info->si_pid = task_pid_vnr(current->parent); 2173 info->si_pid = task_pid_vnr(current->parent);
2185 info->si_uid = map_cred_ns(__task_cred(current->parent), 2174 info->si_uid = from_kuid_munged(current_user_ns(),
2186 current_user_ns()); 2175 task_uid(current->parent));
2187 rcu_read_unlock(); 2176 rcu_read_unlock();
2188 } 2177 }
2189 2178
@@ -2710,6 +2699,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2710 err |= __put_user(from->si_uid, &to->si_uid); 2699 err |= __put_user(from->si_uid, &to->si_uid);
2711 err |= __put_user(from->si_ptr, &to->si_ptr); 2700 err |= __put_user(from->si_ptr, &to->si_ptr);
2712 break; 2701 break;
2702#ifdef __ARCH_SIGSYS
2703 case __SI_SYS:
2704 err |= __put_user(from->si_call_addr, &to->si_call_addr);
2705 err |= __put_user(from->si_syscall, &to->si_syscall);
2706 err |= __put_user(from->si_arch, &to->si_arch);
2707 break;
2708#endif
2713 default: /* this is just in case for now ... */ 2709 default: /* this is just in case for now ... */
2714 err |= __put_user(from->si_pid, &to->si_pid); 2710 err |= __put_user(from->si_pid, &to->si_pid);
2715 err |= __put_user(from->si_uid, &to->si_uid); 2711 err |= __put_user(from->si_uid, &to->si_uid);
@@ -2832,7 +2828,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2832 info.si_errno = 0; 2828 info.si_errno = 0;
2833 info.si_code = SI_USER; 2829 info.si_code = SI_USER;
2834 info.si_pid = task_tgid_vnr(current); 2830 info.si_pid = task_tgid_vnr(current);
2835 info.si_uid = current_uid(); 2831 info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
2836 2832
2837 return kill_something_info(sig, &info, pid); 2833 return kill_something_info(sig, &info, pid);
2838} 2834}
@@ -2875,7 +2871,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2875 info.si_errno = 0; 2871 info.si_errno = 0;
2876 info.si_code = SI_TKILL; 2872 info.si_code = SI_TKILL;
2877 info.si_pid = task_tgid_vnr(current); 2873 info.si_pid = task_tgid_vnr(current);
2878 info.si_uid = current_uid(); 2874 info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
2879 2875
2880 return do_send_specific(tgid, pid, sig, &info); 2876 return do_send_specific(tgid, pid, sig, &info);
2881} 2877}
@@ -3240,6 +3236,21 @@ SYSCALL_DEFINE0(pause)
3240 3236
3241#endif 3237#endif
3242 3238
3239#ifdef HAVE_SET_RESTORE_SIGMASK
3240int sigsuspend(sigset_t *set)
3241{
3242 sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
3243
3244 current->saved_sigmask = current->blocked;
3245 set_current_blocked(set);
3246
3247 current->state = TASK_INTERRUPTIBLE;
3248 schedule();
3249 set_restore_sigmask();
3250 return -ERESTARTNOHAND;
3251}
3252#endif
3253
3243#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 3254#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3244/** 3255/**
3245 * sys_rt_sigsuspend - replace the signal mask for a value with the 3256 * sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3257,15 +3268,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3257 3268
3258 if (copy_from_user(&newset, unewset, sizeof(newset))) 3269 if (copy_from_user(&newset, unewset, sizeof(newset)))
3259 return -EFAULT; 3270 return -EFAULT;
3260 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3271 return sigsuspend(&newset);
3261
3262 current->saved_sigmask = current->blocked;
3263 set_current_blocked(&newset);
3264
3265 current->state = TASK_INTERRUPTIBLE;
3266 schedule();
3267 set_restore_sigmask();
3268 return -ERESTARTNOHAND;
3269} 3272}
3270#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3273#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
3271 3274
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf75..d0ae5b24875 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#include "smpboot.h"
17
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
17static struct { 19static struct {
18 struct list_head queue; 20 struct list_head queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
669{ 671{
670 unsigned int cpu; 672 unsigned int cpu;
671 673
674 idle_threads_init();
675
672 /* FIXME: This should be done in userspace --RR */ 676 /* FIXME: This should be done in userspace --RR */
673 for_each_present_cpu(cpu) { 677 for_each_present_cpu(cpu) {
674 if (num_online_cpus() >= setup_max_cpus) 678 if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
791 } 795 }
792} 796}
793EXPORT_SYMBOL(on_each_cpu_cond); 797EXPORT_SYMBOL(on_each_cpu_cond);
798
799static void do_nothing(void *unused)
800{
801}
802
803/**
804 * kick_all_cpus_sync - Force all cpus out of idle
805 *
806 * Used to synchronize the update of pm_idle function pointer. It's
807 * called after the pointer is updated and returns after the dummy
808 * callback function has been executed on all cpus. The execution of
809 * the function can only happen on the remote cpus after they have
810 * left the idle function which had been called via pm_idle function
811 * pointer. So it's guaranteed that nothing uses the previous pointer
812 * anymore.
813 */
814void kick_all_cpus_sync(void)
815{
816 /* Make sure the change is visible before we kick the cpus */
817 smp_mb();
818 smp_call_function(do_nothing, NULL, 1);
819}
820EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 00000000000..e1a797e028a
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,62 @@
1/*
2 * Common SMP CPU bringup/teardown functions
3 */
4#include <linux/err.h>
5#include <linux/smp.h>
6#include <linux/init.h>
7#include <linux/sched.h>
8#include <linux/percpu.h>
9
10#include "smpboot.h"
11
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/*
14 * For the hotplug case we keep the task structs around and reuse
15 * them.
16 */
17static DEFINE_PER_CPU(struct task_struct *, idle_threads);
18
19struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
20{
21 struct task_struct *tsk = per_cpu(idle_threads, cpu);
22
23 if (!tsk)
24 return ERR_PTR(-ENOMEM);
25 init_idle(tsk, cpu);
26 return tsk;
27}
28
29void __init idle_thread_set_boot_cpu(void)
30{
31 per_cpu(idle_threads, smp_processor_id()) = current;
32}
33
34static inline void idle_init(unsigned int cpu)
35{
36 struct task_struct *tsk = per_cpu(idle_threads, cpu);
37
38 if (!tsk) {
39 tsk = fork_idle(cpu);
40 if (IS_ERR(tsk))
41 pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
42 else
43 per_cpu(idle_threads, cpu) = tsk;
44 }
45}
46
47/**
48 * idle_thread_init - Initialize the idle thread for a cpu
49 * @cpu: The cpu for which the idle thread should be initialized
50 *
51 * Creates the thread if it does not exist.
52 */
53void __init idle_threads_init(void)
54{
55 unsigned int cpu;
56
57 for_each_possible_cpu(cpu) {
58 if (cpu != smp_processor_id())
59 idle_init(cpu);
60 }
61}
62#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 00000000000..80c0acfb847
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
1#ifndef SMPBOOT_H
2#define SMPBOOT_H
3
4struct task_struct;
5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void);
11void idle_threads_init(void);
12#else
13static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
14static inline void idle_thread_set_boot_cpu(void) { }
15static inline void idle_threads_init(void) { }
16#endif
17
18#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f..2095be3318d 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37/*
38 * Initialize an rcu_batch structure to empty.
39 */
40static inline void rcu_batch_init(struct rcu_batch *b)
41{
42 b->head = NULL;
43 b->tail = &b->head;
44}
45
46/*
47 * Enqueue a callback onto the tail of the specified rcu_batch structure.
48 */
49static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
50{
51 *b->tail = head;
52 b->tail = &head->next;
53}
54
55/*
56 * Is the specified rcu_batch structure empty?
57 */
58static inline bool rcu_batch_empty(struct rcu_batch *b)
59{
60 return b->tail == &b->head;
61}
62
63/*
64 * Remove the callback at the head of the specified rcu_batch structure
65 * and return a pointer to it, or return NULL if the structure is empty.
66 */
67static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
68{
69 struct rcu_head *head;
70
71 if (rcu_batch_empty(b))
72 return NULL;
73
74 head = b->head;
75 b->head = head->next;
76 if (b->tail == &head->next)
77 rcu_batch_init(b);
78
79 return head;
80}
81
82/*
83 * Move all callbacks from the rcu_batch structure specified by "from" to
84 * the structure specified by "to".
85 */
86static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
87{
88 if (!rcu_batch_empty(from)) {
89 *to->tail = from->head;
90 to->tail = from->tail;
91 rcu_batch_init(from);
92 }
93}
94
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
37static int init_srcu_struct_fields(struct srcu_struct *sp) 98static int init_srcu_struct_fields(struct srcu_struct *sp)
38{ 99{
39 sp->completed = 0; 100 sp->completed = 0;
40 mutex_init(&sp->mutex); 101 spin_lock_init(&sp->queue_lock);
102 sp->running = false;
103 rcu_batch_init(&sp->batch_queue);
104 rcu_batch_init(&sp->batch_check0);
105 rcu_batch_init(&sp->batch_check1);
106 rcu_batch_init(&sp->batch_done);
107 INIT_DELAYED_WORK(&sp->work, process_srcu);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM; 109 return sp->per_cpu_ref ? 0 : -ENOMEM;
43} 110}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 140#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 141
75/* 142/*
76 * srcu_readers_active_idx -- returns approximate number of readers 143 * Returns approximate total of the readers' ->seq[] values for the
77 * active on the specified rank of per-CPU counters. 144 * rank of per-CPU counters specified by idx.
78 */ 145 */
146static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
147{
148 int cpu;
149 unsigned long sum = 0;
150 unsigned long t;
79 151
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 152 for_each_possible_cpu(cpu) {
153 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
154 sum += t;
155 }
156 return sum;
157}
158
159/*
160 * Returns approximate number of readers active on the specified rank
161 * of the per-CPU ->c[] counters.
162 */
163static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
81{ 164{
82 int cpu; 165 int cpu;
83 int sum; 166 unsigned long sum = 0;
167 unsigned long t;
84 168
85 sum = 0; 169 for_each_possible_cpu(cpu) {
86 for_each_possible_cpu(cpu) 170 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 171 sum += t;
172 }
88 return sum; 173 return sum;
89} 174}
90 175
176/*
177 * Return true if the number of pre-existing readers is determined to
178 * be stably zero. An example unstable zero can occur if the call
179 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
180 * but due to task migration, sees the corresponding __srcu_read_unlock()
181 * decrement. This can happen because srcu_readers_active_idx() takes
182 * time to sum the array, and might in fact be interrupted or preempted
183 * partway through the summation.
184 */
185static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
186{
187 unsigned long seq;
188
189 seq = srcu_readers_seq_idx(sp, idx);
190
191 /*
192 * The following smp_mb() A pairs with the smp_mb() B located in
193 * __srcu_read_lock(). This pairing ensures that if an
194 * __srcu_read_lock() increments its counter after the summation
195 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
196 * critical section will see any changes made prior to the start
197 * of the current SRCU grace period.
198 *
199 * Also, if the above call to srcu_readers_seq_idx() saw the
200 * increment of ->seq[], then the call to srcu_readers_active_idx()
201 * must see the increment of ->c[].
202 */
203 smp_mb(); /* A */
204
205 /*
206 * Note that srcu_readers_active_idx() can incorrectly return
207 * zero even though there is a pre-existing reader throughout.
208 * To see this, suppose that task A is in a very long SRCU
209 * read-side critical section that started on CPU 0, and that
210 * no other reader exists, so that the sum of the counters
211 * is equal to one. Then suppose that task B starts executing
212 * srcu_readers_active_idx(), summing up to CPU 1, and then that
213 * task C starts reading on CPU 0, so that its increment is not
214 * summed, but finishes reading on CPU 2, so that its decrement
215 * -is- summed. Then when task B completes its sum, it will
216 * incorrectly get zero, despite the fact that task A has been
217 * in its SRCU read-side critical section the whole time.
218 *
219 * We therefore do a validation step should srcu_readers_active_idx()
220 * return zero.
221 */
222 if (srcu_readers_active_idx(sp, idx) != 0)
223 return false;
224
225 /*
226 * The remainder of this function is the validation step.
227 * The following smp_mb() D pairs with the smp_mb() C in
228 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
229 * by srcu_readers_active_idx() above, then any destructive
230 * operation performed after the grace period will happen after
231 * the corresponding SRCU read-side critical section.
232 *
233 * Note that there can be at most NR_CPUS worth of readers using
234 * the old index, which is not enough to overflow even a 32-bit
235 * integer. (Yes, this does mean that systems having more than
236 * a billion or so CPUs need to be 64-bit systems.) Therefore,
237 * the sum of the ->seq[] counters cannot possibly overflow.
238 * Therefore, the only way that the return values of the two
239 * calls to srcu_readers_seq_idx() can be equal is if there were
240 * no increments of the corresponding rank of ->seq[] counts
241 * in the interim. But the missed-increment scenario laid out
242 * above includes an increment of the ->seq[] counter by
243 * the corresponding __srcu_read_lock(). Therefore, if this
244 * scenario occurs, the return values from the two calls to
245 * srcu_readers_seq_idx() will differ, and thus the validation
246 * step below suffices.
247 */
248 smp_mb(); /* D */
249
250 return srcu_readers_seq_idx(sp, idx) == seq;
251}
252
91/** 253/**
92 * srcu_readers_active - returns approximate number of readers. 254 * srcu_readers_active - returns approximate number of readers.
93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 255 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
98 */ 260 */
99static int srcu_readers_active(struct srcu_struct *sp) 261static int srcu_readers_active(struct srcu_struct *sp)
100{ 262{
101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 263 int cpu;
264 unsigned long sum = 0;
265
266 for_each_possible_cpu(cpu) {
267 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
268 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
269 }
270 return sum;
102} 271}
103 272
104/** 273/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 300 int idx;
132 301
133 preempt_disable(); 302 preempt_disable();
134 idx = sp->completed & 0x1; 303 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 304 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 306 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
138 preempt_enable(); 308 preempt_enable();
139 return idx; 309 return idx;
140} 310}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 319void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 320{
151 preempt_disable(); 321 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 322 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
154 preempt_enable(); 324 preempt_enable();
155} 325}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 326EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 333 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 334 * has done well in testing, so there is no need for a config parameter.
165 */ 335 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 336#define SRCU_RETRY_CHECK_DELAY 5
337#define SYNCHRONIZE_SRCU_TRYCOUNT 2
338#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
167 339
168/* 340/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 341 * @@@ Wait until all pre-existing readers complete. Such readers
342 * will have used the index specified by "idx".
343 * the caller should ensures the ->completed is not changed while checking
344 * and idx = (->completed & 1) ^ 1
170 */ 345 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 346static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
172{ 347{
173 int idx; 348 for (;;) {
174 349 if (srcu_readers_active_idx_check(sp, idx))
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 350 return true;
176 !lock_is_held(&rcu_bh_lock_map) && 351 if (--trycount <= 0)
177 !lock_is_held(&rcu_lock_map) && 352 return false;
178 !lock_is_held(&rcu_sched_lock_map), 353 udelay(SRCU_RETRY_CHECK_DELAY);
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 354 }
180 355}
181 idx = sp->completed;
182 mutex_lock(&sp->mutex);
183 356
184 /* 357/*
185 * Check to see if someone else did the work for us while we were 358 * Increment the ->completed counter so that future SRCU readers will
186 * waiting to acquire the lock. We need -two- advances of 359 * use the other rank of the ->c[] and ->seq[] arrays. This allows
187 * the counter, not just one. If there was but one, we might have 360 * us to wait for pre-existing readers in a starvation-free manner.
188 * shown up -after- our helper's first synchronize_sched(), thus 361 */
189 * having failed to prevent CPU-reordering races with concurrent 362static void srcu_flip(struct srcu_struct *sp)
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 363{
191 * either (1) wait for two or (2) supply the second ourselves. 364 sp->completed++;
192 */ 365}
193 366
194 if ((sp->completed - idx) >= 2) { 367/*
195 mutex_unlock(&sp->mutex); 368 * Enqueue an SRCU callback on the specified srcu_struct structure,
196 return; 369 * initiating grace-period processing if it is not already running.
370 */
371void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
372 void (*func)(struct rcu_head *head))
373{
374 unsigned long flags;
375
376 head->next = NULL;
377 head->func = func;
378 spin_lock_irqsave(&sp->queue_lock, flags);
379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) {
381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0);
197 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385}
386EXPORT_SYMBOL_GPL(call_srcu);
198 387
199 sync_func(); /* Force memory barrier on all CPUs. */ 388struct rcu_synchronize {
389 struct rcu_head head;
390 struct completion completion;
391};
200 392
201 /* 393/*
202 * The preceding synchronize_sched() ensures that any CPU that 394 * Awaken the corresponding synchronize_srcu() instance now that a
203 * sees the new value of sp->completed will also see any preceding 395 * grace period has elapsed.
204 * changes to data structures made by this CPU. This prevents 396 */
205 * some other CPU from reordering the accesses in its SRCU 397static void wakeme_after_rcu(struct rcu_head *head)
206 * read-side critical section to precede the corresponding 398{
207 * srcu_read_lock() -- ensuring that such references will in 399 struct rcu_synchronize *rcu;
208 * fact be protected.
209 *
210 * So it is now safe to do the flip.
211 */
212 400
213 idx = sp->completed & 0x1; 401 rcu = container_of(head, struct rcu_synchronize, head);
214 sp->completed++; 402 complete(&rcu->completion);
403}
215 404
216 sync_func(); /* Force memory barrier on all CPUs. */ 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
406static void srcu_reschedule(struct srcu_struct *sp);
217 407
218 /* 408/*
219 * At this point, because of the preceding synchronize_sched(), 409 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
220 * all srcu_read_lock() calls using the old counters have completed. 410 */
221 * Their corresponding critical sections might well be still 411static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
222 * executing, but the srcu_read_lock() primitives themselves 412{
223 * will have finished executing. We initially give readers 413 struct rcu_synchronize rcu;
224 * an arbitrarily chosen 10 microseconds to get out of their 414 struct rcu_head *head = &rcu.head;
225 * SRCU read-side critical sections, then loop waiting 1/HZ 415 bool done = false;
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234 416
235 sync_func(); /* Force memory barrier on all CPUs. */ 417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
418 !lock_is_held(&rcu_bh_lock_map) &&
419 !lock_is_held(&rcu_lock_map) &&
420 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
236 422
237 /* 423 init_completion(&rcu.completion);
238 * The preceding synchronize_sched() forces all srcu_read_unlock() 424
239 * primitives that were executing concurrently with the preceding 425 head->next = NULL;
240 * for_each_possible_cpu() loop to have completed by this point. 426 head->func = wakeme_after_rcu;
241 * More importantly, it also forces the corresponding SRCU read-side 427 spin_lock_irq(&sp->queue_lock);
242 * critical sections to have also completed, and the corresponding 428 if (!sp->running) {
243 * references to SRCU-protected data items to be dropped. 429 /* steal the processing owner */
244 * 430 sp->running = true;
245 * Note: 431 rcu_batch_queue(&sp->batch_check0, head);
246 * 432 spin_unlock_irq(&sp->queue_lock);
247 * Despite what you might think at first glance, the 433
248 * preceding synchronize_sched() -must- be within the 434 srcu_advance_batches(sp, trycount);
249 * critical section ended by the following mutex_unlock(). 435 if (!rcu_batch_empty(&sp->batch_done)) {
250 * Otherwise, a task taking the early exit can race 436 BUG_ON(sp->batch_done.head != head);
251 * with a srcu_read_unlock(), which might have executed 437 rcu_batch_dequeue(&sp->batch_done);
252 * just before the preceding srcu_readers_active() check, 438 done = true;
253 * and whose CPU might have reordered the srcu_read_unlock() 439 }
254 * with the preceding critical section. In this case, there 440 /* give the processing owner to work_struct */
255 * is nothing preventing the synchronize_sched() task that is 441 srcu_reschedule(sp);
256 * taking the early exit from freeing a data structure that 442 } else {
257 * is still being referenced (out of order) by the task 443 rcu_batch_queue(&sp->batch_queue, head);
258 * doing the srcu_read_unlock(). 444 spin_unlock_irq(&sp->queue_lock);
259 * 445 }
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */
264 446
265 mutex_unlock(&sp->mutex); 447 if (!done)
448 wait_for_completion(&rcu.completion);
266} 449}
267 450
268/** 451/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 464 */
282void synchronize_srcu(struct srcu_struct *sp) 465void synchronize_srcu(struct srcu_struct *sp)
283{ 466{
284 __synchronize_srcu(sp, synchronize_sched); 467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
285} 468}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 469EXPORT_SYMBOL_GPL(synchronize_srcu);
287 470
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 472 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 473 * @sp: srcu_struct with which to synchronize.
291 * 474 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 475 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 476 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 477 *
300 * Note that it is illegal to call this function while holding any lock 478 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 480 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 481 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 485 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 487{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 488 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
313} 489}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 490EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 491
316/** 492/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
494 */
495void srcu_barrier(struct srcu_struct *sp)
496{
497 synchronize_srcu(sp);
498}
499EXPORT_SYMBOL_GPL(srcu_barrier);
500
501/**
317 * srcu_batches_completed - return batches completed. 502 * srcu_batches_completed - return batches completed.
318 * @sp: srcu_struct on which to report batch completion. 503 * @sp: srcu_struct on which to report batch completion.
319 * 504 *
320 * Report the number of batches, correlated with, but not necessarily 505 * Report the number of batches, correlated with, but not necessarily
321 * precisely the same as, the number of grace periods that have elapsed. 506 * precisely the same as, the number of grace periods that have elapsed.
322 */ 507 */
323
324long srcu_batches_completed(struct srcu_struct *sp) 508long srcu_batches_completed(struct srcu_struct *sp)
325{ 509{
326 return sp->completed; 510 return sp->completed;
327} 511}
328EXPORT_SYMBOL_GPL(srcu_batches_completed); 512EXPORT_SYMBOL_GPL(srcu_batches_completed);
513
514#define SRCU_CALLBACK_BATCH 10
515#define SRCU_INTERVAL 1
516
517/*
518 * Move any new SRCU callbacks to the first stage of the SRCU grace
519 * period pipeline.
520 */
521static void srcu_collect_new(struct srcu_struct *sp)
522{
523 if (!rcu_batch_empty(&sp->batch_queue)) {
524 spin_lock_irq(&sp->queue_lock);
525 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
526 spin_unlock_irq(&sp->queue_lock);
527 }
528}
529
530/*
531 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
532 * ->batch_check1 and then to ->batch_done as readers drain.
533 */
534static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
535{
536 int idx = 1 ^ (sp->completed & 1);
537
538 /*
539 * Because readers might be delayed for an extended period after
540 * fetching ->completed for their index, at any point in time there
541 * might well be readers using both idx=0 and idx=1. We therefore
542 * need to wait for readers to clear from both index values before
543 * invoking a callback.
544 */
545
546 if (rcu_batch_empty(&sp->batch_check0) &&
547 rcu_batch_empty(&sp->batch_check1))
548 return; /* no callbacks need to be advanced */
549
550 if (!try_check_zero(sp, idx, trycount))
551 return; /* failed to advance, will try after SRCU_INTERVAL */
552
553 /*
554 * The callbacks in ->batch_check1 have already done with their
555 * first zero check and flip back when they were enqueued on
556 * ->batch_check0 in a previous invocation of srcu_advance_batches().
557 * (Presumably try_check_zero() returned false during that
558 * invocation, leaving the callbacks stranded on ->batch_check1.)
559 * They are therefore ready to invoke, so move them to ->batch_done.
560 */
561 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
562
563 if (rcu_batch_empty(&sp->batch_check0))
564 return; /* no callbacks need to be advanced */
565 srcu_flip(sp);
566
567 /*
568 * The callbacks in ->batch_check0 just finished their
569 * first check zero and flip, so move them to ->batch_check1
570 * for future checking on the other idx.
571 */
572 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
573
574 /*
575 * SRCU read-side critical sections are normally short, so check
576 * at least twice in quick succession after a flip.
577 */
578 trycount = trycount < 2 ? 2 : trycount;
579 if (!try_check_zero(sp, idx^1, trycount))
580 return; /* failed to advance, will try after SRCU_INTERVAL */
581
582 /*
583 * The callbacks in ->batch_check1 have now waited for all
584 * pre-existing readers using both idx values. They are therefore
585 * ready to invoke, so move them to ->batch_done.
586 */
587 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
588}
589
590/*
591 * Invoke a limited number of SRCU callbacks that have passed through
592 * their grace period. If there are more to do, SRCU will reschedule
593 * the workqueue.
594 */
595static void srcu_invoke_callbacks(struct srcu_struct *sp)
596{
597 int i;
598 struct rcu_head *head;
599
600 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
601 head = rcu_batch_dequeue(&sp->batch_done);
602 if (!head)
603 break;
604 local_bh_disable();
605 head->func(head);
606 local_bh_enable();
607 }
608}
609
610/*
611 * Finished one round of SRCU grace period. Start another if there are
612 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
613 */
614static void srcu_reschedule(struct srcu_struct *sp)
615{
616 bool pending = true;
617
618 if (rcu_batch_empty(&sp->batch_done) &&
619 rcu_batch_empty(&sp->batch_check1) &&
620 rcu_batch_empty(&sp->batch_check0) &&
621 rcu_batch_empty(&sp->batch_queue)) {
622 spin_lock_irq(&sp->queue_lock);
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 sp->running = false;
628 pending = false;
629 }
630 spin_unlock_irq(&sp->queue_lock);
631 }
632
633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
635}
636
637/*
638 * This is the work-queue function that handles SRCU grace periods.
639 */
640static void process_srcu(struct work_struct *work)
641{
642 struct srcu_struct *sp;
643
644 sp = container_of(work, struct srcu_struct, work.work);
645
646 srcu_collect_new(sp);
647 srcu_advance_batches(sp, 1);
648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp);
650}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e..6df42624e45 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -93,10 +93,8 @@
93int overflowuid = DEFAULT_OVERFLOWUID; 93int overflowuid = DEFAULT_OVERFLOWUID;
94int overflowgid = DEFAULT_OVERFLOWGID; 94int overflowgid = DEFAULT_OVERFLOWGID;
95 95
96#ifdef CONFIG_UID16
97EXPORT_SYMBOL(overflowuid); 96EXPORT_SYMBOL(overflowuid);
98EXPORT_SYMBOL(overflowgid); 97EXPORT_SYMBOL(overflowgid);
99#endif
100 98
101/* 99/*
102 * the same as above, but for filesystems which can only store a 16-bit 100 * the same as above, but for filesystems which can only store a 16-bit
@@ -133,11 +131,10 @@ static bool set_one_prio_perm(struct task_struct *p)
133{ 131{
134 const struct cred *cred = current_cred(), *pcred = __task_cred(p); 132 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
135 133
136 if (pcred->user->user_ns == cred->user->user_ns && 134 if (uid_eq(pcred->uid, cred->euid) ||
137 (pcred->uid == cred->euid || 135 uid_eq(pcred->euid, cred->euid))
138 pcred->euid == cred->euid))
139 return true; 136 return true;
140 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) 137 if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
141 return true; 138 return true;
142 return false; 139 return false;
143} 140}
@@ -177,6 +174,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
177 const struct cred *cred = current_cred(); 174 const struct cred *cred = current_cred();
178 int error = -EINVAL; 175 int error = -EINVAL;
179 struct pid *pgrp; 176 struct pid *pgrp;
177 kuid_t uid;
180 178
181 if (which > PRIO_USER || which < PRIO_PROCESS) 179 if (which > PRIO_USER || which < PRIO_PROCESS)
182 goto out; 180 goto out;
@@ -209,18 +207,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
209 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 207 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
210 break; 208 break;
211 case PRIO_USER: 209 case PRIO_USER:
212 user = (struct user_struct *) cred->user; 210 uid = make_kuid(cred->user_ns, who);
211 user = cred->user;
213 if (!who) 212 if (!who)
214 who = cred->uid; 213 uid = cred->uid;
215 else if ((who != cred->uid) && 214 else if (!uid_eq(uid, cred->uid) &&
216 !(user = find_user(who))) 215 !(user = find_user(uid)))
217 goto out_unlock; /* No processes for this user */ 216 goto out_unlock; /* No processes for this user */
218 217
219 do_each_thread(g, p) { 218 do_each_thread(g, p) {
220 if (__task_cred(p)->uid == who) 219 if (uid_eq(task_uid(p), uid))
221 error = set_one_prio(p, niceval, error); 220 error = set_one_prio(p, niceval, error);
222 } while_each_thread(g, p); 221 } while_each_thread(g, p);
223 if (who != cred->uid) 222 if (!uid_eq(uid, cred->uid))
224 free_uid(user); /* For find_user() */ 223 free_uid(user); /* For find_user() */
225 break; 224 break;
226 } 225 }
@@ -244,6 +243,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
244 const struct cred *cred = current_cred(); 243 const struct cred *cred = current_cred();
245 long niceval, retval = -ESRCH; 244 long niceval, retval = -ESRCH;
246 struct pid *pgrp; 245 struct pid *pgrp;
246 kuid_t uid;
247 247
248 if (which > PRIO_USER || which < PRIO_PROCESS) 248 if (which > PRIO_USER || which < PRIO_PROCESS)
249 return -EINVAL; 249 return -EINVAL;
@@ -274,21 +274,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
274 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 274 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
275 break; 275 break;
276 case PRIO_USER: 276 case PRIO_USER:
277 user = (struct user_struct *) cred->user; 277 uid = make_kuid(cred->user_ns, who);
278 user = cred->user;
278 if (!who) 279 if (!who)
279 who = cred->uid; 280 uid = cred->uid;
280 else if ((who != cred->uid) && 281 else if (!uid_eq(uid, cred->uid) &&
281 !(user = find_user(who))) 282 !(user = find_user(uid)))
282 goto out_unlock; /* No processes for this user */ 283 goto out_unlock; /* No processes for this user */
283 284
284 do_each_thread(g, p) { 285 do_each_thread(g, p) {
285 if (__task_cred(p)->uid == who) { 286 if (uid_eq(task_uid(p), uid)) {
286 niceval = 20 - task_nice(p); 287 niceval = 20 - task_nice(p);
287 if (niceval > retval) 288 if (niceval > retval)
288 retval = niceval; 289 retval = niceval;
289 } 290 }
290 } while_each_thread(g, p); 291 } while_each_thread(g, p);
291 if (who != cred->uid) 292 if (!uid_eq(uid, cred->uid))
292 free_uid(user); /* for find_user() */ 293 free_uid(user); /* for find_user() */
293 break; 294 break;
294 } 295 }
@@ -553,9 +554,19 @@ void ctrl_alt_del(void)
553 */ 554 */
554SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 555SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
555{ 556{
557 struct user_namespace *ns = current_user_ns();
556 const struct cred *old; 558 const struct cred *old;
557 struct cred *new; 559 struct cred *new;
558 int retval; 560 int retval;
561 kgid_t krgid, kegid;
562
563 krgid = make_kgid(ns, rgid);
564 kegid = make_kgid(ns, egid);
565
566 if ((rgid != (gid_t) -1) && !gid_valid(krgid))
567 return -EINVAL;
568 if ((egid != (gid_t) -1) && !gid_valid(kegid))
569 return -EINVAL;
559 570
560 new = prepare_creds(); 571 new = prepare_creds();
561 if (!new) 572 if (!new)
@@ -564,25 +575,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
564 575
565 retval = -EPERM; 576 retval = -EPERM;
566 if (rgid != (gid_t) -1) { 577 if (rgid != (gid_t) -1) {
567 if (old->gid == rgid || 578 if (gid_eq(old->gid, krgid) ||
568 old->egid == rgid || 579 gid_eq(old->egid, krgid) ||
569 nsown_capable(CAP_SETGID)) 580 nsown_capable(CAP_SETGID))
570 new->gid = rgid; 581 new->gid = krgid;
571 else 582 else
572 goto error; 583 goto error;
573 } 584 }
574 if (egid != (gid_t) -1) { 585 if (egid != (gid_t) -1) {
575 if (old->gid == egid || 586 if (gid_eq(old->gid, kegid) ||
576 old->egid == egid || 587 gid_eq(old->egid, kegid) ||
577 old->sgid == egid || 588 gid_eq(old->sgid, kegid) ||
578 nsown_capable(CAP_SETGID)) 589 nsown_capable(CAP_SETGID))
579 new->egid = egid; 590 new->egid = kegid;
580 else 591 else
581 goto error; 592 goto error;
582 } 593 }
583 594
584 if (rgid != (gid_t) -1 || 595 if (rgid != (gid_t) -1 ||
585 (egid != (gid_t) -1 && egid != old->gid)) 596 (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
586 new->sgid = new->egid; 597 new->sgid = new->egid;
587 new->fsgid = new->egid; 598 new->fsgid = new->egid;
588 599
@@ -600,9 +611,15 @@ error:
600 */ 611 */
601SYSCALL_DEFINE1(setgid, gid_t, gid) 612SYSCALL_DEFINE1(setgid, gid_t, gid)
602{ 613{
614 struct user_namespace *ns = current_user_ns();
603 const struct cred *old; 615 const struct cred *old;
604 struct cred *new; 616 struct cred *new;
605 int retval; 617 int retval;
618 kgid_t kgid;
619
620 kgid = make_kgid(ns, gid);
621 if (!gid_valid(kgid))
622 return -EINVAL;
606 623
607 new = prepare_creds(); 624 new = prepare_creds();
608 if (!new) 625 if (!new)
@@ -611,9 +628,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
611 628
612 retval = -EPERM; 629 retval = -EPERM;
613 if (nsown_capable(CAP_SETGID)) 630 if (nsown_capable(CAP_SETGID))
614 new->gid = new->egid = new->sgid = new->fsgid = gid; 631 new->gid = new->egid = new->sgid = new->fsgid = kgid;
615 else if (gid == old->gid || gid == old->sgid) 632 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
616 new->egid = new->fsgid = gid; 633 new->egid = new->fsgid = kgid;
617 else 634 else
618 goto error; 635 goto error;
619 636
@@ -631,7 +648,7 @@ static int set_user(struct cred *new)
631{ 648{
632 struct user_struct *new_user; 649 struct user_struct *new_user;
633 650
634 new_user = alloc_uid(current_user_ns(), new->uid); 651 new_user = alloc_uid(new->uid);
635 if (!new_user) 652 if (!new_user)
636 return -EAGAIN; 653 return -EAGAIN;
637 654
@@ -670,9 +687,19 @@ static int set_user(struct cred *new)
670 */ 687 */
671SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 688SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
672{ 689{
690 struct user_namespace *ns = current_user_ns();
673 const struct cred *old; 691 const struct cred *old;
674 struct cred *new; 692 struct cred *new;
675 int retval; 693 int retval;
694 kuid_t kruid, keuid;
695
696 kruid = make_kuid(ns, ruid);
697 keuid = make_kuid(ns, euid);
698
699 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
700 return -EINVAL;
701 if ((euid != (uid_t) -1) && !uid_valid(keuid))
702 return -EINVAL;
676 703
677 new = prepare_creds(); 704 new = prepare_creds();
678 if (!new) 705 if (!new)
@@ -681,29 +708,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
681 708
682 retval = -EPERM; 709 retval = -EPERM;
683 if (ruid != (uid_t) -1) { 710 if (ruid != (uid_t) -1) {
684 new->uid = ruid; 711 new->uid = kruid;
685 if (old->uid != ruid && 712 if (!uid_eq(old->uid, kruid) &&
686 old->euid != ruid && 713 !uid_eq(old->euid, kruid) &&
687 !nsown_capable(CAP_SETUID)) 714 !nsown_capable(CAP_SETUID))
688 goto error; 715 goto error;
689 } 716 }
690 717
691 if (euid != (uid_t) -1) { 718 if (euid != (uid_t) -1) {
692 new->euid = euid; 719 new->euid = keuid;
693 if (old->uid != euid && 720 if (!uid_eq(old->uid, keuid) &&
694 old->euid != euid && 721 !uid_eq(old->euid, keuid) &&
695 old->suid != euid && 722 !uid_eq(old->suid, keuid) &&
696 !nsown_capable(CAP_SETUID)) 723 !nsown_capable(CAP_SETUID))
697 goto error; 724 goto error;
698 } 725 }
699 726
700 if (new->uid != old->uid) { 727 if (!uid_eq(new->uid, old->uid)) {
701 retval = set_user(new); 728 retval = set_user(new);
702 if (retval < 0) 729 if (retval < 0)
703 goto error; 730 goto error;
704 } 731 }
705 if (ruid != (uid_t) -1 || 732 if (ruid != (uid_t) -1 ||
706 (euid != (uid_t) -1 && euid != old->uid)) 733 (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
707 new->suid = new->euid; 734 new->suid = new->euid;
708 new->fsuid = new->euid; 735 new->fsuid = new->euid;
709 736
@@ -731,9 +758,15 @@ error:
731 */ 758 */
732SYSCALL_DEFINE1(setuid, uid_t, uid) 759SYSCALL_DEFINE1(setuid, uid_t, uid)
733{ 760{
761 struct user_namespace *ns = current_user_ns();
734 const struct cred *old; 762 const struct cred *old;
735 struct cred *new; 763 struct cred *new;
736 int retval; 764 int retval;
765 kuid_t kuid;
766
767 kuid = make_kuid(ns, uid);
768 if (!uid_valid(kuid))
769 return -EINVAL;
737 770
738 new = prepare_creds(); 771 new = prepare_creds();
739 if (!new) 772 if (!new)
@@ -742,17 +775,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
742 775
743 retval = -EPERM; 776 retval = -EPERM;
744 if (nsown_capable(CAP_SETUID)) { 777 if (nsown_capable(CAP_SETUID)) {
745 new->suid = new->uid = uid; 778 new->suid = new->uid = kuid;
746 if (uid != old->uid) { 779 if (!uid_eq(kuid, old->uid)) {
747 retval = set_user(new); 780 retval = set_user(new);
748 if (retval < 0) 781 if (retval < 0)
749 goto error; 782 goto error;
750 } 783 }
751 } else if (uid != old->uid && uid != new->suid) { 784 } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
752 goto error; 785 goto error;
753 } 786 }
754 787
755 new->fsuid = new->euid = uid; 788 new->fsuid = new->euid = kuid;
756 789
757 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 790 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
758 if (retval < 0) 791 if (retval < 0)
@@ -772,9 +805,24 @@ error:
772 */ 805 */
773SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 806SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
774{ 807{
808 struct user_namespace *ns = current_user_ns();
775 const struct cred *old; 809 const struct cred *old;
776 struct cred *new; 810 struct cred *new;
777 int retval; 811 int retval;
812 kuid_t kruid, keuid, ksuid;
813
814 kruid = make_kuid(ns, ruid);
815 keuid = make_kuid(ns, euid);
816 ksuid = make_kuid(ns, suid);
817
818 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
819 return -EINVAL;
820
821 if ((euid != (uid_t) -1) && !uid_valid(keuid))
822 return -EINVAL;
823
824 if ((suid != (uid_t) -1) && !uid_valid(ksuid))
825 return -EINVAL;
778 826
779 new = prepare_creds(); 827 new = prepare_creds();
780 if (!new) 828 if (!new)
@@ -784,29 +832,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
784 832
785 retval = -EPERM; 833 retval = -EPERM;
786 if (!nsown_capable(CAP_SETUID)) { 834 if (!nsown_capable(CAP_SETUID)) {
787 if (ruid != (uid_t) -1 && ruid != old->uid && 835 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
788 ruid != old->euid && ruid != old->suid) 836 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
789 goto error; 837 goto error;
790 if (euid != (uid_t) -1 && euid != old->uid && 838 if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
791 euid != old->euid && euid != old->suid) 839 !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
792 goto error; 840 goto error;
793 if (suid != (uid_t) -1 && suid != old->uid && 841 if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
794 suid != old->euid && suid != old->suid) 842 !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
795 goto error; 843 goto error;
796 } 844 }
797 845
798 if (ruid != (uid_t) -1) { 846 if (ruid != (uid_t) -1) {
799 new->uid = ruid; 847 new->uid = kruid;
800 if (ruid != old->uid) { 848 if (!uid_eq(kruid, old->uid)) {
801 retval = set_user(new); 849 retval = set_user(new);
802 if (retval < 0) 850 if (retval < 0)
803 goto error; 851 goto error;
804 } 852 }
805 } 853 }
806 if (euid != (uid_t) -1) 854 if (euid != (uid_t) -1)
807 new->euid = euid; 855 new->euid = keuid;
808 if (suid != (uid_t) -1) 856 if (suid != (uid_t) -1)
809 new->suid = suid; 857 new->suid = ksuid;
810 new->fsuid = new->euid; 858 new->fsuid = new->euid;
811 859
812 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 860 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -820,14 +868,19 @@ error:
820 return retval; 868 return retval;
821} 869}
822 870
823SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) 871SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
824{ 872{
825 const struct cred *cred = current_cred(); 873 const struct cred *cred = current_cred();
826 int retval; 874 int retval;
875 uid_t ruid, euid, suid;
827 876
828 if (!(retval = put_user(cred->uid, ruid)) && 877 ruid = from_kuid_munged(cred->user_ns, cred->uid);
829 !(retval = put_user(cred->euid, euid))) 878 euid = from_kuid_munged(cred->user_ns, cred->euid);
830 retval = put_user(cred->suid, suid); 879 suid = from_kuid_munged(cred->user_ns, cred->suid);
880
881 if (!(retval = put_user(ruid, ruidp)) &&
882 !(retval = put_user(euid, euidp)))
883 retval = put_user(suid, suidp);
831 884
832 return retval; 885 return retval;
833} 886}
@@ -837,9 +890,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
837 */ 890 */
838SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 891SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
839{ 892{
893 struct user_namespace *ns = current_user_ns();
840 const struct cred *old; 894 const struct cred *old;
841 struct cred *new; 895 struct cred *new;
842 int retval; 896 int retval;
897 kgid_t krgid, kegid, ksgid;
898
899 krgid = make_kgid(ns, rgid);
900 kegid = make_kgid(ns, egid);
901 ksgid = make_kgid(ns, sgid);
902
903 if ((rgid != (gid_t) -1) && !gid_valid(krgid))
904 return -EINVAL;
905 if ((egid != (gid_t) -1) && !gid_valid(kegid))
906 return -EINVAL;
907 if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
908 return -EINVAL;
843 909
844 new = prepare_creds(); 910 new = prepare_creds();
845 if (!new) 911 if (!new)
@@ -848,23 +914,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
848 914
849 retval = -EPERM; 915 retval = -EPERM;
850 if (!nsown_capable(CAP_SETGID)) { 916 if (!nsown_capable(CAP_SETGID)) {
851 if (rgid != (gid_t) -1 && rgid != old->gid && 917 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
852 rgid != old->egid && rgid != old->sgid) 918 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
853 goto error; 919 goto error;
854 if (egid != (gid_t) -1 && egid != old->gid && 920 if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
855 egid != old->egid && egid != old->sgid) 921 !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
856 goto error; 922 goto error;
857 if (sgid != (gid_t) -1 && sgid != old->gid && 923 if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
858 sgid != old->egid && sgid != old->sgid) 924 !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
859 goto error; 925 goto error;
860 } 926 }
861 927
862 if (rgid != (gid_t) -1) 928 if (rgid != (gid_t) -1)
863 new->gid = rgid; 929 new->gid = krgid;
864 if (egid != (gid_t) -1) 930 if (egid != (gid_t) -1)
865 new->egid = egid; 931 new->egid = kegid;
866 if (sgid != (gid_t) -1) 932 if (sgid != (gid_t) -1)
867 new->sgid = sgid; 933 new->sgid = ksgid;
868 new->fsgid = new->egid; 934 new->fsgid = new->egid;
869 935
870 return commit_creds(new); 936 return commit_creds(new);
@@ -874,14 +940,19 @@ error:
874 return retval; 940 return retval;
875} 941}
876 942
877SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) 943SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
878{ 944{
879 const struct cred *cred = current_cred(); 945 const struct cred *cred = current_cred();
880 int retval; 946 int retval;
947 gid_t rgid, egid, sgid;
948
949 rgid = from_kgid_munged(cred->user_ns, cred->gid);
950 egid = from_kgid_munged(cred->user_ns, cred->egid);
951 sgid = from_kgid_munged(cred->user_ns, cred->sgid);
881 952
882 if (!(retval = put_user(cred->gid, rgid)) && 953 if (!(retval = put_user(rgid, rgidp)) &&
883 !(retval = put_user(cred->egid, egid))) 954 !(retval = put_user(egid, egidp)))
884 retval = put_user(cred->sgid, sgid); 955 retval = put_user(sgid, sgidp);
885 956
886 return retval; 957 return retval;
887} 958}
@@ -898,18 +969,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
898 const struct cred *old; 969 const struct cred *old;
899 struct cred *new; 970 struct cred *new;
900 uid_t old_fsuid; 971 uid_t old_fsuid;
972 kuid_t kuid;
973
974 old = current_cred();
975 old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
976
977 kuid = make_kuid(old->user_ns, uid);
978 if (!uid_valid(kuid))
979 return old_fsuid;
901 980
902 new = prepare_creds(); 981 new = prepare_creds();
903 if (!new) 982 if (!new)
904 return current_fsuid(); 983 return old_fsuid;
905 old = current_cred();
906 old_fsuid = old->fsuid;
907 984
908 if (uid == old->uid || uid == old->euid || 985 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
909 uid == old->suid || uid == old->fsuid || 986 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
910 nsown_capable(CAP_SETUID)) { 987 nsown_capable(CAP_SETUID)) {
911 if (uid != old_fsuid) { 988 if (!uid_eq(kuid, old->fsuid)) {
912 new->fsuid = uid; 989 new->fsuid = kuid;
913 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 990 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
914 goto change_okay; 991 goto change_okay;
915 } 992 }
@@ -931,18 +1008,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
931 const struct cred *old; 1008 const struct cred *old;
932 struct cred *new; 1009 struct cred *new;
933 gid_t old_fsgid; 1010 gid_t old_fsgid;
1011 kgid_t kgid;
1012
1013 old = current_cred();
1014 old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
1015
1016 kgid = make_kgid(old->user_ns, gid);
1017 if (!gid_valid(kgid))
1018 return old_fsgid;
934 1019
935 new = prepare_creds(); 1020 new = prepare_creds();
936 if (!new) 1021 if (!new)
937 return current_fsgid(); 1022 return old_fsgid;
938 old = current_cred();
939 old_fsgid = old->fsgid;
940 1023
941 if (gid == old->gid || gid == old->egid || 1024 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
942 gid == old->sgid || gid == old->fsgid || 1025 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
943 nsown_capable(CAP_SETGID)) { 1026 nsown_capable(CAP_SETGID)) {
944 if (gid != old_fsgid) { 1027 if (!gid_eq(kgid, old->fsgid)) {
945 new->fsgid = gid; 1028 new->fsgid = kgid;
946 goto change_okay; 1029 goto change_okay;
947 } 1030 }
948 } 1031 }
@@ -1498,15 +1581,14 @@ static int check_prlimit_permission(struct task_struct *task)
1498 return 0; 1581 return 0;
1499 1582
1500 tcred = __task_cred(task); 1583 tcred = __task_cred(task);
1501 if (cred->user->user_ns == tcred->user->user_ns && 1584 if (uid_eq(cred->uid, tcred->euid) &&
1502 (cred->uid == tcred->euid && 1585 uid_eq(cred->uid, tcred->suid) &&
1503 cred->uid == tcred->suid && 1586 uid_eq(cred->uid, tcred->uid) &&
1504 cred->uid == tcred->uid && 1587 gid_eq(cred->gid, tcred->egid) &&
1505 cred->gid == tcred->egid && 1588 gid_eq(cred->gid, tcred->sgid) &&
1506 cred->gid == tcred->sgid && 1589 gid_eq(cred->gid, tcred->gid))
1507 cred->gid == tcred->gid))
1508 return 0; 1590 return 0;
1509 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) 1591 if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1510 return 0; 1592 return 0;
1511 1593
1512 return -EPERM; 1594 return -EPERM;
@@ -1908,7 +1990,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1908 error = prctl_get_seccomp(); 1990 error = prctl_get_seccomp();
1909 break; 1991 break;
1910 case PR_SET_SECCOMP: 1992 case PR_SET_SECCOMP:
1911 error = prctl_set_seccomp(arg2); 1993 error = prctl_set_seccomp(arg2, (char __user *)arg3);
1912 break; 1994 break;
1913 case PR_GET_TSC: 1995 case PR_GET_TSC:
1914 error = GET_TSC_CTL(arg2); 1996 error = GET_TSC_CTL(arg2);
@@ -1979,6 +2061,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1979 error = put_user(me->signal->is_child_subreaper, 2061 error = put_user(me->signal->is_child_subreaper,
1980 (int __user *) arg2); 2062 (int __user *) arg2);
1981 break; 2063 break;
2064 case PR_SET_NO_NEW_PRIVS:
2065 if (arg2 != 1 || arg3 || arg4 || arg5)
2066 return -EINVAL;
2067
2068 current->no_new_privs = 1;
2069 break;
2070 case PR_GET_NO_NEW_PRIVS:
2071 if (arg2 || arg3 || arg4 || arg5)
2072 return -EINVAL;
2073 return current->no_new_privs ? 1 : 0;
1982 default: 2074 default:
1983 error = -EINVAL; 2075 error = -EINVAL;
1984 break; 2076 break;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7..aa27d391bfc 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
59 * If one has not already been chosen, it checks to see if a 59 * If one has not already been chosen, it checks to see if a
60 * functional rtc device is available. 60 * functional rtc device is available.
61 */ 61 */
62static struct rtc_device *alarmtimer_get_rtcdev(void) 62struct rtc_device *alarmtimer_get_rtcdev(void)
63{ 63{
64 unsigned long flags; 64 unsigned long flags;
65 struct rtc_device *ret; 65 struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
115 class_interface_unregister(&alarmtimer_rtc_interface); 115 class_interface_unregister(&alarmtimer_rtc_interface);
116} 116}
117#else 117#else
118static inline struct rtc_device *alarmtimer_get_rtcdev(void) 118struct rtc_device *alarmtimer_get_rtcdev(void)
119{ 119{
120 return NULL; 120 return NULL;
121} 121}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index bf57abdc7bd..f113755695e 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -346,7 +346,8 @@ int tick_resume_broadcast(void)
346 tick_get_broadcast_mask()); 346 tick_get_broadcast_mask());
347 break; 347 break;
348 case TICKDEV_MODE_ONESHOT: 348 case TICKDEV_MODE_ONESHOT:
349 broadcast = tick_resume_broadcast_oneshot(bc); 349 if (!cpumask_empty(tick_get_broadcast_mask()))
350 broadcast = tick_resume_broadcast_oneshot(bc);
350 break; 351 break;
351 } 352 }
352 } 353 }
@@ -373,6 +374,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
373{ 374{
374 struct clock_event_device *bc = tick_broadcast_device.evtdev; 375 struct clock_event_device *bc = tick_broadcast_device.evtdev;
375 376
377 if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
378 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
379
376 return clockevents_program_event(bc, expires, force); 380 return clockevents_program_event(bc, expires, force);
377} 381}
378 382
@@ -531,7 +535,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 535 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
532 536
533 bc->event_handler = tick_handle_oneshot_broadcast; 537 bc->event_handler = tick_handle_oneshot_broadcast;
534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
535 538
536 /* Take the do_timer update */ 539 /* Take the do_timer update */
537 tick_do_timer_cpu = cpu; 540 tick_do_timer_cpu = cpu;
@@ -549,6 +552,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
549 to_cpumask(tmpmask)); 552 to_cpumask(tmpmask));
550 553
551 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 554 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
555 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
552 tick_broadcast_init_next_event(to_cpumask(tmpmask), 556 tick_broadcast_init_next_event(to_cpumask(tmpmask),
553 tick_next_period); 557 tick_next_period);
554 tick_broadcast_set_event(tick_next_period, 1); 558 tick_broadcast_set_event(tick_next_period, 1);
@@ -577,15 +581,10 @@ void tick_broadcast_switch_to_oneshot(void)
577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 581 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
578 582
579 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 583 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
580
581 if (cpumask_empty(tick_get_broadcast_mask()))
582 goto end;
583
584 bc = tick_broadcast_device.evtdev; 584 bc = tick_broadcast_device.evtdev;
585 if (bc) 585 if (bc)
586 tick_broadcast_setup_oneshot(bc); 586 tick_broadcast_setup_oneshot(bc);
587 587
588end:
589 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 588 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
590} 589}
591 590
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888..6ec7e7e0db4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
861 * 861 *
862 * mod_timer_pinned() is a way to update the expire field of an 862 * mod_timer_pinned() is a way to update the expire field of an
863 * active timer (if the timer is inactive it will be activated) 863 * active timer (if the timer is inactive it will be activated)
864 * and not allow the timer to be migrated to a different CPU. 864 * and to ensure that the timer is scheduled on the current CPU.
865 *
866 * Note that this does not prevent the timer from being migrated
867 * when the current CPU goes offline. If this is a problem for
868 * you, use CPU-hotplug notifiers to handle it correctly, for
869 * example, cancelling the timer when the corresponding CPU goes
870 * offline.
865 * 871 *
866 * mod_timer_pinned(timer, expires) is equivalent to: 872 * mod_timer_pinned(timer, expires) is equivalent to:
867 * 873 *
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1102 * warnings as well as problems when looking into 1108 * warnings as well as problems when looking into
1103 * timer->lockdep_map, make a copy and use that here. 1109 * timer->lockdep_map, make a copy and use that here.
1104 */ 1110 */
1105 struct lockdep_map lockdep_map = timer->lockdep_map; 1111 struct lockdep_map lockdep_map;
1112
1113 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1106#endif 1114#endif
1107 /* 1115 /*
1108 * Couple the lock chain with the lock chain at 1116 * Couple the lock chain with the lock chain at
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid)
1427SYSCALL_DEFINE0(getuid) 1435SYSCALL_DEFINE0(getuid)
1428{ 1436{
1429 /* Only we change this so SMP safe */ 1437 /* Only we change this so SMP safe */
1430 return current_uid(); 1438 return from_kuid_munged(current_user_ns(), current_uid());
1431} 1439}
1432 1440
1433SYSCALL_DEFINE0(geteuid) 1441SYSCALL_DEFINE0(geteuid)
1434{ 1442{
1435 /* Only we change this so SMP safe */ 1443 /* Only we change this so SMP safe */
1436 return current_euid(); 1444 return from_kuid_munged(current_user_ns(), current_euid());
1437} 1445}
1438 1446
1439SYSCALL_DEFINE0(getgid) 1447SYSCALL_DEFINE0(getgid)
1440{ 1448{
1441 /* Only we change this so SMP safe */ 1449 /* Only we change this so SMP safe */
1442 return current_gid(); 1450 return from_kgid_munged(current_user_ns(), current_gid());
1443} 1451}
1444 1452
1445SYSCALL_DEFINE0(getegid) 1453SYSCALL_DEFINE0(getegid)
1446{ 1454{
1447 /* Only we change this so SMP safe */ 1455 /* Only we change this so SMP safe */
1448 return current_egid(); 1456 return from_kgid_munged(current_user_ns(), current_egid());
1449} 1457}
1450 1458
1451#endif 1459#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ea4bff6295f..8c4c07071cc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,6 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
145 select KALLSYMS 144 select KALLSYMS
146 select GENERIC_TRACER 145 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 146 select CONTEXT_SWITCH_TRACER
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES
272 bool "Trace likely/unlikely profiler" 271 bool "Trace likely/unlikely profiler"
273 select TRACE_BRANCH_PROFILING 272 select TRACE_BRANCH_PROFILING
274 help 273 help
275 This tracer profiles all the the likely and unlikely macros 274 This tracer profiles all likely and unlikely macros
276 in the kernel. It will display the results in: 275 in the kernel. It will display the results in:
277 276
278 /sys/kernel/debug/tracing/trace_stat/branch_annotated 277 /sys/kernel/debug/tracing/trace_stat/branch_annotated
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1734c03e048..b831087c820 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
45obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 44obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
46ifeq ($(CONFIG_BLOCK),y) 45ifeq ($(CONFIG_BLOCK),y)
47obj-$(CONFIG_EVENT_TRACING) += blktrace.o 46obj-$(CONFIG_EVENT_TRACING) += blktrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cf81f27ce6c..a008663d86c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1383 1383
1384static int ftrace_cmp_recs(const void *a, const void *b) 1384static int ftrace_cmp_recs(const void *a, const void *b)
1385{ 1385{
1386 const struct dyn_ftrace *reca = a; 1386 const struct dyn_ftrace *key = a;
1387 const struct dyn_ftrace *recb = b; 1387 const struct dyn_ftrace *rec = b;
1388 1388
1389 if (reca->ip > recb->ip) 1389 if (key->flags < rec->ip)
1390 return 1;
1391 if (reca->ip < recb->ip)
1392 return -1; 1390 return -1;
1391 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1392 return 1;
1393 return 0; 1393 return 0;
1394} 1394}
1395 1395
1396/** 1396static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1397 * ftrace_location - return true if the ip giving is a traced location
1398 * @ip: the instruction pointer to check
1399 *
1400 * Returns 1 if @ip given is a pointer to a ftrace location.
1401 * That is, the instruction that is either a NOP or call to
1402 * the function tracer. It checks the ftrace internal tables to
1403 * determine if the address belongs or not.
1404 */
1405int ftrace_location(unsigned long ip)
1406{ 1397{
1407 struct ftrace_page *pg; 1398 struct ftrace_page *pg;
1408 struct dyn_ftrace *rec; 1399 struct dyn_ftrace *rec;
1409 struct dyn_ftrace key; 1400 struct dyn_ftrace key;
1410 1401
1411 key.ip = ip; 1402 key.ip = start;
1403 key.flags = end; /* overload flags, as it is unsigned long */
1412 1404
1413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 1405 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1406 if (end < pg->records[0].ip ||
1407 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1408 continue;
1414 rec = bsearch(&key, pg->records, pg->index, 1409 rec = bsearch(&key, pg->records, pg->index,
1415 sizeof(struct dyn_ftrace), 1410 sizeof(struct dyn_ftrace),
1416 ftrace_cmp_recs); 1411 ftrace_cmp_recs);
1417 if (rec) 1412 if (rec)
1418 return 1; 1413 return rec->ip;
1419 } 1414 }
1420 1415
1421 return 0; 1416 return 0;
1422} 1417}
1423 1418
1419/**
1420 * ftrace_location - return true if the ip giving is a traced location
1421 * @ip: the instruction pointer to check
1422 *
1423 * Returns rec->ip if @ip given is a pointer to a ftrace location.
1424 * That is, the instruction that is either a NOP or call to
1425 * the function tracer. It checks the ftrace internal tables to
1426 * determine if the address belongs or not.
1427 */
1428unsigned long ftrace_location(unsigned long ip)
1429{
1430 return ftrace_location_range(ip, ip);
1431}
1432
1433/**
1434 * ftrace_text_reserved - return true if range contains an ftrace location
1435 * @start: start of range to search
1436 * @end: end of range to search (inclusive). @end points to the last byte to check.
1437 *
1438 * Returns 1 if @start and @end contains a ftrace location.
1439 * That is, the instruction that is either a NOP or call to
1440 * the function tracer. It checks the ftrace internal tables to
1441 * determine if the address belongs or not.
1442 */
1443int ftrace_text_reserved(void *start, void *end)
1444{
1445 unsigned long ret;
1446
1447 ret = ftrace_location_range((unsigned long)start,
1448 (unsigned long)end);
1449
1450 return (int)!!ret;
1451}
1452
1424static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1453static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1425 int filter_hash, 1454 int filter_hash,
1426 bool inc) 1455 bool inc)
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1520 __ftrace_hash_rec_update(ops, filter_hash, 1); 1549 __ftrace_hash_rec_update(ops, filter_hash, 1);
1521} 1550}
1522 1551
1523static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1524{
1525 if (ftrace_pages->index == ftrace_pages->size) {
1526 /* We should have allocated enough */
1527 if (WARN_ON(!ftrace_pages->next))
1528 return NULL;
1529 ftrace_pages = ftrace_pages->next;
1530 }
1531
1532 return &ftrace_pages->records[ftrace_pages->index++];
1533}
1534
1535static struct dyn_ftrace *
1536ftrace_record_ip(unsigned long ip)
1537{
1538 struct dyn_ftrace *rec;
1539
1540 if (ftrace_disabled)
1541 return NULL;
1542
1543 rec = ftrace_alloc_dyn_node(ip);
1544 if (!rec)
1545 return NULL;
1546
1547 rec->ip = ip;
1548
1549 return rec;
1550}
1551
1552static void print_ip_ins(const char *fmt, unsigned char *p) 1552static void print_ip_ins(const char *fmt, unsigned char *p)
1553{ 1553{
1554 int i; 1554 int i;
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)
1598 } 1598 }
1599} 1599}
1600 1600
1601
1602/* Return 1 if the address range is reserved for ftrace */
1603int ftrace_text_reserved(void *start, void *end)
1604{
1605 struct dyn_ftrace *rec;
1606 struct ftrace_page *pg;
1607
1608 do_for_each_ftrace_rec(pg, rec) {
1609 if (rec->ip <= (unsigned long)end &&
1610 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1611 return 1;
1612 } while_for_each_ftrace_rec();
1613 return 0;
1614}
1615
1616static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1601static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1617{ 1602{
1618 unsigned long flag = 0UL; 1603 unsigned long flag = 0UL;
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1698 return -1; /* unknow ftrace bug */ 1683 return -1; /* unknow ftrace bug */
1699} 1684}
1700 1685
1701static void ftrace_replace_code(int update) 1686void __weak ftrace_replace_code(int enable)
1702{ 1687{
1703 struct dyn_ftrace *rec; 1688 struct dyn_ftrace *rec;
1704 struct ftrace_page *pg; 1689 struct ftrace_page *pg;
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)
1708 return; 1693 return;
1709 1694
1710 do_for_each_ftrace_rec(pg, rec) { 1695 do_for_each_ftrace_rec(pg, rec) {
1711 failed = __ftrace_replace_code(rec, update); 1696 failed = __ftrace_replace_code(rec, enable);
1712 if (failed) { 1697 if (failed) {
1713 ftrace_bug(failed, rec->ip); 1698 ftrace_bug(failed, rec->ip);
1714 /* Stop processing */ 1699 /* Stop processing */
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1826 return 0; 1811 return 0;
1827} 1812}
1828 1813
1829static int __ftrace_modify_code(void *data) 1814void ftrace_modify_all_code(int command)
1830{ 1815{
1831 int *command = data; 1816 if (command & FTRACE_UPDATE_CALLS)
1832
1833 if (*command & FTRACE_UPDATE_CALLS)
1834 ftrace_replace_code(1); 1817 ftrace_replace_code(1);
1835 else if (*command & FTRACE_DISABLE_CALLS) 1818 else if (command & FTRACE_DISABLE_CALLS)
1836 ftrace_replace_code(0); 1819 ftrace_replace_code(0);
1837 1820
1838 if (*command & FTRACE_UPDATE_TRACE_FUNC) 1821 if (command & FTRACE_UPDATE_TRACE_FUNC)
1839 ftrace_update_ftrace_func(ftrace_trace_function); 1822 ftrace_update_ftrace_func(ftrace_trace_function);
1840 1823
1841 if (*command & FTRACE_START_FUNC_RET) 1824 if (command & FTRACE_START_FUNC_RET)
1842 ftrace_enable_ftrace_graph_caller(); 1825 ftrace_enable_ftrace_graph_caller();
1843 else if (*command & FTRACE_STOP_FUNC_RET) 1826 else if (command & FTRACE_STOP_FUNC_RET)
1844 ftrace_disable_ftrace_graph_caller(); 1827 ftrace_disable_ftrace_graph_caller();
1828}
1829
1830static int __ftrace_modify_code(void *data)
1831{
1832 int *command = data;
1833
1834 ftrace_modify_all_code(*command);
1845 1835
1846 return 0; 1836 return 0;
1847} 1837}
@@ -3666,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3666 return 0; 3656 return 0;
3667} 3657}
3668 3658
3669static void ftrace_swap_recs(void *a, void *b, int size) 3659static int ftrace_cmp_ips(const void *a, const void *b)
3660{
3661 const unsigned long *ipa = a;
3662 const unsigned long *ipb = b;
3663
3664 if (*ipa > *ipb)
3665 return 1;
3666 if (*ipa < *ipb)
3667 return -1;
3668 return 0;
3669}
3670
3671static void ftrace_swap_ips(void *a, void *b, int size)
3670{ 3672{
3671 struct dyn_ftrace *reca = a; 3673 unsigned long *ipa = a;
3672 struct dyn_ftrace *recb = b; 3674 unsigned long *ipb = b;
3673 struct dyn_ftrace t; 3675 unsigned long t;
3674 3676
3675 t = *reca; 3677 t = *ipa;
3676 *reca = *recb; 3678 *ipa = *ipb;
3677 *recb = t; 3679 *ipb = t;
3678} 3680}
3679 3681
3680static int ftrace_process_locs(struct module *mod, 3682static int ftrace_process_locs(struct module *mod,
3681 unsigned long *start, 3683 unsigned long *start,
3682 unsigned long *end) 3684 unsigned long *end)
3683{ 3685{
3686 struct ftrace_page *start_pg;
3684 struct ftrace_page *pg; 3687 struct ftrace_page *pg;
3688 struct dyn_ftrace *rec;
3685 unsigned long count; 3689 unsigned long count;
3686 unsigned long *p; 3690 unsigned long *p;
3687 unsigned long addr; 3691 unsigned long addr;
@@ -3693,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,
3693 if (!count) 3697 if (!count)
3694 return 0; 3698 return 0;
3695 3699
3696 pg = ftrace_allocate_pages(count); 3700 sort(start, count, sizeof(*start),
3697 if (!pg) 3701 ftrace_cmp_ips, ftrace_swap_ips);
3702
3703 start_pg = ftrace_allocate_pages(count);
3704 if (!start_pg)
3698 return -ENOMEM; 3705 return -ENOMEM;
3699 3706
3700 mutex_lock(&ftrace_lock); 3707 mutex_lock(&ftrace_lock);
@@ -3707,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,
3707 if (!mod) { 3714 if (!mod) {
3708 WARN_ON(ftrace_pages || ftrace_pages_start); 3715 WARN_ON(ftrace_pages || ftrace_pages_start);
3709 /* First initialization */ 3716 /* First initialization */
3710 ftrace_pages = ftrace_pages_start = pg; 3717 ftrace_pages = ftrace_pages_start = start_pg;
3711 } else { 3718 } else {
3712 if (!ftrace_pages) 3719 if (!ftrace_pages)
3713 goto out; 3720 goto out;
@@ -3718,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,
3718 ftrace_pages = ftrace_pages->next; 3725 ftrace_pages = ftrace_pages->next;
3719 } 3726 }
3720 3727
3721 ftrace_pages->next = pg; 3728 ftrace_pages->next = start_pg;
3722 ftrace_pages = pg;
3723 } 3729 }
3724 3730
3725 p = start; 3731 p = start;
3732 pg = start_pg;
3726 while (p < end) { 3733 while (p < end) {
3727 addr = ftrace_call_adjust(*p++); 3734 addr = ftrace_call_adjust(*p++);
3728 /* 3735 /*
@@ -3733,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,
3733 */ 3740 */
3734 if (!addr) 3741 if (!addr)
3735 continue; 3742 continue;
3736 if (!ftrace_record_ip(addr)) 3743
3737 break; 3744 if (pg->index == pg->size) {
3745 /* We should have allocated enough */
3746 if (WARN_ON(!pg->next))
3747 break;
3748 pg = pg->next;
3749 }
3750
3751 rec = &pg->records[pg->index++];
3752 rec->ip = addr;
3738 } 3753 }
3739 3754
3740 /* These new locations need to be initialized */ 3755 /* We should have used all pages */
3741 ftrace_new_pgs = pg; 3756 WARN_ON(pg->next);
3742 3757
3743 /* Make each individual set of pages sorted by ips */ 3758 /* Assign the last page to ftrace_pages */
3744 for (; pg; pg = pg->next) 3759 ftrace_pages = pg;
3745 sort(pg->records, pg->index, sizeof(struct dyn_ftrace), 3760
3746 ftrace_cmp_recs, ftrace_swap_recs); 3761 /* These new locations need to be initialized */
3762 ftrace_new_pgs = start_pg;
3747 3763
3748 /* 3764 /*
3749 * We only need to disable interrupts on start up 3765 * We only need to disable interrupts on start up
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d5eb332082..6420cda6233 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
23#include <asm/local.h> 23#include <asm/local.h>
24#include "trace.h" 24#include "trace.h"
25 25
26static void update_pages_handler(struct work_struct *work);
27
26/* 28/*
27 * The ring buffer header is special. We must manually up keep it. 29 * The ring buffer header is special. We must manually up keep it.
28 */ 30 */
@@ -470,12 +472,15 @@ struct ring_buffer_per_cpu {
470 /* ring buffer pages to update, > 0 to add, < 0 to remove */ 472 /* ring buffer pages to update, > 0 to add, < 0 to remove */
471 int nr_pages_to_update; 473 int nr_pages_to_update;
472 struct list_head new_pages; /* new pages to add */ 474 struct list_head new_pages; /* new pages to add */
475 struct work_struct update_pages_work;
476 struct completion update_done;
473}; 477};
474 478
475struct ring_buffer { 479struct ring_buffer {
476 unsigned flags; 480 unsigned flags;
477 int cpus; 481 int cpus;
478 atomic_t record_disabled; 482 atomic_t record_disabled;
483 atomic_t resize_disabled;
479 cpumask_var_t cpumask; 484 cpumask_var_t cpumask;
480 485
481 struct lock_class_key *reader_lock_key; 486 struct lock_class_key *reader_lock_key;
@@ -940,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
940 struct list_head *head = cpu_buffer->pages; 945 struct list_head *head = cpu_buffer->pages;
941 struct buffer_page *bpage, *tmp; 946 struct buffer_page *bpage, *tmp;
942 947
948 /* Reset the head page if it exists */
949 if (cpu_buffer->head_page)
950 rb_set_head_page(cpu_buffer);
951
943 rb_head_page_deactivate(cpu_buffer); 952 rb_head_page_deactivate(cpu_buffer);
944 953
945 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 954 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -1048,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1048 raw_spin_lock_init(&cpu_buffer->reader_lock); 1057 raw_spin_lock_init(&cpu_buffer->reader_lock);
1049 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1058 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1050 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1059 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1060 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1061 init_completion(&cpu_buffer->update_done);
1051 1062
1052 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1063 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1053 GFP_KERNEL, cpu_to_node(cpu)); 1064 GFP_KERNEL, cpu_to_node(cpu));
@@ -1235,70 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
1235 1246
1236static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 1247static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1237 1248
1238static void 1249static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1239rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1240{ 1250{
1241 struct buffer_page *bpage; 1251 return local_read(&bpage->entries) & RB_WRITE_MASK;
1242 struct list_head *p; 1252}
1243 unsigned i; 1253
1254static inline unsigned long rb_page_write(struct buffer_page *bpage)
1255{
1256 return local_read(&bpage->write) & RB_WRITE_MASK;
1257}
1258
1259static int
1260rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1261{
1262 struct list_head *tail_page, *to_remove, *next_page;
1263 struct buffer_page *to_remove_page, *tmp_iter_page;
1264 struct buffer_page *last_page, *first_page;
1265 unsigned int nr_removed;
1266 unsigned long head_bit;
1267 int page_entries;
1268
1269 head_bit = 0;
1244 1270
1245 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1271 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1246 rb_head_page_deactivate(cpu_buffer); 1272 atomic_inc(&cpu_buffer->record_disabled);
1273 /*
1274 * We don't race with the readers since we have acquired the reader
1275 * lock. We also don't race with writers after disabling recording.
1276 * This makes it easy to figure out the first and the last page to be
1277 * removed from the list. We unlink all the pages in between including
1278 * the first and last pages. This is done in a busy loop so that we
1279 * lose the least number of traces.
1280 * The pages are freed after we restart recording and unlock readers.
1281 */
1282 tail_page = &cpu_buffer->tail_page->list;
1247 1283
1248 for (i = 0; i < nr_pages; i++) { 1284 /*
1249 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1285 * tail page might be on reader page, we remove the next page
1250 goto out; 1286 * from the ring buffer
1251 p = cpu_buffer->pages->next; 1287 */
1252 bpage = list_entry(p, struct buffer_page, list); 1288 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
1253 list_del_init(&bpage->list); 1289 tail_page = rb_list_head(tail_page->next);
1254 free_buffer_page(bpage); 1290 to_remove = tail_page;
1291
1292 /* start of pages to remove */
1293 first_page = list_entry(rb_list_head(to_remove->next),
1294 struct buffer_page, list);
1295
1296 for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
1297 to_remove = rb_list_head(to_remove)->next;
1298 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
1255 } 1299 }
1256 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1257 goto out;
1258 1300
1259 rb_reset_cpu(cpu_buffer); 1301 next_page = rb_list_head(to_remove)->next;
1260 rb_check_pages(cpu_buffer);
1261 1302
1262out: 1303 /*
1304 * Now we remove all pages between tail_page and next_page.
1305 * Make sure that we have head_bit value preserved for the
1306 * next page
1307 */
1308 tail_page->next = (struct list_head *)((unsigned long)next_page |
1309 head_bit);
1310 next_page = rb_list_head(next_page);
1311 next_page->prev = tail_page;
1312
1313 /* make sure pages points to a valid page in the ring buffer */
1314 cpu_buffer->pages = next_page;
1315
1316 /* update head page */
1317 if (head_bit)
1318 cpu_buffer->head_page = list_entry(next_page,
1319 struct buffer_page, list);
1320
1321 /*
1322 * change read pointer to make sure any read iterators reset
1323 * themselves
1324 */
1325 cpu_buffer->read = 0;
1326
1327 /* pages are removed, resume tracing and then free the pages */
1328 atomic_dec(&cpu_buffer->record_disabled);
1263 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1329 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1330
1331 RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
1332
1333 /* last buffer page to remove */
1334 last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
1335 list);
1336 tmp_iter_page = first_page;
1337
1338 do {
1339 to_remove_page = tmp_iter_page;
1340 rb_inc_page(cpu_buffer, &tmp_iter_page);
1341
1342 /* update the counters */
1343 page_entries = rb_page_entries(to_remove_page);
1344 if (page_entries) {
1345 /*
1346 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 }
1355
1356 /*
1357 * We have already removed references to this list item, just
1358 * free up the buffer_page and its page
1359 */
1360 free_buffer_page(to_remove_page);
1361 nr_removed--;
1362
1363 } while (to_remove_page != last_page);
1364
1365 RB_WARN_ON(cpu_buffer, nr_removed);
1366
1367 return nr_removed == 0;
1264} 1368}
1265 1369
1266static void 1370static int
1267rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 1371rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1268 struct list_head *pages, unsigned nr_pages)
1269{ 1372{
1270 struct buffer_page *bpage; 1373 struct list_head *pages = &cpu_buffer->new_pages;
1271 struct list_head *p; 1374 int retries, success;
1272 unsigned i;
1273 1375
1274 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1376 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1275 rb_head_page_deactivate(cpu_buffer); 1377 /*
1378 * We are holding the reader lock, so the reader page won't be swapped
1379 * in the ring buffer. Now we are racing with the writer trying to
1380 * move head page and the tail page.
1381 * We are going to adapt the reader page update process where:
1382 * 1. We first splice the start and end of list of new pages between
1383 * the head page and its previous page.
1384 * 2. We cmpxchg the prev_page->next to point from head page to the
1385 * start of new pages list.
1386 * 3. Finally, we update the head->prev to the end of new list.
1387 *
1388 * We will try this process 10 times, to make sure that we don't keep
1389 * spinning.
1390 */
1391 retries = 10;
1392 success = 0;
1393 while (retries--) {
1394 struct list_head *head_page, *prev_page, *r;
1395 struct list_head *last_page, *first_page;
1396 struct list_head *head_page_with_bit;
1276 1397
1277 for (i = 0; i < nr_pages; i++) { 1398 head_page = &rb_set_head_page(cpu_buffer)->list;
1278 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1399 prev_page = head_page->prev;
1279 goto out; 1400
1280 p = pages->next; 1401 first_page = pages->next;
1281 bpage = list_entry(p, struct buffer_page, list); 1402 last_page = pages->prev;
1282 list_del_init(&bpage->list); 1403
1283 list_add_tail(&bpage->list, cpu_buffer->pages); 1404 head_page_with_bit = (struct list_head *)
1405 ((unsigned long)head_page | RB_PAGE_HEAD);
1406
1407 last_page->next = head_page_with_bit;
1408 first_page->prev = prev_page;
1409
1410 r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
1411
1412 if (r == head_page_with_bit) {
1413 /*
1414 * yay, we replaced the page pointer to our new list,
1415 * now, we just have to update to head page's prev
1416 * pointer to point to end of list
1417 */
1418 head_page->prev = last_page;
1419 success = 1;
1420 break;
1421 }
1284 } 1422 }
1285 rb_reset_cpu(cpu_buffer);
1286 rb_check_pages(cpu_buffer);
1287 1423
1288out: 1424 if (success)
1425 INIT_LIST_HEAD(pages);
1426 /*
1427 * If we weren't successful in adding in new pages, warn and stop
1428 * tracing
1429 */
1430 RB_WARN_ON(cpu_buffer, !success);
1289 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1431 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1432
1433 /* free pages if they weren't inserted */
1434 if (!success) {
1435 struct buffer_page *bpage, *tmp;
1436 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1437 list) {
1438 list_del_init(&bpage->list);
1439 free_buffer_page(bpage);
1440 }
1441 }
1442 return success;
1290} 1443}
1291 1444
1292static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer) 1445static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
1293{ 1446{
1447 int success;
1448
1294 if (cpu_buffer->nr_pages_to_update > 0) 1449 if (cpu_buffer->nr_pages_to_update > 0)
1295 rb_insert_pages(cpu_buffer, &cpu_buffer->new_pages, 1450 success = rb_insert_pages(cpu_buffer);
1296 cpu_buffer->nr_pages_to_update);
1297 else 1451 else
1298 rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update); 1452 success = rb_remove_pages(cpu_buffer,
1299 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; 1453 -cpu_buffer->nr_pages_to_update);
1300 /* reset this value */ 1454
1301 cpu_buffer->nr_pages_to_update = 0; 1455 if (success)
1456 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
1457}
1458
1459static void update_pages_handler(struct work_struct *work)
1460{
1461 struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
1462 struct ring_buffer_per_cpu, update_pages_work);
1463 rb_update_pages(cpu_buffer);
1464 complete(&cpu_buffer->update_done);
1302} 1465}
1303 1466
1304/** 1467/**
@@ -1308,14 +1471,14 @@ static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer)
1308 * 1471 *
1309 * Minimum size is 2 * BUF_PAGE_SIZE. 1472 * Minimum size is 2 * BUF_PAGE_SIZE.
1310 * 1473 *
1311 * Returns -1 on failure. 1474 * Returns 0 on success and < 0 on failure.
1312 */ 1475 */
1313int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, 1476int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1314 int cpu_id) 1477 int cpu_id)
1315{ 1478{
1316 struct ring_buffer_per_cpu *cpu_buffer; 1479 struct ring_buffer_per_cpu *cpu_buffer;
1317 unsigned nr_pages; 1480 unsigned nr_pages;
1318 int cpu; 1481 int cpu, err = 0;
1319 1482
1320 /* 1483 /*
1321 * Always succeed at resizing a non-existent buffer: 1484 * Always succeed at resizing a non-existent buffer:
@@ -1330,15 +1493,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1330 if (size < BUF_PAGE_SIZE * 2) 1493 if (size < BUF_PAGE_SIZE * 2)
1331 size = BUF_PAGE_SIZE * 2; 1494 size = BUF_PAGE_SIZE * 2;
1332 1495
1333 atomic_inc(&buffer->record_disabled); 1496 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1334 1497
1335 /* Make sure all writers are done with this buffer. */ 1498 /*
1336 synchronize_sched(); 1499 * Don't succeed if resizing is disabled, as a reader might be
1500 * manipulating the ring buffer and is expecting a sane state while
1501 * this is true.
1502 */
1503 if (atomic_read(&buffer->resize_disabled))
1504 return -EBUSY;
1337 1505
1506 /* prevent another thread from changing buffer sizes */
1338 mutex_lock(&buffer->mutex); 1507 mutex_lock(&buffer->mutex);
1339 get_online_cpus();
1340
1341 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1342 1508
1343 if (cpu_id == RING_BUFFER_ALL_CPUS) { 1509 if (cpu_id == RING_BUFFER_ALL_CPUS) {
1344 /* calculate the pages to update */ 1510 /* calculate the pages to update */
@@ -1347,33 +1513,57 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1347 1513
1348 cpu_buffer->nr_pages_to_update = nr_pages - 1514 cpu_buffer->nr_pages_to_update = nr_pages -
1349 cpu_buffer->nr_pages; 1515 cpu_buffer->nr_pages;
1350
1351 /* 1516 /*
1352 * nothing more to do for removing pages or no update 1517 * nothing more to do for removing pages or no update
1353 */ 1518 */
1354 if (cpu_buffer->nr_pages_to_update <= 0) 1519 if (cpu_buffer->nr_pages_to_update <= 0)
1355 continue; 1520 continue;
1356
1357 /* 1521 /*
1358 * to add pages, make sure all new pages can be 1522 * to add pages, make sure all new pages can be
1359 * allocated without receiving ENOMEM 1523 * allocated without receiving ENOMEM
1360 */ 1524 */
1361 INIT_LIST_HEAD(&cpu_buffer->new_pages); 1525 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1362 if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, 1526 if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1363 &cpu_buffer->new_pages, cpu)) 1527 &cpu_buffer->new_pages, cpu)) {
1364 /* not enough memory for new pages */ 1528 /* not enough memory for new pages */
1365 goto no_mem; 1529 err = -ENOMEM;
1530 goto out_err;
1531 }
1532 }
1533
1534 get_online_cpus();
1535 /*
1536 * Fire off all the required work handlers
1537 * We can't schedule on offline CPUs, but it's not necessary
1538 * since we can change their buffer sizes without any race.
1539 */
1540 for_each_buffer_cpu(buffer, cpu) {
1541 cpu_buffer = buffer->buffers[cpu];
1542 if (!cpu_buffer->nr_pages_to_update)
1543 continue;
1544
1545 if (cpu_online(cpu))
1546 schedule_work_on(cpu,
1547 &cpu_buffer->update_pages_work);
1548 else
1549 rb_update_pages(cpu_buffer);
1366 } 1550 }
1367 1551
1368 /* wait for all the updates to complete */ 1552 /* wait for all the updates to complete */
1369 for_each_buffer_cpu(buffer, cpu) { 1553 for_each_buffer_cpu(buffer, cpu) {
1370 cpu_buffer = buffer->buffers[cpu]; 1554 cpu_buffer = buffer->buffers[cpu];
1371 if (cpu_buffer->nr_pages_to_update) { 1555 if (!cpu_buffer->nr_pages_to_update)
1372 update_pages_handler(cpu_buffer); 1556 continue;
1373 } 1557
1558 if (cpu_online(cpu))
1559 wait_for_completion(&cpu_buffer->update_done);
1560 cpu_buffer->nr_pages_to_update = 0;
1374 } 1561 }
1562
1563 put_online_cpus();
1375 } else { 1564 } else {
1376 cpu_buffer = buffer->buffers[cpu_id]; 1565 cpu_buffer = buffer->buffers[cpu_id];
1566
1377 if (nr_pages == cpu_buffer->nr_pages) 1567 if (nr_pages == cpu_buffer->nr_pages)
1378 goto out; 1568 goto out;
1379 1569
@@ -1383,38 +1573,69 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1383 INIT_LIST_HEAD(&cpu_buffer->new_pages); 1573 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1384 if (cpu_buffer->nr_pages_to_update > 0 && 1574 if (cpu_buffer->nr_pages_to_update > 0 &&
1385 __rb_allocate_pages(cpu_buffer->nr_pages_to_update, 1575 __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1386 &cpu_buffer->new_pages, cpu_id)) 1576 &cpu_buffer->new_pages, cpu_id)) {
1387 goto no_mem; 1577 err = -ENOMEM;
1578 goto out_err;
1579 }
1388 1580
1389 update_pages_handler(cpu_buffer); 1581 get_online_cpus();
1582
1583 if (cpu_online(cpu_id)) {
1584 schedule_work_on(cpu_id,
1585 &cpu_buffer->update_pages_work);
1586 wait_for_completion(&cpu_buffer->update_done);
1587 } else
1588 rb_update_pages(cpu_buffer);
1589
1590 cpu_buffer->nr_pages_to_update = 0;
1591 put_online_cpus();
1390 } 1592 }
1391 1593
1392 out: 1594 out:
1393 put_online_cpus(); 1595 /*
1394 mutex_unlock(&buffer->mutex); 1596 * The ring buffer resize can happen with the ring buffer
1395 1597 * enabled, so that the update disturbs the tracing as little
1396 atomic_dec(&buffer->record_disabled); 1598 * as possible. But if the buffer is disabled, we do not need
1599 * to worry about that, and we can take the time to verify
1600 * that the buffer is not corrupt.
1601 */
1602 if (atomic_read(&buffer->record_disabled)) {
1603 atomic_inc(&buffer->record_disabled);
1604 /*
1605 * Even though the buffer was disabled, we must make sure
1606 * that it is truly disabled before calling rb_check_pages.
1607 * There could have been a race between checking
1608 * record_disable and incrementing it.
1609 */
1610 synchronize_sched();
1611 for_each_buffer_cpu(buffer, cpu) {
1612 cpu_buffer = buffer->buffers[cpu];
1613 rb_check_pages(cpu_buffer);
1614 }
1615 atomic_dec(&buffer->record_disabled);
1616 }
1397 1617
1618 mutex_unlock(&buffer->mutex);
1398 return size; 1619 return size;
1399 1620
1400 no_mem: 1621 out_err:
1401 for_each_buffer_cpu(buffer, cpu) { 1622 for_each_buffer_cpu(buffer, cpu) {
1402 struct buffer_page *bpage, *tmp; 1623 struct buffer_page *bpage, *tmp;
1624
1403 cpu_buffer = buffer->buffers[cpu]; 1625 cpu_buffer = buffer->buffers[cpu];
1404 /* reset this number regardless */
1405 cpu_buffer->nr_pages_to_update = 0; 1626 cpu_buffer->nr_pages_to_update = 0;
1627
1406 if (list_empty(&cpu_buffer->new_pages)) 1628 if (list_empty(&cpu_buffer->new_pages))
1407 continue; 1629 continue;
1630
1408 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, 1631 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1409 list) { 1632 list) {
1410 list_del_init(&bpage->list); 1633 list_del_init(&bpage->list);
1411 free_buffer_page(bpage); 1634 free_buffer_page(bpage);
1412 } 1635 }
1413 } 1636 }
1414 put_online_cpus();
1415 mutex_unlock(&buffer->mutex); 1637 mutex_unlock(&buffer->mutex);
1416 atomic_dec(&buffer->record_disabled); 1638 return err;
1417 return -ENOMEM;
1418} 1639}
1419EXPORT_SYMBOL_GPL(ring_buffer_resize); 1640EXPORT_SYMBOL_GPL(ring_buffer_resize);
1420 1641
@@ -1453,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
1453 return __rb_page_index(iter->head_page, iter->head); 1674 return __rb_page_index(iter->head_page, iter->head);
1454} 1675}
1455 1676
1456static inline unsigned long rb_page_write(struct buffer_page *bpage)
1457{
1458 return local_read(&bpage->write) & RB_WRITE_MASK;
1459}
1460
1461static inline unsigned rb_page_commit(struct buffer_page *bpage) 1677static inline unsigned rb_page_commit(struct buffer_page *bpage)
1462{ 1678{
1463 return local_read(&bpage->page->commit); 1679 return local_read(&bpage->page->commit);
1464} 1680}
1465 1681
1466static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1467{
1468 return local_read(&bpage->entries) & RB_WRITE_MASK;
1469}
1470
1471/* Size is determined by what has been committed */ 1682/* Size is determined by what has been committed */
1472static inline unsigned rb_page_size(struct buffer_page *bpage) 1683static inline unsigned rb_page_size(struct buffer_page *bpage)
1473{ 1684{
@@ -3492,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3492 3703
3493 iter->cpu_buffer = cpu_buffer; 3704 iter->cpu_buffer = cpu_buffer;
3494 3705
3706 atomic_inc(&buffer->resize_disabled);
3495 atomic_inc(&cpu_buffer->record_disabled); 3707 atomic_inc(&cpu_buffer->record_disabled);
3496 3708
3497 return iter; 3709 return iter;
@@ -3554,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
3554{ 3766{
3555 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3767 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3556 3768
3769 /*
3770 * Ring buffer is disabled from recording, here's a good place
3771 * to check the integrity of the ring buffer.
3772 */
3773 rb_check_pages(cpu_buffer);
3774
3557 atomic_dec(&cpu_buffer->record_disabled); 3775 atomic_dec(&cpu_buffer->record_disabled);
3776 atomic_dec(&cpu_buffer->buffer->resize_disabled);
3558 kfree(iter); 3777 kfree(iter);
3559} 3778}
3560EXPORT_SYMBOL_GPL(ring_buffer_read_finish); 3779EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3626,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3626 cpu_buffer->commit_page = cpu_buffer->head_page; 3845 cpu_buffer->commit_page = cpu_buffer->head_page;
3627 3846
3628 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 3847 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
3848 INIT_LIST_HEAD(&cpu_buffer->new_pages);
3629 local_set(&cpu_buffer->reader_page->write, 0); 3849 local_set(&cpu_buffer->reader_page->write, 0);
3630 local_set(&cpu_buffer->reader_page->entries, 0); 3850 local_set(&cpu_buffer->reader_page->entries, 0);
3631 local_set(&cpu_buffer->reader_page->page->commit, 0); 3851 local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3662,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3662 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3882 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3663 return; 3883 return;
3664 3884
3885 atomic_inc(&buffer->resize_disabled);
3665 atomic_inc(&cpu_buffer->record_disabled); 3886 atomic_inc(&cpu_buffer->record_disabled);
3666 3887
3888 /* Make sure all commits have finished */
3889 synchronize_sched();
3890
3667 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3891 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3668 3892
3669 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3893 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3679,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3679 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3903 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3680 3904
3681 atomic_dec(&cpu_buffer->record_disabled); 3905 atomic_dec(&cpu_buffer->record_disabled);
3906 atomic_dec(&buffer->resize_disabled);
3682} 3907}
3683EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 3908EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
3684 3909
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 48ef4960ec9..68032c6177d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -763,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
763 * Register a new plugin tracer. 763 * Register a new plugin tracer.
764 */ 764 */
765int register_tracer(struct tracer *type) 765int register_tracer(struct tracer *type)
766__releases(kernel_lock)
767__acquires(kernel_lock)
768{ 766{
769 struct tracer *t; 767 struct tracer *t;
770 int ret = 0; 768 int ret = 0;
@@ -2669,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2669 if (cpumask_test_cpu(cpu, tracing_cpumask) && 2667 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2670 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2668 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2671 atomic_inc(&global_trace.data[cpu]->disabled); 2669 atomic_inc(&global_trace.data[cpu]->disabled);
2670 ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
2672 } 2671 }
2673 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 2672 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2674 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2673 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2675 atomic_dec(&global_trace.data[cpu]->disabled); 2674 atomic_dec(&global_trace.data[cpu]->disabled);
2675 ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
2676 } 2676 }
2677 } 2677 }
2678 arch_spin_unlock(&ftrace_max_lock); 2678 arch_spin_unlock(&ftrace_max_lock);
@@ -3076,20 +3076,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3076 3076
3077static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) 3077static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3078{ 3078{
3079 int cpu, ret = size; 3079 int ret = size;
3080 3080
3081 mutex_lock(&trace_types_lock); 3081 mutex_lock(&trace_types_lock);
3082 3082
3083 tracing_stop();
3084
3085 /* disable all cpu buffers */
3086 for_each_tracing_cpu(cpu) {
3087 if (global_trace.data[cpu])
3088 atomic_inc(&global_trace.data[cpu]->disabled);
3089 if (max_tr.data[cpu])
3090 atomic_inc(&max_tr.data[cpu]->disabled);
3091 }
3092
3093 if (cpu_id != RING_BUFFER_ALL_CPUS) { 3083 if (cpu_id != RING_BUFFER_ALL_CPUS) {
3094 /* make sure, this cpu is enabled in the mask */ 3084 /* make sure, this cpu is enabled in the mask */
3095 if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { 3085 if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
@@ -3103,14 +3093,6 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3103 ret = -ENOMEM; 3093 ret = -ENOMEM;
3104 3094
3105out: 3095out:
3106 for_each_tracing_cpu(cpu) {
3107 if (global_trace.data[cpu])
3108 atomic_dec(&global_trace.data[cpu]->disabled);
3109 if (max_tr.data[cpu])
3110 atomic_dec(&max_tr.data[cpu]->disabled);
3111 }
3112
3113 tracing_start();
3114 mutex_unlock(&trace_types_lock); 3096 mutex_unlock(&trace_types_lock);
3115 3097
3116 return ret; 3098 return ret;
@@ -3875,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3875 struct print_entry *entry; 3857 struct print_entry *entry;
3876 unsigned long irq_flags; 3858 unsigned long irq_flags;
3877 struct page *pages[2]; 3859 struct page *pages[2];
3860 void *map_page[2];
3878 int nr_pages = 1; 3861 int nr_pages = 1;
3879 ssize_t written; 3862 ssize_t written;
3880 void *page1;
3881 void *page2;
3882 int offset; 3863 int offset;
3883 int size; 3864 int size;
3884 int len; 3865 int len;
3885 int ret; 3866 int ret;
3867 int i;
3886 3868
3887 if (tracing_disabled) 3869 if (tracing_disabled)
3888 return -EINVAL; 3870 return -EINVAL;
@@ -3921,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3921 goto out; 3903 goto out;
3922 } 3904 }
3923 3905
3924 page1 = kmap_atomic(pages[0]); 3906 for (i = 0; i < nr_pages; i++)
3925 if (nr_pages == 2) 3907 map_page[i] = kmap_atomic(pages[i]);
3926 page2 = kmap_atomic(pages[1]);
3927 3908
3928 local_save_flags(irq_flags); 3909 local_save_flags(irq_flags);
3929 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 3910 size = sizeof(*entry) + cnt + 2; /* possible \n added */
@@ -3941,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3941 3922
3942 if (nr_pages == 2) { 3923 if (nr_pages == 2) {
3943 len = PAGE_SIZE - offset; 3924 len = PAGE_SIZE - offset;
3944 memcpy(&entry->buf, page1 + offset, len); 3925 memcpy(&entry->buf, map_page[0] + offset, len);
3945 memcpy(&entry->buf[len], page2, cnt - len); 3926 memcpy(&entry->buf[len], map_page[1], cnt - len);
3946 } else 3927 } else
3947 memcpy(&entry->buf, page1 + offset, cnt); 3928 memcpy(&entry->buf, map_page[0] + offset, cnt);
3948 3929
3949 if (entry->buf[cnt - 1] != '\n') { 3930 if (entry->buf[cnt - 1] != '\n') {
3950 entry->buf[cnt] = '\n'; 3931 entry->buf[cnt] = '\n';
@@ -3959,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3959 *fpos += written; 3940 *fpos += written;
3960 3941
3961 out_unlock: 3942 out_unlock:
3962 if (nr_pages == 2) 3943 for (i = 0; i < nr_pages; i++){
3963 kunmap_atomic(page2); 3944 kunmap_atomic(map_page[i]);
3964 kunmap_atomic(page1); 3945 put_page(pages[i]);
3965 while (nr_pages > 0) 3946 }
3966 put_page(pages[--nr_pages]);
3967 out: 3947 out:
3968 return written; 3948 return written;
3969} 3949}
@@ -4494,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4494 struct dentry *d_cpu; 4474 struct dentry *d_cpu;
4495 char cpu_dir[30]; /* 30 characters should be more than enough */ 4475 char cpu_dir[30]; /* 30 characters should be more than enough */
4496 4476
4477 if (!d_percpu)
4478 return;
4479
4497 snprintf(cpu_dir, 30, "cpu%ld", cpu); 4480 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4498 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4481 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4499 if (!d_cpu) { 4482 if (!d_cpu) {
@@ -4759,7 +4742,8 @@ static ssize_t
4759rb_simple_read(struct file *filp, char __user *ubuf, 4742rb_simple_read(struct file *filp, char __user *ubuf,
4760 size_t cnt, loff_t *ppos) 4743 size_t cnt, loff_t *ppos)
4761{ 4744{
4762 struct ring_buffer *buffer = filp->private_data; 4745 struct trace_array *tr = filp->private_data;
4746 struct ring_buffer *buffer = tr->buffer;
4763 char buf[64]; 4747 char buf[64];
4764 int r; 4748 int r;
4765 4749
@@ -4777,7 +4761,8 @@ static ssize_t
4777rb_simple_write(struct file *filp, const char __user *ubuf, 4761rb_simple_write(struct file *filp, const char __user *ubuf,
4778 size_t cnt, loff_t *ppos) 4762 size_t cnt, loff_t *ppos)
4779{ 4763{
4780 struct ring_buffer *buffer = filp->private_data; 4764 struct trace_array *tr = filp->private_data;
4765 struct ring_buffer *buffer = tr->buffer;
4781 unsigned long val; 4766 unsigned long val;
4782 int ret; 4767 int ret;
4783 4768
@@ -4864,7 +4849,7 @@ static __init int tracer_init_debugfs(void)
4864 &trace_clock_fops); 4849 &trace_clock_fops);
4865 4850
4866 trace_create_file("tracing_on", 0644, d_tracer, 4851 trace_create_file("tracing_on", 0644, d_tracer,
4867 global_trace.buffer, &rb_simple_fops); 4852 &global_trace, &rb_simple_fops);
4868 4853
4869#ifdef CONFIG_DYNAMIC_FTRACE 4854#ifdef CONFIG_DYNAMIC_FTRACE
4870 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4855 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
@@ -5127,7 +5112,8 @@ __init static int tracer_alloc_buffers(void)
5127 max_tr.data[i] = &per_cpu(max_tr_data, i); 5112 max_tr.data[i] = &per_cpu(max_tr_data, i);
5128 } 5113 }
5129 5114
5130 set_buffer_entries(&global_trace, ring_buf_size); 5115 set_buffer_entries(&global_trace,
5116 ring_buffer_size(global_trace.buffer, 0));
5131#ifdef CONFIG_TRACER_MAX_TRACE 5117#ifdef CONFIG_TRACER_MAX_TRACE
5132 set_buffer_entries(&max_tr, 1); 5118 set_buffer_entries(&max_tr, 1);
5133#endif 5119#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a7d28e033a9..5aec220d2de 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -843,11 +843,11 @@ void trace_printk_init_buffers(void);
843 filter) 843 filter)
844#include "trace_entries.h" 844#include "trace_entries.h"
845 845
846#ifdef CONFIG_FUNCTION_TRACER 846#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
847int perf_ftrace_event_register(struct ftrace_event_call *call, 847int perf_ftrace_event_register(struct ftrace_event_call *call,
848 enum trace_reg type, void *data); 848 enum trace_reg type, void *data);
849#else 849#else
850#define perf_ftrace_event_register NULL 850#define perf_ftrace_event_register NULL
851#endif /* CONFIG_FUNCTION_TRACER */ 851#endif
852 852
853#endif /* _LINUX_KERNEL_TRACE_H */ 853#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9..29111da1d10 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
294 if (!call->name || !call->class || !call->class->reg) 294 if (!call->name || !call->class || !call->class->reg)
295 continue; 295 continue;
296 296
297 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
298 continue;
299
297 if (match && 300 if (match &&
298 strcmp(match, call->name) != 0 && 301 strcmp(match, call->name) != 0 &&
299 strcmp(match, call->class->system) != 0) 302 strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1164 return -1; 1167 return -1;
1165 } 1168 }
1166 1169
1167 if (call->class->reg) 1170 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1168 trace_create_file("enable", 0644, call->dir, call, 1171 trace_create_file("enable", 0644, call->dir, call,
1169 enable); 1172 enable);
1170 1173
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc85..e039906b037 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 180 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 181 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 182 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
183}; \ 184}; \
184struct ftrace_event_call __used \ 185struct ftrace_event_call __used \
185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 859fae6b182..df611a0e76c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter)
652{ 652{
653 u64 next_ts; 653 u64 next_ts;
654 int ret; 654 int ret;
655 /* trace_find_next_entry will reset ent_size */
656 int ent_size = iter->ent_size;
655 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
656 struct trace_entry *entry = iter->ent, 658 struct trace_entry *entry = iter->ent,
657 *next_entry = trace_find_next_entry(iter, NULL, 659 *next_entry = trace_find_next_entry(iter, NULL,
@@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter)
660 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); 662 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
661 unsigned long rel_usecs; 663 unsigned long rel_usecs;
662 664
665 /* Restore the original ent_size */
666 iter->ent_size = ent_size;
667
663 if (!next_entry) 668 if (!next_entry)
664 next_ts = iter->ts; 669 next_ts = iter->ts;
665 rel_usecs = ns2usecs(next_ts - iter->ts); 670 rel_usecs = ns2usecs(next_ts - iter->ts);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a472..00000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/events/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include <linux/slab.h>
13#include <linux/kref.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* A cpu workqueue thread */
19struct cpu_workqueue_stats {
20 struct list_head list;
21 struct kref kref;
22 int cpu;
23 pid_t pid;
24/* Can be inserted from interrupt or user context, need to be atomic */
25 atomic_t inserted;
26/*
27 * Don't need to be atomic, works are serialized in a single workqueue thread
28 * on a single CPU.
29 */
30 unsigned int executed;
31};
32
33/* List of workqueue threads on one cpu */
34struct workqueue_global_stats {
35 struct list_head list;
36 spinlock_t lock;
37};
38
39/* Don't need a global lock because allocated before the workqueues, and
40 * never freed.
41 */
42static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
43#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
44
45static void cpu_workqueue_stat_free(struct kref *kref)
46{
47 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
48}
49
50/* Insertion of a work */
51static void
52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
54 struct work_struct *work)
55{
56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
57 struct cpu_workqueue_stats *node;
58 unsigned long flags;
59
60 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
61 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
62 if (node->pid == wq_thread->pid) {
63 atomic_inc(&node->inserted);
64 goto found;
65 }
66 }
67 pr_debug("trace_workqueue: entry not found\n");
68found:
69 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
70}
71
72/* Execution of a work */
73static void
74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
76 struct work_struct *work)
77{
78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
79 struct cpu_workqueue_stats *node;
80 unsigned long flags;
81
82 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
83 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
84 if (node->pid == wq_thread->pid) {
85 node->executed++;
86 goto found;
87 }
88 }
89 pr_debug("trace_workqueue: entry not found\n");
90found:
91 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
92}
93
94/* Creation of a cpu workqueue thread */
95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
97{
98 struct cpu_workqueue_stats *cws;
99 unsigned long flags;
100
101 WARN_ON(cpu < 0);
102
103 /* Workqueues are sometimes created in atomic context */
104 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
105 if (!cws) {
106 pr_warning("trace_workqueue: not enough memory\n");
107 return;
108 }
109 INIT_LIST_HEAD(&cws->list);
110 kref_init(&cws->kref);
111 cws->cpu = cpu;
112 cws->pid = wq_thread->pid;
113
114 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
115 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
116 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
117}
118
119/* Destruction of a cpu workqueue thread */
120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
122{
123 /* Workqueue only execute on one cpu */
124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
125 struct cpu_workqueue_stats *node, *next;
126 unsigned long flags;
127
128 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
129 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
130 list) {
131 if (node->pid == wq_thread->pid) {
132 list_del(&node->list);
133 kref_put(&node->kref, cpu_workqueue_stat_free);
134 goto found;
135 }
136 }
137
138 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
139found:
140 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
141
142}
143
144static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
145{
146 unsigned long flags;
147 struct cpu_workqueue_stats *ret = NULL;
148
149
150 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
153 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
154 struct cpu_workqueue_stats, list);
155 kref_get(&ret->kref);
156 }
157
158 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
159
160 return ret;
161}
162
163static void *workqueue_stat_start(struct tracer_stat *trace)
164{
165 int cpu;
166 void *ret = NULL;
167
168 for_each_possible_cpu(cpu) {
169 ret = workqueue_stat_start_cpu(cpu);
170 if (ret)
171 return ret;
172 }
173 return NULL;
174}
175
176static void *workqueue_stat_next(void *prev, int idx)
177{
178 struct cpu_workqueue_stats *prev_cws = prev;
179 struct cpu_workqueue_stats *ret;
180 int cpu = prev_cws->cpu;
181 unsigned long flags;
182
183 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
184 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186 do {
187 cpu = cpumask_next(cpu, cpu_possible_mask);
188 if (cpu >= nr_cpu_ids)
189 return NULL;
190 } while (!(ret = workqueue_stat_start_cpu(cpu)));
191 return ret;
192 } else {
193 ret = list_entry(prev_cws->list.next,
194 struct cpu_workqueue_stats, list);
195 kref_get(&ret->kref);
196 }
197 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
198
199 return ret;
200}
201
202static int workqueue_stat_show(struct seq_file *s, void *p)
203{
204 struct cpu_workqueue_stats *cws = p;
205 struct pid *pid;
206 struct task_struct *tsk;
207
208 pid = find_get_pid(cws->pid);
209 if (pid) {
210 tsk = get_pid_task(pid, PIDTYPE_PID);
211 if (tsk) {
212 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
213 atomic_read(&cws->inserted), cws->executed,
214 tsk->comm);
215 put_task_struct(tsk);
216 }
217 put_pid(pid);
218 }
219
220 return 0;
221}
222
223static void workqueue_stat_release(void *stat)
224{
225 struct cpu_workqueue_stats *node = stat;
226
227 kref_put(&node->kref, cpu_workqueue_stat_free);
228}
229
230static int workqueue_stat_headers(struct seq_file *s)
231{
232 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
233 seq_printf(s, "# | | | |\n");
234 return 0;
235}
236
237struct tracer_stat workqueue_stats __read_mostly = {
238 .name = "workqueues",
239 .stat_start = workqueue_stat_start,
240 .stat_next = workqueue_stat_next,
241 .stat_show = workqueue_stat_show,
242 .stat_release = workqueue_stat_release,
243 .stat_headers = workqueue_stat_headers
244};
245
246
247int __init stat_workqueue_init(void)
248{
249 if (register_stat_tracer(&workqueue_stats)) {
250 pr_warning("Unable to register workqueue stat tracer\n");
251 return 1;
252 }
253
254 return 0;
255}
256fs_initcall(stat_workqueue_init);
257
258/*
259 * Workqueues are created very early, just after pre-smp initcalls.
260 * So we must register our tracepoints at this stage.
261 */
262int __init trace_workqueue_early_init(void)
263{
264 int ret, cpu;
265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
272 if (ret)
273 goto out;
274
275 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
276 if (ret)
277 goto no_insertion;
278
279 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
280 if (ret)
281 goto no_execution;
282
283 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
284 if (ret)
285 goto no_creation;
286
287 return 0;
288
289no_creation:
290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
291no_execution:
292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
293no_insertion:
294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
295out:
296 pr_warning("trace_workqueue: unable to trace workqueues\n");
297
298 return 1;
299}
300early_initcall(trace_workqueue_early_init);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 51c6e89e861..d7948eb1022 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
81 return ret; 81 return ret;
82} 82}
83 83
84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) 84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
85{ 85{
86 const struct cred *cred = current_cred(); 86 const struct cred *cred = current_cred();
87 int retval; 87 int retval;
88 old_uid_t ruid, euid, suid;
88 89
89 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && 90 ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
90 !(retval = put_user(high2lowuid(cred->euid), euid))) 91 euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
91 retval = put_user(high2lowuid(cred->suid), suid); 92 suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
93
94 if (!(retval = put_user(ruid, ruidp)) &&
95 !(retval = put_user(euid, euidp)))
96 retval = put_user(suid, suidp);
92 97
93 return retval; 98 return retval;
94} 99}
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
103} 108}
104 109
105 110
106SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) 111SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
107{ 112{
108 const struct cred *cred = current_cred(); 113 const struct cred *cred = current_cred();
109 int retval; 114 int retval;
115 old_gid_t rgid, egid, sgid;
116
117 rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
118 egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
119 sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
110 120
111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && 121 if (!(retval = put_user(rgid, rgidp)) &&
112 !(retval = put_user(high2lowgid(cred->egid), egid))) 122 !(retval = put_user(egid, egidp)))
113 retval = put_user(high2lowgid(cred->sgid), sgid); 123 retval = put_user(sgid, sgidp);
114 124
115 return retval; 125 return retval;
116} 126}
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
134static int groups16_to_user(old_gid_t __user *grouplist, 144static int groups16_to_user(old_gid_t __user *grouplist,
135 struct group_info *group_info) 145 struct group_info *group_info)
136{ 146{
147 struct user_namespace *user_ns = current_user_ns();
137 int i; 148 int i;
138 old_gid_t group; 149 old_gid_t group;
150 kgid_t kgid;
139 151
140 for (i = 0; i < group_info->ngroups; i++) { 152 for (i = 0; i < group_info->ngroups; i++) {
141 group = high2lowgid(GROUP_AT(group_info, i)); 153 kgid = GROUP_AT(group_info, i);
154 group = high2lowgid(from_kgid_munged(user_ns, kgid));
142 if (put_user(group, grouplist+i)) 155 if (put_user(group, grouplist+i))
143 return -EFAULT; 156 return -EFAULT;
144 } 157 }
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist,
149static int groups16_from_user(struct group_info *group_info, 162static int groups16_from_user(struct group_info *group_info,
150 old_gid_t __user *grouplist) 163 old_gid_t __user *grouplist)
151{ 164{
165 struct user_namespace *user_ns = current_user_ns();
152 int i; 166 int i;
153 old_gid_t group; 167 old_gid_t group;
168 kgid_t kgid;
154 169
155 for (i = 0; i < group_info->ngroups; i++) { 170 for (i = 0; i < group_info->ngroups; i++) {
156 if (get_user(group, grouplist+i)) 171 if (get_user(group, grouplist+i))
157 return -EFAULT; 172 return -EFAULT;
158 GROUP_AT(group_info, i) = low2highgid(group); 173
174 kgid = make_kgid(user_ns, low2highgid(group));
175 if (!gid_valid(kgid))
176 return -EINVAL;
177
178 GROUP_AT(group_info, i) = kgid;
159 } 179 }
160 180
161 return 0; 181 return 0;
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
211 231
212SYSCALL_DEFINE0(getuid16) 232SYSCALL_DEFINE0(getuid16)
213{ 233{
214 return high2lowuid(current_uid()); 234 return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
215} 235}
216 236
217SYSCALL_DEFINE0(geteuid16) 237SYSCALL_DEFINE0(geteuid16)
218{ 238{
219 return high2lowuid(current_euid()); 239 return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
220} 240}
221 241
222SYSCALL_DEFINE0(getgid16) 242SYSCALL_DEFINE0(getgid16)
223{ 243{
224 return high2lowgid(current_gid()); 244 return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
225} 245}
226 246
227SYSCALL_DEFINE0(getegid16) 247SYSCALL_DEFINE0(getegid16)
228{ 248{
229 return high2lowgid(current_egid()); 249 return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
230} 250}
diff --git a/kernel/user.c b/kernel/user.c
index 71dd2363ab0..b815fefbe76 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -22,10 +22,27 @@
22 * and 1 for... ? 22 * and 1 for... ?
23 */ 23 */
24struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
25 .uid_map = {
26 .nr_extents = 1,
27 .extent[0] = {
28 .first = 0,
29 .lower_first = 0,
30 .count = 4294967295U,
31 },
32 },
33 .gid_map = {
34 .nr_extents = 1,
35 .extent[0] = {
36 .first = 0,
37 .lower_first = 0,
38 .count = 4294967295U,
39 },
40 },
25 .kref = { 41 .kref = {
26 .refcount = ATOMIC_INIT(3), 42 .refcount = ATOMIC_INIT(3),
27 }, 43 },
28 .creator = &root_user, 44 .owner = GLOBAL_ROOT_UID,
45 .group = GLOBAL_ROOT_GID,
29}; 46};
30EXPORT_SYMBOL_GPL(init_user_ns); 47EXPORT_SYMBOL_GPL(init_user_ns);
31 48
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns);
34 * when changing user ID's (ie setuid() and friends). 51 * when changing user ID's (ie setuid() and friends).
35 */ 52 */
36 53
54#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
55#define UIDHASH_SZ (1 << UIDHASH_BITS)
37#define UIDHASH_MASK (UIDHASH_SZ - 1) 56#define UIDHASH_MASK (UIDHASH_SZ - 1)
38#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 57#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
39#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) 58#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
40 59
41static struct kmem_cache *uid_cachep; 60static struct kmem_cache *uid_cachep;
61struct hlist_head uidhash_table[UIDHASH_SZ];
42 62
43/* 63/*
44 * The uidhash_lock is mostly taken from process context, but it is 64 * The uidhash_lock is mostly taken from process context, but it is
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep;
51 */ 71 */
52static DEFINE_SPINLOCK(uidhash_lock); 72static DEFINE_SPINLOCK(uidhash_lock);
53 73
54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ 74/* root_user.__count is 1, for init task cred */
55struct user_struct root_user = { 75struct user_struct root_user = {
56 .__count = ATOMIC_INIT(2), 76 .__count = ATOMIC_INIT(1),
57 .processes = ATOMIC_INIT(1), 77 .processes = ATOMIC_INIT(1),
58 .files = ATOMIC_INIT(0), 78 .files = ATOMIC_INIT(0),
59 .sigpending = ATOMIC_INIT(0), 79 .sigpending = ATOMIC_INIT(0),
60 .locked_shm = 0, 80 .locked_shm = 0,
61 .user_ns = &init_user_ns, 81 .uid = GLOBAL_ROOT_UID,
62}; 82};
63 83
64/* 84/*
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
72static void uid_hash_remove(struct user_struct *up) 92static void uid_hash_remove(struct user_struct *up)
73{ 93{
74 hlist_del_init(&up->uidhash_node); 94 hlist_del_init(&up->uidhash_node);
75 put_user_ns(up->user_ns);
76} 95}
77 96
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 97static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
79{ 98{
80 struct user_struct *user; 99 struct user_struct *user;
81 struct hlist_node *h; 100 struct hlist_node *h;
82 101
83 hlist_for_each_entry(user, h, hashent, uidhash_node) { 102 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) { 103 if (uid_eq(user->uid, uid)) {
85 atomic_inc(&user->__count); 104 atomic_inc(&user->__count);
86 return user; 105 return user;
87 } 106 }
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags)
110 * 129 *
111 * If the user_struct could not be found, return NULL. 130 * If the user_struct could not be found, return NULL.
112 */ 131 */
113struct user_struct *find_user(uid_t uid) 132struct user_struct *find_user(kuid_t uid)
114{ 133{
115 struct user_struct *ret; 134 struct user_struct *ret;
116 unsigned long flags; 135 unsigned long flags;
117 struct user_namespace *ns = current_user_ns();
118 136
119 spin_lock_irqsave(&uidhash_lock, flags); 137 spin_lock_irqsave(&uidhash_lock, flags);
120 ret = uid_hash_find(uid, uidhashentry(ns, uid)); 138 ret = uid_hash_find(uid, uidhashentry(uid));
121 spin_unlock_irqrestore(&uidhash_lock, flags); 139 spin_unlock_irqrestore(&uidhash_lock, flags);
122 return ret; 140 return ret;
123} 141}
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up)
136 local_irq_restore(flags); 154 local_irq_restore(flags);
137} 155}
138 156
139struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) 157struct user_struct *alloc_uid(kuid_t uid)
140{ 158{
141 struct hlist_head *hashent = uidhashentry(ns, uid); 159 struct hlist_head *hashent = uidhashentry(uid);
142 struct user_struct *up, *new; 160 struct user_struct *up, *new;
143 161
144 spin_lock_irq(&uidhash_lock); 162 spin_lock_irq(&uidhash_lock);
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
153 new->uid = uid; 171 new->uid = uid;
154 atomic_set(&new->__count, 1); 172 atomic_set(&new->__count, 1);
155 173
156 new->user_ns = get_user_ns(ns);
157
158 /* 174 /*
159 * Before adding this, check whether we raced 175 * Before adding this, check whether we raced
160 * on adding the same user already.. 176 * on adding the same user already..
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
162 spin_lock_irq(&uidhash_lock); 178 spin_lock_irq(&uidhash_lock);
163 up = uid_hash_find(uid, hashent); 179 up = uid_hash_find(uid, hashent);
164 if (up) { 180 if (up) {
165 put_user_ns(ns);
166 key_put(new->uid_keyring); 181 key_put(new->uid_keyring);
167 key_put(new->session_keyring); 182 key_put(new->session_keyring);
168 kmem_cache_free(uid_cachep, new); 183 kmem_cache_free(uid_cachep, new);
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void)
187 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 202 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
188 203
189 for(n = 0; n < UIDHASH_SZ; ++n) 204 for(n = 0; n < UIDHASH_SZ; ++n)
190 INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); 205 INIT_HLIST_HEAD(uidhash_table + n);
191 206
192 /* Insert the root user immediately (init already runs as root) */ 207 /* Insert the root user immediately (init already runs as root) */
193 spin_lock_irq(&uidhash_lock); 208 spin_lock_irq(&uidhash_lock);
194 uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); 209 uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
195 spin_unlock_irq(&uidhash_lock); 210 spin_unlock_irq(&uidhash_lock);
196 211
197 return 0; 212 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3b906e98b1d..86602316422 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,9 +11,20 @@
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14#include <linux/securebits.h>
15#include <linux/keyctl.h>
16#include <linux/key-type.h>
17#include <keys/user-type.h>
18#include <linux/seq_file.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/ctype.h>
14 22
15static struct kmem_cache *user_ns_cachep __read_mostly; 23static struct kmem_cache *user_ns_cachep __read_mostly;
16 24
25static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
26 struct uid_gid_map *map);
27
17/* 28/*
18 * Create a new user namespace, deriving the creator from the user in the 29 * Create a new user namespace, deriving the creator from the user in the
19 * passed credentials, and replacing that user with the new root user for the 30 * passed credentials, and replacing that user with the new root user for the
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
24 */ 35 */
25int create_user_ns(struct cred *new) 36int create_user_ns(struct cred *new)
26{ 37{
27 struct user_namespace *ns; 38 struct user_namespace *ns, *parent_ns = new->user_ns;
28 struct user_struct *root_user; 39 kuid_t owner = new->euid;
29 int n; 40 kgid_t group = new->egid;
41
42 /* The creator needs a mapping in the parent user namespace
43 * or else we won't be able to reasonably tell userspace who
44 * created a user_namespace.
45 */
46 if (!kuid_has_mapping(parent_ns, owner) ||
47 !kgid_has_mapping(parent_ns, group))
48 return -EPERM;
30 49
31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); 50 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
32 if (!ns) 51 if (!ns)
33 return -ENOMEM; 52 return -ENOMEM;
34 53
35 kref_init(&ns->kref); 54 kref_init(&ns->kref);
55 ns->parent = parent_ns;
56 ns->owner = owner;
57 ns->group = group;
36 58
37 for (n = 0; n < UIDHASH_SZ; ++n) 59 /* Start with the same capabilities as init but useless for doing
38 INIT_HLIST_HEAD(ns->uidhash_table + n); 60 * anything as the capabilities are bound to the new user namespace.
39 61 */
40 /* Alloc new root user. */ 62 new->securebits = SECUREBITS_DEFAULT;
41 root_user = alloc_uid(ns, 0); 63 new->cap_inheritable = CAP_EMPTY_SET;
42 if (!root_user) { 64 new->cap_permitted = CAP_FULL_SET;
43 kmem_cache_free(user_ns_cachep, ns); 65 new->cap_effective = CAP_FULL_SET;
44 return -ENOMEM; 66 new->cap_bset = CAP_FULL_SET;
45 }
46
47 /* set the new root user in the credentials under preparation */
48 ns->creator = new->user;
49 new->user = root_user;
50 new->uid = new->euid = new->suid = new->fsuid = 0;
51 new->gid = new->egid = new->sgid = new->fsgid = 0;
52 put_group_info(new->group_info);
53 new->group_info = get_group_info(&init_groups);
54#ifdef CONFIG_KEYS 67#ifdef CONFIG_KEYS
55 key_put(new->request_key_auth); 68 key_put(new->request_key_auth);
56 new->request_key_auth = NULL; 69 new->request_key_auth = NULL;
57#endif 70#endif
58 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 71 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
59 72
60 /* root_user holds a reference to ns, our reference can be dropped */ 73 /* Leave the new->user_ns reference with the new user namespace. */
61 put_user_ns(ns); 74 /* Leave the reference to our user_ns with the new cred. */
75 new->user_ns = ns;
62 76
63 return 0; 77 return 0;
64} 78}
65 79
66/* 80void free_user_ns(struct kref *kref)
67 * Deferred destructor for a user namespace. This is required because
68 * free_user_ns() may be called with uidhash_lock held, but we need to call
69 * back to free_uid() which will want to take the lock again.
70 */
71static void free_user_ns_work(struct work_struct *work)
72{ 81{
73 struct user_namespace *ns = 82 struct user_namespace *parent, *ns =
74 container_of(work, struct user_namespace, destroyer); 83 container_of(kref, struct user_namespace, kref);
75 free_uid(ns->creator); 84
85 parent = ns->parent;
76 kmem_cache_free(user_ns_cachep, ns); 86 kmem_cache_free(user_ns_cachep, ns);
87 put_user_ns(parent);
77} 88}
89EXPORT_SYMBOL(free_user_ns);
78 90
79void free_user_ns(struct kref *kref) 91static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
80{ 92{
81 struct user_namespace *ns = 93 unsigned idx, extents;
82 container_of(kref, struct user_namespace, kref); 94 u32 first, last, id2;
95
96 id2 = id + count - 1;
97
98 /* Find the matching extent */
99 extents = map->nr_extents;
100 smp_read_barrier_depends();
101 for (idx = 0; idx < extents; idx++) {
102 first = map->extent[idx].first;
103 last = first + map->extent[idx].count - 1;
104 if (id >= first && id <= last &&
105 (id2 >= first && id2 <= last))
106 break;
107 }
108 /* Map the id or note failure */
109 if (idx < extents)
110 id = (id - first) + map->extent[idx].lower_first;
111 else
112 id = (u32) -1;
83 113
84 INIT_WORK(&ns->destroyer, free_user_ns_work); 114 return id;
85 schedule_work(&ns->destroyer);
86} 115}
87EXPORT_SYMBOL(free_user_ns);
88 116
89uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) 117static u32 map_id_down(struct uid_gid_map *map, u32 id)
90{ 118{
91 struct user_namespace *tmp; 119 unsigned idx, extents;
120 u32 first, last;
92 121
93 if (likely(to == cred->user->user_ns)) 122 /* Find the matching extent */
94 return uid; 123 extents = map->nr_extents;
124 smp_read_barrier_depends();
125 for (idx = 0; idx < extents; idx++) {
126 first = map->extent[idx].first;
127 last = first + map->extent[idx].count - 1;
128 if (id >= first && id <= last)
129 break;
130 }
131 /* Map the id or note failure */
132 if (idx < extents)
133 id = (id - first) + map->extent[idx].lower_first;
134 else
135 id = (u32) -1;
95 136
137 return id;
138}
96 139
97 /* Is cred->user the creator of the target user_ns 140static u32 map_id_up(struct uid_gid_map *map, u32 id)
98 * or the creator of one of it's parents? 141{
99 */ 142 unsigned idx, extents;
100 for ( tmp = to; tmp != &init_user_ns; 143 u32 first, last;
101 tmp = tmp->creator->user_ns ) { 144
102 if (cred->user == tmp->creator) { 145 /* Find the matching extent */
103 return (uid_t)0; 146 extents = map->nr_extents;
104 } 147 smp_read_barrier_depends();
148 for (idx = 0; idx < extents; idx++) {
149 first = map->extent[idx].lower_first;
150 last = first + map->extent[idx].count - 1;
151 if (id >= first && id <= last)
152 break;
105 } 153 }
154 /* Map the id or note failure */
155 if (idx < extents)
156 id = (id - first) + map->extent[idx].first;
157 else
158 id = (u32) -1;
159
160 return id;
161}
162
163/**
164 * make_kuid - Map a user-namespace uid pair into a kuid.
165 * @ns: User namespace that the uid is in
166 * @uid: User identifier
167 *
168 * Maps a user-namespace uid pair into a kernel internal kuid,
169 * and returns that kuid.
170 *
171 * When there is no mapping defined for the user-namespace uid
172 * pair INVALID_UID is returned. Callers are expected to test
173 * for and handle handle INVALID_UID being returned. INVALID_UID
174 * may be tested for using uid_valid().
175 */
176kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
177{
178 /* Map the uid to a global kernel uid */
179 return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
180}
181EXPORT_SYMBOL(make_kuid);
182
183/**
184 * from_kuid - Create a uid from a kuid user-namespace pair.
185 * @targ: The user namespace we want a uid in.
186 * @kuid: The kernel internal uid to start with.
187 *
188 * Map @kuid into the user-namespace specified by @targ and
189 * return the resulting uid.
190 *
191 * There is always a mapping into the initial user_namespace.
192 *
193 * If @kuid has no mapping in @targ (uid_t)-1 is returned.
194 */
195uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
196{
197 /* Map the uid from a global kernel uid */
198 return map_id_up(&targ->uid_map, __kuid_val(kuid));
199}
200EXPORT_SYMBOL(from_kuid);
106 201
107 /* No useful relationship so no mapping */ 202/**
108 return overflowuid; 203 * from_kuid_munged - Create a uid from a kuid user-namespace pair.
204 * @targ: The user namespace we want a uid in.
205 * @kuid: The kernel internal uid to start with.
206 *
207 * Map @kuid into the user-namespace specified by @targ and
208 * return the resulting uid.
209 *
210 * There is always a mapping into the initial user_namespace.
211 *
212 * Unlike from_kuid from_kuid_munged never fails and always
213 * returns a valid uid. This makes from_kuid_munged appropriate
214 * for use in syscalls like stat and getuid where failing the
215 * system call and failing to provide a valid uid are not an
216 * options.
217 *
218 * If @kuid has no mapping in @targ overflowuid is returned.
219 */
220uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
221{
222 uid_t uid;
223 uid = from_kuid(targ, kuid);
224
225 if (uid == (uid_t) -1)
226 uid = overflowuid;
227 return uid;
109} 228}
229EXPORT_SYMBOL(from_kuid_munged);
110 230
111gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) 231/**
232 * make_kgid - Map a user-namespace gid pair into a kgid.
233 * @ns: User namespace that the gid is in
234 * @uid: group identifier
235 *
236 * Maps a user-namespace gid pair into a kernel internal kgid,
237 * and returns that kgid.
238 *
239 * When there is no mapping defined for the user-namespace gid
240 * pair INVALID_GID is returned. Callers are expected to test
241 * for and handle INVALID_GID being returned. INVALID_GID may be
242 * tested for using gid_valid().
243 */
244kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
112{ 245{
113 struct user_namespace *tmp; 246 /* Map the gid to a global kernel gid */
247 return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
248}
249EXPORT_SYMBOL(make_kgid);
114 250
115 if (likely(to == cred->user->user_ns)) 251/**
116 return gid; 252 * from_kgid - Create a gid from a kgid user-namespace pair.
253 * @targ: The user namespace we want a gid in.
254 * @kgid: The kernel internal gid to start with.
255 *
256 * Map @kgid into the user-namespace specified by @targ and
257 * return the resulting gid.
258 *
259 * There is always a mapping into the initial user_namespace.
260 *
261 * If @kgid has no mapping in @targ (gid_t)-1 is returned.
262 */
263gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
264{
265 /* Map the gid from a global kernel gid */
266 return map_id_up(&targ->gid_map, __kgid_val(kgid));
267}
268EXPORT_SYMBOL(from_kgid);
269
270/**
271 * from_kgid_munged - Create a gid from a kgid user-namespace pair.
272 * @targ: The user namespace we want a gid in.
273 * @kgid: The kernel internal gid to start with.
274 *
275 * Map @kgid into the user-namespace specified by @targ and
276 * return the resulting gid.
277 *
278 * There is always a mapping into the initial user_namespace.
279 *
280 * Unlike from_kgid from_kgid_munged never fails and always
281 * returns a valid gid. This makes from_kgid_munged appropriate
282 * for use in syscalls like stat and getgid where failing the
283 * system call and failing to provide a valid gid are not options.
284 *
285 * If @kgid has no mapping in @targ overflowgid is returned.
286 */
287gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
288{
289 gid_t gid;
290 gid = from_kgid(targ, kgid);
117 291
118 /* Is cred->user the creator of the target user_ns 292 if (gid == (gid_t) -1)
119 * or the creator of one of it's parents? 293 gid = overflowgid;
294 return gid;
295}
296EXPORT_SYMBOL(from_kgid_munged);
297
298static int uid_m_show(struct seq_file *seq, void *v)
299{
300 struct user_namespace *ns = seq->private;
301 struct uid_gid_extent *extent = v;
302 struct user_namespace *lower_ns;
303 uid_t lower;
304
305 lower_ns = current_user_ns();
306 if ((lower_ns == ns) && lower_ns->parent)
307 lower_ns = lower_ns->parent;
308
309 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
310
311 seq_printf(seq, "%10u %10u %10u\n",
312 extent->first,
313 lower,
314 extent->count);
315
316 return 0;
317}
318
319static int gid_m_show(struct seq_file *seq, void *v)
320{
321 struct user_namespace *ns = seq->private;
322 struct uid_gid_extent *extent = v;
323 struct user_namespace *lower_ns;
324 gid_t lower;
325
326 lower_ns = current_user_ns();
327 if ((lower_ns == ns) && lower_ns->parent)
328 lower_ns = lower_ns->parent;
329
330 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
331
332 seq_printf(seq, "%10u %10u %10u\n",
333 extent->first,
334 lower,
335 extent->count);
336
337 return 0;
338}
339
340static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
341{
342 struct uid_gid_extent *extent = NULL;
343 loff_t pos = *ppos;
344
345 if (pos < map->nr_extents)
346 extent = &map->extent[pos];
347
348 return extent;
349}
350
351static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
352{
353 struct user_namespace *ns = seq->private;
354
355 return m_start(seq, ppos, &ns->uid_map);
356}
357
358static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
359{
360 struct user_namespace *ns = seq->private;
361
362 return m_start(seq, ppos, &ns->gid_map);
363}
364
365static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
366{
367 (*pos)++;
368 return seq->op->start(seq, pos);
369}
370
371static void m_stop(struct seq_file *seq, void *v)
372{
373 return;
374}
375
376struct seq_operations proc_uid_seq_operations = {
377 .start = uid_m_start,
378 .stop = m_stop,
379 .next = m_next,
380 .show = uid_m_show,
381};
382
383struct seq_operations proc_gid_seq_operations = {
384 .start = gid_m_start,
385 .stop = m_stop,
386 .next = m_next,
387 .show = gid_m_show,
388};
389
390static DEFINE_MUTEX(id_map_mutex);
391
392static ssize_t map_write(struct file *file, const char __user *buf,
393 size_t count, loff_t *ppos,
394 int cap_setid,
395 struct uid_gid_map *map,
396 struct uid_gid_map *parent_map)
397{
398 struct seq_file *seq = file->private_data;
399 struct user_namespace *ns = seq->private;
400 struct uid_gid_map new_map;
401 unsigned idx;
402 struct uid_gid_extent *extent, *last = NULL;
403 unsigned long page = 0;
404 char *kbuf, *pos, *next_line;
405 ssize_t ret = -EINVAL;
406
407 /*
408 * The id_map_mutex serializes all writes to any given map.
409 *
410 * Any map is only ever written once.
411 *
412 * An id map fits within 1 cache line on most architectures.
413 *
414 * On read nothing needs to be done unless you are on an
415 * architecture with a crazy cache coherency model like alpha.
416 *
417 * There is a one time data dependency between reading the
418 * count of the extents and the values of the extents. The
419 * desired behavior is to see the values of the extents that
420 * were written before the count of the extents.
421 *
422 * To achieve this smp_wmb() is used on guarantee the write
423 * order and smp_read_barrier_depends() is guaranteed that we
424 * don't have crazy architectures returning stale data.
425 *
120 */ 426 */
121 for ( tmp = to; tmp != &init_user_ns; 427 mutex_lock(&id_map_mutex);
122 tmp = tmp->creator->user_ns ) { 428
123 if (cred->user == tmp->creator) { 429 ret = -EPERM;
124 return (gid_t)0; 430 /* Only allow one successful write to the map */
431 if (map->nr_extents != 0)
432 goto out;
433
434 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
435 * over the user namespace in order to set the id mapping.
436 */
437 if (!ns_capable(ns, cap_setid))
438 goto out;
439
440 /* Get a buffer */
441 ret = -ENOMEM;
442 page = __get_free_page(GFP_TEMPORARY);
443 kbuf = (char *) page;
444 if (!page)
445 goto out;
446
447 /* Only allow <= page size writes at the beginning of the file */
448 ret = -EINVAL;
449 if ((*ppos != 0) || (count >= PAGE_SIZE))
450 goto out;
451
452 /* Slurp in the user data */
453 ret = -EFAULT;
454 if (copy_from_user(kbuf, buf, count))
455 goto out;
456 kbuf[count] = '\0';
457
458 /* Parse the user data */
459 ret = -EINVAL;
460 pos = kbuf;
461 new_map.nr_extents = 0;
462 for (;pos; pos = next_line) {
463 extent = &new_map.extent[new_map.nr_extents];
464
465 /* Find the end of line and ensure I don't look past it */
466 next_line = strchr(pos, '\n');
467 if (next_line) {
468 *next_line = '\0';
469 next_line++;
470 if (*next_line == '\0')
471 next_line = NULL;
125 } 472 }
473
474 pos = skip_spaces(pos);
475 extent->first = simple_strtoul(pos, &pos, 10);
476 if (!isspace(*pos))
477 goto out;
478
479 pos = skip_spaces(pos);
480 extent->lower_first = simple_strtoul(pos, &pos, 10);
481 if (!isspace(*pos))
482 goto out;
483
484 pos = skip_spaces(pos);
485 extent->count = simple_strtoul(pos, &pos, 10);
486 if (*pos && !isspace(*pos))
487 goto out;
488
489 /* Verify there is not trailing junk on the line */
490 pos = skip_spaces(pos);
491 if (*pos != '\0')
492 goto out;
493
494 /* Verify we have been given valid starting values */
495 if ((extent->first == (u32) -1) ||
496 (extent->lower_first == (u32) -1 ))
497 goto out;
498
499 /* Verify count is not zero and does not cause the extent to wrap */
500 if ((extent->first + extent->count) <= extent->first)
501 goto out;
502 if ((extent->lower_first + extent->count) <= extent->lower_first)
503 goto out;
504
505 /* For now only accept extents that are strictly in order */
506 if (last &&
507 (((last->first + last->count) > extent->first) ||
508 ((last->lower_first + last->count) > extent->lower_first)))
509 goto out;
510
511 new_map.nr_extents++;
512 last = extent;
513
514 /* Fail if the file contains too many extents */
515 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
516 (next_line != NULL))
517 goto out;
518 }
519 /* Be very certaint the new map actually exists */
520 if (new_map.nr_extents == 0)
521 goto out;
522
523 ret = -EPERM;
524 /* Validate the user is allowed to use user id's mapped to. */
525 if (!new_idmap_permitted(ns, cap_setid, &new_map))
526 goto out;
527
528 /* Map the lower ids from the parent user namespace to the
529 * kernel global id space.
530 */
531 for (idx = 0; idx < new_map.nr_extents; idx++) {
532 u32 lower_first;
533 extent = &new_map.extent[idx];
534
535 lower_first = map_id_range_down(parent_map,
536 extent->lower_first,
537 extent->count);
538
539 /* Fail if we can not map the specified extent to
540 * the kernel global id space.
541 */
542 if (lower_first == (u32) -1)
543 goto out;
544
545 extent->lower_first = lower_first;
126 } 546 }
127 547
128 /* No useful relationship so no mapping */ 548 /* Install the map */
129 return overflowgid; 549 memcpy(map->extent, new_map.extent,
550 new_map.nr_extents*sizeof(new_map.extent[0]));
551 smp_wmb();
552 map->nr_extents = new_map.nr_extents;
553
554 *ppos = count;
555 ret = count;
556out:
557 mutex_unlock(&id_map_mutex);
558 if (page)
559 free_page(page);
560 return ret;
561}
562
563ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
564{
565 struct seq_file *seq = file->private_data;
566 struct user_namespace *ns = seq->private;
567
568 if (!ns->parent)
569 return -EPERM;
570
571 return map_write(file, buf, size, ppos, CAP_SETUID,
572 &ns->uid_map, &ns->parent->uid_map);
573}
574
575ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
576{
577 struct seq_file *seq = file->private_data;
578 struct user_namespace *ns = seq->private;
579
580 if (!ns->parent)
581 return -EPERM;
582
583 return map_write(file, buf, size, ppos, CAP_SETGID,
584 &ns->gid_map, &ns->parent->gid_map);
585}
586
587static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
588 struct uid_gid_map *new_map)
589{
590 /* Allow the specified ids if we have the appropriate capability
591 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
592 */
593 if (ns_capable(ns->parent, cap_setid))
594 return true;
595
596 return false;
130} 597}
131 598
132static __init int user_namespaces_init(void) 599static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 405caf91aad..679d97a5d3f 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
43 43
44 down_read(&uts_sem); 44 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); 46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
47 up_read(&uts_sem); 47 up_read(&uts_sem);
48 return ns; 48 return ns;
49} 49}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c0..9a3128dc67d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1032 cwq = get_cwq(gcwq->cpu, wq); 1032 cwq = get_cwq(gcwq->cpu, wq);
1033 trace_workqueue_queue_work(cpu, cwq, work); 1033 trace_workqueue_queue_work(cpu, cwq, work);
1034 1034
1035 BUG_ON(!list_empty(&work->entry)); 1035 if (WARN_ON(!list_empty(&work->entry))) {
1036 spin_unlock_irqrestore(&gcwq->lock, flags);
1037 return;
1038 }
1036 1039
1037 cwq->nr_in_flight[cwq->work_color]++; 1040 cwq->nr_in_flight[cwq->work_color]++;
1038 work_flags = work_color_to_flags(cwq->work_color); 1041 work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
1210 } else 1213 } else
1211 wake_up_all(&gcwq->trustee_wait); 1214 wake_up_all(&gcwq->trustee_wait);
1212 1215
1213 /* sanity check nr_running */ 1216 /*
1214 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && 1217 * Sanity check nr_running. Because trustee releases gcwq->lock
1218 * between setting %WORKER_ROGUE and zapping nr_running, the
1219 * warning may trigger spuriously. Check iff trustee is idle.
1220 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
1222 gcwq->nr_workers == gcwq->nr_idle &&
1215 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1223 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1216} 1224}
1217 1225
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
1810 * lock freed" warnings as well as problems when looking into 1818 * lock freed" warnings as well as problems when looking into
1811 * work->lockdep_map, make a copy and use that here. 1819 * work->lockdep_map, make a copy and use that here.
1812 */ 1820 */
1813 struct lockdep_map lockdep_map = work->lockdep_map; 1821 struct lockdep_map lockdep_map;
1822
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1814#endif 1824#endif
1815 /* 1825 /*
1816 * A single work shouldn't be executed concurrently by 1826 * A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
2506{ 2516{
2507 struct wq_barrier barr; 2517 struct wq_barrier barr;
2508 2518
2519 lock_map_acquire(&work->lockdep_map);
2520 lock_map_release(&work->lockdep_map);
2521
2509 if (start_flush_work(work, &barr, true)) { 2522 if (start_flush_work(work, &barr, true)) {
2510 wait_for_completion(&barr.done); 2523 wait_for_completion(&barr.done);
2511 destroy_work_on_stack(&barr.work); 2524 destroy_work_on_stack(&barr.work);