aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/cgroup.c564
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/compat.c63
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c41
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/debug/debug_core.c53
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/events/core.c13
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c77
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig2
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/debug.h38
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c47
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/itimer.c8
-rw-r--r--kernel/kmod.c117
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/padata.c13
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c62
-rw-r--r--kernel/power/Kconfig27
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/autosleep.c127
-rw-r--r--kernel/power/hibernate.c31
-rw-r--r--kernel/power/main.c160
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/qos.c50
-rw-r--r--kernel/power/suspend.c7
-rw-r--r--kernel/power/swap.c84
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/power/wakelock.c259
-rw-r--r--kernel/printk.c1390
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c333
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/res_counter.c71
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c463
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c472
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/semaphore.c2
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/smp.c27
-rw-r--r--kernel/smpboot.c62
-rw-r--r--kernel/smpboot.h18
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/timer.c12
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c18
-rw-r--r--kernel/trace/ftrace.c242
-rw-r--r--kernel/trace/ring_buffer.c585
-rw-r--r--kernel/trace/trace.c511
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_output.c5
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/workqueue.c21
85 files changed, 5474 insertions, 2721 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9f..6c07f30fa9b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
45obj-$(CONFIG_SMP) += smp.o 45obj-$(CONFIG_SMP) += smp.o
46obj-$(CONFIG_SMP) += smpboot.o
46ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
47obj-y += up.o 48obj-y += up.o
48endif 49endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34eae..4b96415527b8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
70#include <linux/compat.h>
70 71
71#include "audit.h" 72#include "audit.h"
72 73
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
2710 audit_log_end(ab); 2711 audit_log_end(ab);
2711} 2712}
2712 2713
2713void __audit_seccomp(unsigned long syscall) 2714void __audit_seccomp(unsigned long syscall, long signr, int code)
2714{ 2715{
2715 struct audit_buffer *ab; 2716 struct audit_buffer *ab;
2716 2717
2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2718 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2718 audit_log_abend(ab, "seccomp", SIGKILL); 2719 audit_log_abend(ab, "seccomp", signr);
2719 audit_log_format(ab, " syscall=%ld", syscall); 2720 audit_log_format(ab, " syscall=%ld", syscall);
2721 audit_log_format(ab, " compat=%d", is_compat_task());
2722 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2723 audit_log_format(ab, " code=0x%x", code);
2720 audit_log_end(ab); 2724 audit_log_end(ab);
2721} 2725}
2722 2726
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c8329b0c2576..a0c6af34d500 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
66/* 70/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 71 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 72 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 131 /* A list running through the active hierarchies */
128 struct list_head root_list; 132 struct list_head root_list;
129 133
134 /* All cgroups on this root, cgroup_mutex protected */
135 struct list_head allcg_list;
136
130 /* Hierarchy-specific flags */ 137 /* Hierarchy-specific flags */
131 unsigned long flags; 138 unsigned long flags;
132 139
@@ -145,6 +152,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 152static struct cgroupfs_root rootnode;
146 153
147/* 154/*
155 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
156 */
157struct cfent {
158 struct list_head node;
159 struct dentry *dentry;
160 struct cftype *type;
161};
162
163/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 164 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 165 * cgroup_subsys->use_id != 0.
150 */ 166 */
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)
239 255
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 257
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css)
260{
261 int v = atomic_read(&css->refcnt);
262
263 return v >= 0 ? v : v - CSS_DEACT_BIAS;
264}
265
242/* convenient tests for these bits */ 266/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 267inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 268{
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 303#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 304list_for_each_entry(_root, &roots, root_list)
281 305
306static inline struct cgroup *__d_cgrp(struct dentry *dentry)
307{
308 return dentry->d_fsdata;
309}
310
311static inline struct cfent *__d_cfe(struct dentry *dentry)
312{
313 return dentry->d_fsdata;
314}
315
316static inline struct cftype *__d_cft(struct dentry *dentry)
317{
318 return __d_cfe(dentry)->type;
319}
320
282/* the list of cgroups eligible for automatic release. Protected by 321/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 322 * release_list_lock */
284static LIST_HEAD(release_list); 323static LIST_HEAD(release_list);
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 855 struct cgroup_subsys *ss;
817 int ret = 0; 856 int ret = 0;
818 857
819 for_each_subsys(cgrp->root, ss) 858 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 859 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(cgrp); 860 continue;
822 if (ret) 861
823 break; 862 ret = ss->pre_destroy(cgrp);
863 if (ret) {
864 /* ->pre_destroy() failure is being deprecated */
865 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
866 break;
824 } 867 }
868 }
825 869
826 return ret; 870 return ret;
827} 871}
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 908 BUG_ON(!list_empty(&cgrp->pidlists));
865 909
866 kfree_rcu(cgrp, rcu_head); 910 kfree_rcu(cgrp, rcu_head);
911 } else {
912 struct cfent *cfe = __d_cfe(dentry);
913 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
914
915 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name);
918 kfree(cfe);
867 } 919 }
868 iput(inode); 920 iput(inode);
869} 921}
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 934 dput(parent);
883} 935}
884 936
885static void cgroup_clear_directory(struct dentry *dentry) 937static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 938{
887 struct list_head *node; 939 struct cfent *cfe;
888 940
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 941 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 942 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 943
892 while (node != &dentry->d_subdirs) { 944 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 945 struct dentry *d = cfe->dentry;
894 946
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 947 if (cft && cfe->type != cft)
896 list_del_init(node); 948 continue;
897 if (d->d_inode) { 949
898 /* This should never be called on a cgroup 950 dget(d);
899 * directory with child cgroups */ 951 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 952 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 953 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 954 dput(d);
903 spin_unlock(&dentry->d_lock); 955
904 d_delete(d); 956 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 957 }
912 spin_unlock(&dentry->d_lock); 958 return -ENOENT;
959}
960
961static void cgroup_clear_directory(struct dentry *dir)
962{
963 struct cgroup *cgrp = __d_cgrp(dir);
964
965 while (!list_empty(&cgrp->files))
966 cgroup_rm_file(cgrp, NULL);
913} 967}
914 968
915/* 969/*
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1348 if (ret)
1295 goto out_unlock; 1349 goto out_unlock;
1296 1350
1351 /* See feature-removal-schedule.txt */
1352 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1353 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1354 task_tgid_nr(current), current->comm);
1355
1297 /* Don't allow flags or name to change at remount */ 1356 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1357 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1358 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1367 goto out_unlock;
1309 } 1368 }
1310 1369
1311 /* (re)populate subsystem files */ 1370 /* clear out any existing files and repopulate subsystem files */
1371 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1372 cgroup_populate_dir(cgrp);
1313 1373
1314 if (opts.release_agent) 1374 if (opts.release_agent)
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1393{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1394 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1395 INIT_LIST_HEAD(&cgrp->children);
1396 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1397 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1398 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1399 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1405static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1406{
1346 struct cgroup *cgrp = &root->top_cgroup; 1407 struct cgroup *cgrp = &root->top_cgroup;
1408
1347 INIT_LIST_HEAD(&root->subsys_list); 1409 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1410 INIT_LIST_HEAD(&root->root_list);
1411 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1412 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1413 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1414 cgrp->top_cgroup = cgrp;
1415 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1416 init_cgroup_housekeeping(cgrp);
1353} 1417}
1354 1418
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = {
1692 1756
1693static struct kobject *cgroup_kobj; 1757static struct kobject *cgroup_kobj;
1694 1758
1695static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1696{
1697 return dentry->d_fsdata;
1698}
1699
1700static inline struct cftype *__d_cft(struct dentry *dentry)
1701{
1702 return dentry->d_fsdata;
1703}
1704
1705/** 1759/**
1706 * cgroup_path - generate the path of a cgroup 1760 * cgroup_path - generate the path of a cgroup
1707 * @cgrp: the cgroup in question 1761 * @cgrp: the cgroup in question
@@ -2172,6 +2226,18 @@ retry_find_task:
2172 2226
2173 if (threadgroup) 2227 if (threadgroup)
2174 tsk = tsk->group_leader; 2228 tsk = tsk->group_leader;
2229
2230 /*
2231 * Workqueue threads may acquire PF_THREAD_BOUND and become
2232 * trapped in a cpuset, or RT worker may be born in a cgroup
2233 * with no rt_runtime allocated. Just say no.
2234 */
2235 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2236 ret = -EINVAL;
2237 rcu_read_unlock();
2238 goto out_unlock_cgroup;
2239 }
2240
2175 get_task_struct(tsk); 2241 get_task_struct(tsk);
2176 rcu_read_unlock(); 2242 rcu_read_unlock();
2177 2243
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2603 return mode; 2669 return mode;
2604} 2670}
2605 2671
2606int cgroup_add_file(struct cgroup *cgrp, 2672static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2607 struct cgroup_subsys *subsys, 2673 const struct cftype *cft)
2608 const struct cftype *cft)
2609{ 2674{
2610 struct dentry *dir = cgrp->dentry; 2675 struct dentry *dir = cgrp->dentry;
2676 struct cgroup *parent = __d_cgrp(dir);
2611 struct dentry *dentry; 2677 struct dentry *dentry;
2678 struct cfent *cfe;
2612 int error; 2679 int error;
2613 umode_t mode; 2680 umode_t mode;
2614
2615 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2681 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2682
2683 /* does @cft->flags tell us to skip creation on @cgrp? */
2684 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2685 return 0;
2686 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2687 return 0;
2688
2616 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2689 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2617 strcpy(name, subsys->name); 2690 strcpy(name, subsys->name);
2618 strcat(name, "."); 2691 strcat(name, ".");
2619 } 2692 }
2620 strcat(name, cft->name); 2693 strcat(name, cft->name);
2694
2621 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2695 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2696
2697 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2698 if (!cfe)
2699 return -ENOMEM;
2700
2622 dentry = lookup_one_len(name, dir, strlen(name)); 2701 dentry = lookup_one_len(name, dir, strlen(name));
2623 if (!IS_ERR(dentry)) { 2702 if (IS_ERR(dentry)) {
2624 mode = cgroup_file_mode(cft);
2625 error = cgroup_create_file(dentry, mode | S_IFREG,
2626 cgrp->root->sb);
2627 if (!error)
2628 dentry->d_fsdata = (void *)cft;
2629 dput(dentry);
2630 } else
2631 error = PTR_ERR(dentry); 2703 error = PTR_ERR(dentry);
2704 goto out;
2705 }
2706
2707 mode = cgroup_file_mode(cft);
2708 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2709 if (!error) {
2710 cfe->type = (void *)cft;
2711 cfe->dentry = dentry;
2712 dentry->d_fsdata = cfe;
2713 list_add_tail(&cfe->node, &parent->files);
2714 cfe = NULL;
2715 }
2716 dput(dentry);
2717out:
2718 kfree(cfe);
2632 return error; 2719 return error;
2633} 2720}
2634EXPORT_SYMBOL_GPL(cgroup_add_file);
2635 2721
2636int cgroup_add_files(struct cgroup *cgrp, 2722static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2637 struct cgroup_subsys *subsys, 2723 const struct cftype cfts[], bool is_add)
2638 const struct cftype cft[],
2639 int count)
2640{ 2724{
2641 int i, err; 2725 const struct cftype *cft;
2642 for (i = 0; i < count; i++) { 2726 int err, ret = 0;
2643 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2727
2644 if (err) 2728 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2645 return err; 2729 if (is_add)
2730 err = cgroup_add_file(cgrp, subsys, cft);
2731 else
2732 err = cgroup_rm_file(cgrp, cft);
2733 if (err) {
2734 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2735 is_add ? "add" : "remove", cft->name, err);
2736 ret = err;
2737 }
2738 }
2739 return ret;
2740}
2741
2742static DEFINE_MUTEX(cgroup_cft_mutex);
2743
2744static void cgroup_cfts_prepare(void)
2745 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2746{
2747 /*
2748 * Thanks to the entanglement with vfs inode locking, we can't walk
2749 * the existing cgroups under cgroup_mutex and create files.
2750 * Instead, we increment reference on all cgroups and build list of
2751 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2752 * exclusive access to the field.
2753 */
2754 mutex_lock(&cgroup_cft_mutex);
2755 mutex_lock(&cgroup_mutex);
2756}
2757
2758static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2759 const struct cftype *cfts, bool is_add)
2760 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2761{
2762 LIST_HEAD(pending);
2763 struct cgroup *cgrp, *n;
2764
2765 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2766 if (cfts && ss->root != &rootnode) {
2767 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2768 dget(cgrp->dentry);
2769 list_add_tail(&cgrp->cft_q_node, &pending);
2770 }
2771 }
2772
2773 mutex_unlock(&cgroup_mutex);
2774
2775 /*
2776 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2777 * files for all cgroups which were created before.
2778 */
2779 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2780 struct inode *inode = cgrp->dentry->d_inode;
2781
2782 mutex_lock(&inode->i_mutex);
2783 mutex_lock(&cgroup_mutex);
2784 if (!cgroup_is_removed(cgrp))
2785 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2786 mutex_unlock(&cgroup_mutex);
2787 mutex_unlock(&inode->i_mutex);
2788
2789 list_del_init(&cgrp->cft_q_node);
2790 dput(cgrp->dentry);
2646 } 2791 }
2792
2793 mutex_unlock(&cgroup_cft_mutex);
2794}
2795
2796/**
2797 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2798 * @ss: target cgroup subsystem
2799 * @cfts: zero-length name terminated array of cftypes
2800 *
2801 * Register @cfts to @ss. Files described by @cfts are created for all
2802 * existing cgroups to which @ss is attached and all future cgroups will
2803 * have them too. This function can be called anytime whether @ss is
2804 * attached or not.
2805 *
2806 * Returns 0 on successful registration, -errno on failure. Note that this
2807 * function currently returns 0 as long as @cfts registration is successful
2808 * even if some file creation attempts on existing cgroups fail.
2809 */
2810int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2811{
2812 struct cftype_set *set;
2813
2814 set = kzalloc(sizeof(*set), GFP_KERNEL);
2815 if (!set)
2816 return -ENOMEM;
2817
2818 cgroup_cfts_prepare();
2819 set->cfts = cfts;
2820 list_add_tail(&set->node, &ss->cftsets);
2821 cgroup_cfts_commit(ss, cfts, true);
2822
2647 return 0; 2823 return 0;
2648} 2824}
2649EXPORT_SYMBOL_GPL(cgroup_add_files); 2825EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2826
2827/**
2828 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2829 * @ss: target cgroup subsystem
2830 * @cfts: zero-length name terminated array of cftypes
2831 *
2832 * Unregister @cfts from @ss. Files described by @cfts are removed from
2833 * all existing cgroups to which @ss is attached and all future cgroups
2834 * won't have them either. This function can be called anytime whether @ss
2835 * is attached or not.
2836 *
2837 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2838 * registered with @ss.
2839 */
2840int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2841{
2842 struct cftype_set *set;
2843
2844 cgroup_cfts_prepare();
2845
2846 list_for_each_entry(set, &ss->cftsets, node) {
2847 if (set->cfts == cfts) {
2848 list_del_init(&set->node);
2849 cgroup_cfts_commit(ss, cfts, false);
2850 return 0;
2851 }
2852 }
2853
2854 cgroup_cfts_commit(ss, NULL, false);
2855 return -ENOENT;
2856}
2650 2857
2651/** 2858/**
2652 * cgroup_task_count - count the number of tasks in a cgroup. 2859 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3832,14 @@ static struct cftype files[] = {
3625 .read_u64 = cgroup_clone_children_read, 3832 .read_u64 = cgroup_clone_children_read,
3626 .write_u64 = cgroup_clone_children_write, 3833 .write_u64 = cgroup_clone_children_write,
3627 }, 3834 },
3628}; 3835 {
3629 3836 .name = "release_agent",
3630static struct cftype cft_release_agent = { 3837 .flags = CFTYPE_ONLY_ON_ROOT,
3631 .name = "release_agent", 3838 .read_seq_string = cgroup_release_agent_show,
3632 .read_seq_string = cgroup_release_agent_show, 3839 .write_string = cgroup_release_agent_write,
3633 .write_string = cgroup_release_agent_write, 3840 .max_write_len = PATH_MAX,
3634 .max_write_len = PATH_MAX, 3841 },
3842 { } /* terminate */
3635}; 3843};
3636 3844
3637static int cgroup_populate_dir(struct cgroup *cgrp) 3845static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3639 int err; 3847 int err;
3640 struct cgroup_subsys *ss; 3848 struct cgroup_subsys *ss;
3641 3849
3642 /* First clear out any existing files */ 3850 err = cgroup_addrm_files(cgrp, NULL, files, true);
3643 cgroup_clear_directory(cgrp->dentry);
3644
3645 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3646 if (err < 0) 3851 if (err < 0)
3647 return err; 3852 return err;
3648 3853
3649 if (cgrp == cgrp->top_cgroup) { 3854 /* process cftsets of each subsystem */
3650 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3651 return err;
3652 }
3653
3654 for_each_subsys(cgrp->root, ss) { 3855 for_each_subsys(cgrp->root, ss) {
3655 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3856 struct cftype_set *set;
3656 return err; 3857
3858 list_for_each_entry(set, &ss->cftsets, node)
3859 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3657 } 3860 }
3861
3658 /* This cgroup is ready now */ 3862 /* This cgroup is ready now */
3659 for_each_subsys(cgrp->root, ss) { 3863 for_each_subsys(cgrp->root, ss) {
3660 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3864 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3670 return 0; 3874 return 0;
3671} 3875}
3672 3876
3877static void css_dput_fn(struct work_struct *work)
3878{
3879 struct cgroup_subsys_state *css =
3880 container_of(work, struct cgroup_subsys_state, dput_work);
3881
3882 dput(css->cgroup->dentry);
3883}
3884
3673static void init_cgroup_css(struct cgroup_subsys_state *css, 3885static void init_cgroup_css(struct cgroup_subsys_state *css,
3674 struct cgroup_subsys *ss, 3886 struct cgroup_subsys *ss,
3675 struct cgroup *cgrp) 3887 struct cgroup *cgrp)
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3682 set_bit(CSS_ROOT, &css->flags); 3894 set_bit(CSS_ROOT, &css->flags);
3683 BUG_ON(cgrp->subsys[ss->subsys_id]); 3895 BUG_ON(cgrp->subsys[ss->subsys_id]);
3684 cgrp->subsys[ss->subsys_id] = css; 3896 cgrp->subsys[ss->subsys_id] = css;
3897
3898 /*
3899 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3900 * which is put on the last css_put(). dput() requires process
3901 * context, which css_put() may be called without. @css->dput_work
3902 * will be used to invoke dput() asynchronously from css_put().
3903 */
3904 INIT_WORK(&css->dput_work, css_dput_fn);
3905 if (ss->__DEPRECATED_clear_css_refs)
3906 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3685} 3907}
3686 3908
3687static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3909static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3784 if (err < 0) 4006 if (err < 0)
3785 goto err_remove; 4007 goto err_remove;
3786 4008
4009 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4010 for_each_subsys(root, ss)
4011 if (!ss->__DEPRECATED_clear_css_refs)
4012 dget(dentry);
4013
3787 /* The cgroup directory was pre-locked for us */ 4014 /* The cgroup directory was pre-locked for us */
3788 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4015 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3789 4016
4017 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4018
3790 err = cgroup_populate_dir(cgrp); 4019 err = cgroup_populate_dir(cgrp);
3791 /* If err < 0, we have a half-filled directory - oh well ;) */ 4020 /* If err < 0, we have a half-filled directory - oh well ;) */
3792 4021
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3826 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4055 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3827} 4056}
3828 4057
4058/*
4059 * Check the reference count on each subsystem. Since we already
4060 * established that there are no tasks in the cgroup, if the css refcount
4061 * is also 1, then there should be no outstanding references, so the
4062 * subsystem is safe to destroy. We scan across all subsystems rather than
4063 * using the per-hierarchy linked list of mounted subsystems since we can
4064 * be called via check_for_release() with no synchronization other than
4065 * RCU, and the subsystem linked list isn't RCU-safe.
4066 */
3829static int cgroup_has_css_refs(struct cgroup *cgrp) 4067static int cgroup_has_css_refs(struct cgroup *cgrp)
3830{ 4068{
3831 /* Check the reference count on each subsystem. Since we
3832 * already established that there are no tasks in the
3833 * cgroup, if the css refcount is also 1, then there should
3834 * be no outstanding references, so the subsystem is safe to
3835 * destroy. We scan across all subsystems rather than using
3836 * the per-hierarchy linked list of mounted subsystems since
3837 * we can be called via check_for_release() with no
3838 * synchronization other than RCU, and the subsystem linked
3839 * list isn't RCU-safe */
3840 int i; 4069 int i;
4070
3841 /* 4071 /*
3842 * We won't need to lock the subsys array, because the subsystems 4072 * We won't need to lock the subsys array, because the subsystems
3843 * we're concerned about aren't going anywhere since our cgroup root 4073 * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4076 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3847 struct cgroup_subsys *ss = subsys[i]; 4077 struct cgroup_subsys *ss = subsys[i];
3848 struct cgroup_subsys_state *css; 4078 struct cgroup_subsys_state *css;
4079
3849 /* Skip subsystems not present or not in this hierarchy */ 4080 /* Skip subsystems not present or not in this hierarchy */
3850 if (ss == NULL || ss->root != cgrp->root) 4081 if (ss == NULL || ss->root != cgrp->root)
3851 continue; 4082 continue;
4083
3852 css = cgrp->subsys[ss->subsys_id]; 4084 css = cgrp->subsys[ss->subsys_id];
3853 /* When called from check_for_release() it's possible 4085 /*
4086 * When called from check_for_release() it's possible
3854 * that by this point the cgroup has been removed 4087 * that by this point the cgroup has been removed
3855 * and the css deleted. But a false-positive doesn't 4088 * and the css deleted. But a false-positive doesn't
3856 * matter, since it can only happen if the cgroup 4089 * matter, since it can only happen if the cgroup
3857 * has been deleted and hence no longer needs the 4090 * has been deleted and hence no longer needs the
3858 * release agent to be called anyway. */ 4091 * release agent to be called anyway.
3859 if (css && (atomic_read(&css->refcnt) > 1)) 4092 */
4093 if (css && css_refcnt(css) > 1)
3860 return 1; 4094 return 1;
3861 } 4095 }
3862 return 0; 4096 return 0;
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3866 * Atomically mark all (or else none) of the cgroup's CSS objects as 4100 * Atomically mark all (or else none) of the cgroup's CSS objects as
3867 * CSS_REMOVED. Return true on success, or false if the cgroup has 4101 * CSS_REMOVED. Return true on success, or false if the cgroup has
3868 * busy subsystems. Call with cgroup_mutex held 4102 * busy subsystems. Call with cgroup_mutex held
4103 *
4104 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4105 * not, cgroup removal behaves differently.
4106 *
4107 * If clear is set, css refcnt for the subsystem should be zero before
4108 * cgroup removal can be committed. This is implemented by
4109 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4110 * called multiple times until all css refcnts reach zero and is allowed to
4111 * veto removal on any invocation. This behavior is deprecated and will be
4112 * removed as soon as the existing user (memcg) is updated.
4113 *
4114 * If clear is not set, each css holds an extra reference to the cgroup's
4115 * dentry and cgroup removal proceeds regardless of css refs.
4116 * ->pre_destroy() will be called at least once and is not allowed to fail.
4117 * On the last put of each css, whenever that may be, the extra dentry ref
4118 * is put so that dentry destruction happens only after all css's are
4119 * released.
3869 */ 4120 */
3870
3871static int cgroup_clear_css_refs(struct cgroup *cgrp) 4121static int cgroup_clear_css_refs(struct cgroup *cgrp)
3872{ 4122{
3873 struct cgroup_subsys *ss; 4123 struct cgroup_subsys *ss;
3874 unsigned long flags; 4124 unsigned long flags;
3875 bool failed = false; 4125 bool failed = false;
4126
3876 local_irq_save(flags); 4127 local_irq_save(flags);
4128
4129 /*
4130 * Block new css_tryget() by deactivating refcnt. If all refcnts
4131 * for subsystems w/ clear_css_refs set were 1 at the moment of
4132 * deactivation, we succeeded.
4133 */
3877 for_each_subsys(cgrp->root, ss) { 4134 for_each_subsys(cgrp->root, ss) {
3878 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4135 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3879 int refcnt; 4136
3880 while (1) { 4137 WARN_ON(atomic_read(&css->refcnt) < 0);
3881 /* We can only remove a CSS with a refcnt==1 */ 4138 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3882 refcnt = atomic_read(&css->refcnt); 4139
3883 if (refcnt > 1) { 4140 if (ss->__DEPRECATED_clear_css_refs)
3884 failed = true; 4141 failed |= css_refcnt(css) != 1;
3885 goto done;
3886 }
3887 BUG_ON(!refcnt);
3888 /*
3889 * Drop the refcnt to 0 while we check other
3890 * subsystems. This will cause any racing
3891 * css_tryget() to spin until we set the
3892 * CSS_REMOVED bits or abort
3893 */
3894 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3895 break;
3896 cpu_relax();
3897 }
3898 } 4142 }
3899 done: 4143
4144 /*
4145 * If succeeded, set REMOVED and put all the base refs; otherwise,
4146 * restore refcnts to positive values. Either way, all in-progress
4147 * css_tryget() will be released.
4148 */
3900 for_each_subsys(cgrp->root, ss) { 4149 for_each_subsys(cgrp->root, ss) {
3901 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4150 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3902 if (failed) { 4151
3903 /* 4152 if (!failed) {
3904 * Restore old refcnt if we previously managed
3905 * to clear it from 1 to 0
3906 */
3907 if (!atomic_read(&css->refcnt))
3908 atomic_set(&css->refcnt, 1);
3909 } else {
3910 /* Commit the fact that the CSS is removed */
3911 set_bit(CSS_REMOVED, &css->flags); 4153 set_bit(CSS_REMOVED, &css->flags);
4154 css_put(css);
4155 } else {
4156 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3912 } 4157 }
3913 } 4158 }
4159
3914 local_irq_restore(flags); 4160 local_irq_restore(flags);
3915 return !failed; 4161 return !failed;
3916} 4162}
@@ -3995,6 +4241,8 @@ again:
3995 list_del_init(&cgrp->sibling); 4241 list_del_init(&cgrp->sibling);
3996 cgroup_unlock_hierarchy(cgrp->root); 4242 cgroup_unlock_hierarchy(cgrp->root);
3997 4243
4244 list_del_init(&cgrp->allcg_node);
4245
3998 d = dget(cgrp->dentry); 4246 d = dget(cgrp->dentry);
3999 4247
4000 cgroup_d_remove_dir(d); 4248 cgroup_d_remove_dir(d);
@@ -4021,12 +4269,29 @@ again:
4021 return 0; 4269 return 0;
4022} 4270}
4023 4271
4272static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4273{
4274 INIT_LIST_HEAD(&ss->cftsets);
4275
4276 /*
4277 * base_cftset is embedded in subsys itself, no need to worry about
4278 * deregistration.
4279 */
4280 if (ss->base_cftypes) {
4281 ss->base_cftset.cfts = ss->base_cftypes;
4282 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4283 }
4284}
4285
4024static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4286static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4025{ 4287{
4026 struct cgroup_subsys_state *css; 4288 struct cgroup_subsys_state *css;
4027 4289
4028 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4290 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4029 4291
4292 /* init base cftset */
4293 cgroup_init_cftsets(ss);
4294
4030 /* Create the top cgroup state for this subsystem */ 4295 /* Create the top cgroup state for this subsystem */
4031 list_add(&ss->sibling, &rootnode.subsys_list); 4296 list_add(&ss->sibling, &rootnode.subsys_list);
4032 ss->root = &rootnode; 4297 ss->root = &rootnode;
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 return 0; 4361 return 0;
4097 } 4362 }
4098 4363
4364 /* init base cftset */
4365 cgroup_init_cftsets(ss);
4366
4099 /* 4367 /*
4100 * need to register a subsys id before anything else - for example, 4368 * need to register a subsys id before anything else - for example,
4101 * init_cgroup_css needs it. 4369 * init_cgroup_css needs it.
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp)
4685} 4953}
4686 4954
4687/* Caller must verify that the css is not for root cgroup */ 4955/* Caller must verify that the css is not for root cgroup */
4688void __css_put(struct cgroup_subsys_state *css, int count) 4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 do {
4959 int v = css_refcnt(css);
4960
4961 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4962 return true;
4963 cpu_relax();
4964 } while (!test_bit(CSS_REMOVED, &css->flags));
4965
4966 return false;
4967}
4968EXPORT_SYMBOL_GPL(__css_tryget);
4969
4970/* Caller must verify that the css is not for root cgroup */
4971void __css_put(struct cgroup_subsys_state *css)
4689{ 4972{
4690 struct cgroup *cgrp = css->cgroup; 4973 struct cgroup *cgrp = css->cgroup;
4691 int val; 4974
4692 rcu_read_lock(); 4975 rcu_read_lock();
4693 val = atomic_sub_return(count, &css->refcnt); 4976 atomic_dec(&css->refcnt);
4694 if (val == 1) { 4977 switch (css_refcnt(css)) {
4978 case 1:
4695 if (notify_on_release(cgrp)) { 4979 if (notify_on_release(cgrp)) {
4696 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4980 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4697 check_for_release(cgrp); 4981 check_for_release(cgrp);
4698 } 4982 }
4699 cgroup_wakeup_rmdir_waiter(cgrp); 4983 cgroup_wakeup_rmdir_waiter(cgrp);
4984 break;
4985 case 0:
4986 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4987 schedule_work(&css->dput_work);
4988 break;
4700 } 4989 }
4701 rcu_read_unlock(); 4990 rcu_read_unlock();
4702 WARN_ON_ONCE(val < 1);
4703} 4991}
4704EXPORT_SYMBOL_GPL(__css_put); 4992EXPORT_SYMBOL_GPL(__css_put);
4705 4993
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4818 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5106 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4819 * it's unchanged until freed. 5107 * it's unchanged until freed.
4820 */ 5108 */
4821 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5109 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4822 5110
4823 if (cssid) 5111 if (cssid)
4824 return cssid->id; 5112 return cssid->id;
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4830{ 5118{
4831 struct css_id *cssid; 5119 struct css_id *cssid;
4832 5120
4833 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5121 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4834 5122
4835 if (cssid) 5123 if (cssid)
4836 return cssid->depth; 5124 return cssid->depth;
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = {
5211 .name = "releasable", 5499 .name = "releasable",
5212 .read_u64 = releasable_read, 5500 .read_u64 = releasable_read,
5213 }, 5501 },
5214};
5215 5502
5216static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5503 { } /* terminate */
5217{ 5504};
5218 return cgroup_add_files(cont, ss, debug_files,
5219 ARRAY_SIZE(debug_files));
5220}
5221 5505
5222struct cgroup_subsys debug_subsys = { 5506struct cgroup_subsys debug_subsys = {
5223 .name = "debug", 5507 .name = "debug",
5224 .create = debug_create, 5508 .create = debug_create,
5225 .destroy = debug_destroy, 5509 .destroy = debug_destroy,
5226 .populate = debug_populate,
5227 .subsys_id = debug_subsys_id, 5510 .subsys_id = debug_subsys_id,
5511 .base_cftypes = debug_files,
5228}; 5512};
5229#endif /* CONFIG_CGROUP_DEBUG */ 5513#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
358static struct cftype files[] = { 358static struct cftype files[] = {
359 { 359 {
360 .name = "state", 360 .name = "state",
361 .flags = CFTYPE_NOT_ON_ROOT,
361 .read_seq_string = freezer_read, 362 .read_seq_string = freezer_read,
362 .write_string = freezer_write, 363 .write_string = freezer_write,
363 }, 364 },
365 { } /* terminate */
364}; 366};
365 367
366static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
367{
368 if (!cgroup->parent)
369 return 0;
370 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
371}
372
373struct cgroup_subsys freezer_subsys = { 368struct cgroup_subsys freezer_subsys = {
374 .name = "freezer", 369 .name = "freezer",
375 .create = freezer_create, 370 .create = freezer_create,
376 .destroy = freezer_destroy, 371 .destroy = freezer_destroy,
377 .populate = freezer_populate,
378 .subsys_id = freezer_subsys_id, 372 .subsys_id = freezer_subsys_id,
379 .can_attach = freezer_can_attach, 373 .can_attach = freezer_can_attach,
380 .fork = freezer_fork, 374 .fork = freezer_fork,
375 .base_cftypes = files,
381}; 376};
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809a..d2c67aa49ae6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
372 372
373#ifdef __ARCH_WANT_SYS_SIGPROCMASK 373#ifdef __ARCH_WANT_SYS_SIGPROCMASK
374 374
375asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 375/*
376 compat_old_sigset_t __user *oset) 376 * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
377 * blocked set of signals to the supplied signal set
378 */
379static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
377{ 380{
378 old_sigset_t s; 381 memcpy(blocked->sig, &set, sizeof(set));
379 long ret; 382}
380 mm_segment_t old_fs;
381 383
382 if (set && get_user(s, set)) 384asmlinkage long compat_sys_sigprocmask(int how,
383 return -EFAULT; 385 compat_old_sigset_t __user *nset,
384 old_fs = get_fs(); 386 compat_old_sigset_t __user *oset)
385 set_fs(KERNEL_DS); 387{
386 ret = sys_sigprocmask(how, 388 old_sigset_t old_set, new_set;
387 set ? (old_sigset_t __user *) &s : NULL, 389 sigset_t new_blocked;
388 oset ? (old_sigset_t __user *) &s : NULL); 390
389 set_fs(old_fs); 391 old_set = current->blocked.sig[0];
390 if (ret == 0) 392
391 if (oset) 393 if (nset) {
392 ret = put_user(s, oset); 394 if (get_user(new_set, nset))
393 return ret; 395 return -EFAULT;
396 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
397
398 new_blocked = current->blocked;
399
400 switch (how) {
401 case SIG_BLOCK:
402 sigaddsetmask(&new_blocked, new_set);
403 break;
404 case SIG_UNBLOCK:
405 sigdelsetmask(&new_blocked, new_set);
406 break;
407 case SIG_SETMASK:
408 compat_sig_setmask(&new_blocked, new_set);
409 break;
410 default:
411 return -EINVAL;
412 }
413
414 set_current_blocked(&new_blocked);
415 }
416
417 if (oset) {
418 if (put_user(old_set, oset))
419 return -EFAULT;
420 }
421
422 return 0;
394} 423}
395 424
396#endif 425#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..0e6353cf147a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,8 @@
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/suspend.h> 18#include <linux/suspend.h>
19 19
20#include "smpboot.h"
21
20#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
21/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 23/* Serializes the updates to cpu_online_mask, cpu_present_mask */
22static DEFINE_MUTEX(cpu_add_remove_lock); 24static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
295 int ret, nr_calls = 0; 297 int ret, nr_calls = 0;
296 void *hcpu = (void *)(long)cpu; 298 void *hcpu = (void *)(long)cpu;
297 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 299 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
300 struct task_struct *idle;
298 301
299 if (cpu_online(cpu) || !cpu_present(cpu)) 302 if (cpu_online(cpu) || !cpu_present(cpu))
300 return -EINVAL; 303 return -EINVAL;
301 304
302 cpu_hotplug_begin(); 305 cpu_hotplug_begin();
306
307 idle = idle_thread_get(cpu);
308 if (IS_ERR(idle)) {
309 ret = PTR_ERR(idle);
310 goto out;
311 }
312
303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 313 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
304 if (ret) { 314 if (ret) {
305 nr_calls--; 315 nr_calls--;
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
309 } 319 }
310 320
311 /* Arch-specific enabling code. */ 321 /* Arch-specific enabling code. */
312 ret = __cpu_up(cpu); 322 ret = __cpu_up(cpu, idle);
313 if (ret != 0) 323 if (ret != 0)
314 goto out_notify; 324 goto out_notify;
315 BUG_ON(!cpu_online(cpu)); 325 BUG_ON(!cpu_online(cpu));
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
320out_notify: 330out_notify:
321 if (ret != 0) 331 if (ret != 0)
322 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 332 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
333out:
323 cpu_hotplug_done(); 334 cpu_hotplug_done();
324 335
325 return ret; 336 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b96ad75b7e64..8c8bd652dd12 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = {
270 * are online. If none are online, walk up the cpuset hierarchy 270 * are online. If none are online, walk up the cpuset hierarchy
271 * until we find one that does have some online cpus. If we get 271 * until we find one that does have some online cpus. If we get
272 * all the way to the top and still haven't found any online cpus, 272 * all the way to the top and still haven't found any online cpus,
273 * return cpu_online_map. Or if passed a NULL cs from an exit'ing 273 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
274 * task, return cpu_online_map. 274 * task, return cpu_online_mask.
275 * 275 *
276 * One way or another, we guarantee to return some non-empty subset 276 * One way or another, we guarantee to return some non-empty subset
277 * of cpu_online_map. 277 * of cpu_online_mask.
278 * 278 *
279 * Call with callback_mutex held. 279 * Call with callback_mutex held.
280 */ 280 */
@@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
867 int retval; 867 int retval;
868 int is_load_balanced; 868 int is_load_balanced;
869 869
870 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 870 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
871 if (cs == &top_cpuset) 871 if (cs == &top_cpuset)
872 return -EACCES; 872 return -EACCES;
873 873
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
1765 .write_u64 = cpuset_write_u64, 1765 .write_u64 = cpuset_write_u64,
1766 .private = FILE_SPREAD_SLAB, 1766 .private = FILE_SPREAD_SLAB,
1767 }, 1767 },
1768};
1769
1770static struct cftype cft_memory_pressure_enabled = {
1771 .name = "memory_pressure_enabled",
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775};
1776 1768
1777static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1769 {
1778{ 1770 .name = "memory_pressure_enabled",
1779 int err; 1771 .flags = CFTYPE_ONLY_ON_ROOT,
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775 },
1780 1776
1781 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 1777 { } /* terminate */
1782 if (err) 1778};
1783 return err;
1784 /* memory_pressure_enabled is in root cpuset only */
1785 if (!cont->parent)
1786 err = cgroup_add_file(cont, ss,
1787 &cft_memory_pressure_enabled);
1788 return err;
1789}
1790 1779
1791/* 1780/*
1792 * post_clone() is called during cgroup_create() when the 1781 * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
1887 .destroy = cpuset_destroy, 1876 .destroy = cpuset_destroy,
1888 .can_attach = cpuset_can_attach, 1877 .can_attach = cpuset_can_attach,
1889 .attach = cpuset_attach, 1878 .attach = cpuset_attach,
1890 .populate = cpuset_populate,
1891 .post_clone = cpuset_post_clone, 1879 .post_clone = cpuset_post_clone,
1892 .subsys_id = cpuset_subsys_id, 1880 .subsys_id = cpuset_subsys_id,
1881 .base_cftypes = files,
1893 .early_init = 1, 1882 .early_init = 1,
1894}; 1883};
1895 1884
@@ -2149,7 +2138,7 @@ void __init cpuset_init_smp(void)
2149 * 2138 *
2150 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 2139 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2151 * attached to the specified @tsk. Guaranteed to return some non-empty 2140 * attached to the specified @tsk. Guaranteed to return some non-empty
2152 * subset of cpu_online_map, even if this means going outside the 2141 * subset of cpu_online_mask, even if this means going outside the
2153 * tasks cpuset. 2142 * tasks cpuset.
2154 **/ 2143 **/
2155 2144
diff --git a/kernel/cred.c b/kernel/cred.c
index eddc5e2e9587..430557ea488f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -396,6 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
396 struct cred *new; 396 struct cred *new;
397 int ret; 397 int ret;
398 398
399 p->replacement_session_keyring = NULL;
400
399 if ( 401 if (
400#ifdef CONFIG_KEYS 402#ifdef CONFIG_KEYS
401 !p->cred->thread_keyring && 403 !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1dc53bae56e1..0557f24c6bca 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -160,37 +160,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
160 * Weak aliases for breakpoint management, 160 * Weak aliases for breakpoint management,
161 * can be overriden by architectures when needed: 161 * can be overriden by architectures when needed:
162 */ 162 */
163int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) 163int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
164{ 164{
165 int err; 165 int err;
166 166
167 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); 167 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
168 BREAK_INSTR_SIZE);
168 if (err) 169 if (err)
169 return err; 170 return err;
170 171 err = probe_kernel_write((char *)bpt->bpt_addr,
171 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, 172 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
172 BREAK_INSTR_SIZE); 173 return err;
173} 174}
174 175
175int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) 176int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
176{ 177{
177 return probe_kernel_write((char *)addr, 178 return probe_kernel_write((char *)bpt->bpt_addr,
178 (char *)bundle, BREAK_INSTR_SIZE); 179 (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
179} 180}
180 181
181int __weak kgdb_validate_break_address(unsigned long addr) 182int __weak kgdb_validate_break_address(unsigned long addr)
182{ 183{
183 char tmp_variable[BREAK_INSTR_SIZE]; 184 struct kgdb_bkpt tmp;
184 int err; 185 int err;
185 /* Validate setting the breakpoint and then removing it. In the 186 /* Validate setting the breakpoint and then removing it. If the
186 * remove fails, the kernel needs to emit a bad message because we 187 * remove fails, the kernel needs to emit a bad message because we
187 * are deep trouble not being able to put things back the way we 188 * are deep trouble not being able to put things back the way we
188 * found them. 189 * found them.
189 */ 190 */
190 err = kgdb_arch_set_breakpoint(addr, tmp_variable); 191 tmp.bpt_addr = addr;
192 err = kgdb_arch_set_breakpoint(&tmp);
191 if (err) 193 if (err)
192 return err; 194 return err;
193 err = kgdb_arch_remove_breakpoint(addr, tmp_variable); 195 err = kgdb_arch_remove_breakpoint(&tmp);
194 if (err) 196 if (err)
195 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " 197 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
196 "memory destroyed at: %lx", addr); 198 "memory destroyed at: %lx", addr);
@@ -234,7 +236,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
234 */ 236 */
235int dbg_activate_sw_breakpoints(void) 237int dbg_activate_sw_breakpoints(void)
236{ 238{
237 unsigned long addr;
238 int error; 239 int error;
239 int ret = 0; 240 int ret = 0;
240 int i; 241 int i;
@@ -243,16 +244,15 @@ int dbg_activate_sw_breakpoints(void)
243 if (kgdb_break[i].state != BP_SET) 244 if (kgdb_break[i].state != BP_SET)
244 continue; 245 continue;
245 246
246 addr = kgdb_break[i].bpt_addr; 247 error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
247 error = kgdb_arch_set_breakpoint(addr,
248 kgdb_break[i].saved_instr);
249 if (error) { 248 if (error) {
250 ret = error; 249 ret = error;
251 printk(KERN_INFO "KGDB: BP install failed: %lx", addr); 250 printk(KERN_INFO "KGDB: BP install failed: %lx",
251 kgdb_break[i].bpt_addr);
252 continue; 252 continue;
253 } 253 }
254 254
255 kgdb_flush_swbreak_addr(addr); 255 kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
256 kgdb_break[i].state = BP_ACTIVE; 256 kgdb_break[i].state = BP_ACTIVE;
257 } 257 }
258 return ret; 258 return ret;
@@ -301,7 +301,6 @@ int dbg_set_sw_break(unsigned long addr)
301 301
302int dbg_deactivate_sw_breakpoints(void) 302int dbg_deactivate_sw_breakpoints(void)
303{ 303{
304 unsigned long addr;
305 int error; 304 int error;
306 int ret = 0; 305 int ret = 0;
307 int i; 306 int i;
@@ -309,15 +308,14 @@ int dbg_deactivate_sw_breakpoints(void)
309 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 308 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
310 if (kgdb_break[i].state != BP_ACTIVE) 309 if (kgdb_break[i].state != BP_ACTIVE)
311 continue; 310 continue;
312 addr = kgdb_break[i].bpt_addr; 311 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
313 error = kgdb_arch_remove_breakpoint(addr,
314 kgdb_break[i].saved_instr);
315 if (error) { 312 if (error) {
316 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); 313 printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
314 kgdb_break[i].bpt_addr);
317 ret = error; 315 ret = error;
318 } 316 }
319 317
320 kgdb_flush_swbreak_addr(addr); 318 kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
321 kgdb_break[i].state = BP_SET; 319 kgdb_break[i].state = BP_SET;
322 } 320 }
323 return ret; 321 return ret;
@@ -351,7 +349,6 @@ int kgdb_isremovedbreak(unsigned long addr)
351 349
352int dbg_remove_all_break(void) 350int dbg_remove_all_break(void)
353{ 351{
354 unsigned long addr;
355 int error; 352 int error;
356 int i; 353 int i;
357 354
@@ -359,12 +356,10 @@ int dbg_remove_all_break(void)
359 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 356 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
360 if (kgdb_break[i].state != BP_ACTIVE) 357 if (kgdb_break[i].state != BP_ACTIVE)
361 goto setundefined; 358 goto setundefined;
362 addr = kgdb_break[i].bpt_addr; 359 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
363 error = kgdb_arch_remove_breakpoint(addr,
364 kgdb_break[i].saved_instr);
365 if (error) 360 if (error)
366 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", 361 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
367 addr); 362 kgdb_break[i].bpt_addr);
368setundefined: 363setundefined:
369 kgdb_break[i].state = BP_UNDEFINED; 364 kgdb_break[i].state = BP_UNDEFINED;
370 } 365 }
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 9b5f17da1c56..bb9520f0f6ff 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -743,7 +743,7 @@ kdb_printit:
743 kdb_input_flush(); 743 kdb_input_flush();
744 c = console_drivers; 744 c = console_drivers;
745 745
746 if (!dbg_io_ops->is_console) { 746 if (dbg_io_ops && !dbg_io_ops->is_console) {
747 len = strlen(moreprompt); 747 len = strlen(moreprompt);
748 cp = moreprompt; 748 cp = moreprompt;
749 while (len--) { 749 while (len--) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a6a9ec4cd8f5..5b06cbbf6931 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3183,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event,
3183 perf_event_for_each_child(event, func); 3183 perf_event_for_each_child(event, func);
3184 func(event); 3184 func(event);
3185 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3185 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3186 perf_event_for_each_child(event, func); 3186 perf_event_for_each_child(sibling, func);
3187 mutex_unlock(&ctx->mutex); 3187 mutex_unlock(&ctx->mutex);
3188} 3188}
3189 3189
@@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
4957 if (rctx < 0) 4957 if (rctx < 0)
4958 return; 4958 return;
4959 4959
4960 perf_sample_data_init(&data, addr); 4960 perf_sample_data_init(&data, addr, 0);
4961 4961
4962 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 4962 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
4963 4963
@@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5215 .data = record, 5215 .data = record,
5216 }; 5216 };
5217 5217
5218 perf_sample_data_init(&data, addr); 5218 perf_sample_data_init(&data, addr, 0);
5219 data.raw = &raw; 5219 data.raw = &raw;
5220 5220
5221 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5221 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5318 struct perf_sample_data sample; 5318 struct perf_sample_data sample;
5319 struct pt_regs *regs = data; 5319 struct pt_regs *regs = data;
5320 5320
5321 perf_sample_data_init(&sample, bp->attr.bp_addr); 5321 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
5322 5322
5323 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5323 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5324 perf_swevent_event(bp, 1, &sample, regs); 5324 perf_swevent_event(bp, 1, &sample, regs);
@@ -5344,13 +5344,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5344 5344
5345 event->pmu->read(event); 5345 event->pmu->read(event);
5346 5346
5347 perf_sample_data_init(&data, 0); 5347 perf_sample_data_init(&data, 0, event->hw.last_period);
5348 data.period = event->hw.last_period;
5349 regs = get_irq_regs(); 5348 regs = get_irq_regs();
5350 5349
5351 if (regs && !perf_exclude_event(event, regs)) { 5350 if (regs && !perf_exclude_event(event, regs)) {
5352 if (!(event->attr.exclude_idle && is_idle_task(current))) 5351 if (!(event->attr.exclude_idle && is_idle_task(current)))
5353 if (perf_event_overflow(event, &data, regs)) 5352 if (__perf_event_overflow(event, 1, &data, regs))
5354 ret = HRTIMER_NORESTART; 5353 ret = HRTIMER_NORESTART;
5355 } 5354 }
5356 5355
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b8241..fe35a634bf76 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
35extern struct exception_table_entry __start___ex_table[]; 35extern struct exception_table_entry __start___ex_table[];
36extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
37 37
38/* Cleared by build time tools if the table is already sorted. */
39u32 __initdata main_extable_sort_needed = 1;
40
38/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
39void __init sort_main_extable(void) 42void __init sort_main_extable(void)
40{ 43{
41 sort_extable(__start___ex_table, __stop___ex_table); 44 if (main_extable_sort_needed)
45 sort_extable(__start___ex_table, __stop___ex_table);
46 else
47 pr_notice("__ex_table already sorted, skipping sort\n");
42} 48}
43 49
44/* Given an address, look for it in the exception tables. */ 50/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..05c813dc9ecc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
34#include <linux/cgroup.h> 34#include <linux/cgroup.h>
35#include <linux/security.h> 35#include <linux/security.h>
36#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
37#include <linux/seccomp.h>
37#include <linux/swap.h> 38#include <linux/swap.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
39#include <linux/jiffies.h> 40#include <linux/jiffies.h>
@@ -47,6 +48,7 @@
47#include <linux/audit.h> 48#include <linux/audit.h>
48#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
49#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/proc_fs.h>
50#include <linux/profile.h> 52#include <linux/profile.h>
51#include <linux/rmap.h> 53#include <linux/rmap.h>
52#include <linux/ksm.h> 54#include <linux/ksm.h>
@@ -111,32 +113,67 @@ int nr_processes(void)
111 return total; 113 return total;
112} 114}
113 115
114#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 116#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
115# define alloc_task_struct_node(node) \
116 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
117# define free_task_struct(tsk) \
118 kmem_cache_free(task_struct_cachep, (tsk))
119static struct kmem_cache *task_struct_cachep; 117static struct kmem_cache *task_struct_cachep;
118
119static inline struct task_struct *alloc_task_struct_node(int node)
120{
121 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
122}
123
124void __weak arch_release_task_struct(struct task_struct *tsk) { }
125
126static inline void free_task_struct(struct task_struct *tsk)
127{
128 arch_release_task_struct(tsk);
129 kmem_cache_free(task_struct_cachep, tsk);
130}
120#endif 131#endif
121 132
122#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 133#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
134void __weak arch_release_thread_info(struct thread_info *ti) { }
135
136/*
137 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
138 * kmemcache based allocator.
139 */
140# if THREAD_SIZE >= PAGE_SIZE
123static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 141static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
124 int node) 142 int node)
125{ 143{
126#ifdef CONFIG_DEBUG_STACK_USAGE 144 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
127 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 145 THREAD_SIZE_ORDER);
128#else
129 gfp_t mask = GFP_KERNEL;
130#endif
131 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
132 146
133 return page ? page_address(page) : NULL; 147 return page ? page_address(page) : NULL;
134} 148}
135 149
136static inline void free_thread_info(struct thread_info *ti) 150static inline void free_thread_info(struct thread_info *ti)
137{ 151{
152 arch_release_thread_info(ti);
138 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 153 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
139} 154}
155# else
156static struct kmem_cache *thread_info_cache;
157
158static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
159 int node)
160{
161 return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
162}
163
164static void free_thread_info(struct thread_info *ti)
165{
166 arch_release_thread_info(ti);
167 kmem_cache_free(thread_info_cache, ti);
168}
169
170void thread_info_cache_init(void)
171{
172 thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
173 THREAD_SIZE, 0, NULL);
174 BUG_ON(thread_info_cache == NULL);
175}
176# endif
140#endif 177#endif
141 178
142/* SLAB cache for signal_struct structures (tsk->signal) */ 179/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -170,6 +207,7 @@ void free_task(struct task_struct *tsk)
170 free_thread_info(tsk->stack); 207 free_thread_info(tsk->stack);
171 rt_mutex_debug_task_free(tsk); 208 rt_mutex_debug_task_free(tsk);
172 ftrace_graph_exit_task(tsk); 209 ftrace_graph_exit_task(tsk);
210 put_seccomp_filter(tsk);
173 free_task_struct(tsk); 211 free_task_struct(tsk);
174} 212}
175EXPORT_SYMBOL(free_task); 213EXPORT_SYMBOL(free_task);
@@ -203,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk)
203} 241}
204EXPORT_SYMBOL_GPL(__put_task_struct); 242EXPORT_SYMBOL_GPL(__put_task_struct);
205 243
206/* 244void __init __weak arch_task_cache_init(void) { }
207 * macro override instead of weak attribute alias, to workaround
208 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
209 */
210#ifndef arch_task_cache_init
211#define arch_task_cache_init()
212#endif
213 245
214void __init fork_init(unsigned long mempages) 246void __init fork_init(unsigned long mempages)
215{ 247{
216#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 248#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
217#ifndef ARCH_MIN_TASKALIGN 249#ifndef ARCH_MIN_TASKALIGN
218#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 250#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
219#endif 251#endif
@@ -260,8 +292,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
260 int node = tsk_fork_get_node(orig); 292 int node = tsk_fork_get_node(orig);
261 int err; 293 int err;
262 294
263 prepare_to_copy(orig);
264
265 tsk = alloc_task_struct_node(node); 295 tsk = alloc_task_struct_node(node);
266 if (!tsk) 296 if (!tsk)
267 return NULL; 297 return NULL;
@@ -1162,6 +1192,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1162 goto fork_out; 1192 goto fork_out;
1163 1193
1164 ftrace_graph_init_task(p); 1194 ftrace_graph_init_task(p);
1195 get_seccomp_filter(p);
1165 1196
1166 rt_mutex_init_task(p); 1197 rt_mutex_init_task(p);
1167 1198
@@ -1464,6 +1495,8 @@ bad_fork_cleanup_io:
1464 if (p->io_context) 1495 if (p->io_context)
1465 exit_io_context(p); 1496 exit_io_context(p);
1466bad_fork_cleanup_namespaces: 1497bad_fork_cleanup_namespaces:
1498 if (unlikely(clone_flags & CLONE_NEWPID))
1499 pid_ns_release_proc(p->nsproxy->pid_ns);
1467 exit_task_namespaces(p); 1500 exit_task_namespaces(p);
1468bad_fork_cleanup_mm: 1501bad_fork_cleanup_mm:
1469 if (p->mm) 1502 if (p->mm)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2a..6df614912b9d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
108 108
109 touch_nmi_watchdog(); 109 touch_nmi_watchdog();
110 110
111 if (sysctl_hung_task_panic) 111 if (sysctl_hung_task_panic) {
112 trigger_all_cpu_backtrace();
112 panic("hung_task: blocked tasks"); 113 panic("hung_task: blocked tasks");
114 }
113} 115}
114 116
115/* 117/*
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index cf1a4a68ce44..d1a758bc972a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -62,7 +62,7 @@ config IRQ_DOMAIN_DEBUG
62 help 62 help
63 This option will show the mapping relationship between hardware irq 63 This option will show the mapping relationship between hardware irq
64 numbers and Linux irq numbers. The mapping is exposed via debugfs 64 numbers and Linux irq numbers. The mapping is exposed via debugfs
65 in the file "virq_mapping". 65 in the file "irq_domain_mapping".
66 66
67 If you don't know what this means you don't need it. 67 If you don't know what this means you don't need it.
68 68
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c33..fc275e4f629b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
379 * If its disabled or no action available 379 * If its disabled or no action available
380 * keep it masked and get out of here 380 * keep it masked and get out of here
381 */ 381 */
382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
383 desc->istate |= IRQS_PENDING;
383 goto out_unlock; 384 goto out_unlock;
385 }
384 386
385 handle_irq_event(desc); 387 handle_irq_event(desc);
386 388
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
518out_unlock: 520out_unlock:
519 raw_spin_unlock(&desc->lock); 521 raw_spin_unlock(&desc->lock);
520} 522}
523EXPORT_SYMBOL(handle_edge_irq);
521 524
522#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER 525#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
523/** 526/**
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 97a8bfadc88a..e75e29e4434a 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,10 +4,10 @@
4 4
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6 6
7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) 7#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) 8#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */ 9/* FIXME */
10#define PD(f) do { } while (0) 10#define ___PD(f) do { } while (0)
11 11
12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
13{ 13{
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
23 print_symbol("%s\n", (unsigned long)desc->action->handler); 23 print_symbol("%s\n", (unsigned long)desc->action->handler);
24 } 24 }
25 25
26 P(IRQ_LEVEL); 26 ___P(IRQ_LEVEL);
27 P(IRQ_PER_CPU); 27 ___P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE); 28 ___P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST); 29 ___P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD); 30 ___P(IRQ_NOTHREAD);
31 P(IRQ_NOAUTOEN); 31 ___P(IRQ_NOAUTOEN);
32 32
33 PS(IRQS_AUTODETECT); 33 ___PS(IRQS_AUTODETECT);
34 PS(IRQS_REPLAY); 34 ___PS(IRQS_REPLAY);
35 PS(IRQS_WAITING); 35 ___PS(IRQS_WAITING);
36 PS(IRQS_PENDING); 36 ___PS(IRQS_PENDING);
37 37
38 PD(IRQS_INPROGRESS); 38 ___PD(IRQS_INPROGRESS);
39 PD(IRQS_DISABLED); 39 ___PD(IRQS_DISABLED);
40 PD(IRQS_MASKED); 40 ___PD(IRQS_MASKED);
41} 41}
42 42
43#undef P 43#undef ___P
44#undef PS 44#undef ___PS
45#undef PD 45#undef ___PD
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95eb..192a302d6cfd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
112{ 112{
113 return radix_tree_lookup(&irq_desc_tree, irq); 113 return radix_tree_lookup(&irq_desc_tree, irq);
114} 114}
115EXPORT_SYMBOL(irq_to_desc);
115 116
116static void delete_irq_desc(unsigned int irq) 117static void delete_irq_desc(unsigned int irq)
117{ 118{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3601f3fbf67c..0e0ba5f840b2 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,7 +23,6 @@ static LIST_HEAD(irq_domain_list);
23static DEFINE_MUTEX(irq_domain_mutex); 23static DEFINE_MUTEX(irq_domain_mutex);
24 24
25static DEFINE_MUTEX(revmap_trees_mutex); 25static DEFINE_MUTEX(revmap_trees_mutex);
26static unsigned int irq_virq_count = NR_IRQS;
27static struct irq_domain *irq_default_domain; 26static struct irq_domain *irq_default_domain;
28 27
29/** 28/**
@@ -184,13 +183,16 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
184} 183}
185 184
186struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, 185struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
186 unsigned int max_irq,
187 const struct irq_domain_ops *ops, 187 const struct irq_domain_ops *ops,
188 void *host_data) 188 void *host_data)
189{ 189{
190 struct irq_domain *domain = irq_domain_alloc(of_node, 190 struct irq_domain *domain = irq_domain_alloc(of_node,
191 IRQ_DOMAIN_MAP_NOMAP, ops, host_data); 191 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
192 if (domain) 192 if (domain) {
193 domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
193 irq_domain_add(domain); 194 irq_domain_add(domain);
195 }
194 return domain; 196 return domain;
195} 197}
196 198
@@ -262,22 +264,6 @@ void irq_set_default_host(struct irq_domain *domain)
262 irq_default_domain = domain; 264 irq_default_domain = domain;
263} 265}
264 266
265/**
266 * irq_set_virq_count() - Set the maximum number of linux irqs
267 * @count: number of linux irqs, capped with NR_IRQS
268 *
269 * This is mainly for use by platforms like iSeries who want to program
270 * the virtual irq number in the controller to avoid the reverse mapping
271 */
272void irq_set_virq_count(unsigned int count)
273{
274 pr_debug("irq: Trying to set virq count to %d\n", count);
275
276 BUG_ON(count < NUM_ISA_INTERRUPTS);
277 if (count < NR_IRQS)
278 irq_virq_count = count;
279}
280
281static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, 267static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
282 irq_hw_number_t hwirq) 268 irq_hw_number_t hwirq)
283{ 269{
@@ -320,13 +306,12 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
320 pr_debug("irq: create_direct virq allocation failed\n"); 306 pr_debug("irq: create_direct virq allocation failed\n");
321 return 0; 307 return 0;
322 } 308 }
323 if (virq >= irq_virq_count) { 309 if (virq >= domain->revmap_data.nomap.max_irq) {
324 pr_err("ERROR: no free irqs available below %i maximum\n", 310 pr_err("ERROR: no free irqs available below %i maximum\n",
325 irq_virq_count); 311 domain->revmap_data.nomap.max_irq);
326 irq_free_desc(virq); 312 irq_free_desc(virq);
327 return 0; 313 return 0;
328 } 314 }
329
330 pr_debug("irq: create_direct obtained virq %d\n", virq); 315 pr_debug("irq: create_direct obtained virq %d\n", virq);
331 316
332 if (irq_setup_virq(domain, virq, virq)) { 317 if (irq_setup_virq(domain, virq, virq)) {
@@ -350,7 +335,8 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
350unsigned int irq_create_mapping(struct irq_domain *domain, 335unsigned int irq_create_mapping(struct irq_domain *domain,
351 irq_hw_number_t hwirq) 336 irq_hw_number_t hwirq)
352{ 337{
353 unsigned int virq, hint; 338 unsigned int hint;
339 int virq;
354 340
355 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 341 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
356 342
@@ -377,13 +363,13 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
377 return irq_domain_legacy_revmap(domain, hwirq); 363 return irq_domain_legacy_revmap(domain, hwirq);
378 364
379 /* Allocate a virtual interrupt number */ 365 /* Allocate a virtual interrupt number */
380 hint = hwirq % irq_virq_count; 366 hint = hwirq % nr_irqs;
381 if (hint == 0) 367 if (hint == 0)
382 hint++; 368 hint++;
383 virq = irq_alloc_desc_from(hint, 0); 369 virq = irq_alloc_desc_from(hint, 0);
384 if (!virq) 370 if (virq <= 0)
385 virq = irq_alloc_desc_from(1, 0); 371 virq = irq_alloc_desc_from(1, 0);
386 if (!virq) { 372 if (virq <= 0) {
387 pr_debug("irq: -> virq allocation failed\n"); 373 pr_debug("irq: -> virq allocation failed\n");
388 return 0; 374 return 0;
389 } 375 }
@@ -515,7 +501,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
515 irq_hw_number_t hwirq) 501 irq_hw_number_t hwirq)
516{ 502{
517 unsigned int i; 503 unsigned int i;
518 unsigned int hint = hwirq % irq_virq_count; 504 unsigned int hint = hwirq % nr_irqs;
519 505
520 /* Look for default domain if nececssary */ 506 /* Look for default domain if nececssary */
521 if (domain == NULL) 507 if (domain == NULL)
@@ -536,7 +522,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
536 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 522 if (data && (data->domain == domain) && (data->hwirq == hwirq))
537 return i; 523 return i;
538 i++; 524 i++;
539 if (i >= irq_virq_count) 525 if (i >= nr_irqs)
540 i = 1; 526 i = 1;
541 } while(i != hint); 527 } while(i != hint);
542 return 0; 528 return 0;
@@ -642,8 +628,9 @@ static int virq_debug_show(struct seq_file *m, void *private)
642 void *data; 628 void *data;
643 int i; 629 int i;
644 630
645 seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", 631 seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq",
646 "chip name", "chip data", "domain name"); 632 "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
633 "domain name");
647 634
648 for (i = 1; i < nr_irqs; i++) { 635 for (i = 1; i < nr_irqs; i++) {
649 desc = irq_to_desc(i); 636 desc = irq_to_desc(i);
@@ -666,7 +653,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
666 seq_printf(m, "%-15s ", p); 653 seq_printf(m, "%-15s ", p);
667 654
668 data = irq_desc_get_chip_data(desc); 655 data = irq_desc_get_chip_data(desc);
669 seq_printf(m, "0x%16p ", data); 656 seq_printf(m, data ? "0x%p " : " %p ", data);
670 657
671 if (desc->irq_data.domain && desc->irq_data.domain->of_node) 658 if (desc->irq_data.domain && desc->irq_data.domain->of_node)
672 p = desc->irq_data.domain->of_node->full_name; 659 p = desc->irq_data.domain->of_node->full_name;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569b..bb32326afe87 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
565 * IRQF_TRIGGER_* but the PIC does not support multiple 565 * IRQF_TRIGGER_* but the PIC does not support multiple
566 * flow-types? 566 * flow-types?
567 */ 567 */
568 pr_debug("No set_type function for IRQ %d (%s)\n", irq, 568 pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
569 chip ? (chip->name ? : "unknown") : "unknown"); 569 chip ? (chip->name ? : "unknown") : "unknown");
570 return 0; 570 return 0;
571 } 571 }
572 572
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
600 ret = 0; 600 ret = 0;
601 break; 601 break;
602 default: 602 default:
603 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 603 pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
604 flags, irq, chip->irq_set_type); 604 flags, irq, chip->irq_set_type);
605 } 605 }
606 if (unmask) 606 if (unmask)
@@ -837,8 +837,7 @@ void exit_irq_thread(void)
837 837
838 action = kthread_data(tsk); 838 action = kthread_data(tsk);
839 839
840 printk(KERN_ERR 840 pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
841 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
842 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 841 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
843 842
844 desc = irq_to_desc(action->irq); 843 desc = irq_to_desc(action->irq);
@@ -878,7 +877,6 @@ static int
878__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 877__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
879{ 878{
880 struct irqaction *old, **old_ptr; 879 struct irqaction *old, **old_ptr;
881 const char *old_name = NULL;
882 unsigned long flags, thread_mask = 0; 880 unsigned long flags, thread_mask = 0;
883 int ret, nested, shared = 0; 881 int ret, nested, shared = 0;
884 cpumask_var_t mask; 882 cpumask_var_t mask;
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
972 */ 970 */
973 if (!((old->flags & new->flags) & IRQF_SHARED) || 971 if (!((old->flags & new->flags) & IRQF_SHARED) ||
974 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || 972 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
975 ((old->flags ^ new->flags) & IRQF_ONESHOT)) { 973 ((old->flags ^ new->flags) & IRQF_ONESHOT))
976 old_name = old->name;
977 goto mismatch; 974 goto mismatch;
978 }
979 975
980 /* All handlers must agree on per-cpuness */ 976 /* All handlers must agree on per-cpuness */
981 if ((old->flags & IRQF_PERCPU) != 977 if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1031 * all existing action->thread_mask bits. 1027 * all existing action->thread_mask bits.
1032 */ 1028 */
1033 new->thread_mask = 1 << ffz(thread_mask); 1029 new->thread_mask = 1 << ffz(thread_mask);
1030
1031 } else if (new->handler == irq_default_primary_handler) {
1032 /*
1033 * The interrupt was requested with handler = NULL, so
1034 * we use the default primary handler for it. But it
1035 * does not have the oneshot flag set. In combination
1036 * with level interrupts this is deadly, because the
1037 * default primary handler just wakes the thread, then
1038 * the irq lines is reenabled, but the device still
1039 * has the level irq asserted. Rinse and repeat....
1040 *
1041 * While this works for edge type interrupts, we play
1042 * it safe and reject unconditionally because we can't
1043 * say for sure which type this interrupt really
1044 * has. The type flags are unreliable as the
1045 * underlying chip implementation can override them.
1046 */
1047 pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1048 irq);
1049 ret = -EINVAL;
1050 goto out_mask;
1034 } 1051 }
1035 1052
1036 if (!shared) { 1053 if (!shared) {
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1078 1095
1079 if (nmsk != omsk) 1096 if (nmsk != omsk)
1080 /* hope the handler works with current trigger mode */ 1097 /* hope the handler works with current trigger mode */
1081 pr_warning("IRQ %d uses trigger mode %u; requested %u\n", 1098 pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
1082 irq, nmsk, omsk); 1099 irq, nmsk, omsk);
1083 } 1100 }
1084 1101
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1115 return 0; 1132 return 0;
1116 1133
1117mismatch: 1134mismatch:
1118#ifdef CONFIG_DEBUG_SHIRQ
1119 if (!(new->flags & IRQF_PROBE_SHARED)) { 1135 if (!(new->flags & IRQF_PROBE_SHARED)) {
1120 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 1136 pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
1121 if (old_name) 1137 irq, new->flags, new->name, old->flags, old->name);
1122 printk(KERN_ERR "current handler: %s\n", old_name); 1138#ifdef CONFIG_DEBUG_SHIRQ
1123 dump_stack(); 1139 dump_stack();
1124 }
1125#endif 1140#endif
1141 }
1126 ret = -EBUSY; 1142 ret = -EBUSY;
1127 1143
1128out_mask: 1144out_mask:
@@ -1204,12 +1220,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1204 /* Found it - now remove it from the list of entries: */ 1220 /* Found it - now remove it from the list of entries: */
1205 *action_ptr = action->next; 1221 *action_ptr = action->next;
1206 1222
1207 /* Currently used only by UML, might disappear one day: */
1208#ifdef CONFIG_IRQ_RELEASE_METHOD
1209 if (desc->irq_data.chip->release)
1210 desc->irq_data.chip->release(irq, dev_id);
1211#endif
1212
1213 /* If this was the last handler, shut down the IRQ line: */ 1223 /* If this was the last handler, shut down the IRQ line: */
1214 if (!desc->action) 1224 if (!desc->action)
1215 irq_shutdown(desc); 1225 irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a6..cb228bf21760 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
103 int irq; 103 int irq;
104 104
105 for_each_irq_desc(irq, desc) { 105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
106 if (irqd_is_wakeup_set(&desc->irq_data)) { 111 if (irqd_is_wakeup_set(&desc->irq_data)) {
107 if (desc->istate & IRQS_PENDING) 112 if (desc->depth == 1 && desc->istate & IRQS_PENDING)
108 return -EBUSY; 113 return -EBUSY;
109 continue; 114 continue;
110 } 115 }
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c9..6454db7b6a4d 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
58 /* 58 /*
59 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
60 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
61 * active. 61 * active. Clear the pending bit so suspend/resume does not
62 * get confused.
62 */ 63 */
63 if (irq_settings_is_level(desc)) 64 if (irq_settings_is_level(desc)) {
65 desc->istate &= ~IRQS_PENDING;
64 return; 66 return;
67 }
65 if (desc->istate & IRQS_REPLAY) 68 if (desc->istate & IRQS_REPLAY)
66 return; 69 return;
67 if (desc->istate & IRQS_PENDING) { 70 if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c3c46c72046e..1588e3b2871b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -5,11 +5,13 @@
5 * context. The enqueueing is NMI-safe. 5 * context. The enqueueing is NMI-safe.
6 */ 6 */
7 7
8#include <linux/bug.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
9#include <linux/export.h> 10#include <linux/export.h>
10#include <linux/irq_work.h> 11#include <linux/irq_work.h>
11#include <linux/percpu.h> 12#include <linux/percpu.h>
12#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/irqflags.h>
13#include <asm/processor.h> 15#include <asm/processor.h>
14 16
15/* 17/*
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 22000c3db0dd..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -284,8 +284,12 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
284 if (value) { 284 if (value) {
285 if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) 285 if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
286 return -EFAULT; 286 return -EFAULT;
287 } else 287 } else {
288 memset((char *) &set_buffer, 0, sizeof(set_buffer)); 288 memset(&set_buffer, 0, sizeof(set_buffer));
289 printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
290 " Misfeature support will be removed\n",
291 current->comm);
292 }
289 293
290 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); 294 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
291 if (error || !ovalue) 295 if (error || !ovalue)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 957a7aab8ebc..05698a7415fe 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -322,7 +322,7 @@ static void __call_usermodehelper(struct work_struct *work)
322 * land has been frozen during a system-wide hibernation or suspend operation). 322 * land has been frozen during a system-wide hibernation or suspend operation).
323 * Should always be manipulated under umhelper_sem acquired for write. 323 * Should always be manipulated under umhelper_sem acquired for write.
324 */ 324 */
325static int usermodehelper_disabled = 1; 325static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
326 326
327/* Number of helpers running */ 327/* Number of helpers running */
328static atomic_t running_helpers = ATOMIC_INIT(0); 328static atomic_t running_helpers = ATOMIC_INIT(0);
@@ -334,32 +334,110 @@ static atomic_t running_helpers = ATOMIC_INIT(0);
334static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); 334static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
335 335
336/* 336/*
337 * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
338 * to become 'false'.
339 */
340static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
341
342/*
337 * Time to wait for running_helpers to become zero before the setting of 343 * Time to wait for running_helpers to become zero before the setting of
338 * usermodehelper_disabled in usermodehelper_disable() fails 344 * usermodehelper_disabled in usermodehelper_disable() fails
339 */ 345 */
340#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 346#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
341 347
342void read_lock_usermodehelper(void) 348int usermodehelper_read_trylock(void)
343{ 349{
350 DEFINE_WAIT(wait);
351 int ret = 0;
352
344 down_read(&umhelper_sem); 353 down_read(&umhelper_sem);
354 for (;;) {
355 prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
356 TASK_INTERRUPTIBLE);
357 if (!usermodehelper_disabled)
358 break;
359
360 if (usermodehelper_disabled == UMH_DISABLED)
361 ret = -EAGAIN;
362
363 up_read(&umhelper_sem);
364
365 if (ret)
366 break;
367
368 schedule();
369 try_to_freeze();
370
371 down_read(&umhelper_sem);
372 }
373 finish_wait(&usermodehelper_disabled_waitq, &wait);
374 return ret;
375}
376EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
377
378long usermodehelper_read_lock_wait(long timeout)
379{
380 DEFINE_WAIT(wait);
381
382 if (timeout < 0)
383 return -EINVAL;
384
385 down_read(&umhelper_sem);
386 for (;;) {
387 prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
388 TASK_UNINTERRUPTIBLE);
389 if (!usermodehelper_disabled)
390 break;
391
392 up_read(&umhelper_sem);
393
394 timeout = schedule_timeout(timeout);
395 if (!timeout)
396 break;
397
398 down_read(&umhelper_sem);
399 }
400 finish_wait(&usermodehelper_disabled_waitq, &wait);
401 return timeout;
345} 402}
346EXPORT_SYMBOL_GPL(read_lock_usermodehelper); 403EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
347 404
348void read_unlock_usermodehelper(void) 405void usermodehelper_read_unlock(void)
349{ 406{
350 up_read(&umhelper_sem); 407 up_read(&umhelper_sem);
351} 408}
352EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); 409EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
353 410
354/** 411/**
355 * usermodehelper_disable - prevent new helpers from being started 412 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
413 * depth: New value to assign to usermodehelper_disabled.
414 *
415 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
416 * writing) and wakeup tasks waiting for it to change.
356 */ 417 */
357int usermodehelper_disable(void) 418void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
419{
420 down_write(&umhelper_sem);
421 usermodehelper_disabled = depth;
422 wake_up(&usermodehelper_disabled_waitq);
423 up_write(&umhelper_sem);
424}
425
426/**
427 * __usermodehelper_disable - Prevent new helpers from being started.
428 * @depth: New value to assign to usermodehelper_disabled.
429 *
430 * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
431 */
432int __usermodehelper_disable(enum umh_disable_depth depth)
358{ 433{
359 long retval; 434 long retval;
360 435
436 if (!depth)
437 return -EINVAL;
438
361 down_write(&umhelper_sem); 439 down_write(&umhelper_sem);
362 usermodehelper_disabled = 1; 440 usermodehelper_disabled = depth;
363 up_write(&umhelper_sem); 441 up_write(&umhelper_sem);
364 442
365 /* 443 /*
@@ -374,31 +452,10 @@ int usermodehelper_disable(void)
374 if (retval) 452 if (retval)
375 return 0; 453 return 0;
376 454
377 down_write(&umhelper_sem); 455 __usermodehelper_set_disable_depth(UMH_ENABLED);
378 usermodehelper_disabled = 0;
379 up_write(&umhelper_sem);
380 return -EAGAIN; 456 return -EAGAIN;
381} 457}
382 458
383/**
384 * usermodehelper_enable - allow new helpers to be started again
385 */
386void usermodehelper_enable(void)
387{
388 down_write(&umhelper_sem);
389 usermodehelper_disabled = 0;
390 up_write(&umhelper_sem);
391}
392
393/**
394 * usermodehelper_is_disabled - check if new helpers are allowed to be started
395 */
396bool usermodehelper_is_disabled(void)
397{
398 return usermodehelper_disabled;
399}
400EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
401
402static void helper_lock(void) 459static void helper_lock(void)
403{ 460{
404 atomic_inc(&running_helpers); 461 atomic_inc(&running_helpers);
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e425..4edbd9c11aca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info,
2429 goto free_hdr; 2429 goto free_hdr;
2430 } 2430 }
2431 2431
2432 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { 2432 if (hdr->e_shoff >= len ||
2433 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
2433 err = -ENOEXEC; 2434 err = -ENOEXEC;
2434 goto free_hdr; 2435 goto free_hdr;
2435 } 2436 }
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod,
2953 2954
2954 /* Module is ready to execute: parsing args may do that. */ 2955 /* Module is ready to execute: parsing args may do that. */
2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 2956 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL); 2957 -32768, 32767, &ddebug_dyndbg_module_param_cb);
2957 if (err < 0) 2958 if (err < 0)
2958 goto unlink; 2959 goto unlink;
2959 2960
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f10eb285ece..89fe3d1b9efb 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * padata.c - generic interface to process data streams in parallel 2 * padata.c - generic interface to process data streams in parallel
3 * 3 *
4 * See Documentation/padata.txt for an api documentation.
5 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG 6 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> 7 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 * 8 *
@@ -354,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
354 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) 356 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
355 return -ENOMEM; 357 return -ENOMEM;
356 358
357 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); 359 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
358 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { 360 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
359 free_cpumask_var(pd->cpumask.cbcpu); 361 free_cpumask_var(pd->cpumask.cbcpu);
360 return -ENOMEM; 362 return -ENOMEM;
361 } 363 }
362 364
363 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); 365 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
364 return 0; 366 return 0;
365} 367}
366 368
@@ -564,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
564static bool padata_validate_cpumask(struct padata_instance *pinst, 566static bool padata_validate_cpumask(struct padata_instance *pinst,
565 const struct cpumask *cpumask) 567 const struct cpumask *cpumask)
566{ 568{
567 if (!cpumask_intersects(cpumask, cpu_active_mask)) { 569 if (!cpumask_intersects(cpumask, cpu_online_mask)) {
568 pinst->flags |= PADATA_INVALID; 570 pinst->flags |= PADATA_INVALID;
569 return false; 571 return false;
570 } 572 }
@@ -678,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
678{ 680{
679 struct parallel_data *pd; 681 struct parallel_data *pd;
680 682
681 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 683 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
682 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, 684 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
683 pinst->cpumask.cbcpu); 685 pinst->cpumask.cbcpu);
684 if (!pd) 686 if (!pd)
@@ -746,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
746 return -ENOMEM; 748 return -ENOMEM;
747 749
748 padata_replace(pinst, pd); 750 padata_replace(pinst, pd);
751
752 cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
753 cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
749 } 754 }
750 755
751 return 0; 756 return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 80aed44e345a..8ed89a175d79 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -97,7 +97,7 @@ void panic(const char *fmt, ...)
97 /* 97 /*
98 * Avoid nested stack-dumping if a panic occurs during oops processing 98 * Avoid nested stack-dumping if a panic occurs during oops processing
99 */ 99 */
100 if (!oops_in_progress) 100 if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
101 dump_stack(); 101 dump_stack();
102#endif 102#endif
103 103
diff --git a/kernel/params.c b/kernel/params.c
index f37d82631347..ed35345be536 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
85 85
86static int parse_one(char *param, 86static int parse_one(char *param,
87 char *val, 87 char *val,
88 const char *doing,
88 const struct kernel_param *params, 89 const struct kernel_param *params,
89 unsigned num_params, 90 unsigned num_params,
90 s16 min_level, 91 s16 min_level,
91 s16 max_level, 92 s16 max_level,
92 int (*handle_unknown)(char *param, char *val)) 93 int (*handle_unknown)(char *param, char *val,
94 const char *doing))
93{ 95{
94 unsigned int i; 96 unsigned int i;
95 int err; 97 int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
104 if (!val && params[i].ops->set != param_set_bool 106 if (!val && params[i].ops->set != param_set_bool
105 && params[i].ops->set != param_set_bint) 107 && params[i].ops->set != param_set_bint)
106 return -EINVAL; 108 return -EINVAL;
107 pr_debug("They are equal! Calling %p\n", 109 pr_debug("handling %s with %p\n", param,
108 params[i].ops->set); 110 params[i].ops->set);
109 mutex_lock(&param_lock); 111 mutex_lock(&param_lock);
110 err = params[i].ops->set(val, &params[i]); 112 err = params[i].ops->set(val, &params[i]);
111 mutex_unlock(&param_lock); 113 mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
114 } 116 }
115 117
116 if (handle_unknown) { 118 if (handle_unknown) {
117 pr_debug("Unknown argument: calling %p\n", handle_unknown); 119 pr_debug("doing %s: %s='%s'\n", doing, param, val);
118 return handle_unknown(param, val); 120 return handle_unknown(param, val, doing);
119 } 121 }
120 122
121 pr_debug("Unknown argument `%s'\n", param); 123 pr_debug("Unknown argument '%s'\n", param);
122 return -ENOENT; 124 return -ENOENT;
123} 125}
124 126
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
175} 177}
176 178
177/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
178int parse_args(const char *name, 180int parse_args(const char *doing,
179 char *args, 181 char *args,
180 const struct kernel_param *params, 182 const struct kernel_param *params,
181 unsigned num, 183 unsigned num,
182 s16 min_level, 184 s16 min_level,
183 s16 max_level, 185 s16 max_level,
184 int (*unknown)(char *param, char *val)) 186 int (*unknown)(char *param, char *val, const char *doing))
185{ 187{
186 char *param, *val; 188 char *param, *val;
187 189
188 pr_debug("Parsing ARGS: %s\n", args);
189
190 /* Chew leading spaces */ 190 /* Chew leading spaces */
191 args = skip_spaces(args); 191 args = skip_spaces(args);
192 192
193 if (*args)
194 pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
195
193 while (*args) { 196 while (*args) {
194 int ret; 197 int ret;
195 int irq_was_disabled; 198 int irq_was_disabled;
196 199
197 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
198 irq_was_disabled = irqs_disabled(); 201 irq_was_disabled = irqs_disabled();
199 ret = parse_one(param, val, params, num, 202 ret = parse_one(param, val, doing, params, num,
200 min_level, max_level, unknown); 203 min_level, max_level, unknown);
201 if (irq_was_disabled && !irqs_disabled()) { 204 if (irq_was_disabled && !irqs_disabled())
202 printk(KERN_WARNING "parse_args(): option '%s' enabled " 205 pr_warn("%s: option '%s' enabled irq's!\n",
203 "irq's!\n", param); 206 doing, param);
204 } 207
205 switch (ret) { 208 switch (ret) {
206 case -ENOENT: 209 case -ENOENT:
207 printk(KERN_ERR "%s: Unknown parameter `%s'\n", 210 pr_err("%s: Unknown parameter `%s'\n", doing, param);
208 name, param);
209 return ret; 211 return ret;
210 case -ENOSPC: 212 case -ENOSPC:
211 printk(KERN_ERR 213 pr_err("%s: `%s' too large for parameter `%s'\n",
212 "%s: `%s' too large for parameter `%s'\n", 214 doing, val ?: "", param);
213 name, val ?: "", param);
214 return ret; 215 return ret;
215 case 0: 216 case 0:
216 break; 217 break;
217 default: 218 default:
218 printk(KERN_ERR 219 pr_err("%s: `%s' invalid for parameter `%s'\n",
219 "%s: `%s' invalid for parameter `%s'\n", 220 doing, val ?: "", param);
220 name, val ?: "", param);
221 return ret; 221 return ret;
222 } 222 }
223 } 223 }
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
263int param_set_charp(const char *val, const struct kernel_param *kp) 263int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 264{
265 if (strlen(val) > 1024) { 265 if (strlen(val) > 1024) {
266 printk(KERN_ERR "%s: string parameter too long\n", 266 pr_err("%s: string parameter too long\n", kp->name);
267 kp->name);
268 return -ENOSPC; 267 return -ENOSPC;
269 } 268 }
270 269
@@ -400,8 +399,7 @@ static int param_array(const char *name,
400 int len; 399 int len;
401 400
402 if (*num == max) { 401 if (*num == max) {
403 printk(KERN_ERR "%s: can only take %i arguments\n", 402 pr_err("%s: can only take %i arguments\n", name, max);
404 name, max);
405 return -EINVAL; 403 return -EINVAL;
406 } 404 }
407 len = strcspn(val, ","); 405 len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
420 } while (save == ','); 418 } while (save == ',');
421 419
422 if (*num < min) { 420 if (*num < min) {
423 printk(KERN_ERR "%s: needs at least %i arguments\n", 421 pr_err("%s: needs at least %i arguments\n", name, min);
424 name, min);
425 return -EINVAL; 422 return -EINVAL;
426 } 423 }
427 return 0; 424 return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
480 const struct kparam_string *kps = kp->str; 477 const struct kparam_string *kps = kp->str;
481 478
482 if (strlen(val)+1 > kps->maxlen) { 479 if (strlen(val)+1 > kps->maxlen) {
483 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 480 pr_err("%s: string doesn't fit in %u chars.\n",
484 kp->name, kps->maxlen-1); 481 kp->name, kps->maxlen-1);
485 return -ENOSPC; 482 return -ENOSPC;
486 } 483 }
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
750#endif 747#endif
751 if (err) { 748 if (err) {
752 kobject_put(&mk->kobj); 749 kobject_put(&mk->kobj);
753 printk(KERN_ERR 750 pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
754 "Module '%s' failed add to sysfs, error number %d\n",
755 name, err); 751 name, err);
756 printk(KERN_ERR
757 "The system will be unstable now.\n");
758 return NULL; 752 return NULL;
759 } 753 }
760 754
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e3216..8f9b4eb974e0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP
103 select HOTPLUG 103 select HOTPLUG
104 select HOTPLUG_CPU 104 select HOTPLUG_CPU
105 105
106config PM_AUTOSLEEP
107 bool "Opportunistic sleep"
108 depends on PM_SLEEP
109 default n
110 ---help---
111 Allow the kernel to trigger a system transition into a global sleep
112 state automatically whenever there are no active wakeup sources.
113
114config PM_WAKELOCKS
115 bool "User space wakeup sources interface"
116 depends on PM_SLEEP
117 default n
118 ---help---
119 Allow user space to create, activate and deactivate wakeup source
120 objects with the help of a sysfs-based interface.
121
122config PM_WAKELOCKS_LIMIT
123 int "Maximum number of user space wakeup sources (0 = no limit)"
124 range 0 100000
125 default 100
126 depends on PM_WAKELOCKS
127
128config PM_WAKELOCKS_GC
129 bool "Garbage collector for user space wakeup sources"
130 depends on PM_WAKELOCKS
131 default y
132
106config PM_RUNTIME 133config PM_RUNTIME
107 bool "Run-time PM core functionality" 134 bool "Run-time PM core functionality"
108 depends on !IA64_HP_SIM 135 depends on !IA64_HP_SIM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 66d808ec5252..29472bff11ef 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o
9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
11 block_io.o 11 block_io.o
12obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
13obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
12 14
13obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 000000000000..ca304046d9e2
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,127 @@
1/*
2 * kernel/power/autosleep.c
3 *
4 * Opportunistic sleep support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/device.h>
10#include <linux/mutex.h>
11#include <linux/pm_wakeup.h>
12
13#include "power.h"
14
15static suspend_state_t autosleep_state;
16static struct workqueue_struct *autosleep_wq;
17/*
18 * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
19 * is active, otherwise a deadlock with try_to_suspend() is possible.
20 * Alternatively mutex_lock_interruptible() can be used. This will then fail
21 * if an auto_sleep cycle tries to freeze processes.
22 */
23static DEFINE_MUTEX(autosleep_lock);
24static struct wakeup_source *autosleep_ws;
25
26static void try_to_suspend(struct work_struct *work)
27{
28 unsigned int initial_count, final_count;
29
30 if (!pm_get_wakeup_count(&initial_count, true))
31 goto out;
32
33 mutex_lock(&autosleep_lock);
34
35 if (!pm_save_wakeup_count(initial_count)) {
36 mutex_unlock(&autosleep_lock);
37 goto out;
38 }
39
40 if (autosleep_state == PM_SUSPEND_ON) {
41 mutex_unlock(&autosleep_lock);
42 return;
43 }
44 if (autosleep_state >= PM_SUSPEND_MAX)
45 hibernate();
46 else
47 pm_suspend(autosleep_state);
48
49 mutex_unlock(&autosleep_lock);
50
51 if (!pm_get_wakeup_count(&final_count, false))
52 goto out;
53
54 /*
55 * If the wakeup occured for an unknown reason, wait to prevent the
56 * system from trying to suspend and waking up in a tight loop.
57 */
58 if (final_count == initial_count)
59 schedule_timeout_uninterruptible(HZ / 2);
60
61 out:
62 queue_up_suspend_work();
63}
64
65static DECLARE_WORK(suspend_work, try_to_suspend);
66
67void queue_up_suspend_work(void)
68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work);
71}
72
73suspend_state_t pm_autosleep_state(void)
74{
75 return autosleep_state;
76}
77
78int pm_autosleep_lock(void)
79{
80 return mutex_lock_interruptible(&autosleep_lock);
81}
82
83void pm_autosleep_unlock(void)
84{
85 mutex_unlock(&autosleep_lock);
86}
87
88int pm_autosleep_set_state(suspend_state_t state)
89{
90
91#ifndef CONFIG_HIBERNATION
92 if (state >= PM_SUSPEND_MAX)
93 return -EINVAL;
94#endif
95
96 __pm_stay_awake(autosleep_ws);
97
98 mutex_lock(&autosleep_lock);
99
100 autosleep_state = state;
101
102 __pm_relax(autosleep_ws);
103
104 if (state > PM_SUSPEND_ON) {
105 pm_wakep_autosleep_enabled(true);
106 queue_up_suspend_work();
107 } else {
108 pm_wakep_autosleep_enabled(false);
109 }
110
111 mutex_unlock(&autosleep_lock);
112 return 0;
113}
114
115int __init pm_autosleep_init(void)
116{
117 autosleep_ws = wakeup_source_register("autosleep");
118 if (!autosleep_ws)
119 return -ENOMEM;
120
121 autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
122 if (autosleep_wq)
123 return 0;
124
125 wakeup_source_unregister(autosleep_ws);
126 return -ENOMEM;
127}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0a186cfde788..8b53db38a279 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -16,7 +16,6 @@
16#include <linux/string.h> 16#include <linux/string.h>
17#include <linux/device.h> 17#include <linux/device.h>
18#include <linux/async.h> 18#include <linux/async.h>
19#include <linux/kmod.h>
20#include <linux/delay.h> 19#include <linux/delay.h>
21#include <linux/fs.h> 20#include <linux/fs.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
@@ -26,6 +25,8 @@
26#include <linux/freezer.h> 25#include <linux/freezer.h>
27#include <linux/gfp.h> 26#include <linux/gfp.h>
28#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/ctype.h>
29#include <linux/genhd.h>
29#include <scsi/scsi_scan.h> 30#include <scsi/scsi_scan.h>
30 31
31#include "power.h" 32#include "power.h"
@@ -611,14 +612,10 @@ int hibernate(void)
611 if (error) 612 if (error)
612 goto Exit; 613 goto Exit;
613 614
614 error = usermodehelper_disable();
615 if (error)
616 goto Exit;
617
618 /* Allocate memory management structures */ 615 /* Allocate memory management structures */
619 error = create_basic_memory_bitmaps(); 616 error = create_basic_memory_bitmaps();
620 if (error) 617 if (error)
621 goto Enable_umh; 618 goto Exit;
622 619
623 printk(KERN_INFO "PM: Syncing filesystems ... "); 620 printk(KERN_INFO "PM: Syncing filesystems ... ");
624 sys_sync(); 621 sys_sync();
@@ -661,8 +658,6 @@ int hibernate(void)
661 658
662 Free_bitmaps: 659 Free_bitmaps:
663 free_basic_memory_bitmaps(); 660 free_basic_memory_bitmaps();
664 Enable_umh:
665 usermodehelper_enable();
666 Exit: 661 Exit:
667 pm_notifier_call_chain(PM_POST_HIBERNATION); 662 pm_notifier_call_chain(PM_POST_HIBERNATION);
668 pm_restore_console(); 663 pm_restore_console();
@@ -729,6 +724,17 @@ static int software_resume(void)
729 724
730 /* Check if the device is there */ 725 /* Check if the device is there */
731 swsusp_resume_device = name_to_dev_t(resume_file); 726 swsusp_resume_device = name_to_dev_t(resume_file);
727
728 /*
729 * name_to_dev_t is ineffective to verify parition if resume_file is in
730 * integer format. (e.g. major:minor)
731 */
732 if (isdigit(resume_file[0]) && resume_wait) {
733 int partno;
734 while (!get_gendisk(swsusp_resume_device, &partno))
735 msleep(10);
736 }
737
732 if (!swsusp_resume_device) { 738 if (!swsusp_resume_device) {
733 /* 739 /*
734 * Some device discovery might still be in progress; we need 740 * Some device discovery might still be in progress; we need
@@ -777,15 +783,9 @@ static int software_resume(void)
777 if (error) 783 if (error)
778 goto close_finish; 784 goto close_finish;
779 785
780 error = usermodehelper_disable();
781 if (error)
782 goto close_finish;
783
784 error = create_basic_memory_bitmaps(); 786 error = create_basic_memory_bitmaps();
785 if (error) { 787 if (error)
786 usermodehelper_enable();
787 goto close_finish; 788 goto close_finish;
788 }
789 789
790 pr_debug("PM: Preparing processes for restore.\n"); 790 pr_debug("PM: Preparing processes for restore.\n");
791 error = freeze_processes(); 791 error = freeze_processes();
@@ -806,7 +806,6 @@ static int software_resume(void)
806 thaw_processes(); 806 thaw_processes();
807 Done: 807 Done:
808 free_basic_memory_bitmaps(); 808 free_basic_memory_bitmaps();
809 usermodehelper_enable();
810 Finish: 809 Finish:
811 pm_notifier_call_chain(PM_POST_RESTORE); 810 pm_notifier_call_chain(PM_POST_RESTORE);
812 pm_restore_console(); 811 pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c12581f1c62..428f8a034e96 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
269 return (s - buf); 269 return (s - buf);
270} 270}
271 271
272static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, 272static suspend_state_t decode_state(const char *buf, size_t n)
273 const char *buf, size_t n)
274{ 273{
275#ifdef CONFIG_SUSPEND 274#ifdef CONFIG_SUSPEND
276 suspend_state_t state = PM_SUSPEND_STANDBY; 275 suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
278#endif 277#endif
279 char *p; 278 char *p;
280 int len; 279 int len;
281 int error = -EINVAL;
282 280
283 p = memchr(buf, '\n', n); 281 p = memchr(buf, '\n', n);
284 len = p ? p - buf : n; 282 len = p ? p - buf : n;
285 283
286 /* First, check if we are requested to hibernate */ 284 /* Check hibernation first. */
287 if (len == 4 && !strncmp(buf, "disk", len)) { 285 if (len == 4 && !strncmp(buf, "disk", len))
288 error = hibernate(); 286 return PM_SUSPEND_MAX;
289 goto Exit;
290 }
291 287
292#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { 290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
295 error = pm_suspend(state); 291 return state;
296 break;
297 }
298 }
299#endif 292#endif
300 293
301 Exit: 294 return PM_SUSPEND_ON;
295}
296
297static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
298 const char *buf, size_t n)
299{
300 suspend_state_t state;
301 int error;
302
303 error = pm_autosleep_lock();
304 if (error)
305 return error;
306
307 if (pm_autosleep_state() > PM_SUSPEND_ON) {
308 error = -EBUSY;
309 goto out;
310 }
311
312 state = decode_state(buf, n);
313 if (state < PM_SUSPEND_MAX)
314 error = pm_suspend(state);
315 else if (state == PM_SUSPEND_MAX)
316 error = hibernate();
317 else
318 error = -EINVAL;
319
320 out:
321 pm_autosleep_unlock();
302 return error ? error : n; 322 return error ? error : n;
303} 323}
304 324
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
339{ 359{
340 unsigned int val; 360 unsigned int val;
341 361
342 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; 362 return pm_get_wakeup_count(&val, true) ?
363 sprintf(buf, "%u\n", val) : -EINTR;
343} 364}
344 365
345static ssize_t wakeup_count_store(struct kobject *kobj, 366static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
347 const char *buf, size_t n) 368 const char *buf, size_t n)
348{ 369{
349 unsigned int val; 370 unsigned int val;
371 int error;
372
373 error = pm_autosleep_lock();
374 if (error)
375 return error;
376
377 if (pm_autosleep_state() > PM_SUSPEND_ON) {
378 error = -EBUSY;
379 goto out;
380 }
350 381
382 error = -EINVAL;
351 if (sscanf(buf, "%u", &val) == 1) { 383 if (sscanf(buf, "%u", &val) == 1) {
352 if (pm_save_wakeup_count(val)) 384 if (pm_save_wakeup_count(val))
353 return n; 385 error = n;
354 } 386 }
355 return -EINVAL; 387
388 out:
389 pm_autosleep_unlock();
390 return error;
356} 391}
357 392
358power_attr(wakeup_count); 393power_attr(wakeup_count);
394
395#ifdef CONFIG_PM_AUTOSLEEP
396static ssize_t autosleep_show(struct kobject *kobj,
397 struct kobj_attribute *attr,
398 char *buf)
399{
400 suspend_state_t state = pm_autosleep_state();
401
402 if (state == PM_SUSPEND_ON)
403 return sprintf(buf, "off\n");
404
405#ifdef CONFIG_SUSPEND
406 if (state < PM_SUSPEND_MAX)
407 return sprintf(buf, "%s\n", valid_state(state) ?
408 pm_states[state] : "error");
409#endif
410#ifdef CONFIG_HIBERNATION
411 return sprintf(buf, "disk\n");
412#else
413 return sprintf(buf, "error");
414#endif
415}
416
417static ssize_t autosleep_store(struct kobject *kobj,
418 struct kobj_attribute *attr,
419 const char *buf, size_t n)
420{
421 suspend_state_t state = decode_state(buf, n);
422 int error;
423
424 if (state == PM_SUSPEND_ON
425 && strcmp(buf, "off") && strcmp(buf, "off\n"))
426 return -EINVAL;
427
428 error = pm_autosleep_set_state(state);
429 return error ? error : n;
430}
431
432power_attr(autosleep);
433#endif /* CONFIG_PM_AUTOSLEEP */
434
435#ifdef CONFIG_PM_WAKELOCKS
436static ssize_t wake_lock_show(struct kobject *kobj,
437 struct kobj_attribute *attr,
438 char *buf)
439{
440 return pm_show_wakelocks(buf, true);
441}
442
443static ssize_t wake_lock_store(struct kobject *kobj,
444 struct kobj_attribute *attr,
445 const char *buf, size_t n)
446{
447 int error = pm_wake_lock(buf);
448 return error ? error : n;
449}
450
451power_attr(wake_lock);
452
453static ssize_t wake_unlock_show(struct kobject *kobj,
454 struct kobj_attribute *attr,
455 char *buf)
456{
457 return pm_show_wakelocks(buf, false);
458}
459
460static ssize_t wake_unlock_store(struct kobject *kobj,
461 struct kobj_attribute *attr,
462 const char *buf, size_t n)
463{
464 int error = pm_wake_unlock(buf);
465 return error ? error : n;
466}
467
468power_attr(wake_unlock);
469
470#endif /* CONFIG_PM_WAKELOCKS */
359#endif /* CONFIG_PM_SLEEP */ 471#endif /* CONFIG_PM_SLEEP */
360 472
361#ifdef CONFIG_PM_TRACE 473#ifdef CONFIG_PM_TRACE
@@ -409,6 +521,13 @@ static struct attribute * g[] = {
409#ifdef CONFIG_PM_SLEEP 521#ifdef CONFIG_PM_SLEEP
410 &pm_async_attr.attr, 522 &pm_async_attr.attr,
411 &wakeup_count_attr.attr, 523 &wakeup_count_attr.attr,
524#ifdef CONFIG_PM_AUTOSLEEP
525 &autosleep_attr.attr,
526#endif
527#ifdef CONFIG_PM_WAKELOCKS
528 &wake_lock_attr.attr,
529 &wake_unlock_attr.attr,
530#endif
412#ifdef CONFIG_PM_DEBUG 531#ifdef CONFIG_PM_DEBUG
413 &pm_test_attr.attr, 532 &pm_test_attr.attr,
414#endif 533#endif
@@ -444,7 +563,10 @@ static int __init pm_init(void)
444 power_kobj = kobject_create_and_add("power", NULL); 563 power_kobj = kobject_create_and_add("power", NULL);
445 if (!power_kobj) 564 if (!power_kobj)
446 return -ENOMEM; 565 return -ENOMEM;
447 return sysfs_create_group(power_kobj, &attr_group); 566 error = sysfs_create_group(power_kobj, &attr_group);
567 if (error)
568 return error;
569 return pm_autosleep_init();
448} 570}
449 571
450core_initcall(pm_init); 572core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98f3622d7407..b0bd4beaebfe 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void)
264{ 264{
265} 265}
266#endif 266#endif
267
268#ifdef CONFIG_PM_AUTOSLEEP
269
270/* kernel/power/autosleep.c */
271extern int pm_autosleep_init(void);
272extern int pm_autosleep_lock(void);
273extern void pm_autosleep_unlock(void);
274extern suspend_state_t pm_autosleep_state(void);
275extern int pm_autosleep_set_state(suspend_state_t state);
276
277#else /* !CONFIG_PM_AUTOSLEEP */
278
279static inline int pm_autosleep_init(void) { return 0; }
280static inline int pm_autosleep_lock(void) { return 0; }
281static inline void pm_autosleep_unlock(void) {}
282static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
283
284#endif /* !CONFIG_PM_AUTOSLEEP */
285
286#ifdef CONFIG_PM_WAKELOCKS
287
288/* kernel/power/wakelock.c */
289extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
290extern int pm_wake_lock(const char *buf);
291extern int pm_wake_unlock(const char *buf);
292
293#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0d2aeb226108..19db29f67558 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,6 +16,7 @@
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/kmod.h>
19 20
20/* 21/*
21 * Timeout for stopping processes 22 * Timeout for stopping processes
@@ -122,6 +123,10 @@ int freeze_processes(void)
122{ 123{
123 int error; 124 int error;
124 125
126 error = __usermodehelper_disable(UMH_FREEZING);
127 if (error)
128 return error;
129
125 if (!pm_freezing) 130 if (!pm_freezing)
126 atomic_inc(&system_freezing_cnt); 131 atomic_inc(&system_freezing_cnt);
127 132
@@ -130,6 +135,7 @@ int freeze_processes(void)
130 error = try_to_freeze_tasks(true); 135 error = try_to_freeze_tasks(true);
131 if (!error) { 136 if (!error) {
132 printk("done."); 137 printk("done.");
138 __usermodehelper_set_disable_depth(UMH_DISABLED);
133 oom_killer_disable(); 139 oom_killer_disable();
134 } 140 }
135 printk("\n"); 141 printk("\n");
@@ -187,6 +193,8 @@ void thaw_processes(void)
187 } while_each_thread(g, p); 193 } while_each_thread(g, p);
188 read_unlock(&tasklist_lock); 194 read_unlock(&tasklist_lock);
189 195
196 usermodehelper_enable();
197
190 schedule(); 198 schedule();
191 printk("done.\n"); 199 printk("done.\n");
192} 200}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index d6d6dbd1ecc0..6a031e684026 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -230,6 +230,21 @@ int pm_qos_request_active(struct pm_qos_request *req)
230EXPORT_SYMBOL_GPL(pm_qos_request_active); 230EXPORT_SYMBOL_GPL(pm_qos_request_active);
231 231
232/** 232/**
233 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
234 * @work: work struct for the delayed work (timeout)
235 *
236 * This cancels the timeout request by falling back to the default at timeout.
237 */
238static void pm_qos_work_fn(struct work_struct *work)
239{
240 struct pm_qos_request *req = container_of(to_delayed_work(work),
241 struct pm_qos_request,
242 work);
243
244 pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
245}
246
247/**
233 * pm_qos_add_request - inserts new qos request into the list 248 * pm_qos_add_request - inserts new qos request into the list
234 * @req: pointer to a preallocated handle 249 * @req: pointer to a preallocated handle
235 * @pm_qos_class: identifies which list of qos request to use 250 * @pm_qos_class: identifies which list of qos request to use
@@ -253,6 +268,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
253 return; 268 return;
254 } 269 }
255 req->pm_qos_class = pm_qos_class; 270 req->pm_qos_class = pm_qos_class;
271 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
256 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, 272 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
257 &req->node, PM_QOS_ADD_REQ, value); 273 &req->node, PM_QOS_ADD_REQ, value);
258} 274}
@@ -279,6 +295,9 @@ void pm_qos_update_request(struct pm_qos_request *req,
279 return; 295 return;
280 } 296 }
281 297
298 if (delayed_work_pending(&req->work))
299 cancel_delayed_work_sync(&req->work);
300
282 if (new_value != req->node.prio) 301 if (new_value != req->node.prio)
283 pm_qos_update_target( 302 pm_qos_update_target(
284 pm_qos_array[req->pm_qos_class]->constraints, 303 pm_qos_array[req->pm_qos_class]->constraints,
@@ -287,6 +306,34 @@ void pm_qos_update_request(struct pm_qos_request *req,
287EXPORT_SYMBOL_GPL(pm_qos_update_request); 306EXPORT_SYMBOL_GPL(pm_qos_update_request);
288 307
289/** 308/**
309 * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
310 * @req : handle to list element holding a pm_qos request to use
311 * @new_value: defines the temporal qos request
312 * @timeout_us: the effective duration of this qos request in usecs.
313 *
314 * After timeout_us, this qos request is cancelled automatically.
315 */
316void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
317 unsigned long timeout_us)
318{
319 if (!req)
320 return;
321 if (WARN(!pm_qos_request_active(req),
322 "%s called for unknown object.", __func__))
323 return;
324
325 if (delayed_work_pending(&req->work))
326 cancel_delayed_work_sync(&req->work);
327
328 if (new_value != req->node.prio)
329 pm_qos_update_target(
330 pm_qos_array[req->pm_qos_class]->constraints,
331 &req->node, PM_QOS_UPDATE_REQ, new_value);
332
333 schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
334}
335
336/**
290 * pm_qos_remove_request - modifies an existing qos request 337 * pm_qos_remove_request - modifies an existing qos request
291 * @req: handle to request list element 338 * @req: handle to request list element
292 * 339 *
@@ -305,6 +352,9 @@ void pm_qos_remove_request(struct pm_qos_request *req)
305 return; 352 return;
306 } 353 }
307 354
355 if (delayed_work_pending(&req->work))
356 cancel_delayed_work_sync(&req->work);
357
308 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 358 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
309 &req->node, PM_QOS_REMOVE_REQ, 359 &req->node, PM_QOS_REMOVE_REQ,
310 PM_QOS_DEFAULT_VALUE); 360 PM_QOS_DEFAULT_VALUE);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 88e5c967370d..396d262b8fd0 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,7 +12,6 @@
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/kmod.h>
16#include <linux/console.h> 15#include <linux/console.h>
17#include <linux/cpu.h> 16#include <linux/cpu.h>
18#include <linux/syscalls.h> 17#include <linux/syscalls.h>
@@ -102,17 +101,12 @@ static int suspend_prepare(void)
102 if (error) 101 if (error)
103 goto Finish; 102 goto Finish;
104 103
105 error = usermodehelper_disable();
106 if (error)
107 goto Finish;
108
109 error = suspend_freeze_processes(); 104 error = suspend_freeze_processes();
110 if (!error) 105 if (!error)
111 return 0; 106 return 0;
112 107
113 suspend_stats.failed_freeze++; 108 suspend_stats.failed_freeze++;
114 dpm_save_failed_step(SUSPEND_FREEZE); 109 dpm_save_failed_step(SUSPEND_FREEZE);
115 usermodehelper_enable();
116 Finish: 110 Finish:
117 pm_notifier_call_chain(PM_POST_SUSPEND); 111 pm_notifier_call_chain(PM_POST_SUSPEND);
118 pm_restore_console(); 112 pm_restore_console();
@@ -259,7 +253,6 @@ int suspend_devices_and_enter(suspend_state_t state)
259static void suspend_finish(void) 253static void suspend_finish(void)
260{ 254{
261 suspend_thaw_processes(); 255 suspend_thaw_processes();
262 usermodehelper_enable();
263 pm_notifier_call_chain(PM_POST_SUSPEND); 256 pm_notifier_call_chain(PM_POST_SUSPEND);
264 pm_restore_console(); 257 pm_restore_console();
265} 258}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8742fd013a94..11e22c068e8b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> 9 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
10 * 10 *
11 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
12 * 12 *
@@ -51,6 +51,23 @@
51 51
52#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) 52#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
53 53
54/*
55 * Number of free pages that are not high.
56 */
57static inline unsigned long low_free_pages(void)
58{
59 return nr_free_pages() - nr_free_highpages();
60}
61
62/*
63 * Number of pages required to be kept free while writing the image. Always
64 * half of all available low pages before the writing starts.
65 */
66static inline unsigned long reqd_free_pages(void)
67{
68 return low_free_pages() / 2;
69}
70
54struct swap_map_page { 71struct swap_map_page {
55 sector_t entries[MAP_PAGE_ENTRIES]; 72 sector_t entries[MAP_PAGE_ENTRIES];
56 sector_t next_swap; 73 sector_t next_swap;
@@ -72,7 +89,7 @@ struct swap_map_handle {
72 sector_t cur_swap; 89 sector_t cur_swap;
73 sector_t first_sector; 90 sector_t first_sector;
74 unsigned int k; 91 unsigned int k;
75 unsigned long nr_free_pages, written; 92 unsigned long reqd_free_pages;
76 u32 crc32; 93 u32 crc32;
77}; 94};
78 95
@@ -265,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
265 return -ENOSPC; 282 return -ENOSPC;
266 283
267 if (bio_chain) { 284 if (bio_chain) {
268 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 285 src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
286 __GFP_NORETRY);
269 if (src) { 287 if (src) {
270 copy_page(src, buf); 288 copy_page(src, buf);
271 } else { 289 } else {
272 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ 290 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
273 if (ret) 291 if (ret)
274 return ret; 292 return ret;
275 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 293 src = (void *)__get_free_page(__GFP_WAIT |
294 __GFP_NOWARN |
295 __GFP_NORETRY);
276 if (src) { 296 if (src) {
277 copy_page(src, buf); 297 copy_page(src, buf);
278 } else { 298 } else {
@@ -316,8 +336,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
316 goto err_rel; 336 goto err_rel;
317 } 337 }
318 handle->k = 0; 338 handle->k = 0;
319 handle->nr_free_pages = nr_free_pages() >> 1; 339 handle->reqd_free_pages = reqd_free_pages();
320 handle->written = 0;
321 handle->first_sector = handle->cur_swap; 340 handle->first_sector = handle->cur_swap;
322 return 0; 341 return 0;
323err_rel: 342err_rel:
@@ -351,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
351 clear_page(handle->cur); 370 clear_page(handle->cur);
352 handle->cur_swap = offset; 371 handle->cur_swap = offset;
353 handle->k = 0; 372 handle->k = 0;
354 } 373
355 if (bio_chain && ++handle->written > handle->nr_free_pages) { 374 if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
356 error = hib_wait_on_bio_chain(bio_chain); 375 error = hib_wait_on_bio_chain(bio_chain);
357 if (error) 376 if (error)
358 goto out; 377 goto out;
359 handle->written = 0; 378 /*
379 * Recalculate the number of required free pages, to
380 * make sure we never take more than half.
381 */
382 handle->reqd_free_pages = reqd_free_pages();
383 }
360 } 384 }
361 out: 385 out:
362 return error; 386 return error;
@@ -403,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
403/* Maximum number of threads for compression/decompression. */ 427/* Maximum number of threads for compression/decompression. */
404#define LZO_THREADS 3 428#define LZO_THREADS 3
405 429
406/* Maximum number of pages for read buffering. */ 430/* Minimum/maximum number of pages for read buffering. */
407#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) 431#define LZO_MIN_RD_PAGES 1024
432#define LZO_MAX_RD_PAGES 8192
408 433
409 434
410/** 435/**
@@ -615,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
615 } 640 }
616 641
617 /* 642 /*
618 * Adjust number of free pages after all allocations have been done.
619 * We don't want to run out of pages when writing.
620 */
621 handle->nr_free_pages = nr_free_pages() >> 1;
622
623 /*
624 * Start the CRC32 thread. 643 * Start the CRC32 thread.
625 */ 644 */
626 init_waitqueue_head(&crc->go); 645 init_waitqueue_head(&crc->go);
@@ -641,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
641 goto out_clean; 660 goto out_clean;
642 } 661 }
643 662
663 /*
664 * Adjust the number of required free pages after all allocations have
665 * been done. We don't want to run out of pages when writing.
666 */
667 handle->reqd_free_pages = reqd_free_pages();
668
644 printk(KERN_INFO 669 printk(KERN_INFO
645 "PM: Using %u thread(s) for compression.\n" 670 "PM: Using %u thread(s) for compression.\n"
646 "PM: Compressing and saving image data (%u pages) ... ", 671 "PM: Compressing and saving image data (%u pages) ... ",
@@ -1051,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1051 unsigned i, thr, run_threads, nr_threads; 1076 unsigned i, thr, run_threads, nr_threads;
1052 unsigned ring = 0, pg = 0, ring_size = 0, 1077 unsigned ring = 0, pg = 0, ring_size = 0,
1053 have = 0, want, need, asked = 0; 1078 have = 0, want, need, asked = 0;
1054 unsigned long read_pages; 1079 unsigned long read_pages = 0;
1055 unsigned char **page = NULL; 1080 unsigned char **page = NULL;
1056 struct dec_data *data = NULL; 1081 struct dec_data *data = NULL;
1057 struct crc_data *crc = NULL; 1082 struct crc_data *crc = NULL;
@@ -1063,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1063 nr_threads = num_online_cpus() - 1; 1088 nr_threads = num_online_cpus() - 1;
1064 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); 1089 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1065 1090
1066 page = vmalloc(sizeof(*page) * LZO_READ_PAGES); 1091 page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
1067 if (!page) { 1092 if (!page) {
1068 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 1093 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1069 ret = -ENOMEM; 1094 ret = -ENOMEM;
@@ -1128,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
1128 } 1153 }
1129 1154
1130 /* 1155 /*
1131 * Adjust number of pages for read buffering, in case we are short. 1156 * Set the number of pages for read buffering.
1157 * This is complete guesswork, because we'll only know the real
1158 * picture once prepare_image() is called, which is much later on
1159 * during the image load phase. We'll assume the worst case and
1160 * say that none of the image pages are from high memory.
1132 */ 1161 */
1133 read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; 1162 if (low_free_pages() > snapshot_get_image_size())
1134 read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); 1163 read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
1164 read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
1135 1165
1136 for (i = 0; i < read_pages; i++) { 1166 for (i = 0; i < read_pages; i++) {
1137 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? 1167 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1138 __GFP_WAIT | __GFP_HIGH : 1168 __GFP_WAIT | __GFP_HIGH :
1139 __GFP_WAIT); 1169 __GFP_WAIT | __GFP_NOWARN |
1170 __GFP_NORETRY);
1171
1140 if (!page[i]) { 1172 if (!page[i]) {
1141 if (i < LZO_CMP_PAGES) { 1173 if (i < LZO_CMP_PAGES) {
1142 ring_size = i; 1174 ring_size = i;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 33c4329205af..91b0fd021a95 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,7 +12,6 @@
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/kmod.h>
16#include <linux/string.h> 15#include <linux/string.h>
17#include <linux/device.h> 16#include <linux/device.h>
18#include <linux/miscdevice.h> 17#include <linux/miscdevice.h>
@@ -222,14 +221,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
222 sys_sync(); 221 sys_sync();
223 printk("done.\n"); 222 printk("done.\n");
224 223
225 error = usermodehelper_disable();
226 if (error)
227 break;
228
229 error = freeze_processes(); 224 error = freeze_processes();
230 if (error) 225 if (!error)
231 usermodehelper_enable();
232 else
233 data->frozen = 1; 226 data->frozen = 1;
234 break; 227 break;
235 228
@@ -238,7 +231,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
238 break; 231 break;
239 pm_restore_gfp_mask(); 232 pm_restore_gfp_mask();
240 thaw_processes(); 233 thaw_processes();
241 usermodehelper_enable();
242 data->frozen = 0; 234 data->frozen = 0;
243 break; 235 break;
244 236
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 000000000000..c8fba3380076
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,259 @@
1/*
2 * kernel/power/wakelock.c
3 *
4 * User space wakeup sources support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This code is based on the analogous interface allowing user space to
9 * manipulate wakelocks on Android.
10 */
11
12#include <linux/ctype.h>
13#include <linux/device.h>
14#include <linux/err.h>
15#include <linux/hrtimer.h>
16#include <linux/list.h>
17#include <linux/rbtree.h>
18#include <linux/slab.h>
19
20static DEFINE_MUTEX(wakelocks_lock);
21
22struct wakelock {
23 char *name;
24 struct rb_node node;
25 struct wakeup_source ws;
26#ifdef CONFIG_PM_WAKELOCKS_GC
27 struct list_head lru;
28#endif
29};
30
31static struct rb_root wakelocks_tree = RB_ROOT;
32
33ssize_t pm_show_wakelocks(char *buf, bool show_active)
34{
35 struct rb_node *node;
36 struct wakelock *wl;
37 char *str = buf;
38 char *end = buf + PAGE_SIZE;
39
40 mutex_lock(&wakelocks_lock);
41
42 for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
43 wl = rb_entry(node, struct wakelock, node);
44 if (wl->ws.active == show_active)
45 str += scnprintf(str, end - str, "%s ", wl->name);
46 }
47 if (str > buf)
48 str--;
49
50 str += scnprintf(str, end - str, "\n");
51
52 mutex_unlock(&wakelocks_lock);
53 return (str - buf);
54}
55
56#if CONFIG_PM_WAKELOCKS_LIMIT > 0
57static unsigned int number_of_wakelocks;
58
59static inline bool wakelocks_limit_exceeded(void)
60{
61 return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
62}
63
64static inline void increment_wakelocks_number(void)
65{
66 number_of_wakelocks++;
67}
68
69static inline void decrement_wakelocks_number(void)
70{
71 number_of_wakelocks--;
72}
73#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
74static inline bool wakelocks_limit_exceeded(void) { return false; }
75static inline void increment_wakelocks_number(void) {}
76static inline void decrement_wakelocks_number(void) {}
77#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
78
79#ifdef CONFIG_PM_WAKELOCKS_GC
80#define WL_GC_COUNT_MAX 100
81#define WL_GC_TIME_SEC 300
82
83static LIST_HEAD(wakelocks_lru_list);
84static unsigned int wakelocks_gc_count;
85
86static inline void wakelocks_lru_add(struct wakelock *wl)
87{
88 list_add(&wl->lru, &wakelocks_lru_list);
89}
90
91static inline void wakelocks_lru_most_recent(struct wakelock *wl)
92{
93 list_move(&wl->lru, &wakelocks_lru_list);
94}
95
96static void wakelocks_gc(void)
97{
98 struct wakelock *wl, *aux;
99 ktime_t now;
100
101 if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
102 return;
103
104 now = ktime_get();
105 list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
106 u64 idle_time_ns;
107 bool active;
108
109 spin_lock_irq(&wl->ws.lock);
110 idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
111 active = wl->ws.active;
112 spin_unlock_irq(&wl->ws.lock);
113
114 if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
115 break;
116
117 if (!active) {
118 wakeup_source_remove(&wl->ws);
119 rb_erase(&wl->node, &wakelocks_tree);
120 list_del(&wl->lru);
121 kfree(wl->name);
122 kfree(wl);
123 decrement_wakelocks_number();
124 }
125 }
126 wakelocks_gc_count = 0;
127}
128#else /* !CONFIG_PM_WAKELOCKS_GC */
129static inline void wakelocks_lru_add(struct wakelock *wl) {}
130static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
131static inline void wakelocks_gc(void) {}
132#endif /* !CONFIG_PM_WAKELOCKS_GC */
133
134static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
135 bool add_if_not_found)
136{
137 struct rb_node **node = &wakelocks_tree.rb_node;
138 struct rb_node *parent = *node;
139 struct wakelock *wl;
140
141 while (*node) {
142 int diff;
143
144 parent = *node;
145 wl = rb_entry(*node, struct wakelock, node);
146 diff = strncmp(name, wl->name, len);
147 if (diff == 0) {
148 if (wl->name[len])
149 diff = -1;
150 else
151 return wl;
152 }
153 if (diff < 0)
154 node = &(*node)->rb_left;
155 else
156 node = &(*node)->rb_right;
157 }
158 if (!add_if_not_found)
159 return ERR_PTR(-EINVAL);
160
161 if (wakelocks_limit_exceeded())
162 return ERR_PTR(-ENOSPC);
163
164 /* Not found, we have to add a new one. */
165 wl = kzalloc(sizeof(*wl), GFP_KERNEL);
166 if (!wl)
167 return ERR_PTR(-ENOMEM);
168
169 wl->name = kstrndup(name, len, GFP_KERNEL);
170 if (!wl->name) {
171 kfree(wl);
172 return ERR_PTR(-ENOMEM);
173 }
174 wl->ws.name = wl->name;
175 wakeup_source_add(&wl->ws);
176 rb_link_node(&wl->node, parent, node);
177 rb_insert_color(&wl->node, &wakelocks_tree);
178 wakelocks_lru_add(wl);
179 increment_wakelocks_number();
180 return wl;
181}
182
183int pm_wake_lock(const char *buf)
184{
185 const char *str = buf;
186 struct wakelock *wl;
187 u64 timeout_ns = 0;
188 size_t len;
189 int ret = 0;
190
191 while (*str && !isspace(*str))
192 str++;
193
194 len = str - buf;
195 if (!len)
196 return -EINVAL;
197
198 if (*str && *str != '\n') {
199 /* Find out if there's a valid timeout string appended. */
200 ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
201 if (ret)
202 return -EINVAL;
203 }
204
205 mutex_lock(&wakelocks_lock);
206
207 wl = wakelock_lookup_add(buf, len, true);
208 if (IS_ERR(wl)) {
209 ret = PTR_ERR(wl);
210 goto out;
211 }
212 if (timeout_ns) {
213 u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
214
215 do_div(timeout_ms, NSEC_PER_MSEC);
216 __pm_wakeup_event(&wl->ws, timeout_ms);
217 } else {
218 __pm_stay_awake(&wl->ws);
219 }
220
221 wakelocks_lru_most_recent(wl);
222
223 out:
224 mutex_unlock(&wakelocks_lock);
225 return ret;
226}
227
228int pm_wake_unlock(const char *buf)
229{
230 struct wakelock *wl;
231 size_t len;
232 int ret = 0;
233
234 len = strlen(buf);
235 if (!len)
236 return -EINVAL;
237
238 if (buf[len-1] == '\n')
239 len--;
240
241 if (!len)
242 return -EINVAL;
243
244 mutex_lock(&wakelocks_lock);
245
246 wl = wakelock_lookup_add(buf, len, false);
247 if (IS_ERR(wl)) {
248 ret = PTR_ERR(wl);
249 goto out;
250 }
251 __pm_relax(&wl->ws);
252
253 wakelocks_lru_most_recent(wl);
254 wakelocks_gc();
255
256 out:
257 mutex_unlock(&wakelocks_lock);
258 return ret;
259}
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d39..32462d2b364a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
41#include <linux/cpu.h> 41#include <linux/cpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46 47
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
54{ 55{
55} 56}
56 57
57#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
58
59/* printk's without a loglevel use this.. */ 58/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 59#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 60
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
99static int console_locked, console_suspended; 98static int console_locked, console_suspended;
100 99
101/* 100/*
102 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
103 * It is also used in interesting ways to provide interlocking in
104 * console_unlock();.
105 */
106static DEFINE_RAW_SPINLOCK(logbuf_lock);
107
108#define LOG_BUF_MASK (log_buf_len-1)
109#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
110
111/*
112 * The indices into log_buf are not constrained to log_buf_len - they
113 * must be masked before subscripting
114 */
115static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
116static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
117static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
118
119/*
120 * If exclusive_console is non-NULL then only this console is to be printed to. 101 * If exclusive_console is non-NULL then only this console is to be printed to.
121 */ 102 */
122static struct console *exclusive_console; 103static struct console *exclusive_console;
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);
145/* Flag: console code may call schedule() */ 126/* Flag: console code may call schedule() */
146static int console_may_schedule; 127static int console_may_schedule;
147 128
129/*
130 * The printk log buffer consists of a chain of concatenated variable
131 * length records. Every record starts with a record header, containing
132 * the overall length of the record.
133 *
134 * The heads to the first and last entry in the buffer, as well as the
135 * sequence numbers of these both entries are maintained when messages
136 * are stored..
137 *
138 * If the heads indicate available messages, the length in the header
139 * tells the start next message. A length == 0 for the next message
140 * indicates a wrap-around to the beginning of the buffer.
141 *
142 * Every record carries the monotonic timestamp in microseconds, as well as
143 * the standard userspace syslog level and syslog facility. The usual
144 * kernel messages use LOG_KERN; userspace-injected messages always carry
145 * a matching syslog facility, by default LOG_USER. The origin of every
146 * message can be reliably determined that way.
147 *
148 * The human readable log message directly follows the message header. The
149 * length of the message text is stored in the header, the stored message
150 * is not terminated.
151 *
152 * Optionally, a message can carry a dictionary of properties (key/value pairs),
153 * to provide userspace with a machine-readable message context.
154 *
155 * Examples for well-defined, commonly used property names are:
156 * DEVICE=b12:8 device identifier
157 * b12:8 block dev_t
158 * c127:3 char dev_t
159 * n8 netdev ifindex
160 * +sound:card0 subsystem:devname
161 * SUBSYSTEM=pci driver-core subsystem name
162 *
163 * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
164 * follows directly after a '=' character. Every property is terminated by
165 * a '\0' character. The last property is not terminated.
166 *
167 * Example of a message structure:
168 * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
169 * 0008 34 00 record is 52 bytes long
170 * 000a 0b 00 text is 11 bytes long
171 * 000c 1f 00 dictionary is 23 bytes long
172 * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
173 * 0010 69 74 27 73 20 61 20 6c "it's a l"
174 * 69 6e 65 "ine"
175 * 001b 44 45 56 49 43 "DEVIC"
176 * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
177 * 52 49 56 45 52 3d 62 75 "RIVER=bu"
178 * 67 "g"
179 * 0032 00 00 00 padding to next message header
180 *
181 * The 'struct log' buffer header must never be directly exported to
182 * userspace, it is a kernel-private implementation detail that might
183 * need to be changed in the future, when the requirements change.
184 *
185 * /dev/kmsg exports the structured data in the following line format:
186 * "level,sequnum,timestamp;<message text>\n"
187 *
188 * The optional key/value pairs are attached as continuation lines starting
189 * with a space character and terminated by a newline. All possible
190 * non-prinatable characters are escaped in the "\xff" notation.
191 *
192 * Users of the export format should ignore possible additional values
193 * separated by ',', and find the message after the ';' character.
194 */
195
196struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */
202};
203
204/*
205 * The logbuf_lock protects kmsg buffer, indices, counters. It is also
206 * used in interesting ways to provide interlocking in console_unlock();
207 */
208static DEFINE_RAW_SPINLOCK(logbuf_lock);
209
210/* the next printk record to read by syslog(READ) or /proc/kmsg */
211static u64 syslog_seq;
212static u32 syslog_idx;
213
214/* index and sequence number of the first record stored in the buffer */
215static u64 log_first_seq;
216static u32 log_first_idx;
217
218/* index and sequence number of the next record to store in the buffer */
219static u64 log_next_seq;
148#ifdef CONFIG_PRINTK 220#ifdef CONFIG_PRINTK
221static u32 log_next_idx;
222
223/* the next printk record to read after the last 'clear' command */
224static u64 clear_seq;
225static u32 clear_idx;
226
227#define LOG_LINE_MAX 1024
149 228
150static char __log_buf[__LOG_BUF_LEN]; 229/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4
232#else
233#define LOG_ALIGN 8
234#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
151static char *log_buf = __log_buf; 237static char *log_buf = __log_buf;
152static int log_buf_len = __LOG_BUF_LEN; 238static u32 log_buf_len = __LOG_BUF_LEN;
153static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 239
154static int saved_console_loglevel = -1; 240/* cpu currently holding logbuf_lock */
241static volatile unsigned int logbuf_cpu = UINT_MAX;
242
243/* human readable text of the record */
244static char *log_text(const struct log *msg)
245{
246 return (char *)msg + sizeof(struct log);
247}
248
249/* optional key/value pair dictionary attached to the record */
250static char *log_dict(const struct log *msg)
251{
252 return (char *)msg + sizeof(struct log) + msg->text_len;
253}
254
255/* get record by index; idx must point to valid msg */
256static struct log *log_from_idx(u32 idx)
257{
258 struct log *msg = (struct log *)(log_buf + idx);
259
260 /*
261 * A length == 0 record is the end of buffer marker. Wrap around and
262 * read the message at the start of the buffer.
263 */
264 if (!msg->len)
265 return (struct log *)log_buf;
266 return msg;
267}
268
269/* get next record; idx must point to valid msg */
270static u32 log_next(u32 idx)
271{
272 struct log *msg = (struct log *)(log_buf + idx);
273
274 /* length == 0 indicates the end of the buffer; wrap */
275 /*
276 * A length == 0 record is the end of buffer marker. Wrap around and
277 * read the message at the start of the buffer as *this* one, and
278 * return the one after that.
279 */
280 if (!msg->len) {
281 msg = (struct log *)log_buf;
282 return msg->len;
283 }
284 return idx + msg->len;
285}
286
287/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level,
289 const char *dict, u16 dict_len,
290 const char *text, u16 text_len)
291{
292 struct log *msg;
293 u32 size, pad_len;
294
295 /* number of '\0' padding bytes to next message */
296 size = sizeof(struct log) + text_len + dict_len;
297 pad_len = (-size) & (LOG_ALIGN - 1);
298 size += pad_len;
299
300 while (log_first_seq < log_next_seq) {
301 u32 free;
302
303 if (log_next_idx > log_first_idx)
304 free = max(log_buf_len - log_next_idx, log_first_idx);
305 else
306 free = log_first_idx - log_next_idx;
307
308 if (free > size + sizeof(struct log))
309 break;
310
311 /* drop old messages until we have enough contiuous space */
312 log_first_idx = log_next(log_first_idx);
313 log_first_seq++;
314 }
315
316 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
317 /*
318 * This message + an additional empty header does not fit
319 * at the end of the buffer. Add an empty header with len == 0
320 * to signify a wrap around.
321 */
322 memset(log_buf + log_next_idx, 0, sizeof(struct log));
323 log_next_idx = 0;
324 }
325
326 /* fill message */
327 msg = (struct log *)(log_buf + log_next_idx);
328 memcpy(log_text(msg), text, text_len);
329 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7);
333 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336
337 /* insert message */
338 log_next_idx += msg->len;
339 log_next_seq++;
340}
341
342/* /dev/kmsg - userspace message inject/listen interface */
343struct devkmsg_user {
344 u64 seq;
345 u32 idx;
346 struct mutex lock;
347 char buf[8192];
348};
349
350static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
351 unsigned long count, loff_t pos)
352{
353 char *buf, *line;
354 int i;
355 int level = default_message_loglevel;
356 int facility = 1; /* LOG_USER */
357 size_t len = iov_length(iv, count);
358 ssize_t ret = len;
359
360 if (len > LOG_LINE_MAX)
361 return -EINVAL;
362 buf = kmalloc(len+1, GFP_KERNEL);
363 if (buf == NULL)
364 return -ENOMEM;
365
366 line = buf;
367 for (i = 0; i < count; i++) {
368 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
369 goto out;
370 line += iv[i].iov_len;
371 }
372
373 /*
374 * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
375 * the decimal value represents 32bit, the lower 3 bit are the log
376 * level, the rest are the log facility.
377 *
378 * If no prefix or no userspace facility is specified, we
379 * enforce LOG_USER, to be able to reliably distinguish
380 * kernel-generated messages from userspace-injected ones.
381 */
382 line = buf;
383 if (line[0] == '<') {
384 char *endp = NULL;
385
386 i = simple_strtoul(line+1, &endp, 10);
387 if (endp && endp[0] == '>') {
388 level = i & 7;
389 if (i >> 3)
390 facility = i >> 3;
391 endp++;
392 len -= endp - line;
393 line = endp;
394 }
395 }
396 line[len] = '\0';
397
398 printk_emit(facility, level, NULL, 0, "%s", line);
399out:
400 kfree(buf);
401 return ret;
402}
403
404static ssize_t devkmsg_read(struct file *file, char __user *buf,
405 size_t count, loff_t *ppos)
406{
407 struct devkmsg_user *user = file->private_data;
408 struct log *msg;
409 u64 ts_usec;
410 size_t i;
411 size_t len;
412 ssize_t ret;
413
414 if (!user)
415 return -EBADF;
416
417 mutex_lock(&user->lock);
418 raw_spin_lock(&logbuf_lock);
419 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) {
421 ret = -EAGAIN;
422 raw_spin_unlock(&logbuf_lock);
423 goto out;
424 }
425
426 raw_spin_unlock(&logbuf_lock);
427 ret = wait_event_interruptible(log_wait,
428 user->seq != log_next_seq);
429 if (ret)
430 goto out;
431 raw_spin_lock(&logbuf_lock);
432 }
433
434 if (user->seq < log_first_seq) {
435 /* our last seen message is gone, return error and reset */
436 user->idx = log_first_idx;
437 user->seq = log_first_seq;
438 ret = -EPIPE;
439 raw_spin_unlock(&logbuf_lock);
440 goto out;
441 }
442
443 msg = log_from_idx(user->idx);
444 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec);
448
449 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) {
451 unsigned char c = log_text(msg)[i];
452
453 if (c < ' ' || c >= 128)
454 len += sprintf(user->buf + len, "\\x%02x", c);
455 else
456 user->buf[len++] = c;
457 }
458 user->buf[len++] = '\n';
459
460 if (msg->dict_len) {
461 bool line = true;
462
463 for (i = 0; i < msg->dict_len; i++) {
464 unsigned char c = log_dict(msg)[i];
465
466 if (line) {
467 user->buf[len++] = ' ';
468 line = false;
469 }
470
471 if (c == '\0') {
472 user->buf[len++] = '\n';
473 line = true;
474 continue;
475 }
476
477 if (c < ' ' || c >= 128) {
478 len += sprintf(user->buf + len, "\\x%02x", c);
479 continue;
480 }
481
482 user->buf[len++] = c;
483 }
484 user->buf[len++] = '\n';
485 }
486
487 user->idx = log_next(user->idx);
488 user->seq++;
489 raw_spin_unlock(&logbuf_lock);
490
491 if (len > count) {
492 ret = -EINVAL;
493 goto out;
494 }
495
496 if (copy_to_user(buf, user->buf, len)) {
497 ret = -EFAULT;
498 goto out;
499 }
500 ret = len;
501out:
502 mutex_unlock(&user->lock);
503 return ret;
504}
505
506static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
507{
508 struct devkmsg_user *user = file->private_data;
509 loff_t ret = 0;
510
511 if (!user)
512 return -EBADF;
513 if (offset)
514 return -ESPIPE;
515
516 raw_spin_lock(&logbuf_lock);
517 switch (whence) {
518 case SEEK_SET:
519 /* the first record */
520 user->idx = log_first_idx;
521 user->seq = log_first_seq;
522 break;
523 case SEEK_DATA:
524 /*
525 * The first record after the last SYSLOG_ACTION_CLEAR,
526 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
527 * changes no global state, and does not clear anything.
528 */
529 user->idx = clear_idx;
530 user->seq = clear_seq;
531 break;
532 case SEEK_END:
533 /* after the last record */
534 user->idx = log_next_idx;
535 user->seq = log_next_seq;
536 break;
537 default:
538 ret = -EINVAL;
539 }
540 raw_spin_unlock(&logbuf_lock);
541 return ret;
542}
543
544static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
545{
546 struct devkmsg_user *user = file->private_data;
547 int ret = 0;
548
549 if (!user)
550 return POLLERR|POLLNVAL;
551
552 poll_wait(file, &log_wait, wait);
553
554 raw_spin_lock(&logbuf_lock);
555 if (user->seq < log_next_seq) {
556 /* return error when data has vanished underneath us */
557 if (user->seq < log_first_seq)
558 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
559 ret = POLLIN|POLLRDNORM;
560 }
561 raw_spin_unlock(&logbuf_lock);
562
563 return ret;
564}
565
566static int devkmsg_open(struct inode *inode, struct file *file)
567{
568 struct devkmsg_user *user;
569 int err;
570
571 /* write-only does not need any file context */
572 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
573 return 0;
574
575 err = security_syslog(SYSLOG_ACTION_READ_ALL);
576 if (err)
577 return err;
578
579 user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
580 if (!user)
581 return -ENOMEM;
582
583 mutex_init(&user->lock);
584
585 raw_spin_lock(&logbuf_lock);
586 user->idx = log_first_idx;
587 user->seq = log_first_seq;
588 raw_spin_unlock(&logbuf_lock);
589
590 file->private_data = user;
591 return 0;
592}
593
594static int devkmsg_release(struct inode *inode, struct file *file)
595{
596 struct devkmsg_user *user = file->private_data;
597
598 if (!user)
599 return 0;
600
601 mutex_destroy(&user->lock);
602 kfree(user);
603 return 0;
604}
605
606const struct file_operations kmsg_fops = {
607 .open = devkmsg_open,
608 .read = devkmsg_read,
609 .aio_write = devkmsg_writev,
610 .llseek = devkmsg_llseek,
611 .poll = devkmsg_poll,
612 .release = devkmsg_release,
613};
155 614
156#ifdef CONFIG_KEXEC 615#ifdef CONFIG_KEXEC
157/* 616/*
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;
165void log_buf_kexec_setup(void) 624void log_buf_kexec_setup(void)
166{ 625{
167 VMCOREINFO_SYMBOL(log_buf); 626 VMCOREINFO_SYMBOL(log_buf);
168 VMCOREINFO_SYMBOL(log_end);
169 VMCOREINFO_SYMBOL(log_buf_len); 627 VMCOREINFO_SYMBOL(log_buf_len);
170 VMCOREINFO_SYMBOL(logged_chars); 628 VMCOREINFO_SYMBOL(log_first_idx);
629 VMCOREINFO_SYMBOL(log_next_idx);
171} 630}
172#endif 631#endif
173 632
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);
191void __init setup_log_buf(int early) 650void __init setup_log_buf(int early)
192{ 651{
193 unsigned long flags; 652 unsigned long flags;
194 unsigned start, dest_idx, offset;
195 char *new_log_buf; 653 char *new_log_buf;
196 int free; 654 int free;
197 655
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early)
219 log_buf_len = new_log_buf_len; 677 log_buf_len = new_log_buf_len;
220 log_buf = new_log_buf; 678 log_buf = new_log_buf;
221 new_log_buf_len = 0; 679 new_log_buf_len = 0;
222 free = __LOG_BUF_LEN - log_end; 680 free = __LOG_BUF_LEN - log_next_idx;
223 681 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
224 offset = start = min(con_start, log_start);
225 dest_idx = 0;
226 while (start != log_end) {
227 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
228
229 log_buf[dest_idx] = __log_buf[log_idx_mask];
230 start++;
231 dest_idx++;
232 }
233 log_start -= offset;
234 con_start -= offset;
235 log_end -= offset;
236 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 682 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
237 683
238 pr_info("log_buf_len: %d\n", log_buf_len); 684 pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)
332 return 0; 778 return 0;
333} 779}
334 780
781#if defined(CONFIG_PRINTK_TIME)
782static bool printk_time = 1;
783#else
784static bool printk_time;
785#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{
790 size_t len = 0;
791
792 if (syslog) {
793 if (buf) {
794 len += sprintf(buf, "<%u>", msg->level);
795 } else {
796 len += 3;
797 if (msg->level > 9)
798 len++;
799 if (msg->level > 99)
800 len++;
801 }
802 }
803
804 if (printk_time) {
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len;
817}
818
819static size_t msg_print_text(const struct log *msg, bool syslog,
820 char *buf, size_t size)
821{
822 const char *text = log_text(msg);
823 size_t text_size = msg->text_len;
824 size_t len = 0;
825
826 do {
827 const char *next = memchr(text, '\n', text_size);
828 size_t text_len;
829
830 if (next) {
831 text_len = next - text;
832 next++;
833 text_size -= next - text;
834 } else {
835 text_len = text_size;
836 }
837
838 if (buf) {
839 if (print_prefix(msg, syslog, NULL) +
840 text_len + 1>= size - len)
841 break;
842
843 len += print_prefix(msg, syslog, buf + len);
844 memcpy(buf + len, text, text_len);
845 len += text_len;
846 buf[len++] = '\n';
847 } else {
848 /* SYSLOG_ACTION_* buffer size only calculation */
849 len += print_prefix(msg, syslog, NULL);
850 len += text_len + 1;
851 }
852
853 text = next;
854 } while (text);
855
856 return len;
857}
858
859static int syslog_print(char __user *buf, int size)
860{
861 char *text;
862 struct log *msg;
863 int len;
864
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text)
867 return -ENOMEM;
868
869 raw_spin_lock_irq(&logbuf_lock);
870 if (syslog_seq < log_first_seq) {
871 /* messages are gone, move to first one */
872 syslog_seq = log_first_seq;
873 syslog_idx = log_first_idx;
874 }
875 msg = log_from_idx(syslog_idx);
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX);
877 syslog_idx = log_next(syslog_idx);
878 syslog_seq++;
879 raw_spin_unlock_irq(&logbuf_lock);
880
881 if (len > 0 && copy_to_user(buf, text, len))
882 len = -EFAULT;
883
884 kfree(text);
885 return len;
886}
887
888static int syslog_print_all(char __user *buf, int size, bool clear)
889{
890 char *text;
891 int len = 0;
892
893 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
894 if (!text)
895 return -ENOMEM;
896
897 raw_spin_lock_irq(&logbuf_lock);
898 if (buf) {
899 u64 next_seq;
900 u64 seq;
901 u32 idx;
902
903 if (clear_seq < log_first_seq) {
904 /* messages are gone, move to first available one */
905 clear_seq = log_first_seq;
906 clear_idx = log_first_idx;
907 }
908
909 /*
910 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump.
912 */
913 seq = clear_seq;
914 idx = clear_idx;
915 while (seq < log_next_seq) {
916 struct log *msg = log_from_idx(idx);
917
918 len += msg_print_text(msg, true, NULL, 0);
919 idx = log_next(idx);
920 seq++;
921 }
922 seq = clear_seq;
923 idx = clear_idx;
924 while (len > size && seq < log_next_seq) {
925 struct log *msg = log_from_idx(idx);
926
927 len -= msg_print_text(msg, true, NULL, 0);
928 idx = log_next(idx);
929 seq++;
930 }
931
932 /* last message in this dump */
933 next_seq = log_next_seq;
934
935 len = 0;
936 while (len >= 0 && seq < next_seq) {
937 struct log *msg = log_from_idx(idx);
938 int textlen;
939
940 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
941 if (textlen < 0) {
942 len = textlen;
943 break;
944 }
945 idx = log_next(idx);
946 seq++;
947
948 raw_spin_unlock_irq(&logbuf_lock);
949 if (copy_to_user(buf + len, text, textlen))
950 len = -EFAULT;
951 else
952 len += textlen;
953 raw_spin_lock_irq(&logbuf_lock);
954
955 if (seq < log_first_seq) {
956 /* messages are gone, move to next one */
957 seq = log_first_seq;
958 idx = log_first_idx;
959 }
960 }
961 }
962
963 if (clear) {
964 clear_seq = log_next_seq;
965 clear_idx = log_next_idx;
966 }
967 raw_spin_unlock_irq(&logbuf_lock);
968
969 kfree(text);
970 return len;
971}
972
335int do_syslog(int type, char __user *buf, int len, bool from_file) 973int do_syslog(int type, char __user *buf, int len, bool from_file)
336{ 974{
337 unsigned i, j, limit, count; 975 bool clear = false;
338 int do_clear = 0; 976 static int saved_console_loglevel = -1;
339 char c;
340 int error; 977 int error;
341 978
342 error = check_syslog_permissions(type, from_file); 979 error = check_syslog_permissions(type, from_file);
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
364 goto out; 1001 goto out;
365 } 1002 }
366 error = wait_event_interruptible(log_wait, 1003 error = wait_event_interruptible(log_wait,
367 (log_start - log_end)); 1004 syslog_seq != log_next_seq);
368 if (error) 1005 if (error)
369 goto out; 1006 goto out;
370 i = 0; 1007 error = syslog_print(buf, len);
371 raw_spin_lock_irq(&logbuf_lock);
372 while (!error && (log_start != log_end) && i < len) {
373 c = LOG_BUF(log_start);
374 log_start++;
375 raw_spin_unlock_irq(&logbuf_lock);
376 error = __put_user(c,buf);
377 buf++;
378 i++;
379 cond_resched();
380 raw_spin_lock_irq(&logbuf_lock);
381 }
382 raw_spin_unlock_irq(&logbuf_lock);
383 if (!error)
384 error = i;
385 break; 1008 break;
386 /* Read/clear last kernel messages */ 1009 /* Read/clear last kernel messages */
387 case SYSLOG_ACTION_READ_CLEAR: 1010 case SYSLOG_ACTION_READ_CLEAR:
388 do_clear = 1; 1011 clear = true;
389 /* FALL THRU */ 1012 /* FALL THRU */
390 /* Read last kernel messages */ 1013 /* Read last kernel messages */
391 case SYSLOG_ACTION_READ_ALL: 1014 case SYSLOG_ACTION_READ_ALL:
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
399 error = -EFAULT; 1022 error = -EFAULT;
400 goto out; 1023 goto out;
401 } 1024 }
402 count = len; 1025 error = syslog_print_all(buf, len, clear);
403 if (count > log_buf_len)
404 count = log_buf_len;
405 raw_spin_lock_irq(&logbuf_lock);
406 if (count > logged_chars)
407 count = logged_chars;
408 if (do_clear)
409 logged_chars = 0;
410 limit = log_end;
411 /*
412 * __put_user() could sleep, and while we sleep
413 * printk() could overwrite the messages
414 * we try to copy to user space. Therefore
415 * the messages are copied in reverse. <manfreds>
416 */
417 for (i = 0; i < count && !error; i++) {
418 j = limit-1-i;
419 if (j + log_buf_len < log_end)
420 break;
421 c = LOG_BUF(j);
422 raw_spin_unlock_irq(&logbuf_lock);
423 error = __put_user(c,&buf[count-1-i]);
424 cond_resched();
425 raw_spin_lock_irq(&logbuf_lock);
426 }
427 raw_spin_unlock_irq(&logbuf_lock);
428 if (error)
429 break;
430 error = i;
431 if (i != count) {
432 int offset = count-error;
433 /* buffer overflow during copy, correct user buffer. */
434 for (i = 0; i < error; i++) {
435 if (__get_user(c,&buf[i+offset]) ||
436 __put_user(c,&buf[i])) {
437 error = -EFAULT;
438 break;
439 }
440 cond_resched();
441 }
442 }
443 break; 1026 break;
444 /* Clear ring buffer */ 1027 /* Clear ring buffer */
445 case SYSLOG_ACTION_CLEAR: 1028 case SYSLOG_ACTION_CLEAR:
446 logged_chars = 0; 1029 syslog_print_all(NULL, 0, true);
447 break;
448 /* Disable logging to console */ 1030 /* Disable logging to console */
449 case SYSLOG_ACTION_CONSOLE_OFF: 1031 case SYSLOG_ACTION_CONSOLE_OFF:
450 if (saved_console_loglevel == -1) 1032 if (saved_console_loglevel == -1)
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
472 break; 1054 break;
473 /* Number of chars in the log buffer */ 1055 /* Number of chars in the log buffer */
474 case SYSLOG_ACTION_SIZE_UNREAD: 1056 case SYSLOG_ACTION_SIZE_UNREAD:
475 error = log_end - log_start; 1057 raw_spin_lock_irq(&logbuf_lock);
1058 if (syslog_seq < log_first_seq) {
1059 /* messages are gone, move to first one */
1060 syslog_seq = log_first_seq;
1061 syslog_idx = log_first_idx;
1062 }
1063 if (from_file) {
1064 /*
1065 * Short-cut for poll(/"proc/kmsg") which simply checks
1066 * for pending data, not the size; return the count of
1067 * records, not the length.
1068 */
1069 error = log_next_idx - syslog_idx;
1070 } else {
1071 u64 seq;
1072 u32 idx;
1073
1074 error = 0;
1075 seq = syslog_seq;
1076 idx = syslog_idx;
1077 while (seq < log_next_seq) {
1078 struct log *msg = log_from_idx(idx);
1079
1080 error += msg_print_text(msg, true, NULL, 0);
1081 idx = log_next(idx);
1082 seq++;
1083 }
1084 }
1085 raw_spin_unlock_irq(&logbuf_lock);
476 break; 1086 break;
477 /* Size of the log buffer */ 1087 /* Size of the log buffer */
478 case SYSLOG_ACTION_SIZE_BUFFER: 1088 case SYSLOG_ACTION_SIZE_BUFFER:
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])
501{ 1111{
502 syslog_data[0] = log_buf; 1112 syslog_data[0] = log_buf;
503 syslog_data[1] = log_buf + log_buf_len; 1113 syslog_data[1] = log_buf + log_buf_len;
504 syslog_data[2] = log_buf + log_end - 1114 syslog_data[2] = log_buf + log_first_idx;
505 (logged_chars < log_buf_len ? logged_chars : log_buf_len); 1115 syslog_data[3] = log_buf + log_next_idx;
506 syslog_data[3] = log_buf + log_end;
507} 1116}
508#endif /* CONFIG_KGDB_KDB */ 1117#endif /* CONFIG_KGDB_KDB */
509 1118
510/*
511 * Call the console drivers on a range of log_buf
512 */
513static void __call_console_drivers(unsigned start, unsigned end)
514{
515 struct console *con;
516
517 for_each_console(con) {
518 if (exclusive_console && con != exclusive_console)
519 continue;
520 if ((con->flags & CON_ENABLED) && con->write &&
521 (cpu_online(smp_processor_id()) ||
522 (con->flags & CON_ANYTIME)))
523 con->write(con, &LOG_BUF(start), end - start);
524 }
525}
526
527static bool __read_mostly ignore_loglevel; 1119static bool __read_mostly ignore_loglevel;
528 1120
529static int __init ignore_loglevel_setup(char *str) 1121static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
540 "print all kernel messages to the console."); 1132 "print all kernel messages to the console.");
541 1133
542/* 1134/*
543 * Write out chars from start to end - 1 inclusive
544 */
545static void _call_console_drivers(unsigned start,
546 unsigned end, int msg_log_level)
547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
551 console_drivers && start != end) {
552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
553 /* wrapped write */
554 __call_console_drivers(start & LOG_BUF_MASK,
555 log_buf_len);
556 __call_console_drivers(0, end & LOG_BUF_MASK);
557 } else {
558 __call_console_drivers(start, end);
559 }
560 }
561}
562
563/*
564 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
565 * lower 3 bit are the log level, the rest are the log facility. In case
566 * userspace passes usual userspace syslog messages to /dev/kmsg or
567 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
568 * to extract the correct log level for in-kernel processing, and not mangle
569 * the original value.
570 *
571 * If a prefix is found, the length of the prefix is returned. If 'level' is
572 * passed, it will be filled in with the log level without a possible facility
573 * value. If 'special' is passed, the special printk prefix chars are accepted
574 * and returned. If no valid header is found, 0 is returned and the passed
575 * variables are not touched.
576 */
577static size_t log_prefix(const char *p, unsigned int *level, char *special)
578{
579 unsigned int lev = 0;
580 char sp = '\0';
581 size_t len;
582
583 if (p[0] != '<' || !p[1])
584 return 0;
585 if (p[2] == '>') {
586 /* usual single digit level number or special char */
587 switch (p[1]) {
588 case '0' ... '7':
589 lev = p[1] - '0';
590 break;
591 case 'c': /* KERN_CONT */
592 case 'd': /* KERN_DEFAULT */
593 sp = p[1];
594 break;
595 default:
596 return 0;
597 }
598 len = 3;
599 } else {
600 /* multi digit including the level and facility number */
601 char *endp = NULL;
602
603 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
604 if (endp == NULL || endp[0] != '>')
605 return 0;
606 len = (endp + 1) - p;
607 }
608
609 /* do not accept special char if not asked for */
610 if (sp && !special)
611 return 0;
612
613 if (special) {
614 *special = sp;
615 /* return special char, do not touch level */
616 if (sp)
617 return len;
618 }
619
620 if (level)
621 *level = lev;
622 return len;
623}
624
625/*
626 * Call the console drivers, asking them to write out 1135 * Call the console drivers, asking them to write out
627 * log_buf[start] to log_buf[end - 1]. 1136 * log_buf[start] to log_buf[end - 1].
628 * The console_lock must be held. 1137 * The console_lock must be held.
629 */ 1138 */
630static void call_console_drivers(unsigned start, unsigned end) 1139static void call_console_drivers(int level, const char *text, size_t len)
631{ 1140{
632 unsigned cur_index, start_print; 1141 struct console *con;
633 static int msg_level = -1;
634 1142
635 BUG_ON(((int)(start - end)) > 0); 1143 trace_console(text, 0, len, len);
636 1144
637 cur_index = start; 1145 if (level >= console_loglevel && !ignore_loglevel)
638 start_print = start; 1146 return;
639 while (cur_index != end) { 1147 if (!console_drivers)
640 if (msg_level < 0 && ((end - cur_index) > 2)) { 1148 return;
641 /* strip log prefix */
642 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
643 start_print = cur_index;
644 }
645 while (cur_index != end) {
646 char c = LOG_BUF(cur_index);
647
648 cur_index++;
649 if (c == '\n') {
650 if (msg_level < 0) {
651 /*
652 * printk() has already given us loglevel tags in
653 * the buffer. This code is here in case the
654 * log buffer has wrapped right round and scribbled
655 * on those tags
656 */
657 msg_level = default_message_loglevel;
658 }
659 _call_console_drivers(start_print, cur_index, msg_level);
660 msg_level = -1;
661 start_print = cur_index;
662 break;
663 }
664 }
665 }
666 _call_console_drivers(start_print, end, msg_level);
667}
668 1149
669static void emit_log_char(char c) 1150 for_each_console(con) {
670{ 1151 if (exclusive_console && con != exclusive_console)
671 LOG_BUF(log_end) = c; 1152 continue;
672 log_end++; 1153 if (!(con->flags & CON_ENABLED))
673 if (log_end - log_start > log_buf_len) 1154 continue;
674 log_start = log_end - log_buf_len; 1155 if (!con->write)
675 if (log_end - con_start > log_buf_len) 1156 continue;
676 con_start = log_end - log_buf_len; 1157 if (!cpu_online(smp_processor_id()) &&
677 if (logged_chars < log_buf_len) 1158 !(con->flags & CON_ANYTIME))
678 logged_chars++; 1159 continue;
1160 con->write(con, text, len);
1161 }
679} 1162}
680 1163
681/* 1164/*
@@ -700,16 +1183,6 @@ static void zap_locks(void)
700 sema_init(&console_sem, 1); 1183 sema_init(&console_sem, 1);
701} 1184}
702 1185
703#if defined(CONFIG_PRINTK_TIME)
704static bool printk_time = 1;
705#else
706static bool printk_time = 0;
707#endif
708module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
709
710static bool always_kmsg_dump;
711module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
712
713/* Check if we have any console registered that can be called early in boot. */ 1186/* Check if we have any console registered that can be called early in boot. */
714static int have_callable_console(void) 1187static int have_callable_console(void)
715{ 1188{
@@ -722,51 +1195,6 @@ static int have_callable_console(void)
722 return 0; 1195 return 0;
723} 1196}
724 1197
725/**
726 * printk - print a kernel message
727 * @fmt: format string
728 *
729 * This is printk(). It can be called from any context. We want it to work.
730 *
731 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
732 * call the console drivers. If we fail to get the semaphore we place the output
733 * into the log buffer and return. The current holder of the console_sem will
734 * notice the new output in console_unlock(); and will send it to the
735 * consoles before releasing the lock.
736 *
737 * One effect of this deferred printing is that code which calls printk() and
738 * then changes console_loglevel may break. This is because console_loglevel
739 * is inspected when the actual printing occurs.
740 *
741 * See also:
742 * printf(3)
743 *
744 * See the vsnprintf() documentation for format string extensions over C99.
745 */
746
747asmlinkage int printk(const char *fmt, ...)
748{
749 va_list args;
750 int r;
751
752#ifdef CONFIG_KGDB_KDB
753 if (unlikely(kdb_trap_printk)) {
754 va_start(args, fmt);
755 r = vkdb_printf(fmt, args);
756 va_end(args);
757 return r;
758 }
759#endif
760 va_start(args, fmt);
761 r = vprintk(fmt, args);
762 va_end(args);
763
764 return r;
765}
766
767/* cpu currently holding logbuf_lock */
768static volatile unsigned int printk_cpu = UINT_MAX;
769
770/* 1198/*
771 * Can we actually use the console at this time on this cpu? 1199 * Can we actually use the console at this time on this cpu?
772 * 1200 *
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)
810 retval = 0; 1238 retval = 0;
811 } 1239 }
812 } 1240 }
813 printk_cpu = UINT_MAX; 1241 logbuf_cpu = UINT_MAX;
814 if (wake) 1242 if (wake)
815 up(&console_sem); 1243 up(&console_sem);
816 raw_spin_unlock(&logbuf_lock); 1244 raw_spin_unlock(&logbuf_lock);
817 return retval; 1245 return retval;
818} 1246}
819static const char recursion_bug_msg [] =
820 KERN_CRIT "BUG: recent printk recursion!\n";
821static int recursion_bug;
822static int new_text_line = 1;
823static char printk_buf[1024];
824 1247
825int printk_delay_msec __read_mostly; 1248int printk_delay_msec __read_mostly;
826 1249
@@ -836,15 +1259,23 @@ static inline void printk_delay(void)
836 } 1259 }
837} 1260}
838 1261
839asmlinkage int vprintk(const char *fmt, va_list args) 1262asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args)
840{ 1265{
841 int printed_len = 0; 1266 static int recursion_bug;
842 int current_log_level = default_message_loglevel; 1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf;
1273 size_t text_len;
843 unsigned long flags; 1274 unsigned long flags;
844 int this_cpu; 1275 int this_cpu;
845 char *p; 1276 bool newline = false;
846 size_t plen; 1277 bool prefix = false;
847 char special; 1278 int printed_len = 0;
848 1279
849 boot_delay_msec(); 1280 boot_delay_msec();
850 printk_delay(); 1281 printk_delay();
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 /* 1287 /*
857 * Ouch, printk recursed into itself! 1288 * Ouch, printk recursed into itself!
858 */ 1289 */
859 if (unlikely(printk_cpu == this_cpu)) { 1290 if (unlikely(logbuf_cpu == this_cpu)) {
860 /* 1291 /*
861 * If a crash is occurring during printk() on this CPU, 1292 * If a crash is occurring during printk() on this CPU,
862 * then try to get the crash message out but make sure 1293 * then try to get the crash message out but make sure
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)
873 1304
874 lockdep_off(); 1305 lockdep_off();
875 raw_spin_lock(&logbuf_lock); 1306 raw_spin_lock(&logbuf_lock);
876 printk_cpu = this_cpu; 1307 logbuf_cpu = this_cpu;
877 1308
878 if (recursion_bug) { 1309 if (recursion_bug) {
1310 static const char recursion_msg[] =
1311 "BUG: recent printk recursion!";
1312
879 recursion_bug = 0; 1313 recursion_bug = 0;
880 strcpy(printk_buf, recursion_bug_msg); 1314 printed_len += strlen(recursion_msg);
881 printed_len = strlen(recursion_bug_msg); 1315 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len);
882 } 1317 }
883 /* Emit the output into the temporary buffer */
884 printed_len += vscnprintf(printk_buf + printed_len,
885 sizeof(printk_buf) - printed_len, fmt, args);
886 1318
887 p = printk_buf; 1319 /*
1320 * The printf needs to come first; we need the syslog
1321 * prefix which might be passed-in as a parameter.
1322 */
1323 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
888 1324
889 /* Read log level and handle special printk prefix */ 1325 /* mark and strip a trailing newline */
890 plen = log_prefix(p, &current_log_level, &special); 1326 if (text_len && text[text_len-1] == '\n') {
891 if (plen) { 1327 text_len--;
892 p += plen; 1328 newline = true;
1329 }
893 1330
894 switch (special) { 1331 /* strip syslog prefix and extract log level or control flags */
895 case 'c': /* Strip <c> KERN_CONT, continue line */ 1332 if (text[0] == '<' && text[1] && text[2] == '>') {
896 plen = 0; 1333 switch (text[1]) {
897 break; 1334 case '0' ... '7':
898 case 'd': /* Strip <d> KERN_DEFAULT, start new line */ 1335 if (level == -1)
899 plen = 0; 1336 level = text[1] - '0';
900 default: 1337 case 'd': /* KERN_DEFAULT */
901 if (!new_text_line) { 1338 prefix = true;
902 emit_log_char('\n'); 1339 case 'c': /* KERN_CONT */
903 new_text_line = 1; 1340 text += 3;
904 } 1341 text_len -= 3;
905 } 1342 }
906 } 1343 }
907 1344
908 /* 1345 if (level == -1)
909 * Copy the output into log_buf. If the caller didn't provide 1346 level = default_message_loglevel;
910 * the appropriate log prefix, we insert them here
911 */
912 for (; *p; p++) {
913 if (new_text_line) {
914 new_text_line = 0;
915
916 if (plen) {
917 /* Copy original log prefix */
918 int i;
919
920 for (i = 0; i < plen; i++)
921 emit_log_char(printk_buf[i]);
922 printed_len += plen;
923 } else {
924 /* Add log prefix */
925 emit_log_char('<');
926 emit_log_char(current_log_level + '0');
927 emit_log_char('>');
928 printed_len += 3;
929 }
930 1347
931 if (printk_time) { 1348 if (dict) {
932 /* Add the current time stamp */ 1349 prefix = true;
933 char tbuf[50], *tp; 1350 newline = true;
934 unsigned tlen; 1351 }
935 unsigned long long t;
936 unsigned long nanosec_rem;
937
938 t = cpu_clock(printk_cpu);
939 nanosec_rem = do_div(t, 1000000000);
940 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
941 (unsigned long) t,
942 nanosec_rem / 1000);
943
944 for (tp = tbuf; tp < tbuf + tlen; tp++)
945 emit_log_char(*tp);
946 printed_len += tlen;
947 }
948 1352
949 if (!*p) 1353 if (!newline) {
950 break; 1354 if (cont_len && (prefix || cont_task != current)) {
1355 /*
1356 * Flush earlier buffer, which is either from a
1357 * different thread, or when we got a new prefix.
1358 */
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
1360 cont_len = 0;
951 } 1361 }
952 1362
953 emit_log_char(*p); 1363 if (!cont_len) {
954 if (*p == '\n') 1364 cont_level = level;
955 new_text_line = 1; 1365 cont_task = current;
1366 }
1367
1368 /* buffer or append to earlier buffer from the same thread */
1369 if (cont_len + text_len > sizeof(cont_buf))
1370 text_len = sizeof(cont_buf) - cont_len;
1371 memcpy(cont_buf + cont_len, text, text_len);
1372 cont_len += text_len;
1373 } else {
1374 if (cont_len && cont_task == current) {
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385
1386 /* append to the earlier buffer and flush */
1387 if (cont_len + text_len > sizeof(cont_buf))
1388 text_len = sizeof(cont_buf) - cont_len;
1389 memcpy(cont_buf + cont_len, text, text_len);
1390 cont_len += text_len;
1391 log_store(facility, cont_level,
1392 NULL, 0, cont_buf, cont_len);
1393 cont_len = 0;
1394 cont_task = NULL;
1395 printed_len = cont_len;
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 }
956 } 1402 }
957 1403
958 /* 1404 /*
959 * Try to acquire and then immediately release the 1405 * Try to acquire and then immediately release the console semaphore.
960 * console semaphore. The release will do all the 1406 * The release will print out buffers and wake up /dev/kmsg and syslog()
961 * actual magic (print out buffers, wake up klogd, 1407 * users.
962 * etc).
963 * 1408 *
964 * The console_trylock_for_printk() function 1409 * The console_trylock_for_printk() function will release 'logbuf_lock'
965 * will release 'logbuf_lock' regardless of whether it 1410 * regardless of whether it actually gets the console semaphore or not.
966 * actually gets the semaphore or not.
967 */ 1411 */
968 if (console_trylock_for_printk(this_cpu)) 1412 if (console_trylock_for_printk(this_cpu))
969 console_unlock(); 1413 console_unlock();
@@ -974,16 +1418,81 @@ out_restore_irqs:
974 1418
975 return printed_len; 1419 return printed_len;
976} 1420}
977EXPORT_SYMBOL(printk); 1421EXPORT_SYMBOL(vprintk_emit);
978EXPORT_SYMBOL(vprintk);
979 1422
980#else 1423asmlinkage int vprintk(const char *fmt, va_list args)
1424{
1425 return vprintk_emit(0, -1, NULL, 0, fmt, args);
1426}
1427EXPORT_SYMBOL(vprintk);
981 1428
982static void call_console_drivers(unsigned start, unsigned end) 1429asmlinkage int printk_emit(int facility, int level,
1430 const char *dict, size_t dictlen,
1431 const char *fmt, ...)
983{ 1432{
1433 va_list args;
1434 int r;
1435
1436 va_start(args, fmt);
1437 r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
1438 va_end(args);
1439
1440 return r;
984} 1441}
1442EXPORT_SYMBOL(printk_emit);
985 1443
1444/**
1445 * printk - print a kernel message
1446 * @fmt: format string
1447 *
1448 * This is printk(). It can be called from any context. We want it to work.
1449 *
1450 * We try to grab the console_lock. If we succeed, it's easy - we log the
1451 * output and call the console drivers. If we fail to get the semaphore, we
1452 * place the output into the log buffer and return. The current holder of
1453 * the console_sem will notice the new output in console_unlock(); and will
1454 * send it to the consoles before releasing the lock.
1455 *
1456 * One effect of this deferred printing is that code which calls printk() and
1457 * then changes console_loglevel may break. This is because console_loglevel
1458 * is inspected when the actual printing occurs.
1459 *
1460 * See also:
1461 * printf(3)
1462 *
1463 * See the vsnprintf() documentation for format string extensions over C99.
1464 */
1465asmlinkage int printk(const char *fmt, ...)
1466{
1467 va_list args;
1468 int r;
1469
1470#ifdef CONFIG_KGDB_KDB
1471 if (unlikely(kdb_trap_printk)) {
1472 va_start(args, fmt);
1473 r = vkdb_printf(fmt, args);
1474 va_end(args);
1475 return r;
1476 }
986#endif 1477#endif
1478 va_start(args, fmt);
1479 r = vprintk_emit(0, -1, NULL, 0, fmt, args);
1480 va_end(args);
1481
1482 return r;
1483}
1484EXPORT_SYMBOL(printk);
1485
1486#else
1487
1488#define LOG_LINE_MAX 0
1489static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog,
1493 char *buf, size_t size) { return 0; }
1494
1495#endif /* CONFIG_PRINTK */
987 1496
988static int __add_preferred_console(char *name, int idx, char *options, 1497static int __add_preferred_console(char *name, int idx, char *options,
989 char *brl_options) 1498 char *brl_options)
@@ -1217,7 +1726,7 @@ int is_console_locked(void)
1217} 1726}
1218 1727
1219/* 1728/*
1220 * Delayed printk facility, for scheduler-internal messages: 1729 * Delayed printk version, for scheduler-internal messages:
1221 */ 1730 */
1222#define PRINTK_BUF_SIZE 512 1731#define PRINTK_BUF_SIZE 512
1223 1732
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void)
1253 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1762 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1254} 1763}
1255 1764
1765/* the next printk record to write to the console */
1766static u64 console_seq;
1767static u32 console_idx;
1768
1256/** 1769/**
1257 * console_unlock - unlock the console system 1770 * console_unlock - unlock the console system
1258 * 1771 *
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void)
1263 * by printk(). If this is the case, console_unlock(); emits 1776 * by printk(). If this is the case, console_unlock(); emits
1264 * the output prior to releasing the lock. 1777 * the output prior to releasing the lock.
1265 * 1778 *
1266 * If there is output waiting for klogd, we wake it up. 1779 * If there is output waiting, we wake /dev/kmsg and syslog() users.
1267 * 1780 *
1268 * console_unlock(); may be called from any context. 1781 * console_unlock(); may be called from any context.
1269 */ 1782 */
1270void console_unlock(void) 1783void console_unlock(void)
1271{ 1784{
1785 static u64 seen_seq;
1272 unsigned long flags; 1786 unsigned long flags;
1273 unsigned _con_start, _log_end; 1787 bool wake_klogd = false;
1274 unsigned wake_klogd = 0, retry = 0; 1788 bool retry;
1275 1789
1276 if (console_suspended) { 1790 if (console_suspended) {
1277 up(&console_sem); 1791 up(&console_sem);
@@ -1281,17 +1795,38 @@ void console_unlock(void)
1281 console_may_schedule = 0; 1795 console_may_schedule = 0;
1282 1796
1283again: 1797again:
1284 for ( ; ; ) { 1798 for (;;) {
1799 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len;
1802 int level;
1803
1285 raw_spin_lock_irqsave(&logbuf_lock, flags); 1804 raw_spin_lock_irqsave(&logbuf_lock, flags);
1286 wake_klogd |= log_start - log_end; 1805 if (seen_seq != log_next_seq) {
1287 if (con_start == log_end) 1806 wake_klogd = true;
1288 break; /* Nothing to print */ 1807 seen_seq = log_next_seq;
1289 _con_start = con_start; 1808 }
1290 _log_end = log_end; 1809
1291 con_start = log_end; /* Flush */ 1810 if (console_seq < log_first_seq) {
1811 /* messages are gone, move to first one */
1812 console_seq = log_first_seq;
1813 console_idx = log_first_idx;
1814 }
1815
1816 if (console_seq == log_next_seq)
1817 break;
1818
1819 msg = log_from_idx(console_idx);
1820 level = msg->level & 7;
1821
1822 len = msg_print_text(msg, false, text, sizeof(text));
1823
1824 console_idx = log_next(console_idx);
1825 console_seq++;
1292 raw_spin_unlock(&logbuf_lock); 1826 raw_spin_unlock(&logbuf_lock);
1827
1293 stop_critical_timings(); /* don't trace print latency */ 1828 stop_critical_timings(); /* don't trace print latency */
1294 call_console_drivers(_con_start, _log_end); 1829 call_console_drivers(level, text, len);
1295 start_critical_timings(); 1830 start_critical_timings();
1296 local_irq_restore(flags); 1831 local_irq_restore(flags);
1297 } 1832 }
@@ -1312,8 +1847,7 @@ again:
1312 * flush, no worries. 1847 * flush, no worries.
1313 */ 1848 */
1314 raw_spin_lock(&logbuf_lock); 1849 raw_spin_lock(&logbuf_lock);
1315 if (con_start != log_end) 1850 retry = console_seq != log_next_seq;
1316 retry = 1;
1317 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1851 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1318 1852
1319 if (retry && console_trylock()) 1853 if (retry && console_trylock())
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)
1549 * for us. 2083 * for us.
1550 */ 2084 */
1551 raw_spin_lock_irqsave(&logbuf_lock, flags); 2085 raw_spin_lock_irqsave(&logbuf_lock, flags);
1552 con_start = log_start; 2086 console_seq = syslog_seq;
2087 console_idx = syslog_idx;
1553 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2088 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1554 /* 2089 /*
1555 * We're about to replay the log buffer. Only do this to the 2090 * We're about to replay the log buffer. Only do this to the
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1758} 2293}
1759EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 2294EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1760 2295
2296static bool always_kmsg_dump;
2297module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2298
1761/** 2299/**
1762 * kmsg_dump - dump kernel log to kernel message dumpers. 2300 * kmsg_dump - dump kernel log to kernel message dumpers.
1763 * @reason: the reason (oops, panic etc) for dumping 2301 * @reason: the reason (oops, panic etc) for dumping
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1767 */ 2305 */
1768void kmsg_dump(enum kmsg_dump_reason reason) 2306void kmsg_dump(enum kmsg_dump_reason reason)
1769{ 2307{
1770 unsigned long end; 2308 u64 idx;
1771 unsigned chars;
1772 struct kmsg_dumper *dumper; 2309 struct kmsg_dumper *dumper;
1773 const char *s1, *s2; 2310 const char *s1, *s2;
1774 unsigned long l1, l2; 2311 unsigned long l1, l2;
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1780 /* Theoretically, the log could move on after we do this, but 2317 /* Theoretically, the log could move on after we do this, but
1781 there's not a lot we can do about that. The new messages 2318 there's not a lot we can do about that. The new messages
1782 will overwrite the start of what we dump. */ 2319 will overwrite the start of what we dump. */
2320
1783 raw_spin_lock_irqsave(&logbuf_lock, flags); 2321 raw_spin_lock_irqsave(&logbuf_lock, flags);
1784 end = log_end & LOG_BUF_MASK; 2322 if (syslog_seq < log_first_seq)
1785 chars = logged_chars; 2323 idx = syslog_idx;
1786 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2324 else
2325 idx = log_first_idx;
1787 2326
1788 if (chars > end) { 2327 if (idx > log_next_idx) {
1789 s1 = log_buf + log_buf_len - chars + end; 2328 s1 = log_buf;
1790 l1 = chars - end; 2329 l1 = log_next_idx;
1791 2330
1792 s2 = log_buf; 2331 s2 = log_buf + idx;
1793 l2 = end; 2332 l2 = log_buf_len - idx;
1794 } else { 2333 } else {
1795 s1 = ""; 2334 s1 = "";
1796 l1 = 0; 2335 l1 = 0;
1797 2336
1798 s2 = log_buf + end - chars; 2337 s2 = log_buf + idx;
1799 l2 = chars; 2338 l2 = log_next_idx - idx;
1800 } 2339 }
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1801 2341
1802 rcu_read_lock(); 2342 rcu_read_lock();
1803 list_for_each_entry_rcu(dumper, &dump_list, list) 2343 list_for_each_entry_rcu(dumper, &dump_list, list)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
51 51
52#include "rcu.h" 52#include "rcu.h"
53 53
54#ifdef CONFIG_PREEMPT_RCU
55
56/*
57 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep
60 * is enabled.
61 */
62void exit_rcu(void)
63{
64 struct task_struct *t = current;
65
66 if (likely(list_empty(&current->rcu_node_entry)))
67 return;
68 t->rcu_read_lock_nesting = 1;
69 barrier();
70 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
71 __rcu_read_unlock();
72}
73
74#else /* #ifdef CONFIG_PREEMPT_RCU */
75
76void exit_rcu(void)
77{
78}
79
80#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
81
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 82#ifdef CONFIG_DEBUG_LOCK_ALLOC
55static struct lock_class_key rcu_lock_key; 83static struct lock_class_key rcu_lock_key;
56struct lockdep_map rcu_lock_map = 84struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 852}
853 853
854/*
855 * Check for a task exiting while in a preemptible -RCU read-side
856 * critical section, clean up if so. No need to issue warnings,
857 * as debug_check_no_locks_held() already does this if lockdep
858 * is enabled.
859 */
860void exit_rcu(void)
861{
862 struct task_struct *t = current;
863
864 if (t->rcu_read_lock_nesting == 0)
865 return;
866 t->rcu_read_lock_nesting = 1;
867 __rcu_read_unlock();
868}
869
870#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 854#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
871 855
872#ifdef CONFIG_RCU_TRACE 856#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
96MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
97module_param(fqs_stutter, int, 0444); 98module_param(fqs_stutter, int, 0444);
98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
99module_param(onoff_interval, int, 0444); 102module_param(onoff_interval, int, 0444);
100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444); 104module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
139static struct task_struct *onoff_task; 142static struct task_struct *onoff_task;
140#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task; 144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
142 147
143#define RCU_TORTURE_PIPE_LEN 10 148#define RCU_TORTURE_PIPE_LEN 10
144 149
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
164static atomic_t n_rcu_torture_free; 169static atomic_t n_rcu_torture_free;
165static atomic_t n_rcu_torture_mberror; 170static atomic_t n_rcu_torture_mberror;
166static atomic_t n_rcu_torture_error; 171static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
167static long n_rcu_torture_boost_ktrerror; 173static long n_rcu_torture_boost_ktrerror;
168static long n_rcu_torture_boost_rterror; 174static long n_rcu_torture_boost_rterror;
169static long n_rcu_torture_boost_failure; 175static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
173static long n_offline_successes; 179static long n_offline_successes;
174static long n_online_attempts; 180static long n_online_attempts;
175static long n_online_successes; 181static long n_online_successes;
182static long n_barrier_attempts;
183static long n_barrier_successes;
176static struct list_head rcu_torture_removed; 184static struct list_head rcu_torture_removed;
177static cpumask_var_t shuffle_tmp_mask; 185static cpumask_var_t shuffle_tmp_mask;
178 186
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
197static unsigned long boost_starttime; /* jiffies of next boost test start. */ 205static unsigned long boost_starttime; /* jiffies of next boost test start. */
198DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
199 /* and boost task create/destroy. */ 207 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
200 212
201/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 213/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
202 214
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
327 int (*completed)(void); 339 int (*completed)(void);
328 void (*deferred_free)(struct rcu_torture *p); 340 void (*deferred_free)(struct rcu_torture *p);
329 void (*sync)(void); 341 void (*sync)(void);
342 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
330 void (*cb_barrier)(void); 343 void (*cb_barrier)(void);
331 void (*fqs)(void); 344 void (*fqs)(void);
332 int (*stats)(char *page); 345 int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
417 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
418 .deferred_free = rcu_torture_deferred_free, 431 .deferred_free = rcu_torture_deferred_free,
419 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .call = call_rcu,
420 .cb_barrier = rcu_barrier, 434 .cb_barrier = rcu_barrier,
421 .fqs = rcu_force_quiescent_state, 435 .fqs = rcu_force_quiescent_state,
422 .stats = NULL, 436 .stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
460 .completed = rcu_torture_completed, 474 .completed = rcu_torture_completed,
461 .deferred_free = rcu_sync_torture_deferred_free, 475 .deferred_free = rcu_sync_torture_deferred_free,
462 .sync = synchronize_rcu, 476 .sync = synchronize_rcu,
477 .call = NULL,
463 .cb_barrier = NULL, 478 .cb_barrier = NULL,
464 .fqs = rcu_force_quiescent_state, 479 .fqs = rcu_force_quiescent_state,
465 .stats = NULL, 480 .stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
477 .completed = rcu_no_completed, 492 .completed = rcu_no_completed,
478 .deferred_free = rcu_sync_torture_deferred_free, 493 .deferred_free = rcu_sync_torture_deferred_free,
479 .sync = synchronize_rcu_expedited, 494 .sync = synchronize_rcu_expedited,
495 .call = NULL,
480 .cb_barrier = NULL, 496 .cb_barrier = NULL,
481 .fqs = rcu_force_quiescent_state, 497 .fqs = rcu_force_quiescent_state,
482 .stats = NULL, 498 .stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
519 .completed = rcu_bh_torture_completed, 535 .completed = rcu_bh_torture_completed,
520 .deferred_free = rcu_bh_torture_deferred_free, 536 .deferred_free = rcu_bh_torture_deferred_free,
521 .sync = synchronize_rcu_bh, 537 .sync = synchronize_rcu_bh,
538 .call = call_rcu_bh,
522 .cb_barrier = rcu_barrier_bh, 539 .cb_barrier = rcu_barrier_bh,
523 .fqs = rcu_bh_force_quiescent_state, 540 .fqs = rcu_bh_force_quiescent_state,
524 .stats = NULL, 541 .stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
535 .completed = rcu_bh_torture_completed, 552 .completed = rcu_bh_torture_completed,
536 .deferred_free = rcu_sync_torture_deferred_free, 553 .deferred_free = rcu_sync_torture_deferred_free,
537 .sync = synchronize_rcu_bh, 554 .sync = synchronize_rcu_bh,
555 .call = NULL,
538 .cb_barrier = NULL, 556 .cb_barrier = NULL,
539 .fqs = rcu_bh_force_quiescent_state, 557 .fqs = rcu_bh_force_quiescent_state,
540 .stats = NULL, 558 .stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
551 .completed = rcu_bh_torture_completed, 569 .completed = rcu_bh_torture_completed,
552 .deferred_free = rcu_sync_torture_deferred_free, 570 .deferred_free = rcu_sync_torture_deferred_free,
553 .sync = synchronize_rcu_bh_expedited, 571 .sync = synchronize_rcu_bh_expedited,
572 .call = NULL,
554 .cb_barrier = NULL, 573 .cb_barrier = NULL,
555 .fqs = rcu_bh_force_quiescent_state, 574 .fqs = rcu_bh_force_quiescent_state,
556 .stats = NULL, 575 .stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
606 return srcu_batches_completed(&srcu_ctl); 625 return srcu_batches_completed(&srcu_ctl);
607} 626}
608 627
628static void srcu_torture_deferred_free(struct rcu_torture *rp)
629{
630 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
631}
632
609static void srcu_torture_synchronize(void) 633static void srcu_torture_synchronize(void)
610{ 634{
611 synchronize_srcu(&srcu_ctl); 635 synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
621 torture_type, TORTURE_FLAG, idx); 645 torture_type, TORTURE_FLAG, idx);
622 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
623 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
626 } 650 }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
635 .read_delay = srcu_read_delay, 659 .read_delay = srcu_read_delay,
636 .readunlock = srcu_torture_read_unlock, 660 .readunlock = srcu_torture_read_unlock,
637 .completed = srcu_torture_completed, 661 .completed = srcu_torture_completed,
638 .deferred_free = rcu_sync_torture_deferred_free, 662 .deferred_free = srcu_torture_deferred_free,
639 .sync = srcu_torture_synchronize, 663 .sync = srcu_torture_synchronize,
664 .call = NULL,
640 .cb_barrier = NULL, 665 .cb_barrier = NULL,
641 .stats = srcu_torture_stats, 666 .stats = srcu_torture_stats,
642 .name = "srcu" 667 .name = "srcu"
643}; 668};
644 669
670static struct rcu_torture_ops srcu_sync_ops = {
671 .init = srcu_torture_init,
672 .cleanup = srcu_torture_cleanup,
673 .readlock = srcu_torture_read_lock,
674 .read_delay = srcu_read_delay,
675 .readunlock = srcu_torture_read_unlock,
676 .completed = srcu_torture_completed,
677 .deferred_free = rcu_sync_torture_deferred_free,
678 .sync = srcu_torture_synchronize,
679 .call = NULL,
680 .cb_barrier = NULL,
681 .stats = srcu_torture_stats,
682 .name = "srcu_sync"
683};
684
645static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) 685static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
646{ 686{
647 return srcu_read_lock_raw(&srcu_ctl); 687 return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
659 .read_delay = srcu_read_delay, 699 .read_delay = srcu_read_delay,
660 .readunlock = srcu_torture_read_unlock_raw, 700 .readunlock = srcu_torture_read_unlock_raw,
661 .completed = srcu_torture_completed, 701 .completed = srcu_torture_completed,
662 .deferred_free = rcu_sync_torture_deferred_free, 702 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 703 .sync = srcu_torture_synchronize,
704 .call = NULL,
664 .cb_barrier = NULL, 705 .cb_barrier = NULL,
665 .stats = srcu_torture_stats, 706 .stats = srcu_torture_stats,
666 .name = "srcu_raw" 707 .name = "srcu_raw"
667}; 708};
668 709
710static struct rcu_torture_ops srcu_raw_sync_ops = {
711 .init = srcu_torture_init,
712 .cleanup = srcu_torture_cleanup,
713 .readlock = srcu_torture_read_lock_raw,
714 .read_delay = srcu_read_delay,
715 .readunlock = srcu_torture_read_unlock_raw,
716 .completed = srcu_torture_completed,
717 .deferred_free = rcu_sync_torture_deferred_free,
718 .sync = srcu_torture_synchronize,
719 .call = NULL,
720 .cb_barrier = NULL,
721 .stats = srcu_torture_stats,
722 .name = "srcu_raw_sync"
723};
724
669static void srcu_torture_synchronize_expedited(void) 725static void srcu_torture_synchronize_expedited(void)
670{ 726{
671 synchronize_srcu_expedited(&srcu_ctl); 727 synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
680 .completed = srcu_torture_completed, 736 .completed = srcu_torture_completed,
681 .deferred_free = rcu_sync_torture_deferred_free, 737 .deferred_free = rcu_sync_torture_deferred_free,
682 .sync = srcu_torture_synchronize_expedited, 738 .sync = srcu_torture_synchronize_expedited,
739 .call = NULL,
683 .cb_barrier = NULL, 740 .cb_barrier = NULL,
684 .stats = srcu_torture_stats, 741 .stats = srcu_torture_stats,
685 .name = "srcu_expedited" 742 .name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1131 "rtbf: %ld rtb: %ld nt: %ld " 1188 "rtbf: %ld rtb: %ld nt: %ld "
1132 "onoff: %ld/%ld:%ld/%ld", 1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1133 rcu_torture_current, 1191 rcu_torture_current,
1134 rcu_torture_current_version, 1192 rcu_torture_current_version,
1135 list_empty(&rcu_torture_freelist), 1193 list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
1145 n_online_successes, 1203 n_online_successes,
1146 n_online_attempts, 1204 n_online_attempts,
1147 n_offline_successes, 1205 n_offline_successes,
1148 n_offline_attempts); 1206 n_offline_attempts,
1207 n_barrier_successes,
1208 n_barrier_attempts,
1209 n_rcu_torture_barrier_error);
1210 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1149 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1211 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1212 n_rcu_torture_barrier_error != 0 ||
1150 n_rcu_torture_boost_ktrerror != 0 || 1213 n_rcu_torture_boost_ktrerror != 0 ||
1151 n_rcu_torture_boost_rterror != 0 || 1214 n_rcu_torture_boost_rterror != 0 ||
1152 n_rcu_torture_boost_failure != 0) 1215 n_rcu_torture_boost_failure != 0 ||
1153 cnt += sprintf(&page[cnt], " !!!"); 1216 i > 1) {
1154 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1155 if (i > 1) {
1156 cnt += sprintf(&page[cnt], "!!! "); 1217 cnt += sprintf(&page[cnt], "!!! ");
1157 atomic_inc(&n_rcu_torture_error); 1218 atomic_inc(&n_rcu_torture_error);
1158 WARN_ON_ONCE(1); 1219 WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
1337 1398
1338 /* This must be outside of the mutex, otherwise deadlock! */ 1399 /* This must be outside of the mutex, otherwise deadlock! */
1339 kthread_stop(t); 1400 kthread_stop(t);
1401 boost_tasks[cpu] = NULL;
1340} 1402}
1341 1403
1342static int rcutorture_booster_init(int cpu) 1404static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
1484 return; 1546 return;
1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1547 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1486 kthread_stop(onoff_task); 1548 kthread_stop(onoff_task);
1549 onoff_task = NULL;
1487} 1550}
1488 1551
1489#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552#else /* #ifdef CONFIG_HOTPLUG_CPU */
1490 1553
1491static void 1554static int
1492rcu_torture_onoff_init(void) 1555rcu_torture_onoff_init(void)
1493{ 1556{
1557 return 0;
1494} 1558}
1495 1559
1496static void rcu_torture_onoff_cleanup(void) 1560static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
1554 return; 1618 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1619 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task); 1620 kthread_stop(stall_task);
1621 stall_task = NULL;
1622}
1623
1624/* Callback function for RCU barrier testing. */
1625void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1626{
1627 atomic_inc(&barrier_cbs_invoked);
1628}
1629
1630/* kthread function to register callbacks used to test RCU barriers. */
1631static int rcu_torture_barrier_cbs(void *arg)
1632{
1633 long myid = (long)arg;
1634 struct rcu_head rcu;
1635
1636 init_rcu_head_on_stack(&rcu);
1637 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1638 set_user_nice(current, 19);
1639 do {
1640 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
1642 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP);
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1647 if (atomic_dec_and_test(&barrier_cbs_count))
1648 wake_up(&barrier_wq);
1649 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1650 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1651 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1652 while (!kthread_should_stop())
1653 schedule_timeout_interruptible(1);
1654 cur_ops->cb_barrier();
1655 destroy_rcu_head_on_stack(&rcu);
1656 return 0;
1657}
1658
1659/* kthread function to drive and coordinate RCU barrier testing. */
1660static int rcu_torture_barrier(void *arg)
1661{
1662 int i;
1663
1664 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1665 do {
1666 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */
1669 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq,
1672 atomic_read(&barrier_cbs_count) == 0 ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1676 break;
1677 n_barrier_attempts++;
1678 cur_ops->cb_barrier();
1679 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1680 n_rcu_torture_barrier_error++;
1681 WARN_ON_ONCE(1);
1682 }
1683 n_barrier_successes++;
1684 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1688 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1);
1690 return 0;
1691}
1692
1693/* Initialize RCU barrier testing. */
1694static int rcu_torture_barrier_init(void)
1695{
1696 int i;
1697 int ret;
1698
1699 if (n_barrier_cbs == 0)
1700 return 0;
1701 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1702 printk(KERN_ALERT "%s" TORTURE_FLAG
1703 " Call or barrier ops missing for %s,\n",
1704 torture_type, cur_ops->name);
1705 printk(KERN_ALERT "%s" TORTURE_FLAG
1706 " RCU barrier testing omitted from run.\n",
1707 torture_type);
1708 return 0;
1709 }
1710 atomic_set(&barrier_cbs_count, 0);
1711 atomic_set(&barrier_cbs_invoked, 0);
1712 barrier_cbs_tasks =
1713 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1714 GFP_KERNEL);
1715 barrier_cbs_wq =
1716 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1717 GFP_KERNEL);
1718 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1719 return -ENOMEM;
1720 for (i = 0; i < n_barrier_cbs; i++) {
1721 init_waitqueue_head(&barrier_cbs_wq[i]);
1722 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1723 (void *)(long)i,
1724 "rcu_torture_barrier_cbs");
1725 if (IS_ERR(barrier_cbs_tasks[i])) {
1726 ret = PTR_ERR(barrier_cbs_tasks[i]);
1727 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1728 barrier_cbs_tasks[i] = NULL;
1729 return ret;
1730 }
1731 }
1732 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1733 "rcu_torture_barrier");
1734 if (IS_ERR(barrier_task)) {
1735 ret = PTR_ERR(barrier_task);
1736 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1737 barrier_task = NULL;
1738 }
1739 return 0;
1740}
1741
1742/* Clean up after RCU barrier testing. */
1743static void rcu_torture_barrier_cleanup(void)
1744{
1745 int i;
1746
1747 if (barrier_task != NULL) {
1748 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1749 kthread_stop(barrier_task);
1750 barrier_task = NULL;
1751 }
1752 if (barrier_cbs_tasks != NULL) {
1753 for (i = 0; i < n_barrier_cbs; i++) {
1754 if (barrier_cbs_tasks[i] != NULL) {
1755 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1756 kthread_stop(barrier_cbs_tasks[i]);
1757 barrier_cbs_tasks[i] = NULL;
1758 }
1759 }
1760 kfree(barrier_cbs_tasks);
1761 barrier_cbs_tasks = NULL;
1762 }
1763 if (barrier_cbs_wq != NULL) {
1764 kfree(barrier_cbs_wq);
1765 barrier_cbs_wq = NULL;
1766 }
1557} 1767}
1558 1768
1559static int rcutorture_cpu_notify(struct notifier_block *self, 1769static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
1598 fullstop = FULLSTOP_RMMOD; 1808 fullstop = FULLSTOP_RMMOD;
1599 mutex_unlock(&fullstop_mutex); 1809 mutex_unlock(&fullstop_mutex);
1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1810 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1811 rcu_torture_barrier_cleanup();
1601 rcu_torture_stall_cleanup(); 1812 rcu_torture_stall_cleanup();
1602 if (stutter_task) { 1813 if (stutter_task) {
1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1876 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1666 kthread_stop(shutdown_task); 1877 kthread_stop(shutdown_task);
1667 } 1878 }
1879 shutdown_task = NULL;
1668 rcu_torture_onoff_cleanup(); 1880 rcu_torture_onoff_cleanup();
1669 1881
1670 /* Wait for all RCU callbacks to fire. */ 1882 /* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
1676 1888
1677 if (cur_ops->cleanup) 1889 if (cur_ops->cleanup)
1678 cur_ops->cleanup(); 1890 cur_ops->cleanup();
1679 if (atomic_read(&n_rcu_torture_error)) 1891 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1892 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts || 1893 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts) 1894 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
1692 int i; 1904 int i;
1693 int cpu; 1905 int cpu;
1694 int firsterr = 0; 1906 int firsterr = 0;
1907 int retval;
1695 static struct rcu_torture_ops *torture_ops[] = 1908 static struct rcu_torture_ops *torture_ops[] =
1696 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1697 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1698 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops,
1699 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1700 1914
1701 mutex_lock(&fullstop_mutex); 1915 mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
1749 atomic_set(&n_rcu_torture_free, 0); 1963 atomic_set(&n_rcu_torture_free, 0);
1750 atomic_set(&n_rcu_torture_mberror, 0); 1964 atomic_set(&n_rcu_torture_mberror, 0);
1751 atomic_set(&n_rcu_torture_error, 0); 1965 atomic_set(&n_rcu_torture_error, 0);
1966 n_rcu_torture_barrier_error = 0;
1752 n_rcu_torture_boost_ktrerror = 0; 1967 n_rcu_torture_boost_ktrerror = 0;
1753 n_rcu_torture_boost_rterror = 0; 1968 n_rcu_torture_boost_rterror = 0;
1754 n_rcu_torture_boost_failure = 0; 1969 n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
1872 test_boost_duration = 2; 2087 test_boost_duration = 2;
1873 if ((test_boost == 1 && cur_ops->can_boost) || 2088 if ((test_boost == 1 && cur_ops->can_boost) ||
1874 test_boost == 2) { 2089 test_boost == 2) {
1875 int retval;
1876 2090
1877 boost_starttime = jiffies + test_boost_interval * HZ; 2091 boost_starttime = jiffies + test_boost_interval * HZ;
1878 register_cpu_notifier(&rcutorture_cpu_nb); 2092 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
1897 goto unwind; 2111 goto unwind;
1898 } 2112 }
1899 } 2113 }
1900 rcu_torture_onoff_init(); 2114 i = rcu_torture_onoff_init();
2115 if (i != 0) {
2116 firsterr = i;
2117 goto unwind;
2118 }
1901 register_reboot_notifier(&rcutorture_shutdown_nb); 2119 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init(); 2120 i = rcu_torture_stall_init();
2121 if (i != 0) {
2122 firsterr = i;
2123 goto unwind;
2124 }
2125 retval = rcu_torture_barrier_init();
2126 if (retval != 0) {
2127 firsterr = retval;
2128 goto unwind;
2129 }
1903 rcutorture_record_test_transition(); 2130 rcutorture_record_test_transition();
1904 mutex_unlock(&fullstop_mutex); 2131 mutex_unlock(&fullstop_mutex);
1905 return 0; 2132 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 1050d6d3922c..0da7b88d92d0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
75 .gpnum = -300, \ 75 .gpnum = -300, \
76 .completed = -300, \ 76 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \
78 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
79 .n_force_qs = 0, \ 81 .n_force_qs = 0, \
80 .n_force_qs_ngp = 0, \ 82 .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145unsigned long rcutorture_testseq; 147unsigned long rcutorture_testseq;
146unsigned long rcutorture_vernum; 148unsigned long rcutorture_vernum;
147 149
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
148/* 157/*
149 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
150 * permit this function to be invoked without holding the root rcu_node 159 * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
192{ 201{
193 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
194 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
195 rcu_preempt_note_context_switch(cpu);
196 trace_rcu_utilization("End context switch"); 204 trace_rcu_utilization("End context switch");
197} 205}
198EXPORT_SYMBOL_GPL(rcu_note_context_switch); 206EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1311#ifdef CONFIG_HOTPLUG_CPU 1319#ifdef CONFIG_HOTPLUG_CPU
1312 1320
1313/* 1321/*
1314 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1322 * Send the specified CPU's RCU callbacks to the orphanage. The
1315 * Also record a quiescent state for this CPU for the current grace period. 1323 * specified CPU must be offline, and the caller must hold the
1316 * Synchronization and interrupt disabling are not required because 1324 * ->onofflock.
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1324 */ 1325 */
1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1326static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp)
1326{ 1329{
1327 int i; 1330 int i;
1328 unsigned long mask;
1329 int receive_cpu = cpumask_any(cpu_online_mask);
1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333 1331
1334 /* First, adjust the counts. */ 1332 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of
1335 * the callbacks, thus no memory barrier is required.
1336 */
1335 if (rdp->nxtlist != NULL) { 1337 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy; 1338 rsp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen; 1339 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen;
1338 rdp->qlen_lazy = 0; 1341 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0; 1342 rdp->qlen = 0;
1340 } 1343 }
1341 1344
1342 /* 1345 /*
1343 * Next, move ready-to-invoke callbacks to be invoked on some 1346 * Next, move those callbacks still needing a grace period to
1344 * other CPU. These will not be required to pass through another 1347 * the orphanage, where some other CPU will pick them up.
1345 * grace period: They are done, regardless of CPU. 1348 * Some of the callbacks might have gone partway through a grace
1349 * period, but that is too bad. They get to start over because we
1350 * cannot assume that grace periods are synchronized across CPUs.
1351 * We don't bother updating the ->nxttail[] array yet, instead
1352 * we just reset the whole thing later on.
1346 */ 1353 */
1347 if (rdp->nxtlist != NULL && 1354 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1355 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1349 struct rcu_head *oldhead; 1356 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1350 struct rcu_head **oldtail; 1357 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 } 1358 }
1366 1359
1367 /* 1360 /*
1368 * Finally, put the rest of the callbacks at the end of the list. 1361 * Then move the ready-to-invoke callbacks to the orphanage,
1369 * The ones that made it partway through get to start over: We 1362 * where some other CPU will pick them up. These will not be
1370 * cannot assume that grace periods are synchronized across CPUs. 1363 * required to pass though another grace period: They are done.
1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */ 1364 */
1374 if (rdp->nxtlist != NULL) { 1365 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1366 *rsp->orphan_donetail = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] = 1367 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 } 1368 }
1385 1369
1370 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL;
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374}
1375
1376/*
1377 * Adopt the RCU callbacks from the specified rcu_state structure's
1378 * orphanage. The caller must hold the ->onofflock.
1379 */
1380static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1381{
1382 int i;
1383 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1384
1386 /* 1385 /*
1387 * Record a quiescent state for the dying CPU. This is safe 1386 * If there is an rcu_barrier() operation in progress, then
1388 * only because we have already cleared out the callbacks. 1387 * only the task doing that operation is permitted to adopt
1389 * (Otherwise, the RCU core might try to schedule the invocation 1388 * callbacks. To do otherwise breaks rcu_barrier() and friends
1390 * of callbacks on this now-offline CPU, which would be bad.) 1389 * by causing them to fail to wait for the callbacks in the
1390 * orphanage.
1391 */ 1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1392 if (rsp->rcu_barrier_in_progress &&
1393 rsp->rcu_barrier_in_progress != current)
1394 return;
1395
1396 /* Do the accounting first. */
1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen;
1400 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0;
1402
1403 /*
1404 * We do not need a memory barrier here because the only way we
1405 * can get here if there is an rcu_barrier() in flight is if
1406 * we are the task doing the rcu_barrier().
1407 */
1408
1409 /* First adopt the ready-to-invoke callbacks. */
1410 if (rsp->orphan_donelist != NULL) {
1411 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1412 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1413 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1414 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1415 rdp->nxttail[i] = rsp->orphan_donetail;
1416 rsp->orphan_donelist = NULL;
1417 rsp->orphan_donetail = &rsp->orphan_donelist;
1418 }
1419
1420 /* And then adopt the callbacks that still need a grace period. */
1421 if (rsp->orphan_nxtlist != NULL) {
1422 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1423 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1424 rsp->orphan_nxtlist = NULL;
1425 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1426 }
1427}
1428
1429/*
1430 * Trace the fact that this CPU is going offline.
1431 */
1432static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1433{
1434 RCU_TRACE(unsigned long mask);
1435 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1436 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1437
1438 RCU_TRACE(mask = rdp->grpmask);
1393 trace_rcu_grace_period(rsp->name, 1439 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1440 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl"); 1441 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1398} 1442}
1399 1443
1400/* 1444/*
1401 * The CPU has been completely removed, and some other CPU is reporting 1445 * The CPU has been completely removed, and some other CPU is reporting
1402 * this fact from process context. Do the remainder of the cleanup. 1446 * this fact from process context. Do the remainder of the cleanup,
1447 * including orphaning the outgoing CPU's RCU callbacks, and also
1448 * adopting them, if there is no _rcu_barrier() instance running.
1403 * There can only be one CPU hotplug operation at a time, so no other 1449 * There can only be one CPU hotplug operation at a time, so no other
1404 * CPU can be attempting to update rcu_cpu_kthread_task. 1450 * CPU can be attempting to update rcu_cpu_kthread_task.
1405 */ 1451 */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1409 unsigned long mask; 1455 unsigned long mask;
1410 int need_report = 0; 1456 int need_report = 0;
1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1457 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1458 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1413 1459
1414 /* Adjust any no-longer-needed kthreads. */ 1460 /* Adjust any no-longer-needed kthreads. */
1415 rcu_stop_cpu_kthread(cpu); 1461 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1); 1462 rcu_node_kthread_setaffinity(rnp, -1);
1417 1463
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1464 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1419 1465
1420 /* Exclude any attempts to start a new grace period. */ 1466 /* Exclude any attempts to start a new grace period. */
1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1467 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1422 1468
1469 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1470 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1471 rcu_adopt_orphan_cbs(rsp);
1472
1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1473 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1424 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1474 mask = rdp->grpmask; /* rnp->grplo is constant. */
1425 do { 1475 do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1456 1506
1457#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1507#else /* #ifdef CONFIG_HOTPLUG_CPU */
1458 1508
1509static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1510{
1511}
1512
1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1513static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1460{ 1514{
1461} 1515}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1524 rcu_is_callbacks_kthread()); 1578 rcu_is_callbacks_kthread());
1525 1579
1526 /* Update count, and requeue any remaining callbacks. */ 1580 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1528 rdp->qlen -= count;
1529 rdp->n_cbs_invoked += count;
1530 if (list != NULL) { 1581 if (list != NULL) {
1531 *tail = rdp->nxtlist; 1582 *tail = rdp->nxtlist;
1532 rdp->nxtlist = list; 1583 rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1536 else 1587 else
1537 break; 1588 break;
1538 } 1589 }
1590 smp_mb(); /* List handling before counting for rcu_barrier(). */
1591 rdp->qlen_lazy -= count_lazy;
1592 rdp->qlen -= count;
1593 rdp->n_cbs_invoked += count;
1539 1594
1540 /* Reinstate batch limit if we have worked down the excess. */ 1595 /* Reinstate batch limit if we have worked down the excess. */
1541 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1596 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1820,15 +1875,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1820 * a quiescent state betweentimes. 1875 * a quiescent state betweentimes.
1821 */ 1876 */
1822 local_irq_save(flags); 1877 local_irq_save(flags);
1823 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1824 rdp = this_cpu_ptr(rsp->rda); 1878 rdp = this_cpu_ptr(rsp->rda);
1825 1879
1826 /* Add the callback to our list. */ 1880 /* Add the callback to our list. */
1827 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1828 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1829 rdp->qlen++; 1881 rdp->qlen++;
1830 if (lazy) 1882 if (lazy)
1831 rdp->qlen_lazy++; 1883 rdp->qlen_lazy++;
1884 else
1885 rcu_idle_count_callbacks_posted();
1886 smp_mb(); /* Count before adding callback for rcu_barrier(). */
1887 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1888 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1832 1889
1833 if (__is_kfree_rcu_offset((unsigned long)func)) 1890 if (__is_kfree_rcu_offset((unsigned long)func))
1834 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1891 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1894,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1894} 1951}
1895EXPORT_SYMBOL_GPL(call_rcu_bh); 1952EXPORT_SYMBOL_GPL(call_rcu_bh);
1896 1953
1954/*
1955 * Because a context switch is a grace period for RCU-sched and RCU-bh,
1956 * any blocking grace-period wait automatically implies a grace period
1957 * if there is only one CPU online at any point time during execution
1958 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
1959 * occasionally incorrectly indicate that there are multiple CPUs online
1960 * when there was in fact only one the whole time, as this just adds
1961 * some overhead: RCU still operates correctly.
1962 *
1963 * Of course, sampling num_online_cpus() with preemption enabled can
1964 * give erroneous results if there are concurrent CPU-hotplug operations.
1965 * For example, given a demonic sequence of preemptions in num_online_cpus()
1966 * and CPU-hotplug operations, there could be two or more CPUs online at
1967 * all times, but num_online_cpus() might well return one (or even zero).
1968 *
1969 * However, all such demonic sequences require at least one CPU-offline
1970 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1971 * is only a problem if there is an RCU read-side critical section executing
1972 * throughout. But RCU-sched and RCU-bh read-side critical sections
1973 * disable either preemption or bh, which prevents a CPU from going offline.
1974 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1975 * that there is only one CPU when in fact there was more than one throughout
1976 * is when there were no RCU readers in the system. If there are no
1977 * RCU readers, the grace period by definition can be of zero length,
1978 * regardless of the number of online CPUs.
1979 */
1980static inline int rcu_blocking_is_gp(void)
1981{
1982 might_sleep(); /* Check for RCU read-side critical section. */
1983 return num_online_cpus() <= 1;
1984}
1985
1897/** 1986/**
1898 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1987 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1899 * 1988 *
@@ -2167,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
2167 rcu_preempt_cpu_has_callbacks(cpu); 2256 rcu_preempt_cpu_has_callbacks(cpu);
2168} 2257}
2169 2258
2170static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2259/*
2171static atomic_t rcu_barrier_cpu_count; 2260 * RCU callback function for _rcu_barrier(). If we are last, wake
2172static DEFINE_MUTEX(rcu_barrier_mutex); 2261 * up the task executing _rcu_barrier().
2173static struct completion rcu_barrier_completion; 2262 */
2174
2175static void rcu_barrier_callback(struct rcu_head *notused) 2263static void rcu_barrier_callback(struct rcu_head *notused)
2176{ 2264{
2177 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2265 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2201,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
2201 void (*call_rcu_func)(struct rcu_head *head, 2289 void (*call_rcu_func)(struct rcu_head *head,
2202 void (*func)(struct rcu_head *head))) 2290 void (*func)(struct rcu_head *head)))
2203{ 2291{
2204 BUG_ON(in_interrupt()); 2292 int cpu;
2293 unsigned long flags;
2294 struct rcu_data *rdp;
2295 struct rcu_head rh;
2296
2297 init_rcu_head_on_stack(&rh);
2298
2205 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2299 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2206 mutex_lock(&rcu_barrier_mutex); 2300 mutex_lock(&rcu_barrier_mutex);
2207 init_completion(&rcu_barrier_completion); 2301
2302 smp_mb(); /* Prevent any prior operations from leaking in. */
2303
2208 /* 2304 /*
2209 * Initialize rcu_barrier_cpu_count to 1, then invoke 2305 * Initialize the count to one rather than to zero in order to
2210 * rcu_barrier_func() on each CPU, so that each CPU also has 2306 * avoid a too-soon return to zero in case of a short grace period
2211 * incremented rcu_barrier_cpu_count. Only then is it safe to 2307 * (or preemption of this task). Also flag this task as doing
2212 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2308 * an rcu_barrier(). This will prevent anyone else from adopting
2213 * might complete its grace period before all of the other CPUs 2309 * orphaned callbacks, which could cause otherwise failure if a
2214 * did their increment, causing this function to return too 2310 * CPU went offline and quickly came back online. To see this,
2215 * early. Note that on_each_cpu() disables irqs, which prevents 2311 * consider the following sequence of events:
2216 * any CPUs from coming online or going offline until each online 2312 *
2217 * CPU has queued its RCU-barrier callback. 2313 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2314 * 2. CPU 1 goes offline, orphaning its callbacks.
2315 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2316 * 4. CPU 1 comes back online.
2317 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2318 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2319 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2218 */ 2320 */
2321 init_completion(&rcu_barrier_completion);
2219 atomic_set(&rcu_barrier_cpu_count, 1); 2322 atomic_set(&rcu_barrier_cpu_count, 1);
2220 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2323 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2324 rsp->rcu_barrier_in_progress = current;
2325 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2326
2327 /*
2328 * Force every CPU with callbacks to register a new callback
2329 * that will tell us when all the preceding callbacks have
2330 * been invoked. If an offline CPU has callbacks, wait for
2331 * it to either come back online or to finish orphaning those
2332 * callbacks.
2333 */
2334 for_each_possible_cpu(cpu) {
2335 preempt_disable();
2336 rdp = per_cpu_ptr(rsp->rda, cpu);
2337 if (cpu_is_offline(cpu)) {
2338 preempt_enable();
2339 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2340 schedule_timeout_interruptible(1);
2341 } else if (ACCESS_ONCE(rdp->qlen)) {
2342 smp_call_function_single(cpu, rcu_barrier_func,
2343 (void *)call_rcu_func, 1);
2344 preempt_enable();
2345 } else {
2346 preempt_enable();
2347 }
2348 }
2349
2350 /*
2351 * Now that all online CPUs have rcu_barrier_callback() callbacks
2352 * posted, we can adopt all of the orphaned callbacks and place
2353 * an rcu_barrier_callback() callback after them. When that is done,
2354 * we are guaranteed to have an rcu_barrier_callback() callback
2355 * following every callback that could possibly have been
2356 * registered before _rcu_barrier() was called.
2357 */
2358 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2359 rcu_adopt_orphan_cbs(rsp);
2360 rsp->rcu_barrier_in_progress = NULL;
2361 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2362 atomic_inc(&rcu_barrier_cpu_count);
2363 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2364 call_rcu_func(&rh, rcu_barrier_callback);
2365
2366 /*
2367 * Now that we have an rcu_barrier_callback() callback on each
2368 * CPU, and thus each counted, remove the initial count.
2369 */
2221 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2370 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2222 complete(&rcu_barrier_completion); 2371 complete(&rcu_barrier_completion);
2372
2373 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2223 wait_for_completion(&rcu_barrier_completion); 2374 wait_for_completion(&rcu_barrier_completion);
2375
2376 /* Other rcu_barrier() invocations can now safely proceed. */
2224 mutex_unlock(&rcu_barrier_mutex); 2377 mutex_unlock(&rcu_barrier_mutex);
2378
2379 destroy_rcu_head_on_stack(&rh);
2225} 2380}
2226 2381
2227/** 2382/**
@@ -2418,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2418 2573
2419 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2574 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2420 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2575 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2421 rsp->levelspread[0] = RCU_FANOUT_LEAF; 2576 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
2422} 2577}
2423#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2578#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2424static void __init rcu_init_levelspread(struct rcu_state *rsp) 2579static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
33 * CONFIG_RCU_FANOUT_LEAF.
33 * In theory, it should be possible to add more levels straightforwardly. 34 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this did work well going from three levels to four. 35 * In practice, this did work well going from three levels to four.
35 * Of course, your mileage may vary. 36 * Of course, your mileage may vary.
36 */ 37 */
37#define MAX_RCU_LVLS 4 38#define MAX_RCU_LVLS 4
38#if CONFIG_RCU_FANOUT > 16 39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
371 367
372 raw_spinlock_t onofflock; /* exclude on/offline and */ 368 raw_spinlock_t onofflock; /* exclude on/offline and */
373 /* starting new GP. */ 369 /* starting new GP. */
370 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
371 /* need a grace period. */
372 struct rcu_head **orphan_nxttail; /* Tail of above. */
373 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
374 /* are ready to invoke. */
375 struct rcu_head **orphan_donetail; /* Tail of above. */
376 long qlen_lazy; /* Number of lazy callbacks. */
377 long qlen; /* Total number of callbacks. */
378 struct task_struct *rcu_barrier_in_progress;
379 /* Task doing rcu_barrier(), */
380 /* or NULL if no barrier. */
374 raw_spinlock_t fqslock; /* Only one task forcing */ 381 raw_spinlock_t fqslock; /* Only one task forcing */
375 /* quiescent states. */ 382 /* quiescent states. */
376 unsigned long jiffies_force_qs; /* Time at which to invoke */ 383 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
423/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
424static void rcu_bootup_announce(void); 431static void rcu_bootup_announce(void);
425long rcu_batches_completed(void); 432long rcu_batches_completed(void);
426static void rcu_preempt_note_context_switch(int cpu);
427static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
428#ifdef CONFIG_HOTPLUG_CPU 434#ifdef CONFIG_HOTPLUG_CPU
429static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 477static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 478static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 479static void rcu_prepare_for_idle(int cpu);
480static void rcu_idle_count_callbacks_posted(void);
474static void print_cpu_stall_info_begin(void); 481static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void); 483static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156static void rcu_preempt_note_context_switch(int cpu) 156void rcu_preempt_note_context_switch(void)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 rdp = __this_cpu_ptr(rcu_preempt_state.rda);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(cpu); 231 rcu_preempt_qs(smp_processor_id());
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
970} 970}
971 971
972/*
973 * Check for a task exiting while in a preemptible-RCU read-side
974 * critical section, clean up if so. No need to issue warnings,
975 * as debug_check_no_locks_held() already does this if lockdep
976 * is enabled.
977 */
978void exit_rcu(void)
979{
980 struct task_struct *t = current;
981
982 if (t->rcu_read_lock_nesting == 0)
983 return;
984 t->rcu_read_lock_nesting = 1;
985 __rcu_read_unlock();
986}
987
988#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 972#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
989 973
990static struct rcu_state *rcu_state = &rcu_sched_state; 974static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
1018EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1019 1003
1020/* 1004/*
1021 * Because preemptible RCU does not exist, we never have to check for
1022 * CPUs being in quiescent states.
1023 */
1024static void rcu_preempt_note_context_switch(int cpu)
1025{
1026}
1027
1028/*
1029 * Because preemptible RCU does not exist, there are never any preempted 1005 * Because preemptible RCU does not exist, there are never any preempted
1030 * RCU readers. 1006 * RCU readers.
1031 */ 1007 */
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
1938{ 1914{
1939} 1915}
1940 1916
1917/*
1918 * Don't bother keeping a running count of the number of RCU callbacks
1919 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1920 */
1921static void rcu_idle_count_callbacks_posted(void)
1922{
1923}
1924
1941#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1925#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1942 1926
1943/* 1927/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1980 1964
1965/* Loop counter for rcu_prepare_for_idle(). */
1981static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1986 1979
1987/* 1980/*
1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
1995 */ 1988 */
1996int rcu_needs_cpu(int cpu) 1989int rcu_needs_cpu(int cpu)
1997{ 1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1998 /* If no callbacks, RCU doesn't need the CPU. */ 1993 /* If no callbacks, RCU doesn't need the CPU. */
1999 if (!rcu_cpu_has_callbacks(cpu)) 1994 if (!rcu_cpu_has_callbacks(cpu))
2000 return 0; 1995 return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2045} 2040}
2046 2041
2047/* 2042/*
2043 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing.
2045 */
2046void rcu_idle_demigrate(void *unused)
2047{
2048 trace_rcu_prep_idle("Demigrate");
2049}
2050
2051/*
2048 * Timer handler used to force CPU to start pushing its remaining RCU 2052 * Timer handler used to force CPU to start pushing its remaining RCU
2049 * callbacks in the case where it entered dyntick-idle mode with callbacks 2053 * callbacks in the case where it entered dyntick-idle mode with callbacks
2050 * pending. The hander doesn't really need to do anything because the 2054 * pending. The hander doesn't really need to do anything because the
2051 * real work is done upon re-entry to idle, or by the next scheduling-clock 2055 * real work is done upon re-entry to idle, or by the next scheduling-clock
2052 * interrupt should idle not be re-entered. 2056 * interrupt should idle not be re-entered.
2057 *
2058 * One special case: the timer gets migrated without awakening the CPU
2059 * on which the timer was scheduled on. In this case, we must wake up
2060 * that CPU. We do so with smp_call_function_single().
2053 */ 2061 */
2054static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2062static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2055{ 2063{
2064 int cpu = (int)cpu_in;
2065
2056 trace_rcu_prep_idle("Timer"); 2066 trace_rcu_prep_idle("Timer");
2057 return HRTIMER_NORESTART; 2067 if (cpu != smp_processor_id())
2068 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
2069 else
2070 WARN_ON_ONCE(1); /* Getting here can hang the system... */
2058} 2071}
2059 2072
2060/* 2073/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2062 */ 2075 */
2063static void rcu_prepare_for_idle_init(int cpu) 2076static void rcu_prepare_for_idle_init(int cpu)
2064{ 2077{
2065 static int firsttime = 1; 2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2066 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
2067 2080 rcu_idle_gp_timer_func, cpu);
2068 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
2069 hrtp->function = rcu_idle_gp_timer_func; 2082 per_cpu(rcu_idle_first_pass, cpu) = 1;
2070 if (firsttime) {
2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2072
2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2076 firsttime = 0;
2077 }
2078} 2083}
2079 2084
2080/* 2085/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2084 */ 2089 */
2085static void rcu_cleanup_after_idle(int cpu) 2090static void rcu_cleanup_after_idle(int cpu)
2086{ 2091{
2087 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
2093 trace_rcu_prep_idle("Cleanup after idle");
2088} 2094}
2089 2095
2090/* 2096/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
2108 */ 2114 */
2109static void rcu_prepare_for_idle(int cpu) 2115static void rcu_prepare_for_idle(int cpu)
2110{ 2116{
2117 struct timer_list *tp;
2118
2119 /*
2120 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
2125 * pending.
2126 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) ==
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2133 }
2134 return;
2135 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139
2111 /* 2140 /*
2112 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2141 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2113 * Also reset state to avoid prejudicing later attempts. 2142 * Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2169 per_cpu(rcu_dyntick_drain, cpu) = 0;
2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2171 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2172 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2173 jiffies + RCU_IDLE_GP_DELAY;
2145 else 2174 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2175 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2176 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2180 per_cpu(rcu_nonlazy_posted, cpu);
2148 return; /* Nothing more to do immediately. */ 2181 return; /* Nothing more to do immediately. */
2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2150 /* We have hit the limit, so time to give up. */ 2183 /* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
2184 trace_rcu_prep_idle("Callbacks drained"); 2217 trace_rcu_prep_idle("Callbacks drained");
2185} 2218}
2186 2219
2220/*
2221 * Keep a running count of the number of non-lazy callbacks posted
2222 * on this CPU. This running counter (which is never decremented) allows
2223 * rcu_prepare_for_idle() to detect when something out of the idle loop
2224 * posts a callback, even if an equal number of callbacks are invoked.
2225 * Of course, callbacks should only be posted from within a trace event
2226 * designed to be called from idle or from within RCU_NONIDLE().
2227 */
2228static void rcu_idle_count_callbacks_posted(void)
2229{
2230 __this_cpu_add(rcu_nonlazy_posted, 1);
2231}
2232
2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188 2234
2189#ifdef CONFIG_RCU_CPU_STALL_INFO 2235#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
2192 2238
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{ 2240{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
2196 2242
2197 sprintf(cp, "drain=%d %c timer=%lld", 2243 sprintf(cp, "drain=%d %c timer=%lu",
2198 per_cpu(rcu_dyntick_drain, cpu), 2244 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp) 2246 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203} 2247}
2204 2248
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2249#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
271 271
272 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
275 rsp->completed, gpnum, rsp->fqs_state, 275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 279 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh); 280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
282 if (rnp->level != level) { 282 if (rnp->level != level) {
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b3..bebe2b170d49 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
25int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
26 bool force)
26{ 27{
28 int ret = 0;
29
27 if (counter->usage + val > counter->limit) { 30 if (counter->usage + val > counter->limit) {
28 counter->failcnt++; 31 counter->failcnt++;
29 return -ENOMEM; 32 ret = -ENOMEM;
33 if (!force)
34 return ret;
30 } 35 }
31 36
32 counter->usage += val; 37 counter->usage += val;
33 if (counter->usage > counter->max_usage) 38 if (counter->usage > counter->max_usage)
34 counter->max_usage = counter->usage; 39 counter->max_usage = counter->usage;
35 return 0; 40 return ret;
36} 41}
37 42
38int res_counter_charge(struct res_counter *counter, unsigned long val, 43static int __res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at) 44 struct res_counter **limit_fail_at, bool force)
40{ 45{
41 int ret; 46 int ret, r;
42 unsigned long flags; 47 unsigned long flags;
43 struct res_counter *c, *u; 48 struct res_counter *c, *u;
44 49
50 r = ret = 0;
45 *limit_fail_at = NULL; 51 *limit_fail_at = NULL;
46 local_irq_save(flags); 52 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) { 53 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock); 54 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val); 55 r = res_counter_charge_locked(c, val, force);
50 spin_unlock(&c->lock); 56 spin_unlock(&c->lock);
51 if (ret < 0) { 57 if (r < 0 && !ret) {
58 ret = r;
52 *limit_fail_at = c; 59 *limit_fail_at = c;
53 goto undo; 60 if (!force)
61 break;
54 } 62 }
55 } 63 }
56 ret = 0; 64
57 goto done; 65 if (ret < 0 && !force) {
58undo: 66 for (u = counter; u != c; u = u->parent) {
59 for (u = counter; u != c; u = u->parent) { 67 spin_lock(&u->lock);
60 spin_lock(&u->lock); 68 res_counter_uncharge_locked(u, val);
61 res_counter_uncharge_locked(u, val); 69 spin_unlock(&u->lock);
62 spin_unlock(&u->lock); 70 }
63 } 71 }
64done:
65 local_irq_restore(flags); 72 local_irq_restore(flags);
73
66 return ret; 74 return ret;
67} 75}
68 76
77int res_counter_charge(struct res_counter *counter, unsigned long val,
78 struct res_counter **limit_fail_at)
79{
80 return __res_counter_charge(counter, val, limit_fail_at, false);
81}
82
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, 83int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at) 84 struct res_counter **limit_fail_at)
71{ 85{
72 int ret, r; 86 return __res_counter_charge(counter, val, limit_fail_at, true);
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93} 87}
88
94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
95{ 90{
96 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b189fecaef90..39eb6011bc38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 693}
693#endif 694#endif
694 695
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 696static void set_load_weight(struct task_struct *p)
698{ 697{
699 int prio = p->static_prio - MAX_RT_PRIO; 698 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2083,6 +2082,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2083#endif 2082#endif
2084 2083
2085 /* Here we just switch the register state and the stack. */ 2084 /* Here we just switch the register state and the stack. */
2085 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2086 switch_to(prev, next, prev);
2087 2087
2088 barrier(); 2088 barrier();
@@ -2486,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2487 * every tick. We fix it up based on jiffies.
2488 */ 2488 */
2489void update_cpu_load(struct rq *this_rq) 2489static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2490 unsigned long pending_updates)
2490{ 2491{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2492 int i, scale;
2495 2493
2496 this_rq->nr_load_updates++; 2494 this_rq->nr_load_updates++;
2497 2495
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2496 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2497 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2498 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2517 sched_avg_update(this_rq);
2527} 2518}
2528 2519
2520/*
2521 * Called from nohz_idle_balance() to update the load ratings before doing the
2522 * idle balance.
2523 */
2524void update_idle_cpu_load(struct rq *this_rq)
2525{
2526 unsigned long curr_jiffies = jiffies;
2527 unsigned long load = this_rq->load.weight;
2528 unsigned long pending_updates;
2529
2530 /*
2531 * Bloody broken means of dealing with nohz, but better than nothing..
2532 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533 * update and see 0 difference the one time and 2 the next, even though
2534 * we ticked at roughtly the same rate.
2535 *
2536 * Hence we only use this from nohz_idle_balance() and skip this
2537 * nonsense when called from the scheduler_tick() since that's
2538 * guaranteed a stable rate.
2539 */
2540 if (load || curr_jiffies == this_rq->last_load_update_tick)
2541 return;
2542
2543 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2544 this_rq->last_load_update_tick = curr_jiffies;
2545
2546 __update_cpu_load(this_rq, load, pending_updates);
2547}
2548
2549/*
2550 * Called from scheduler_tick()
2551 */
2529static void update_cpu_load_active(struct rq *this_rq) 2552static void update_cpu_load_active(struct rq *this_rq)
2530{ 2553{
2531 update_cpu_load(this_rq); 2554 /*
2555 * See the mess in update_idle_cpu_load().
2556 */
2557 this_rq->last_load_update_tick = jiffies;
2558 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2559
2533 calc_load_account_active(this_rq); 2560 calc_load_account_active(this_rq);
2534} 2561}
@@ -3113,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3113 if (irqs_disabled()) 3140 if (irqs_disabled())
3114 print_irqtrace_events(prev); 3141 print_irqtrace_events(prev);
3115 dump_stack(); 3142 dump_stack();
3143 add_taint(TAINT_WARN);
3116} 3144}
3117 3145
3118/* 3146/*
@@ -5557,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5557 break; 5585 break;
5558 } 5586 }
5559 5587
5560 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5588 if (!(sd->flags & SD_OVERLAP) &&
5589 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5561 printk(KERN_CONT "\n"); 5590 printk(KERN_CONT "\n");
5562 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5591 printk(KERN_ERR "ERROR: repeated CPUs\n");
5563 break; 5592 break;
@@ -5895,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str)
5895 5924
5896__setup("isolcpus=", isolated_cpu_setup); 5925__setup("isolcpus=", isolated_cpu_setup);
5897 5926
5898#ifdef CONFIG_NUMA
5899
5900/**
5901 * find_next_best_node - find the next node to include in a sched_domain
5902 * @node: node whose sched_domain we're building
5903 * @used_nodes: nodes already in the sched_domain
5904 *
5905 * Find the next node to include in a given scheduling domain. Simply
5906 * finds the closest node not already in the @used_nodes map.
5907 *
5908 * Should use nodemask_t.
5909 */
5910static int find_next_best_node(int node, nodemask_t *used_nodes)
5911{
5912 int i, n, val, min_val, best_node = -1;
5913
5914 min_val = INT_MAX;
5915
5916 for (i = 0; i < nr_node_ids; i++) {
5917 /* Start at @node */
5918 n = (node + i) % nr_node_ids;
5919
5920 if (!nr_cpus_node(n))
5921 continue;
5922
5923 /* Skip already used nodes */
5924 if (node_isset(n, *used_nodes))
5925 continue;
5926
5927 /* Simple min distance search */
5928 val = node_distance(node, n);
5929
5930 if (val < min_val) {
5931 min_val = val;
5932 best_node = n;
5933 }
5934 }
5935
5936 if (best_node != -1)
5937 node_set(best_node, *used_nodes);
5938 return best_node;
5939}
5940
5941/**
5942 * sched_domain_node_span - get a cpumask for a node's sched_domain
5943 * @node: node whose cpumask we're constructing
5944 * @span: resulting cpumask
5945 *
5946 * Given a node, construct a good cpumask for its sched_domain to span. It
5947 * should be one that prevents unnecessary balancing, but also spreads tasks
5948 * out optimally.
5949 */
5950static void sched_domain_node_span(int node, struct cpumask *span)
5951{
5952 nodemask_t used_nodes;
5953 int i;
5954
5955 cpumask_clear(span);
5956 nodes_clear(used_nodes);
5957
5958 cpumask_or(span, span, cpumask_of_node(node));
5959 node_set(node, used_nodes);
5960
5961 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5962 int next_node = find_next_best_node(node, &used_nodes);
5963 if (next_node < 0)
5964 break;
5965 cpumask_or(span, span, cpumask_of_node(next_node));
5966 }
5967}
5968
5969static const struct cpumask *cpu_node_mask(int cpu)
5970{
5971 lockdep_assert_held(&sched_domains_mutex);
5972
5973 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5974
5975 return sched_domains_tmpmask;
5976}
5977
5978static const struct cpumask *cpu_allnodes_mask(int cpu)
5979{
5980 return cpu_possible_mask;
5981}
5982#endif /* CONFIG_NUMA */
5983
5984static const struct cpumask *cpu_cpu_mask(int cpu) 5927static const struct cpumask *cpu_cpu_mask(int cpu)
5985{ 5928{
5986 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
5987} 5930}
5988 5931
5989int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5990
5991struct sd_data { 5932struct sd_data {
5992 struct sched_domain **__percpu sd; 5933 struct sched_domain **__percpu sd;
5993 struct sched_group **__percpu sg; 5934 struct sched_group **__percpu sg;
@@ -6017,6 +5958,7 @@ struct sched_domain_topology_level {
6017 sched_domain_init_f init; 5958 sched_domain_init_f init;
6018 sched_domain_mask_f mask; 5959 sched_domain_mask_f mask;
6019 int flags; 5960 int flags;
5961 int numa_level;
6020 struct sd_data data; 5962 struct sd_data data;
6021}; 5963};
6022 5964
@@ -6208,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6208} 6150}
6209 6151
6210SD_INIT_FUNC(CPU) 6152SD_INIT_FUNC(CPU)
6211#ifdef CONFIG_NUMA
6212 SD_INIT_FUNC(ALLNODES)
6213 SD_INIT_FUNC(NODE)
6214#endif
6215#ifdef CONFIG_SCHED_SMT 6153#ifdef CONFIG_SCHED_SMT
6216 SD_INIT_FUNC(SIBLING) 6154 SD_INIT_FUNC(SIBLING)
6217#endif 6155#endif
@@ -6333,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = {
6333 { sd_init_BOOK, cpu_book_mask, }, 6271 { sd_init_BOOK, cpu_book_mask, },
6334#endif 6272#endif
6335 { sd_init_CPU, cpu_cpu_mask, }, 6273 { sd_init_CPU, cpu_cpu_mask, },
6336#ifdef CONFIG_NUMA
6337 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6338 { sd_init_ALLNODES, cpu_allnodes_mask, },
6339#endif
6340 { NULL, }, 6274 { NULL, },
6341}; 6275};
6342 6276
6343static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6277static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6344 6278
6279#ifdef CONFIG_NUMA
6280
6281static int sched_domains_numa_levels;
6282static int sched_domains_numa_scale;
6283static int *sched_domains_numa_distance;
6284static struct cpumask ***sched_domains_numa_masks;
6285static int sched_domains_curr_level;
6286
6287static inline int sd_local_flags(int level)
6288{
6289 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6290 return 0;
6291
6292 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6293}
6294
6295static struct sched_domain *
6296sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6297{
6298 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6299 int level = tl->numa_level;
6300 int sd_weight = cpumask_weight(
6301 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6302
6303 *sd = (struct sched_domain){
6304 .min_interval = sd_weight,
6305 .max_interval = 2*sd_weight,
6306 .busy_factor = 32,
6307 .imbalance_pct = 125,
6308 .cache_nice_tries = 2,
6309 .busy_idx = 3,
6310 .idle_idx = 2,
6311 .newidle_idx = 0,
6312 .wake_idx = 0,
6313 .forkexec_idx = 0,
6314
6315 .flags = 1*SD_LOAD_BALANCE
6316 | 1*SD_BALANCE_NEWIDLE
6317 | 0*SD_BALANCE_EXEC
6318 | 0*SD_BALANCE_FORK
6319 | 0*SD_BALANCE_WAKE
6320 | 0*SD_WAKE_AFFINE
6321 | 0*SD_PREFER_LOCAL
6322 | 0*SD_SHARE_CPUPOWER
6323 | 0*SD_SHARE_PKG_RESOURCES
6324 | 1*SD_SERIALIZE
6325 | 0*SD_PREFER_SIBLING
6326 | sd_local_flags(level)
6327 ,
6328 .last_balance = jiffies,
6329 .balance_interval = sd_weight,
6330 };
6331 SD_INIT_NAME(sd, NUMA);
6332 sd->private = &tl->data;
6333
6334 /*
6335 * Ugly hack to pass state to sd_numa_mask()...
6336 */
6337 sched_domains_curr_level = tl->numa_level;
6338
6339 return sd;
6340}
6341
6342static const struct cpumask *sd_numa_mask(int cpu)
6343{
6344 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6345}
6346
6347static void sched_init_numa(void)
6348{
6349 int next_distance, curr_distance = node_distance(0, 0);
6350 struct sched_domain_topology_level *tl;
6351 int level = 0;
6352 int i, j, k;
6353
6354 sched_domains_numa_scale = curr_distance;
6355 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6356 if (!sched_domains_numa_distance)
6357 return;
6358
6359 /*
6360 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6361 * unique distances in the node_distance() table.
6362 *
6363 * Assumes node_distance(0,j) includes all distances in
6364 * node_distance(i,j) in order to avoid cubic time.
6365 *
6366 * XXX: could be optimized to O(n log n) by using sort()
6367 */
6368 next_distance = curr_distance;
6369 for (i = 0; i < nr_node_ids; i++) {
6370 for (j = 0; j < nr_node_ids; j++) {
6371 int distance = node_distance(0, j);
6372 if (distance > curr_distance &&
6373 (distance < next_distance ||
6374 next_distance == curr_distance))
6375 next_distance = distance;
6376 }
6377 if (next_distance != curr_distance) {
6378 sched_domains_numa_distance[level++] = next_distance;
6379 sched_domains_numa_levels = level;
6380 curr_distance = next_distance;
6381 } else break;
6382 }
6383 /*
6384 * 'level' contains the number of unique distances, excluding the
6385 * identity distance node_distance(i,i).
6386 *
6387 * The sched_domains_nume_distance[] array includes the actual distance
6388 * numbers.
6389 */
6390
6391 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6392 if (!sched_domains_numa_masks)
6393 return;
6394
6395 /*
6396 * Now for each level, construct a mask per node which contains all
6397 * cpus of nodes that are that many hops away from us.
6398 */
6399 for (i = 0; i < level; i++) {
6400 sched_domains_numa_masks[i] =
6401 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6402 if (!sched_domains_numa_masks[i])
6403 return;
6404
6405 for (j = 0; j < nr_node_ids; j++) {
6406 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6407 if (!mask)
6408 return;
6409
6410 sched_domains_numa_masks[i][j] = mask;
6411
6412 for (k = 0; k < nr_node_ids; k++) {
6413 if (node_distance(j, k) > sched_domains_numa_distance[i])
6414 continue;
6415
6416 cpumask_or(mask, mask, cpumask_of_node(k));
6417 }
6418 }
6419 }
6420
6421 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6422 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6423 if (!tl)
6424 return;
6425
6426 /*
6427 * Copy the default topology bits..
6428 */
6429 for (i = 0; default_topology[i].init; i++)
6430 tl[i] = default_topology[i];
6431
6432 /*
6433 * .. and append 'j' levels of NUMA goodness.
6434 */
6435 for (j = 0; j < level; i++, j++) {
6436 tl[i] = (struct sched_domain_topology_level){
6437 .init = sd_numa_init,
6438 .mask = sd_numa_mask,
6439 .flags = SDTL_OVERLAP,
6440 .numa_level = j,
6441 };
6442 }
6443
6444 sched_domain_topology = tl;
6445}
6446#else
6447static inline void sched_init_numa(void)
6448{
6449}
6450#endif /* CONFIG_NUMA */
6451
6345static int __sdt_alloc(const struct cpumask *cpu_map) 6452static int __sdt_alloc(const struct cpumask *cpu_map)
6346{ 6453{
6347 struct sched_domain_topology_level *tl; 6454 struct sched_domain_topology_level *tl;
@@ -6379,6 +6486,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6379 if (!sg) 6486 if (!sg)
6380 return -ENOMEM; 6487 return -ENOMEM;
6381 6488
6489 sg->next = sg;
6490
6382 *per_cpu_ptr(sdd->sg, j) = sg; 6491 *per_cpu_ptr(sdd->sg, j) = sg;
6383 6492
6384 sgp = kzalloc_node(sizeof(struct sched_group_power), 6493 sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6402,16 +6511,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
6402 struct sd_data *sdd = &tl->data; 6511 struct sd_data *sdd = &tl->data;
6403 6512
6404 for_each_cpu(j, cpu_map) { 6513 for_each_cpu(j, cpu_map) {
6405 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 6514 struct sched_domain *sd;
6406 if (sd && (sd->flags & SD_OVERLAP)) 6515
6407 free_sched_groups(sd->groups, 0); 6516 if (sdd->sd) {
6408 kfree(*per_cpu_ptr(sdd->sd, j)); 6517 sd = *per_cpu_ptr(sdd->sd, j);
6409 kfree(*per_cpu_ptr(sdd->sg, j)); 6518 if (sd && (sd->flags & SD_OVERLAP))
6410 kfree(*per_cpu_ptr(sdd->sgp, j)); 6519 free_sched_groups(sd->groups, 0);
6520 kfree(*per_cpu_ptr(sdd->sd, j));
6521 }
6522
6523 if (sdd->sg)
6524 kfree(*per_cpu_ptr(sdd->sg, j));
6525 if (sdd->sgp)
6526 kfree(*per_cpu_ptr(sdd->sgp, j));
6411 } 6527 }
6412 free_percpu(sdd->sd); 6528 free_percpu(sdd->sd);
6529 sdd->sd = NULL;
6413 free_percpu(sdd->sg); 6530 free_percpu(sdd->sg);
6531 sdd->sg = NULL;
6414 free_percpu(sdd->sgp); 6532 free_percpu(sdd->sgp);
6533 sdd->sgp = NULL;
6415 } 6534 }
6416} 6535}
6417 6536
@@ -6697,97 +6816,6 @@ match2:
6697 mutex_unlock(&sched_domains_mutex); 6816 mutex_unlock(&sched_domains_mutex);
6698} 6817}
6699 6818
6700#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6701static void reinit_sched_domains(void)
6702{
6703 get_online_cpus();
6704
6705 /* Destroy domains first to force the rebuild */
6706 partition_sched_domains(0, NULL, NULL);
6707
6708 rebuild_sched_domains();
6709 put_online_cpus();
6710}
6711
6712static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6713{
6714 unsigned int level = 0;
6715
6716 if (sscanf(buf, "%u", &level) != 1)
6717 return -EINVAL;
6718
6719 /*
6720 * level is always be positive so don't check for
6721 * level < POWERSAVINGS_BALANCE_NONE which is 0
6722 * What happens on 0 or 1 byte write,
6723 * need to check for count as well?
6724 */
6725
6726 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6727 return -EINVAL;
6728
6729 if (smt)
6730 sched_smt_power_savings = level;
6731 else
6732 sched_mc_power_savings = level;
6733
6734 reinit_sched_domains();
6735
6736 return count;
6737}
6738
6739#ifdef CONFIG_SCHED_MC
6740static ssize_t sched_mc_power_savings_show(struct device *dev,
6741 struct device_attribute *attr,
6742 char *buf)
6743{
6744 return sprintf(buf, "%u\n", sched_mc_power_savings);
6745}
6746static ssize_t sched_mc_power_savings_store(struct device *dev,
6747 struct device_attribute *attr,
6748 const char *buf, size_t count)
6749{
6750 return sched_power_savings_store(buf, count, 0);
6751}
6752static DEVICE_ATTR(sched_mc_power_savings, 0644,
6753 sched_mc_power_savings_show,
6754 sched_mc_power_savings_store);
6755#endif
6756
6757#ifdef CONFIG_SCHED_SMT
6758static ssize_t sched_smt_power_savings_show(struct device *dev,
6759 struct device_attribute *attr,
6760 char *buf)
6761{
6762 return sprintf(buf, "%u\n", sched_smt_power_savings);
6763}
6764static ssize_t sched_smt_power_savings_store(struct device *dev,
6765 struct device_attribute *attr,
6766 const char *buf, size_t count)
6767{
6768 return sched_power_savings_store(buf, count, 1);
6769}
6770static DEVICE_ATTR(sched_smt_power_savings, 0644,
6771 sched_smt_power_savings_show,
6772 sched_smt_power_savings_store);
6773#endif
6774
6775int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6776{
6777 int err = 0;
6778
6779#ifdef CONFIG_SCHED_SMT
6780 if (smt_capable())
6781 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6782#endif
6783#ifdef CONFIG_SCHED_MC
6784 if (!err && mc_capable())
6785 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6786#endif
6787 return err;
6788}
6789#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6790
6791/* 6819/*
6792 * Update cpusets according to cpu_active mask. If cpusets are 6820 * Update cpusets according to cpu_active mask. If cpusets are
6793 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6821 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6825,6 +6853,8 @@ void __init sched_init_smp(void)
6825 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6853 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6826 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6854 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6827 6855
6856 sched_init_numa();
6857
6828 get_online_cpus(); 6858 get_online_cpus();
6829 mutex_lock(&sched_domains_mutex); 6859 mutex_lock(&sched_domains_mutex);
6830 init_sched_domains(cpu_active_mask); 6860 init_sched_domains(cpu_active_mask);
@@ -7046,6 +7076,7 @@ void __init sched_init(void)
7046 /* May be allocated at isolcpus cmdline parse time */ 7076 /* May be allocated at isolcpus cmdline parse time */
7047 if (cpu_isolated_map == NULL) 7077 if (cpu_isolated_map == NULL)
7048 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7078 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7079 idle_thread_set_boot_cpu();
7049#endif 7080#endif
7050 init_sched_fair_class(); 7081 init_sched_fair_class();
7051 7082
@@ -7967,13 +7998,9 @@ static struct cftype cpu_files[] = {
7967 .write_u64 = cpu_rt_period_write_uint, 7998 .write_u64 = cpu_rt_period_write_uint,
7968 }, 7999 },
7969#endif 8000#endif
8001 { } /* terminate */
7970}; 8002};
7971 8003
7972static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7973{
7974 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7975}
7976
7977struct cgroup_subsys cpu_cgroup_subsys = { 8004struct cgroup_subsys cpu_cgroup_subsys = {
7978 .name = "cpu", 8005 .name = "cpu",
7979 .create = cpu_cgroup_create, 8006 .create = cpu_cgroup_create,
@@ -7981,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7981 .can_attach = cpu_cgroup_can_attach, 8008 .can_attach = cpu_cgroup_can_attach,
7982 .attach = cpu_cgroup_attach, 8009 .attach = cpu_cgroup_attach,
7983 .exit = cpu_cgroup_exit, 8010 .exit = cpu_cgroup_exit,
7984 .populate = cpu_cgroup_populate,
7985 .subsys_id = cpu_cgroup_subsys_id, 8011 .subsys_id = cpu_cgroup_subsys_id,
8012 .base_cftypes = cpu_files,
7986 .early_init = 1, 8013 .early_init = 1,
7987}; 8014};
7988 8015
@@ -8167,13 +8194,9 @@ static struct cftype files[] = {
8167 .name = "stat", 8194 .name = "stat",
8168 .read_map = cpuacct_stats_show, 8195 .read_map = cpuacct_stats_show,
8169 }, 8196 },
8197 { } /* terminate */
8170}; 8198};
8171 8199
8172static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8173{
8174 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8175}
8176
8177/* 8200/*
8178 * charge this task's execution time to its accounting group. 8201 * charge this task's execution time to its accounting group.
8179 * 8202 *
@@ -8205,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = {
8205 .name = "cpuacct", 8228 .name = "cpuacct",
8206 .create = cpuacct_create, 8229 .create = cpuacct_create,
8207 .destroy = cpuacct_destroy, 8230 .destroy = cpuacct_destroy,
8208 .populate = cpuacct_populate,
8209 .subsys_id = cpuacct_subsys_id, 8231 .subsys_id = cpuacct_subsys_id,
8232 .base_cftypes = files,
8210}; 8233};
8211#endif /* CONFIG_CGROUP_CPUACCT */ 8234#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebdc58f0..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785#ifdef CONFIG_SMP 785#ifdef CONFIG_SMP
786 if (entity_is_task(se)) 786 if (entity_is_task(se))
787 list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 787 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788#endif 788#endif
789 cfs_rq->nr_running++; 789 cfs_rq->nr_running++;
790} 790}
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env)
3215 3212
3216static unsigned long task_h_load(struct task_struct *p); 3213static unsigned long task_h_load(struct task_struct *p);
3217 3214
3215static const unsigned int sched_nr_migrate_break = 32;
3216
3218/* 3217/*
3219 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3220 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3221 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3222 * 3221 *
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3229 unsigned long load; 3228 unsigned long load;
3230 int pulled = 0; 3229 int pulled = 0;
3231 3230
3232 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3233 return 0; 3232 return 0;
3234 3233
3235 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env)
3242 3241
3243 /* take a breather every nr_migrate tasks */ 3242 /* take a breather every nr_migrate tasks */
3244 if (env->loop > env->loop_break) { 3243 if (env->loop > env->loop_break) {
3245 env->loop_break += sysctl_sched_nr_migrate; 3244 env->loop_break += sched_nr_migrate_break;
3246 env->flags |= LBF_NEED_BREAK; 3245 env->flags |= LBF_NEED_BREAK;
3247 break; 3246 break;
3248 } 3247 }
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env)
3252 3251
3253 load = task_h_load(p); 3252 load = task_h_load(p);
3254 3253
3255 if (load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3256 goto next; 3255 goto next;
3257 3256
3258 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3259 goto next; 3258 goto next;
3260 3259
3261 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3263 3262
3264 move_task(p, env); 3263 move_task(p, env);
3265 pulled++; 3264 pulled++;
3266 env->load_move -= load; 3265 env->imbalance -= load;
3267 3266
3268#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3269 /* 3268 /*
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3279 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3280 * weighted load. 3279 * weighted load.
3281 */ 3280 */
3282 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3283 break; 3282 break;
3284 3283
3285 continue; 3284 continue;
@@ -3433,14 +3432,6 @@ struct sd_lb_stats {
3433 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3434 3433
3435 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3436#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3437 int power_savings_balance; /* Is powersave balance needed for this sd */
3438 struct sched_group *group_min; /* Least loaded group in sd */
3439 struct sched_group *group_leader; /* Group which relieves group_min */
3440 unsigned long min_load_per_task; /* load_per_task in group_min */
3441 unsigned long leader_nr_running; /* Nr running of group_leader */
3442 unsigned long min_nr_running; /* Nr running of group_min */
3443#endif
3444}; 3435};
3445 3436
3446/* 3437/*
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3484 return load_idx; 3475 return load_idx;
3485} 3476}
3486 3477
3487
3488#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3489/**
3490 * init_sd_power_savings_stats - Initialize power savings statistics for
3491 * the given sched_domain, during load balancing.
3492 *
3493 * @sd: Sched domain whose power-savings statistics are to be initialized.
3494 * @sds: Variable containing the statistics for sd.
3495 * @idle: Idle status of the CPU at which we're performing load-balancing.
3496 */
3497static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3498 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3499{
3500 /*
3501 * Busy processors will not participate in power savings
3502 * balance.
3503 */
3504 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3505 sds->power_savings_balance = 0;
3506 else {
3507 sds->power_savings_balance = 1;
3508 sds->min_nr_running = ULONG_MAX;
3509 sds->leader_nr_running = 0;
3510 }
3511}
3512
3513/**
3514 * update_sd_power_savings_stats - Update the power saving stats for a
3515 * sched_domain while performing load balancing.
3516 *
3517 * @group: sched_group belonging to the sched_domain under consideration.
3518 * @sds: Variable containing the statistics of the sched_domain
3519 * @local_group: Does group contain the CPU for which we're performing
3520 * load balancing ?
3521 * @sgs: Variable containing the statistics of the group.
3522 */
3523static inline void update_sd_power_savings_stats(struct sched_group *group,
3524 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3525{
3526
3527 if (!sds->power_savings_balance)
3528 return;
3529
3530 /*
3531 * If the local group is idle or completely loaded
3532 * no need to do power savings balance at this domain
3533 */
3534 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3535 !sds->this_nr_running))
3536 sds->power_savings_balance = 0;
3537
3538 /*
3539 * If a group is already running at full capacity or idle,
3540 * don't include that group in power savings calculations
3541 */
3542 if (!sds->power_savings_balance ||
3543 sgs->sum_nr_running >= sgs->group_capacity ||
3544 !sgs->sum_nr_running)
3545 return;
3546
3547 /*
3548 * Calculate the group which has the least non-idle load.
3549 * This is the group from where we need to pick up the load
3550 * for saving power
3551 */
3552 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3553 (sgs->sum_nr_running == sds->min_nr_running &&
3554 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3555 sds->group_min = group;
3556 sds->min_nr_running = sgs->sum_nr_running;
3557 sds->min_load_per_task = sgs->sum_weighted_load /
3558 sgs->sum_nr_running;
3559 }
3560
3561 /*
3562 * Calculate the group which is almost near its
3563 * capacity but still has some space to pick up some load
3564 * from other group and save more power
3565 */
3566 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3567 return;
3568
3569 if (sgs->sum_nr_running > sds->leader_nr_running ||
3570 (sgs->sum_nr_running == sds->leader_nr_running &&
3571 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3572 sds->group_leader = group;
3573 sds->leader_nr_running = sgs->sum_nr_running;
3574 }
3575}
3576
3577/**
3578 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3579 * @sds: Variable containing the statistics of the sched_domain
3580 * under consideration.
3581 * @this_cpu: Cpu at which we're currently performing load-balancing.
3582 * @imbalance: Variable to store the imbalance.
3583 *
3584 * Description:
3585 * Check if we have potential to perform some power-savings balance.
3586 * If yes, set the busiest group to be the least loaded group in the
3587 * sched_domain, so that it's CPUs can be put to idle.
3588 *
3589 * Returns 1 if there is potential to perform power-savings balance.
3590 * Else returns 0.
3591 */
3592static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3593 int this_cpu, unsigned long *imbalance)
3594{
3595 if (!sds->power_savings_balance)
3596 return 0;
3597
3598 if (sds->this != sds->group_leader ||
3599 sds->group_leader == sds->group_min)
3600 return 0;
3601
3602 *imbalance = sds->min_load_per_task;
3603 sds->busiest = sds->group_min;
3604
3605 return 1;
3606
3607}
3608#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3609static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3610 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3611{
3612 return;
3613}
3614
3615static inline void update_sd_power_savings_stats(struct sched_group *group,
3616 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3617{
3618 return;
3619}
3620
3621static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3622 int this_cpu, unsigned long *imbalance)
3623{
3624 return 0;
3625}
3626#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3627
3628
3629unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3630{ 3479{
3631 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3763,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3763 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3764 * @sd: The sched_domain whose statistics are to be updated. 3613 * @sd: The sched_domain whose statistics are to be updated.
3765 * @group: sched_group whose statistics are to be updated. 3614 * @group: sched_group whose statistics are to be updated.
3766 * @this_cpu: Cpu for which load balance is currently performed.
3767 * @idle: Idle status of this_cpu
3768 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3615 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3769 * @local_group: Does group contain this_cpu. 3616 * @local_group: Does group contain this_cpu.
3770 * @cpus: Set of cpus considered for load balancing. 3617 * @cpus: Set of cpus considered for load balancing.
3771 * @balance: Should we balance. 3618 * @balance: Should we balance.
3772 * @sgs: variable to hold the statistics for this group. 3619 * @sgs: variable to hold the statistics for this group.
3773 */ 3620 */
3774static inline void update_sg_lb_stats(struct sched_domain *sd, 3621static inline void update_sg_lb_stats(struct lb_env *env,
3775 struct sched_group *group, int this_cpu, 3622 struct sched_group *group, int load_idx,
3776 enum cpu_idle_type idle, int load_idx,
3777 int local_group, const struct cpumask *cpus, 3623 int local_group, const struct cpumask *cpus,
3778 int *balance, struct sg_lb_stats *sgs) 3624 int *balance, struct sg_lb_stats *sgs)
3779{ 3625{
3780 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3626 unsigned long nr_running, max_nr_running, min_nr_running;
3781 int i; 3627 unsigned long load, max_cpu_load, min_cpu_load;
3782 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3628 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3783 unsigned long avg_load_per_task = 0; 3629 unsigned long avg_load_per_task = 0;
3630 int i;
3784 3631
3785 if (local_group) 3632 if (local_group)
3786 balance_cpu = group_first_cpu(group); 3633 balance_cpu = group_first_cpu(group);
@@ -3789,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3789 max_cpu_load = 0; 3636 max_cpu_load = 0;
3790 min_cpu_load = ~0UL; 3637 min_cpu_load = ~0UL;
3791 max_nr_running = 0; 3638 max_nr_running = 0;
3639 min_nr_running = ~0UL;
3792 3640
3793 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3641 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3794 struct rq *rq = cpu_rq(i); 3642 struct rq *rq = cpu_rq(i);
3795 3643
3644 nr_running = rq->nr_running;
3645
3796 /* Bias balancing toward cpus of our domain */ 3646 /* Bias balancing toward cpus of our domain */
3797 if (local_group) { 3647 if (local_group) {
3798 if (idle_cpu(i) && !first_idle_cpu) { 3648 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3803,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3803 load = target_load(i, load_idx); 3653 load = target_load(i, load_idx);
3804 } else { 3654 } else {
3805 load = source_load(i, load_idx); 3655 load = source_load(i, load_idx);
3806 if (load > max_cpu_load) { 3656 if (load > max_cpu_load)
3807 max_cpu_load = load; 3657 max_cpu_load = load;
3808 max_nr_running = rq->nr_running;
3809 }
3810 if (min_cpu_load > load) 3658 if (min_cpu_load > load)
3811 min_cpu_load = load; 3659 min_cpu_load = load;
3660
3661 if (nr_running > max_nr_running)
3662 max_nr_running = nr_running;
3663 if (min_nr_running > nr_running)
3664 min_nr_running = nr_running;
3812 } 3665 }
3813 3666
3814 sgs->group_load += load; 3667 sgs->group_load += load;
3815 sgs->sum_nr_running += rq->nr_running; 3668 sgs->sum_nr_running += nr_running;
3816 sgs->sum_weighted_load += weighted_cpuload(i); 3669 sgs->sum_weighted_load += weighted_cpuload(i);
3817 if (idle_cpu(i)) 3670 if (idle_cpu(i))
3818 sgs->idle_cpus++; 3671 sgs->idle_cpus++;
@@ -3825,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3825 * to do the newly idle load balance. 3678 * to do the newly idle load balance.
3826 */ 3679 */
3827 if (local_group) { 3680 if (local_group) {
3828 if (idle != CPU_NEWLY_IDLE) { 3681 if (env->idle != CPU_NEWLY_IDLE) {
3829 if (balance_cpu != this_cpu) { 3682 if (balance_cpu != env->dst_cpu) {
3830 *balance = 0; 3683 *balance = 0;
3831 return; 3684 return;
3832 } 3685 }
3833 update_group_power(sd, this_cpu); 3686 update_group_power(env->sd, env->dst_cpu);
3834 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3687 } else if (time_after_eq(jiffies, group->sgp->next_update))
3835 update_group_power(sd, this_cpu); 3688 update_group_power(env->sd, env->dst_cpu);
3836 } 3689 }
3837 3690
3838 /* Adjust by relative CPU power of the group */ 3691 /* Adjust by relative CPU power of the group */
@@ -3850,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3850 if (sgs->sum_nr_running) 3703 if (sgs->sum_nr_running)
3851 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3704 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3852 3705
3853 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3706 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3707 (max_nr_running - min_nr_running) > 1)
3854 sgs->group_imb = 1; 3708 sgs->group_imb = 1;
3855 3709
3856 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3710 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3857 SCHED_POWER_SCALE); 3711 SCHED_POWER_SCALE);
3858 if (!sgs->group_capacity) 3712 if (!sgs->group_capacity)
3859 sgs->group_capacity = fix_small_capacity(sd, group); 3713 sgs->group_capacity = fix_small_capacity(env->sd, group);
3860 sgs->group_weight = group->group_weight; 3714 sgs->group_weight = group->group_weight;
3861 3715
3862 if (sgs->group_capacity > sgs->sum_nr_running) 3716 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3874,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3874 * Determine if @sg is a busier group than the previously selected 3728 * Determine if @sg is a busier group than the previously selected
3875 * busiest group. 3729 * busiest group.
3876 */ 3730 */
3877static bool update_sd_pick_busiest(struct sched_domain *sd, 3731static bool update_sd_pick_busiest(struct lb_env *env,
3878 struct sd_lb_stats *sds, 3732 struct sd_lb_stats *sds,
3879 struct sched_group *sg, 3733 struct sched_group *sg,
3880 struct sg_lb_stats *sgs, 3734 struct sg_lb_stats *sgs)
3881 int this_cpu)
3882{ 3735{
3883 if (sgs->avg_load <= sds->max_load) 3736 if (sgs->avg_load <= sds->max_load)
3884 return false; 3737 return false;
@@ -3894,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3894 * numbered CPUs in the group, therefore mark all groups 3747 * numbered CPUs in the group, therefore mark all groups
3895 * higher than ourself as busy. 3748 * higher than ourself as busy.
3896 */ 3749 */
3897 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3750 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3898 this_cpu < group_first_cpu(sg)) { 3751 env->dst_cpu < group_first_cpu(sg)) {
3899 if (!sds->busiest) 3752 if (!sds->busiest)
3900 return true; 3753 return true;
3901 3754
@@ -3915,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3915 * @balance: Should we balance. 3768 * @balance: Should we balance.
3916 * @sds: variable to hold the statistics for this sched_domain. 3769 * @sds: variable to hold the statistics for this sched_domain.
3917 */ 3770 */
3918static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3771static inline void update_sd_lb_stats(struct lb_env *env,
3919 enum cpu_idle_type idle, const struct cpumask *cpus, 3772 const struct cpumask *cpus,
3920 int *balance, struct sd_lb_stats *sds) 3773 int *balance, struct sd_lb_stats *sds)
3921{ 3774{
3922 struct sched_domain *child = sd->child; 3775 struct sched_domain *child = env->sd->child;
3923 struct sched_group *sg = sd->groups; 3776 struct sched_group *sg = env->sd->groups;
3924 struct sg_lb_stats sgs; 3777 struct sg_lb_stats sgs;
3925 int load_idx, prefer_sibling = 0; 3778 int load_idx, prefer_sibling = 0;
3926 3779
3927 if (child && child->flags & SD_PREFER_SIBLING) 3780 if (child && child->flags & SD_PREFER_SIBLING)
3928 prefer_sibling = 1; 3781 prefer_sibling = 1;
3929 3782
3930 init_sd_power_savings_stats(sd, sds, idle); 3783 load_idx = get_sd_load_idx(env->sd, env->idle);
3931 load_idx = get_sd_load_idx(sd, idle);
3932 3784
3933 do { 3785 do {
3934 int local_group; 3786 int local_group;
3935 3787
3936 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3788 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3937 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3938 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3790 update_sg_lb_stats(env, sg, load_idx, local_group,
3939 local_group, cpus, balance, &sgs); 3791 cpus, balance, &sgs);
3940 3792
3941 if (local_group && !(*balance)) 3793 if (local_group && !(*balance))
3942 return; 3794 return;
@@ -3964,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3964 sds->this_load_per_task = sgs.sum_weighted_load; 3816 sds->this_load_per_task = sgs.sum_weighted_load;
3965 sds->this_has_capacity = sgs.group_has_capacity; 3817 sds->this_has_capacity = sgs.group_has_capacity;
3966 sds->this_idle_cpus = sgs.idle_cpus; 3818 sds->this_idle_cpus = sgs.idle_cpus;
3967 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3819 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3968 sds->max_load = sgs.avg_load; 3820 sds->max_load = sgs.avg_load;
3969 sds->busiest = sg; 3821 sds->busiest = sg;
3970 sds->busiest_nr_running = sgs.sum_nr_running; 3822 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3976,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3976 sds->group_imb = sgs.group_imb; 3828 sds->group_imb = sgs.group_imb;
3977 } 3829 }
3978 3830
3979 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3980 sg = sg->next; 3831 sg = sg->next;
3981 } while (sg != sd->groups); 3832 } while (sg != env->sd->groups);
3982} 3833}
3983 3834
3984/** 3835/**
@@ -4006,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4006 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4007 * @imbalance: returns amount of imbalanced due to packing. 3858 * @imbalance: returns amount of imbalanced due to packing.
4008 */ 3859 */
4009static int check_asym_packing(struct sched_domain *sd, 3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4010 struct sd_lb_stats *sds,
4011 int this_cpu, unsigned long *imbalance)
4012{ 3861{
4013 int busiest_cpu; 3862 int busiest_cpu;
4014 3863
4015 if (!(sd->flags & SD_ASYM_PACKING)) 3864 if (!(env->sd->flags & SD_ASYM_PACKING))
4016 return 0; 3865 return 0;
4017 3866
4018 if (!sds->busiest) 3867 if (!sds->busiest)
4019 return 0; 3868 return 0;
4020 3869
4021 busiest_cpu = group_first_cpu(sds->busiest); 3870 busiest_cpu = group_first_cpu(sds->busiest);
4022 if (this_cpu > busiest_cpu) 3871 if (env->dst_cpu > busiest_cpu)
4023 return 0; 3872 return 0;
4024 3873
4025 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3874 env->imbalance = DIV_ROUND_CLOSEST(
4026 SCHED_POWER_SCALE); 3875 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3876
4027 return 1; 3877 return 1;
4028} 3878}
4029 3879
@@ -4035,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd,
4035 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4036 * @imbalance: Variable to store the imbalance. 3886 * @imbalance: Variable to store the imbalance.
4037 */ 3887 */
4038static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3888static inline
4039 int this_cpu, unsigned long *imbalance) 3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4040{ 3890{
4041 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3891 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4042 unsigned int imbn = 2; 3892 unsigned int imbn = 2;
@@ -4047,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4047 if (sds->busiest_load_per_task > 3897 if (sds->busiest_load_per_task >
4048 sds->this_load_per_task) 3898 sds->this_load_per_task)
4049 imbn = 1; 3899 imbn = 1;
4050 } else 3900 } else {
4051 sds->this_load_per_task = 3901 sds->this_load_per_task =
4052 cpu_avg_load_per_task(this_cpu); 3902 cpu_avg_load_per_task(env->dst_cpu);
3903 }
4053 3904
4054 scaled_busy_load_per_task = sds->busiest_load_per_task 3905 scaled_busy_load_per_task = sds->busiest_load_per_task
4055 * SCHED_POWER_SCALE; 3906 * SCHED_POWER_SCALE;
@@ -4057,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4057 3908
4058 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3909 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4059 (scaled_busy_load_per_task * imbn)) { 3910 (scaled_busy_load_per_task * imbn)) {
4060 *imbalance = sds->busiest_load_per_task; 3911 env->imbalance = sds->busiest_load_per_task;
4061 return; 3912 return;
4062 } 3913 }
4063 3914
@@ -4094,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4094 3945
4095 /* Move if we gain throughput */ 3946 /* Move if we gain throughput */
4096 if (pwr_move > pwr_now) 3947 if (pwr_move > pwr_now)
4097 *imbalance = sds->busiest_load_per_task; 3948 env->imbalance = sds->busiest_load_per_task;
4098} 3949}
4099 3950
4100/** 3951/**
4101 * calculate_imbalance - Calculate the amount of imbalance present within the 3952 * calculate_imbalance - Calculate the amount of imbalance present within the
4102 * groups of a given sched_domain during load balance. 3953 * groups of a given sched_domain during load balance.
3954 * @env: load balance environment
4103 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3955 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4104 * @this_cpu: Cpu for which currently load balance is being performed.
4105 * @imbalance: The variable to store the imbalance.
4106 */ 3956 */
4107static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3957static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4108 unsigned long *imbalance)
4109{ 3958{
4110 unsigned long max_pull, load_above_capacity = ~0UL; 3959 unsigned long max_pull, load_above_capacity = ~0UL;
4111 3960
@@ -4121,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4121 * its cpu_power, while calculating max_load..) 3970 * its cpu_power, while calculating max_load..)
4122 */ 3971 */
4123 if (sds->max_load < sds->avg_load) { 3972 if (sds->max_load < sds->avg_load) {
4124 *imbalance = 0; 3973 env->imbalance = 0;
4125 return fix_small_imbalance(sds, this_cpu, imbalance); 3974 return fix_small_imbalance(env, sds);
4126 } 3975 }
4127 3976
4128 if (!sds->group_imb) { 3977 if (!sds->group_imb) {
@@ -4150,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4150 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3999 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4151 4000
4152 /* How much load to actually move to equalise the imbalance */ 4001 /* How much load to actually move to equalise the imbalance */
4153 *imbalance = min(max_pull * sds->busiest->sgp->power, 4002 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4154 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4003 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4155 / SCHED_POWER_SCALE; 4004 / SCHED_POWER_SCALE;
4156 4005
@@ -4160,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4160 * a think about bumping its value to force at least one task to be 4009 * a think about bumping its value to force at least one task to be
4161 * moved 4010 * moved
4162 */ 4011 */
4163 if (*imbalance < sds->busiest_load_per_task) 4012 if (env->imbalance < sds->busiest_load_per_task)
4164 return fix_small_imbalance(sds, this_cpu, imbalance); 4013 return fix_small_imbalance(env, sds);
4165 4014
4166} 4015}
4167 4016
@@ -4192,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4192 * put to idle by rebalancing its tasks onto our group. 4041 * put to idle by rebalancing its tasks onto our group.
4193 */ 4042 */
4194static struct sched_group * 4043static struct sched_group *
4195find_busiest_group(struct sched_domain *sd, int this_cpu, 4044find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4196 unsigned long *imbalance, enum cpu_idle_type idle,
4197 const struct cpumask *cpus, int *balance)
4198{ 4045{
4199 struct sd_lb_stats sds; 4046 struct sd_lb_stats sds;
4200 4047
@@ -4204,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4204 * Compute the various statistics relavent for load balancing at 4051 * Compute the various statistics relavent for load balancing at
4205 * this level. 4052 * this level.
4206 */ 4053 */
4207 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4054 update_sd_lb_stats(env, cpus, balance, &sds);
4208 4055
4209 /* 4056 /*
4210 * this_cpu is not the appropriate cpu to perform load balancing at 4057 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4213,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4213 if (!(*balance)) 4060 if (!(*balance))
4214 goto ret; 4061 goto ret;
4215 4062
4216 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4063 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4217 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4064 check_asym_packing(env, &sds))
4218 return sds.busiest; 4065 return sds.busiest;
4219 4066
4220 /* There is no busy sibling group to pull tasks from */ 4067 /* There is no busy sibling group to pull tasks from */
@@ -4232,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4232 goto force_balance; 4079 goto force_balance;
4233 4080
4234 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4081 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4235 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4082 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4236 !sds.busiest_has_capacity) 4083 !sds.busiest_has_capacity)
4237 goto force_balance; 4084 goto force_balance;
4238 4085
@@ -4250,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4250 if (sds.this_load >= sds.avg_load) 4097 if (sds.this_load >= sds.avg_load)
4251 goto out_balanced; 4098 goto out_balanced;
4252 4099
4253 if (idle == CPU_IDLE) { 4100 if (env->idle == CPU_IDLE) {
4254 /* 4101 /*
4255 * This cpu is idle. If the busiest group load doesn't 4102 * This cpu is idle. If the busiest group load doesn't
4256 * have more tasks than the number of available cpu's and 4103 * have more tasks than the number of available cpu's and
@@ -4265,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4265 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4112 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4266 * imbalance_pct to be conservative. 4113 * imbalance_pct to be conservative.
4267 */ 4114 */
4268 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4115 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4269 goto out_balanced; 4116 goto out_balanced;
4270 } 4117 }
4271 4118
4272force_balance: 4119force_balance:
4273 /* Looks like there is an imbalance. Compute it */ 4120 /* Looks like there is an imbalance. Compute it */
4274 calculate_imbalance(&sds, this_cpu, imbalance); 4121 calculate_imbalance(env, &sds);
4275 return sds.busiest; 4122 return sds.busiest;
4276 4123
4277out_balanced: 4124out_balanced:
4278 /*
4279 * There is no obvious imbalance. But check if we can do some balancing
4280 * to save power.
4281 */
4282 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4283 return sds.busiest;
4284ret: 4125ret:
4285 *imbalance = 0; 4126 env->imbalance = 0;
4286 return NULL; 4127 return NULL;
4287} 4128}
4288 4129
4289/* 4130/*
4290 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4131 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4291 */ 4132 */
4292static struct rq * 4133static struct rq *find_busiest_queue(struct lb_env *env,
4293find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4134 struct sched_group *group,
4294 enum cpu_idle_type idle, unsigned long imbalance, 4135 const struct cpumask *cpus)
4295 const struct cpumask *cpus)
4296{ 4136{
4297 struct rq *busiest = NULL, *rq; 4137 struct rq *busiest = NULL, *rq;
4298 unsigned long max_load = 0; 4138 unsigned long max_load = 0;
@@ -4305,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4305 unsigned long wl; 4145 unsigned long wl;
4306 4146
4307 if (!capacity) 4147 if (!capacity)
4308 capacity = fix_small_capacity(sd, group); 4148 capacity = fix_small_capacity(env->sd, group);
4309 4149
4310 if (!cpumask_test_cpu(i, cpus)) 4150 if (!cpumask_test_cpu(i, cpus))
4311 continue; 4151 continue;
@@ -4317,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4317 * When comparing with imbalance, use weighted_cpuload() 4157 * When comparing with imbalance, use weighted_cpuload()
4318 * which is not scaled with the cpu power. 4158 * which is not scaled with the cpu power.
4319 */ 4159 */
4320 if (capacity && rq->nr_running == 1 && wl > imbalance) 4160 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4321 continue; 4161 continue;
4322 4162
4323 /* 4163 /*
@@ -4346,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4346/* Working cpumask for load_balance and load_balance_newidle. */ 4186/* Working cpumask for load_balance and load_balance_newidle. */
4347DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4187DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4348 4188
4349static int need_active_balance(struct sched_domain *sd, int idle, 4189static int need_active_balance(struct lb_env *env)
4350 int busiest_cpu, int this_cpu)
4351{ 4190{
4352 if (idle == CPU_NEWLY_IDLE) { 4191 struct sched_domain *sd = env->sd;
4192
4193 if (env->idle == CPU_NEWLY_IDLE) {
4353 4194
4354 /* 4195 /*
4355 * ASYM_PACKING needs to force migrate tasks from busy but 4196 * ASYM_PACKING needs to force migrate tasks from busy but
4356 * higher numbered CPUs in order to pack all tasks in the 4197 * higher numbered CPUs in order to pack all tasks in the
4357 * lowest numbered CPUs. 4198 * lowest numbered CPUs.
4358 */ 4199 */
4359 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4200 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4360 return 1; 4201 return 1;
4361
4362 /*
4363 * The only task running in a non-idle cpu can be moved to this
4364 * cpu in an attempt to completely freeup the other CPU
4365 * package.
4366 *
4367 * The package power saving logic comes from
4368 * find_busiest_group(). If there are no imbalance, then
4369 * f_b_g() will return NULL. However when sched_mc={1,2} then
4370 * f_b_g() will select a group from which a running task may be
4371 * pulled to this cpu in order to make the other package idle.
4372 * If there is no opportunity to make a package idle and if
4373 * there are no imbalance, then f_b_g() will return NULL and no
4374 * action will be taken in load_balance_newidle().
4375 *
4376 * Under normal task pull operation due to imbalance, there
4377 * will be more than one task in the source run queue and
4378 * move_tasks() will succeed. ld_moved will be true and this
4379 * active balance code will not be triggered.
4380 */
4381 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4382 return 0;
4383 } 4202 }
4384 4203
4385 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4204 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4397,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4397{ 4216{
4398 int ld_moved, active_balance = 0; 4217 int ld_moved, active_balance = 0;
4399 struct sched_group *group; 4218 struct sched_group *group;
4400 unsigned long imbalance;
4401 struct rq *busiest; 4219 struct rq *busiest;
4402 unsigned long flags; 4220 unsigned long flags;
4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4221 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4407,7 +4225,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4407 .dst_cpu = this_cpu, 4225 .dst_cpu = this_cpu,
4408 .dst_rq = this_rq, 4226 .dst_rq = this_rq,
4409 .idle = idle, 4227 .idle = idle,
4410 .loop_break = sysctl_sched_nr_migrate, 4228 .loop_break = sched_nr_migrate_break,
4411 }; 4229 };
4412 4230
4413 cpumask_copy(cpus, cpu_active_mask); 4231 cpumask_copy(cpus, cpu_active_mask);
@@ -4415,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4415 schedstat_inc(sd, lb_count[idle]); 4233 schedstat_inc(sd, lb_count[idle]);
4416 4234
4417redo: 4235redo:
4418 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4236 group = find_busiest_group(&env, cpus, balance);
4419 cpus, balance);
4420 4237
4421 if (*balance == 0) 4238 if (*balance == 0)
4422 goto out_balanced; 4239 goto out_balanced;
@@ -4426,7 +4243,7 @@ redo:
4426 goto out_balanced; 4243 goto out_balanced;
4427 } 4244 }
4428 4245
4429 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4246 busiest = find_busiest_queue(&env, group, cpus);
4430 if (!busiest) { 4247 if (!busiest) {
4431 schedstat_inc(sd, lb_nobusyq[idle]); 4248 schedstat_inc(sd, lb_nobusyq[idle]);
4432 goto out_balanced; 4249 goto out_balanced;
@@ -4434,7 +4251,7 @@ redo:
4434 4251
4435 BUG_ON(busiest == this_rq); 4252 BUG_ON(busiest == this_rq);
4436 4253
4437 schedstat_add(sd, lb_imbalance[idle], imbalance); 4254 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4438 4255
4439 ld_moved = 0; 4256 ld_moved = 0;
4440 if (busiest->nr_running > 1) { 4257 if (busiest->nr_running > 1) {
@@ -4445,10 +4262,9 @@ redo:
4445 * correctly treated as an imbalance. 4262 * correctly treated as an imbalance.
4446 */ 4263 */
4447 env.flags |= LBF_ALL_PINNED; 4264 env.flags |= LBF_ALL_PINNED;
4448 env.load_move = imbalance; 4265 env.src_cpu = busiest->cpu;
4449 env.src_cpu = busiest->cpu; 4266 env.src_rq = busiest;
4450 env.src_rq = busiest; 4267 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4451 env.loop_max = busiest->nr_running;
4452 4268
4453more_balance: 4269more_balance:
4454 local_irq_save(flags); 4270 local_irq_save(flags);
@@ -4490,7 +4306,7 @@ more_balance:
4490 if (idle != CPU_NEWLY_IDLE) 4306 if (idle != CPU_NEWLY_IDLE)
4491 sd->nr_balance_failed++; 4307 sd->nr_balance_failed++;
4492 4308
4493 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4309 if (need_active_balance(&env)) {
4494 raw_spin_lock_irqsave(&busiest->lock, flags); 4310 raw_spin_lock_irqsave(&busiest->lock, flags);
4495 4311
4496 /* don't kick the active_load_balance_cpu_stop, 4312 /* don't kick the active_load_balance_cpu_stop,
@@ -4517,10 +4333,11 @@ more_balance:
4517 } 4333 }
4518 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4334 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4519 4335
4520 if (active_balance) 4336 if (active_balance) {
4521 stop_one_cpu_nowait(cpu_of(busiest), 4337 stop_one_cpu_nowait(cpu_of(busiest),
4522 active_load_balance_cpu_stop, busiest, 4338 active_load_balance_cpu_stop, busiest,
4523 &busiest->active_balance_work); 4339 &busiest->active_balance_work);
4340 }
4524 4341
4525 /* 4342 /*
4526 * We've kicked active balancing, reset the failure 4343 * We've kicked active balancing, reset the failure
@@ -4701,104 +4518,15 @@ static struct {
4701 unsigned long next_balance; /* in jiffy units */ 4518 unsigned long next_balance; /* in jiffy units */
4702} nohz ____cacheline_aligned; 4519} nohz ____cacheline_aligned;
4703 4520
4704#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4521static inline int find_new_ilb(int call_cpu)
4705/**
4706 * lowest_flag_domain - Return lowest sched_domain containing flag.
4707 * @cpu: The cpu whose lowest level of sched domain is to
4708 * be returned.
4709 * @flag: The flag to check for the lowest sched_domain
4710 * for the given cpu.
4711 *
4712 * Returns the lowest sched_domain of a cpu which contains the given flag.
4713 */
4714static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4715{
4716 struct sched_domain *sd;
4717
4718 for_each_domain(cpu, sd)
4719 if (sd->flags & flag)
4720 break;
4721
4722 return sd;
4723}
4724
4725/**
4726 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4727 * @cpu: The cpu whose domains we're iterating over.
4728 * @sd: variable holding the value of the power_savings_sd
4729 * for cpu.
4730 * @flag: The flag to filter the sched_domains to be iterated.
4731 *
4732 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4733 * set, starting from the lowest sched_domain to the highest.
4734 */
4735#define for_each_flag_domain(cpu, sd, flag) \
4736 for (sd = lowest_flag_domain(cpu, flag); \
4737 (sd && (sd->flags & flag)); sd = sd->parent)
4738
4739/**
4740 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4741 * @cpu: The cpu which is nominating a new idle_load_balancer.
4742 *
4743 * Returns: Returns the id of the idle load balancer if it exists,
4744 * Else, returns >= nr_cpu_ids.
4745 *
4746 * This algorithm picks the idle load balancer such that it belongs to a
4747 * semi-idle powersavings sched_domain. The idea is to try and avoid
4748 * completely idle packages/cores just for the purpose of idle load balancing
4749 * when there are other idle cpu's which are better suited for that job.
4750 */
4751static int find_new_ilb(int cpu)
4752{ 4522{
4753 int ilb = cpumask_first(nohz.idle_cpus_mask); 4523 int ilb = cpumask_first(nohz.idle_cpus_mask);
4754 struct sched_group *ilbg;
4755 struct sched_domain *sd;
4756
4757 /*
4758 * Have idle load balancer selection from semi-idle packages only
4759 * when power-aware load balancing is enabled
4760 */
4761 if (!(sched_smt_power_savings || sched_mc_power_savings))
4762 goto out_done;
4763
4764 /*
4765 * Optimize for the case when we have no idle CPUs or only one
4766 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4767 */
4768 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4769 goto out_done;
4770
4771 rcu_read_lock();
4772 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4773 ilbg = sd->groups;
4774
4775 do {
4776 if (ilbg->group_weight !=
4777 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4778 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4779 sched_group_cpus(ilbg));
4780 goto unlock;
4781 }
4782
4783 ilbg = ilbg->next;
4784
4785 } while (ilbg != sd->groups);
4786 }
4787unlock:
4788 rcu_read_unlock();
4789 4524
4790out_done:
4791 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4525 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4792 return ilb; 4526 return ilb;
4793 4527
4794 return nr_cpu_ids; 4528 return nr_cpu_ids;
4795} 4529}
4796#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4797static inline int find_new_ilb(int call_cpu)
4798{
4799 return nr_cpu_ids;
4800}
4801#endif
4802 4530
4803/* 4531/*
4804 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4532 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5021,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5021 4749
5022 raw_spin_lock_irq(&this_rq->lock); 4750 raw_spin_lock_irq(&this_rq->lock);
5023 update_rq_clock(this_rq); 4751 update_rq_clock(this_rq);
5024 update_cpu_load(this_rq); 4752 update_idle_cpu_load(this_rq);
5025 raw_spin_unlock_irq(&this_rq->lock); 4753 raw_spin_unlock_irq(&this_rq->lock);
5026 4754
5027 rebalance_domains(balance_cpu, CPU_IDLE); 4755 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d0..de00a486c5c6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, false) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, true) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
71SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..c5565c3c515f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1803static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1804 const struct cpumask *new_mask)
1805{ 1805{
1806 int weight = cpumask_weight(new_mask); 1806 struct rq *rq;
1807 int weight;
1807 1808
1808 BUG_ON(!rt_task(p)); 1809 BUG_ON(!rt_task(p));
1809 1810
1810 /* 1811 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1812 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816
1817 if (!task_current(rq, p)) {
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1813
1827 /* 1814 weight = cpumask_weight(new_mask);
1828 * Requeue if our weight is changing and still > 1
1829 */
1830 if (weight > 1)
1831 enqueue_pushable_task(rq, p);
1832 1815
1833 } 1816 /*
1817 * Only update if the process changes its state from whether it
1818 * can migrate or not.
1819 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
1821 return;
1834 1822
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1823 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1824
1842 update_rt_migration(&rq->rt); 1825 /*
1826 * The process used to be able to migrate OR it can now migrate
1827 */
1828 if (weight <= 1) {
1829 if (!task_current(rq, p))
1830 dequeue_pushable_task(rq, p);
1831 BUG_ON(!rq->rt.rt_nr_migratory);
1832 rq->rt.rt_nr_migratory--;
1833 } else {
1834 if (!task_current(rq, p))
1835 enqueue_pushable_task(rq, p);
1836 rq->rt.rt_nr_migratory++;
1843 } 1837 }
1838
1839 update_rt_migration(&rq->rt);
1844} 1840}
1845 1841
1846/* Assumes rq->lock is held */ 1842/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..ee376beedaf9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
3 * 3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 * 5 *
6 * This defines a simple but solid secure-computing mode. 6 * Copyright (C) 2012 Google, Inc.
7 * Will Drewry <wad@chromium.org>
8 *
9 * This defines a simple but solid secure-computing facility.
10 *
11 * Mode 1 uses a fixed list of allowed system calls.
12 * Mode 2 allows user-defined system call filters in the form
13 * of Berkeley Packet Filters/Linux Socket Filters.
7 */ 14 */
8 15
16#include <linux/atomic.h>
9#include <linux/audit.h> 17#include <linux/audit.h>
10#include <linux/seccomp.h>
11#include <linux/sched.h>
12#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/sched.h>
20#include <linux/seccomp.h>
13 21
14/* #define SECCOMP_DEBUG 1 */ 22/* #define SECCOMP_DEBUG 1 */
15#define NR_SECCOMP_MODES 1 23
24#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h>
26#include <linux/filter.h>
27#include <linux/ptrace.h>
28#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h>
31#include <linux/uaccess.h>
32
33/**
34 * struct seccomp_filter - container for seccomp BPF programs
35 *
36 * @usage: reference count to manage the object lifetime.
37 * get/put helpers should be used when accessing an instance
38 * outside of a lifetime-guarded section. In general, this
39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate
43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting
46 * with current->seccomp.filter, the most recently attached or inherited filter.
47 * However, multiple filters may share a @prev node, by way of fork(), which
48 * results in a unidirectional tree existing in memory. This is similar to
49 * how namespaces work.
50 *
51 * seccomp_filter objects should never be modified after being attached
52 * to a task_struct (other than @usage).
53 */
54struct seccomp_filter {
55 atomic_t usage;
56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[];
59};
60
61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63
64/**
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture.
76 */
77static inline u32 get_u32(u64 data, int index)
78{
79 return ((u32 *)&data)[index];
80}
81
82/* Helper for bpf_load below. */
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
84/**
85 * bpf_load: checks and returns a pointer to the requested offset
86 * @off: offset into struct seccomp_data to load from
87 *
88 * Returns the requested 32-bits of data.
89 * seccomp_check_filter() should assure that @off is 32-bit aligned
90 * and not out of bounds. Failure to do so is a BUG.
91 */
92u32 seccomp_bpf_load(int off)
93{
94 struct pt_regs *regs = task_pt_regs(current);
95 if (off == BPF_DATA(nr))
96 return syscall_get_nr(current, regs);
97 if (off == BPF_DATA(arch))
98 return syscall_get_arch(current, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
100 unsigned long value;
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
102 int index = !!(off % sizeof(u64));
103 syscall_get_arguments(current, regs, arg, 1, &value);
104 return get_u32(value, index);
105 }
106 if (off == BPF_DATA(instruction_pointer))
107 return get_u32(KSTK_EIP(current), 0);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112}
113
114/**
115 * seccomp_check_filter - verify seccomp filter code
116 * @filter: filter to verify
117 * @flen: length of filter
118 *
119 * Takes a previously checked filter (by sk_chk_filter) and
120 * redirects all filter code that loads struct sk_buff data
121 * and related data through seccomp_bpf_load. It also
122 * enforces length and alignment checking of those loads.
123 *
124 * Returns 0 if the rule set is legal or -EINVAL if not.
125 */
126static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
127{
128 int pc;
129 for (pc = 0; pc < flen; pc++) {
130 struct sock_filter *ftest = &filter[pc];
131 u16 code = ftest->code;
132 u32 k = ftest->k;
133
134 switch (code) {
135 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W;
137 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL;
140 continue;
141 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM;
143 ftest->k = sizeof(struct seccomp_data);
144 continue;
145 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM;
147 ftest->k = sizeof(struct seccomp_data);
148 continue;
149 /* Explicitly include allowed calls. */
150 case BPF_S_RET_K:
151 case BPF_S_RET_A:
152 case BPF_S_ALU_ADD_K:
153 case BPF_S_ALU_ADD_X:
154 case BPF_S_ALU_SUB_K:
155 case BPF_S_ALU_SUB_X:
156 case BPF_S_ALU_MUL_K:
157 case BPF_S_ALU_MUL_X:
158 case BPF_S_ALU_DIV_X:
159 case BPF_S_ALU_AND_K:
160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K:
166 case BPF_S_ALU_RSH_X:
167 case BPF_S_ALU_NEG:
168 case BPF_S_LD_IMM:
169 case BPF_S_LDX_IMM:
170 case BPF_S_MISC_TAX:
171 case BPF_S_MISC_TXA:
172 case BPF_S_ALU_DIV_K:
173 case BPF_S_LD_MEM:
174 case BPF_S_LDX_MEM:
175 case BPF_S_ST:
176 case BPF_S_STX:
177 case BPF_S_JMP_JA:
178 case BPF_S_JMP_JEQ_K:
179 case BPF_S_JMP_JEQ_X:
180 case BPF_S_JMP_JGE_K:
181 case BPF_S_JMP_JGE_X:
182 case BPF_S_JMP_JGT_K:
183 case BPF_S_JMP_JGT_X:
184 case BPF_S_JMP_JSET_K:
185 case BPF_S_JMP_JSET_X:
186 continue;
187 default:
188 return -EINVAL;
189 }
190 }
191 return 0;
192}
193
194/**
195 * seccomp_run_filters - evaluates all seccomp filters against @syscall
196 * @syscall: number of the current system call
197 *
198 * Returns valid seccomp BPF response codes.
199 */
200static u32 seccomp_run_filters(int syscall)
201{
202 struct seccomp_filter *f;
203 u32 ret = SECCOMP_RET_ALLOW;
204
205 /* Ensure unexpected behavior doesn't result in failing open. */
206 if (WARN_ON(current->seccomp.filter == NULL))
207 return SECCOMP_RET_KILL;
208
209 /*
210 * All filters in the list are evaluated and the lowest BPF return
211 * value always takes priority (ignoring the DATA).
212 */
213 for (f = current->seccomp.filter; f; f = f->prev) {
214 u32 cur_ret = sk_run_filter(NULL, f->insns);
215 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
216 ret = cur_ret;
217 }
218 return ret;
219}
220
221/**
222 * seccomp_attach_filter: Attaches a seccomp filter to current.
223 * @fprog: BPF program to install
224 *
225 * Returns 0 on success or an errno on failure.
226 */
227static long seccomp_attach_filter(struct sock_fprog *fprog)
228{
229 struct seccomp_filter *filter;
230 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
231 unsigned long total_insns = fprog->len;
232 long ret;
233
234 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
235 return -EINVAL;
236
237 for (filter = current->seccomp.filter; filter; filter = filter->prev)
238 total_insns += filter->len + 4; /* include a 4 instr penalty */
239 if (total_insns > MAX_INSNS_PER_PATH)
240 return -ENOMEM;
241
242 /*
243 * Installing a seccomp filter requires that the task have
244 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
245 * This avoids scenarios where unprivileged tasks can affect the
246 * behavior of privileged children.
247 */
248 if (!current->no_new_privs &&
249 security_capable_noaudit(current_cred(), current_user_ns(),
250 CAP_SYS_ADMIN) != 0)
251 return -EACCES;
252
253 /* Allocate a new seccomp_filter */
254 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
255 GFP_KERNEL|__GFP_NOWARN);
256 if (!filter)
257 return -ENOMEM;
258 atomic_set(&filter->usage, 1);
259 filter->len = fprog->len;
260
261 /* Copy the instructions from fprog. */
262 ret = -EFAULT;
263 if (copy_from_user(filter->insns, fprog->filter, fp_size))
264 goto fail;
265
266 /* Check and rewrite the fprog via the skb checker */
267 ret = sk_chk_filter(filter->insns, filter->len);
268 if (ret)
269 goto fail;
270
271 /* Check and rewrite the fprog for seccomp use */
272 ret = seccomp_check_filter(filter->insns, filter->len);
273 if (ret)
274 goto fail;
275
276 /*
277 * If there is an existing filter, make it the prev and don't drop its
278 * task reference.
279 */
280 filter->prev = current->seccomp.filter;
281 current->seccomp.filter = filter;
282 return 0;
283fail:
284 kfree(filter);
285 return ret;
286}
287
288/**
289 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
290 * @user_filter: pointer to the user data containing a sock_fprog.
291 *
292 * Returns 0 on success and non-zero otherwise.
293 */
294long seccomp_attach_user_filter(char __user *user_filter)
295{
296 struct sock_fprog fprog;
297 long ret = -EFAULT;
298
299#ifdef CONFIG_COMPAT
300 if (is_compat_task()) {
301 struct compat_sock_fprog fprog32;
302 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
303 goto out;
304 fprog.len = fprog32.len;
305 fprog.filter = compat_ptr(fprog32.filter);
306 } else /* falls through to the if below. */
307#endif
308 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
309 goto out;
310 ret = seccomp_attach_filter(&fprog);
311out:
312 return ret;
313}
314
315/* get_seccomp_filter - increments the reference count of the filter on @tsk */
316void get_seccomp_filter(struct task_struct *tsk)
317{
318 struct seccomp_filter *orig = tsk->seccomp.filter;
319 if (!orig)
320 return;
321 /* Reference count is bounded by the number of total processes. */
322 atomic_inc(&orig->usage);
323}
324
325/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
326void put_seccomp_filter(struct task_struct *tsk)
327{
328 struct seccomp_filter *orig = tsk->seccomp.filter;
329 /* Clean up single-reference branches iteratively. */
330 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig;
332 orig = orig->prev;
333 kfree(freeme);
334 }
335}
336
337/**
338 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
339 * @syscall: syscall number to send to userland
340 * @reason: filter-supplied reason code to send to userland (via si_errno)
341 *
342 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
343 */
344static void seccomp_send_sigsys(int syscall, int reason)
345{
346 struct siginfo info;
347 memset(&info, 0, sizeof(info));
348 info.si_signo = SIGSYS;
349 info.si_code = SYS_SECCOMP;
350 info.si_call_addr = (void __user *)KSTK_EIP(current);
351 info.si_errno = reason;
352 info.si_arch = syscall_get_arch(current, task_pt_regs(current));
353 info.si_syscall = syscall;
354 force_sig_info(SIGSYS, &info, current);
355}
356#endif /* CONFIG_SECCOMP_FILTER */
16 357
17/* 358/*
18 * Secure computing mode 1 allows only read/write/exit/sigreturn. 359 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
31}; 372};
32#endif 373#endif
33 374
34void __secure_computing(int this_syscall) 375int __secure_computing(int this_syscall)
35{ 376{
36 int mode = current->seccomp.mode; 377 int mode = current->seccomp.mode;
37 int * syscall; 378 int exit_sig = 0;
379 int *syscall;
380 u32 ret;
38 381
39 switch (mode) { 382 switch (mode) {
40 case 1: 383 case SECCOMP_MODE_STRICT:
41 syscall = mode1_syscalls; 384 syscall = mode1_syscalls;
42#ifdef CONFIG_COMPAT 385#ifdef CONFIG_COMPAT
43 if (is_compat_task()) 386 if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
45#endif 388#endif
46 do { 389 do {
47 if (*syscall == this_syscall) 390 if (*syscall == this_syscall)
48 return; 391 return 0;
49 } while (*++syscall); 392 } while (*++syscall);
393 exit_sig = SIGKILL;
394 ret = SECCOMP_RET_KILL;
395 break;
396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: {
398 int data;
399 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION;
402 switch (ret) {
403 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current),
406 -data, 0);
407 goto skip;
408 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current));
411 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data);
413 goto skip;
414 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
417 goto skip;
418 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /*
421 * The delivery of a fatal signal during event
422 * notification may silently skip tracer notification.
423 * Terminating the task now avoids executing a system
424 * call that may not be intended.
425 */
426 if (fatal_signal_pending(current))
427 break;
428 return 0;
429 case SECCOMP_RET_ALLOW:
430 return 0;
431 case SECCOMP_RET_KILL:
432 default:
433 break;
434 }
435 exit_sig = SIGSYS;
50 break; 436 break;
437 }
438#endif
51 default: 439 default:
52 BUG(); 440 BUG();
53 } 441 }
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
55#ifdef SECCOMP_DEBUG 443#ifdef SECCOMP_DEBUG
56 dump_stack(); 444 dump_stack();
57#endif 445#endif
58 audit_seccomp(this_syscall); 446 audit_seccomp(this_syscall, exit_sig, ret);
59 do_exit(SIGKILL); 447 do_exit(exit_sig);
448#ifdef CONFIG_SECCOMP_FILTER
449skip:
450 audit_seccomp(this_syscall, exit_sig, ret);
451#endif
452 return -1;
60} 453}
61 454
62long prctl_get_seccomp(void) 455long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
64 return current->seccomp.mode; 457 return current->seccomp.mode;
65} 458}
66 459
67long prctl_set_seccomp(unsigned long seccomp_mode) 460/**
461 * prctl_set_seccomp: configures current->seccomp.mode
462 * @seccomp_mode: requested mode to use
463 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
464 *
465 * This function may be called repeatedly with a @seccomp_mode of
466 * SECCOMP_MODE_FILTER to install additional filters. Every filter
467 * successfully installed will be evaluated (in reverse order) for each system
468 * call the task makes.
469 *
470 * Once current->seccomp.mode is non-zero, it may not be changed.
471 *
472 * Returns 0 on success or -EINVAL on failure.
473 */
474long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
68{ 475{
69 long ret; 476 long ret = -EINVAL;
70 477
71 /* can set it only once to be even more secure */ 478 if (current->seccomp.mode &&
72 ret = -EPERM; 479 current->seccomp.mode != seccomp_mode)
73 if (unlikely(current->seccomp.mode))
74 goto out; 480 goto out;
75 481
76 ret = -EINVAL; 482 switch (seccomp_mode) {
77 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { 483 case SECCOMP_MODE_STRICT:
78 current->seccomp.mode = seccomp_mode; 484 ret = 0;
79 set_thread_flag(TIF_SECCOMP);
80#ifdef TIF_NOTSC 485#ifdef TIF_NOTSC
81 disable_TSC(); 486 disable_TSC();
82#endif 487#endif
83 ret = 0; 488 break;
489#ifdef CONFIG_SECCOMP_FILTER
490 case SECCOMP_MODE_FILTER:
491 ret = seccomp_attach_user_filter(filter);
492 if (ret)
493 goto out;
494 break;
495#endif
496 default:
497 goto out;
84 } 498 }
85 499
86 out: 500 current->seccomp.mode = seccomp_mode;
501 set_thread_flag(TIF_SECCOMP);
502out:
87 return ret; 503 return ret;
88} 504}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 60636a4e25c3..4567fc020fe3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
118 * down_trylock - try to acquire the semaphore, without waiting 118 * down_trylock - try to acquire the semaphore, without waiting
119 * @sem: the semaphore to be acquired 119 * @sem: the semaphore to be acquired
120 * 120 *
121 * Try to acquire the semaphore atomically. Returns 0 if the mutex has 121 * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
122 * been acquired successfully or 1 if it it cannot be acquired. 122 * been acquired successfully or 1 if it it cannot be acquired.
123 * 123 *
124 * NOTE: This return value is inverted from both spin_trylock and 124 * NOTE: This return value is inverted from both spin_trylock and
diff --git a/kernel/signal.c b/kernel/signal.c
index 833ea5166855..21ebe75ff85f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -160,7 +160,7 @@ void recalc_sigpending(void)
160 160
161#define SYNCHRONOUS_MASK \ 161#define SYNCHRONOUS_MASK \
162 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ 162 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
163 sigmask(SIGTRAP) | sigmask(SIGFPE)) 163 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
164 164
165int next_signal(struct sigpending *pending, sigset_t *mask) 165int next_signal(struct sigpending *pending, sigset_t *mask)
166{ 166{
@@ -2695,6 +2695,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2695 err |= __put_user(from->si_uid, &to->si_uid); 2695 err |= __put_user(from->si_uid, &to->si_uid);
2696 err |= __put_user(from->si_ptr, &to->si_ptr); 2696 err |= __put_user(from->si_ptr, &to->si_ptr);
2697 break; 2697 break;
2698#ifdef __ARCH_SIGSYS
2699 case __SI_SYS:
2700 err |= __put_user(from->si_call_addr, &to->si_call_addr);
2701 err |= __put_user(from->si_syscall, &to->si_syscall);
2702 err |= __put_user(from->si_arch, &to->si_arch);
2703 break;
2704#endif
2698 default: /* this is just in case for now ... */ 2705 default: /* this is just in case for now ... */
2699 err |= __put_user(from->si_pid, &to->si_pid); 2706 err |= __put_user(from->si_pid, &to->si_pid);
2700 err |= __put_user(from->si_uid, &to->si_uid); 2707 err |= __put_user(from->si_uid, &to->si_uid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf759..d0ae5b24875e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#include "smpboot.h"
17
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
17static struct { 19static struct {
18 struct list_head queue; 20 struct list_head queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
669{ 671{
670 unsigned int cpu; 672 unsigned int cpu;
671 673
674 idle_threads_init();
675
672 /* FIXME: This should be done in userspace --RR */ 676 /* FIXME: This should be done in userspace --RR */
673 for_each_present_cpu(cpu) { 677 for_each_present_cpu(cpu) {
674 if (num_online_cpus() >= setup_max_cpus) 678 if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
791 } 795 }
792} 796}
793EXPORT_SYMBOL(on_each_cpu_cond); 797EXPORT_SYMBOL(on_each_cpu_cond);
798
799static void do_nothing(void *unused)
800{
801}
802
803/**
804 * kick_all_cpus_sync - Force all cpus out of idle
805 *
806 * Used to synchronize the update of pm_idle function pointer. It's
807 * called after the pointer is updated and returns after the dummy
808 * callback function has been executed on all cpus. The execution of
809 * the function can only happen on the remote cpus after they have
810 * left the idle function which had been called via pm_idle function
811 * pointer. So it's guaranteed that nothing uses the previous pointer
812 * anymore.
813 */
814void kick_all_cpus_sync(void)
815{
816 /* Make sure the change is visible before we kick the cpus */
817 smp_mb();
818 smp_call_function(do_nothing, NULL, 1);
819}
820EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 000000000000..e1a797e028a3
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,62 @@
1/*
2 * Common SMP CPU bringup/teardown functions
3 */
4#include <linux/err.h>
5#include <linux/smp.h>
6#include <linux/init.h>
7#include <linux/sched.h>
8#include <linux/percpu.h>
9
10#include "smpboot.h"
11
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/*
14 * For the hotplug case we keep the task structs around and reuse
15 * them.
16 */
17static DEFINE_PER_CPU(struct task_struct *, idle_threads);
18
19struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
20{
21 struct task_struct *tsk = per_cpu(idle_threads, cpu);
22
23 if (!tsk)
24 return ERR_PTR(-ENOMEM);
25 init_idle(tsk, cpu);
26 return tsk;
27}
28
29void __init idle_thread_set_boot_cpu(void)
30{
31 per_cpu(idle_threads, smp_processor_id()) = current;
32}
33
34static inline void idle_init(unsigned int cpu)
35{
36 struct task_struct *tsk = per_cpu(idle_threads, cpu);
37
38 if (!tsk) {
39 tsk = fork_idle(cpu);
40 if (IS_ERR(tsk))
41 pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
42 else
43 per_cpu(idle_threads, cpu) = tsk;
44 }
45}
46
47/**
48 * idle_thread_init - Initialize the idle thread for a cpu
49 * @cpu: The cpu for which the idle thread should be initialized
50 *
51 * Creates the thread if it does not exist.
52 */
53void __init idle_threads_init(void)
54{
55 unsigned int cpu;
56
57 for_each_possible_cpu(cpu) {
58 if (cpu != smp_processor_id())
59 idle_init(cpu);
60 }
61}
62#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 000000000000..80c0acfb8472
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
1#ifndef SMPBOOT_H
2#define SMPBOOT_H
3
4struct task_struct;
5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void);
11void idle_threads_init(void);
12#else
13static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
14static inline void idle_thread_set_boot_cpu(void) { }
15static inline void idle_threads_init(void) { }
16#endif
17
18#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37/*
38 * Initialize an rcu_batch structure to empty.
39 */
40static inline void rcu_batch_init(struct rcu_batch *b)
41{
42 b->head = NULL;
43 b->tail = &b->head;
44}
45
46/*
47 * Enqueue a callback onto the tail of the specified rcu_batch structure.
48 */
49static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
50{
51 *b->tail = head;
52 b->tail = &head->next;
53}
54
55/*
56 * Is the specified rcu_batch structure empty?
57 */
58static inline bool rcu_batch_empty(struct rcu_batch *b)
59{
60 return b->tail == &b->head;
61}
62
63/*
64 * Remove the callback at the head of the specified rcu_batch structure
65 * and return a pointer to it, or return NULL if the structure is empty.
66 */
67static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
68{
69 struct rcu_head *head;
70
71 if (rcu_batch_empty(b))
72 return NULL;
73
74 head = b->head;
75 b->head = head->next;
76 if (b->tail == &head->next)
77 rcu_batch_init(b);
78
79 return head;
80}
81
82/*
83 * Move all callbacks from the rcu_batch structure specified by "from" to
84 * the structure specified by "to".
85 */
86static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
87{
88 if (!rcu_batch_empty(from)) {
89 *to->tail = from->head;
90 to->tail = from->tail;
91 rcu_batch_init(from);
92 }
93}
94
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
37static int init_srcu_struct_fields(struct srcu_struct *sp) 98static int init_srcu_struct_fields(struct srcu_struct *sp)
38{ 99{
39 sp->completed = 0; 100 sp->completed = 0;
40 mutex_init(&sp->mutex); 101 spin_lock_init(&sp->queue_lock);
102 sp->running = false;
103 rcu_batch_init(&sp->batch_queue);
104 rcu_batch_init(&sp->batch_check0);
105 rcu_batch_init(&sp->batch_check1);
106 rcu_batch_init(&sp->batch_done);
107 INIT_DELAYED_WORK(&sp->work, process_srcu);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM; 109 return sp->per_cpu_ref ? 0 : -ENOMEM;
43} 110}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 140#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 141
75/* 142/*
76 * srcu_readers_active_idx -- returns approximate number of readers 143 * Returns approximate total of the readers' ->seq[] values for the
77 * active on the specified rank of per-CPU counters. 144 * rank of per-CPU counters specified by idx.
78 */ 145 */
146static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
147{
148 int cpu;
149 unsigned long sum = 0;
150 unsigned long t;
79 151
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 152 for_each_possible_cpu(cpu) {
153 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
154 sum += t;
155 }
156 return sum;
157}
158
159/*
160 * Returns approximate number of readers active on the specified rank
161 * of the per-CPU ->c[] counters.
162 */
163static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
81{ 164{
82 int cpu; 165 int cpu;
83 int sum; 166 unsigned long sum = 0;
167 unsigned long t;
84 168
85 sum = 0; 169 for_each_possible_cpu(cpu) {
86 for_each_possible_cpu(cpu) 170 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 171 sum += t;
172 }
88 return sum; 173 return sum;
89} 174}
90 175
176/*
177 * Return true if the number of pre-existing readers is determined to
178 * be stably zero. An example unstable zero can occur if the call
179 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
180 * but due to task migration, sees the corresponding __srcu_read_unlock()
181 * decrement. This can happen because srcu_readers_active_idx() takes
182 * time to sum the array, and might in fact be interrupted or preempted
183 * partway through the summation.
184 */
185static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
186{
187 unsigned long seq;
188
189 seq = srcu_readers_seq_idx(sp, idx);
190
191 /*
192 * The following smp_mb() A pairs with the smp_mb() B located in
193 * __srcu_read_lock(). This pairing ensures that if an
194 * __srcu_read_lock() increments its counter after the summation
195 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
196 * critical section will see any changes made prior to the start
197 * of the current SRCU grace period.
198 *
199 * Also, if the above call to srcu_readers_seq_idx() saw the
200 * increment of ->seq[], then the call to srcu_readers_active_idx()
201 * must see the increment of ->c[].
202 */
203 smp_mb(); /* A */
204
205 /*
206 * Note that srcu_readers_active_idx() can incorrectly return
207 * zero even though there is a pre-existing reader throughout.
208 * To see this, suppose that task A is in a very long SRCU
209 * read-side critical section that started on CPU 0, and that
210 * no other reader exists, so that the sum of the counters
211 * is equal to one. Then suppose that task B starts executing
212 * srcu_readers_active_idx(), summing up to CPU 1, and then that
213 * task C starts reading on CPU 0, so that its increment is not
214 * summed, but finishes reading on CPU 2, so that its decrement
215 * -is- summed. Then when task B completes its sum, it will
216 * incorrectly get zero, despite the fact that task A has been
217 * in its SRCU read-side critical section the whole time.
218 *
219 * We therefore do a validation step should srcu_readers_active_idx()
220 * return zero.
221 */
222 if (srcu_readers_active_idx(sp, idx) != 0)
223 return false;
224
225 /*
226 * The remainder of this function is the validation step.
227 * The following smp_mb() D pairs with the smp_mb() C in
228 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
229 * by srcu_readers_active_idx() above, then any destructive
230 * operation performed after the grace period will happen after
231 * the corresponding SRCU read-side critical section.
232 *
233 * Note that there can be at most NR_CPUS worth of readers using
234 * the old index, which is not enough to overflow even a 32-bit
235 * integer. (Yes, this does mean that systems having more than
236 * a billion or so CPUs need to be 64-bit systems.) Therefore,
237 * the sum of the ->seq[] counters cannot possibly overflow.
238 * Therefore, the only way that the return values of the two
239 * calls to srcu_readers_seq_idx() can be equal is if there were
240 * no increments of the corresponding rank of ->seq[] counts
241 * in the interim. But the missed-increment scenario laid out
242 * above includes an increment of the ->seq[] counter by
243 * the corresponding __srcu_read_lock(). Therefore, if this
244 * scenario occurs, the return values from the two calls to
245 * srcu_readers_seq_idx() will differ, and thus the validation
246 * step below suffices.
247 */
248 smp_mb(); /* D */
249
250 return srcu_readers_seq_idx(sp, idx) == seq;
251}
252
91/** 253/**
92 * srcu_readers_active - returns approximate number of readers. 254 * srcu_readers_active - returns approximate number of readers.
93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 255 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
98 */ 260 */
99static int srcu_readers_active(struct srcu_struct *sp) 261static int srcu_readers_active(struct srcu_struct *sp)
100{ 262{
101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 263 int cpu;
264 unsigned long sum = 0;
265
266 for_each_possible_cpu(cpu) {
267 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
268 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
269 }
270 return sum;
102} 271}
103 272
104/** 273/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 300 int idx;
132 301
133 preempt_disable(); 302 preempt_disable();
134 idx = sp->completed & 0x1; 303 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 304 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 306 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
138 preempt_enable(); 308 preempt_enable();
139 return idx; 309 return idx;
140} 310}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 319void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 320{
151 preempt_disable(); 321 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 322 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
154 preempt_enable(); 324 preempt_enable();
155} 325}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 326EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 333 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 334 * has done well in testing, so there is no need for a config parameter.
165 */ 335 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 336#define SRCU_RETRY_CHECK_DELAY 5
337#define SYNCHRONIZE_SRCU_TRYCOUNT 2
338#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
167 339
168/* 340/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 341 * @@@ Wait until all pre-existing readers complete. Such readers
342 * will have used the index specified by "idx".
343 * the caller should ensures the ->completed is not changed while checking
344 * and idx = (->completed & 1) ^ 1
170 */ 345 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 346static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
172{ 347{
173 int idx; 348 for (;;) {
174 349 if (srcu_readers_active_idx_check(sp, idx))
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 350 return true;
176 !lock_is_held(&rcu_bh_lock_map) && 351 if (--trycount <= 0)
177 !lock_is_held(&rcu_lock_map) && 352 return false;
178 !lock_is_held(&rcu_sched_lock_map), 353 udelay(SRCU_RETRY_CHECK_DELAY);
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 354 }
180 355}
181 idx = sp->completed;
182 mutex_lock(&sp->mutex);
183 356
184 /* 357/*
185 * Check to see if someone else did the work for us while we were 358 * Increment the ->completed counter so that future SRCU readers will
186 * waiting to acquire the lock. We need -two- advances of 359 * use the other rank of the ->c[] and ->seq[] arrays. This allows
187 * the counter, not just one. If there was but one, we might have 360 * us to wait for pre-existing readers in a starvation-free manner.
188 * shown up -after- our helper's first synchronize_sched(), thus 361 */
189 * having failed to prevent CPU-reordering races with concurrent 362static void srcu_flip(struct srcu_struct *sp)
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 363{
191 * either (1) wait for two or (2) supply the second ourselves. 364 sp->completed++;
192 */ 365}
193 366
194 if ((sp->completed - idx) >= 2) { 367/*
195 mutex_unlock(&sp->mutex); 368 * Enqueue an SRCU callback on the specified srcu_struct structure,
196 return; 369 * initiating grace-period processing if it is not already running.
370 */
371void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
372 void (*func)(struct rcu_head *head))
373{
374 unsigned long flags;
375
376 head->next = NULL;
377 head->func = func;
378 spin_lock_irqsave(&sp->queue_lock, flags);
379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) {
381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0);
197 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385}
386EXPORT_SYMBOL_GPL(call_srcu);
198 387
199 sync_func(); /* Force memory barrier on all CPUs. */ 388struct rcu_synchronize {
389 struct rcu_head head;
390 struct completion completion;
391};
200 392
201 /* 393/*
202 * The preceding synchronize_sched() ensures that any CPU that 394 * Awaken the corresponding synchronize_srcu() instance now that a
203 * sees the new value of sp->completed will also see any preceding 395 * grace period has elapsed.
204 * changes to data structures made by this CPU. This prevents 396 */
205 * some other CPU from reordering the accesses in its SRCU 397static void wakeme_after_rcu(struct rcu_head *head)
206 * read-side critical section to precede the corresponding 398{
207 * srcu_read_lock() -- ensuring that such references will in 399 struct rcu_synchronize *rcu;
208 * fact be protected.
209 *
210 * So it is now safe to do the flip.
211 */
212 400
213 idx = sp->completed & 0x1; 401 rcu = container_of(head, struct rcu_synchronize, head);
214 sp->completed++; 402 complete(&rcu->completion);
403}
215 404
216 sync_func(); /* Force memory barrier on all CPUs. */ 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
406static void srcu_reschedule(struct srcu_struct *sp);
217 407
218 /* 408/*
219 * At this point, because of the preceding synchronize_sched(), 409 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
220 * all srcu_read_lock() calls using the old counters have completed. 410 */
221 * Their corresponding critical sections might well be still 411static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
222 * executing, but the srcu_read_lock() primitives themselves 412{
223 * will have finished executing. We initially give readers 413 struct rcu_synchronize rcu;
224 * an arbitrarily chosen 10 microseconds to get out of their 414 struct rcu_head *head = &rcu.head;
225 * SRCU read-side critical sections, then loop waiting 1/HZ 415 bool done = false;
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234 416
235 sync_func(); /* Force memory barrier on all CPUs. */ 417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
418 !lock_is_held(&rcu_bh_lock_map) &&
419 !lock_is_held(&rcu_lock_map) &&
420 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
236 422
237 /* 423 init_completion(&rcu.completion);
238 * The preceding synchronize_sched() forces all srcu_read_unlock() 424
239 * primitives that were executing concurrently with the preceding 425 head->next = NULL;
240 * for_each_possible_cpu() loop to have completed by this point. 426 head->func = wakeme_after_rcu;
241 * More importantly, it also forces the corresponding SRCU read-side 427 spin_lock_irq(&sp->queue_lock);
242 * critical sections to have also completed, and the corresponding 428 if (!sp->running) {
243 * references to SRCU-protected data items to be dropped. 429 /* steal the processing owner */
244 * 430 sp->running = true;
245 * Note: 431 rcu_batch_queue(&sp->batch_check0, head);
246 * 432 spin_unlock_irq(&sp->queue_lock);
247 * Despite what you might think at first glance, the 433
248 * preceding synchronize_sched() -must- be within the 434 srcu_advance_batches(sp, trycount);
249 * critical section ended by the following mutex_unlock(). 435 if (!rcu_batch_empty(&sp->batch_done)) {
250 * Otherwise, a task taking the early exit can race 436 BUG_ON(sp->batch_done.head != head);
251 * with a srcu_read_unlock(), which might have executed 437 rcu_batch_dequeue(&sp->batch_done);
252 * just before the preceding srcu_readers_active() check, 438 done = true;
253 * and whose CPU might have reordered the srcu_read_unlock() 439 }
254 * with the preceding critical section. In this case, there 440 /* give the processing owner to work_struct */
255 * is nothing preventing the synchronize_sched() task that is 441 srcu_reschedule(sp);
256 * taking the early exit from freeing a data structure that 442 } else {
257 * is still being referenced (out of order) by the task 443 rcu_batch_queue(&sp->batch_queue, head);
258 * doing the srcu_read_unlock(). 444 spin_unlock_irq(&sp->queue_lock);
259 * 445 }
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */
264 446
265 mutex_unlock(&sp->mutex); 447 if (!done)
448 wait_for_completion(&rcu.completion);
266} 449}
267 450
268/** 451/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 464 */
282void synchronize_srcu(struct srcu_struct *sp) 465void synchronize_srcu(struct srcu_struct *sp)
283{ 466{
284 __synchronize_srcu(sp, synchronize_sched); 467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
285} 468}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 469EXPORT_SYMBOL_GPL(synchronize_srcu);
287 470
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 472 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 473 * @sp: srcu_struct with which to synchronize.
291 * 474 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 475 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 476 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 477 *
300 * Note that it is illegal to call this function while holding any lock 478 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 480 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 481 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 485 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 487{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 488 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
313} 489}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 490EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 491
316/** 492/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
494 */
495void srcu_barrier(struct srcu_struct *sp)
496{
497 synchronize_srcu(sp);
498}
499EXPORT_SYMBOL_GPL(srcu_barrier);
500
501/**
317 * srcu_batches_completed - return batches completed. 502 * srcu_batches_completed - return batches completed.
318 * @sp: srcu_struct on which to report batch completion. 503 * @sp: srcu_struct on which to report batch completion.
319 * 504 *
320 * Report the number of batches, correlated with, but not necessarily 505 * Report the number of batches, correlated with, but not necessarily
321 * precisely the same as, the number of grace periods that have elapsed. 506 * precisely the same as, the number of grace periods that have elapsed.
322 */ 507 */
323
324long srcu_batches_completed(struct srcu_struct *sp) 508long srcu_batches_completed(struct srcu_struct *sp)
325{ 509{
326 return sp->completed; 510 return sp->completed;
327} 511}
328EXPORT_SYMBOL_GPL(srcu_batches_completed); 512EXPORT_SYMBOL_GPL(srcu_batches_completed);
513
514#define SRCU_CALLBACK_BATCH 10
515#define SRCU_INTERVAL 1
516
517/*
518 * Move any new SRCU callbacks to the first stage of the SRCU grace
519 * period pipeline.
520 */
521static void srcu_collect_new(struct srcu_struct *sp)
522{
523 if (!rcu_batch_empty(&sp->batch_queue)) {
524 spin_lock_irq(&sp->queue_lock);
525 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
526 spin_unlock_irq(&sp->queue_lock);
527 }
528}
529
530/*
531 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
532 * ->batch_check1 and then to ->batch_done as readers drain.
533 */
534static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
535{
536 int idx = 1 ^ (sp->completed & 1);
537
538 /*
539 * Because readers might be delayed for an extended period after
540 * fetching ->completed for their index, at any point in time there
541 * might well be readers using both idx=0 and idx=1. We therefore
542 * need to wait for readers to clear from both index values before
543 * invoking a callback.
544 */
545
546 if (rcu_batch_empty(&sp->batch_check0) &&
547 rcu_batch_empty(&sp->batch_check1))
548 return; /* no callbacks need to be advanced */
549
550 if (!try_check_zero(sp, idx, trycount))
551 return; /* failed to advance, will try after SRCU_INTERVAL */
552
553 /*
554 * The callbacks in ->batch_check1 have already done with their
555 * first zero check and flip back when they were enqueued on
556 * ->batch_check0 in a previous invocation of srcu_advance_batches().
557 * (Presumably try_check_zero() returned false during that
558 * invocation, leaving the callbacks stranded on ->batch_check1.)
559 * They are therefore ready to invoke, so move them to ->batch_done.
560 */
561 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
562
563 if (rcu_batch_empty(&sp->batch_check0))
564 return; /* no callbacks need to be advanced */
565 srcu_flip(sp);
566
567 /*
568 * The callbacks in ->batch_check0 just finished their
569 * first check zero and flip, so move them to ->batch_check1
570 * for future checking on the other idx.
571 */
572 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
573
574 /*
575 * SRCU read-side critical sections are normally short, so check
576 * at least twice in quick succession after a flip.
577 */
578 trycount = trycount < 2 ? 2 : trycount;
579 if (!try_check_zero(sp, idx^1, trycount))
580 return; /* failed to advance, will try after SRCU_INTERVAL */
581
582 /*
583 * The callbacks in ->batch_check1 have now waited for all
584 * pre-existing readers using both idx values. They are therefore
585 * ready to invoke, so move them to ->batch_done.
586 */
587 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
588}
589
590/*
591 * Invoke a limited number of SRCU callbacks that have passed through
592 * their grace period. If there are more to do, SRCU will reschedule
593 * the workqueue.
594 */
595static void srcu_invoke_callbacks(struct srcu_struct *sp)
596{
597 int i;
598 struct rcu_head *head;
599
600 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
601 head = rcu_batch_dequeue(&sp->batch_done);
602 if (!head)
603 break;
604 local_bh_disable();
605 head->func(head);
606 local_bh_enable();
607 }
608}
609
610/*
611 * Finished one round of SRCU grace period. Start another if there are
612 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
613 */
614static void srcu_reschedule(struct srcu_struct *sp)
615{
616 bool pending = true;
617
618 if (rcu_batch_empty(&sp->batch_done) &&
619 rcu_batch_empty(&sp->batch_check1) &&
620 rcu_batch_empty(&sp->batch_check0) &&
621 rcu_batch_empty(&sp->batch_queue)) {
622 spin_lock_irq(&sp->queue_lock);
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 sp->running = false;
628 pending = false;
629 }
630 spin_unlock_irq(&sp->queue_lock);
631 }
632
633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
635}
636
637/*
638 * This is the work-queue function that handles SRCU grace periods.
639 */
640static void process_srcu(struct work_struct *work)
641{
642 struct srcu_struct *sp;
643
644 sp = container_of(work, struct srcu_struct, work.work);
645
646 srcu_collect_new(sp);
647 srcu_advance_batches(sp, 1);
648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp);
650}
diff --git a/kernel/sys.c b/kernel/sys.c
index f484077b6b14..6df42624e454 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,7 +1990,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1990 error = prctl_get_seccomp(); 1990 error = prctl_get_seccomp();
1991 break; 1991 break;
1992 case PR_SET_SECCOMP: 1992 case PR_SET_SECCOMP:
1993 error = prctl_set_seccomp(arg2); 1993 error = prctl_set_seccomp(arg2, (char __user *)arg3);
1994 break; 1994 break;
1995 case PR_GET_TSC: 1995 case PR_GET_TSC:
1996 error = GET_TSC_CTL(arg2); 1996 error = GET_TSC_CTL(arg2);
@@ -2061,6 +2061,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2061 error = put_user(me->signal->is_child_subreaper, 2061 error = put_user(me->signal->is_child_subreaper,
2062 (int __user *) arg2); 2062 (int __user *) arg2);
2063 break; 2063 break;
2064 case PR_SET_NO_NEW_PRIVS:
2065 if (arg2 != 1 || arg3 || arg4 || arg5)
2066 return -EINVAL;
2067
2068 current->no_new_privs = 1;
2069 break;
2070 case PR_GET_NO_NEW_PRIVS:
2071 if (arg2 || arg3 || arg4 || arg5)
2072 return -EINVAL;
2073 return current->no_new_privs ? 1 : 0;
2064 default: 2074 default:
2065 error = -EINVAL; 2075 error = -EINVAL;
2066 break; 2076 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 52b3a06a02f8..4ab11879aeb4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,7 +170,7 @@ static int proc_taint(struct ctl_table *table, int write,
170#endif 170#endif
171 171
172#ifdef CONFIG_PRINTK 172#ifdef CONFIG_PRINTK
173static int proc_dmesg_restrict(struct ctl_table *table, int write, 173static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, loff_t *ppos); 174 void __user *buffer, size_t *lenp, loff_t *ppos);
175#endif 175#endif
176 176
@@ -703,7 +703,7 @@ static struct ctl_table kern_table[] = {
703 .data = &dmesg_restrict, 703 .data = &dmesg_restrict,
704 .maxlen = sizeof(int), 704 .maxlen = sizeof(int),
705 .mode = 0644, 705 .mode = 0644,
706 .proc_handler = proc_dointvec_minmax, 706 .proc_handler = proc_dointvec_minmax_sysadmin,
707 .extra1 = &zero, 707 .extra1 = &zero,
708 .extra2 = &one, 708 .extra2 = &one,
709 }, 709 },
@@ -712,7 +712,7 @@ static struct ctl_table kern_table[] = {
712 .data = &kptr_restrict, 712 .data = &kptr_restrict,
713 .maxlen = sizeof(int), 713 .maxlen = sizeof(int),
714 .mode = 0644, 714 .mode = 0644,
715 .proc_handler = proc_dmesg_restrict, 715 .proc_handler = proc_dointvec_minmax_sysadmin,
716 .extra1 = &zero, 716 .extra1 = &zero,
717 .extra2 = &two, 717 .extra2 = &two,
718 }, 718 },
@@ -1943,7 +1943,7 @@ static int proc_taint(struct ctl_table *table, int write,
1943} 1943}
1944 1944
1945#ifdef CONFIG_PRINTK 1945#ifdef CONFIG_PRINTK
1946static int proc_dmesg_restrict(struct ctl_table *table, int write, 1946static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
1947 void __user *buffer, size_t *lenp, loff_t *ppos) 1947 void __user *buffer, size_t *lenp, loff_t *ppos)
1948{ 1948{
1949 if (write && !capable(CAP_SYS_ADMIN)) 1949 if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2cf9cc7aa103..a20dc8a3c949 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,6 +1,10 @@
1# 1#
2# Timer subsystem related configuration options 2# Timer subsystem related configuration options
3# 3#
4
5# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
6# only related to the tick functionality. Oneshot clockevent devices
7# are supported independ of this.
4config TICK_ONESHOT 8config TICK_ONESHOT
5 bool 9 bool
6 10
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7b..aa27d391bfc8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
59 * If one has not already been chosen, it checks to see if a 59 * If one has not already been chosen, it checks to see if a
60 * functional rtc device is available. 60 * functional rtc device is available.
61 */ 61 */
62static struct rtc_device *alarmtimer_get_rtcdev(void) 62struct rtc_device *alarmtimer_get_rtcdev(void)
63{ 63{
64 unsigned long flags; 64 unsigned long flags;
65 struct rtc_device *ret; 65 struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
115 class_interface_unregister(&alarmtimer_rtc_interface); 115 class_interface_unregister(&alarmtimer_rtc_interface);
116} 116}
117#else 117#else
118static inline struct rtc_device *alarmtimer_get_rtcdev(void) 118struct rtc_device *alarmtimer_get_rtcdev(void)
119{ 119{
120 return NULL; 120 return NULL;
121} 121}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e883f57a3cd3..f113755695e2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -346,7 +346,8 @@ int tick_resume_broadcast(void)
346 tick_get_broadcast_mask()); 346 tick_get_broadcast_mask());
347 break; 347 break;
348 case TICKDEV_MODE_ONESHOT: 348 case TICKDEV_MODE_ONESHOT:
349 broadcast = tick_resume_broadcast_oneshot(bc); 349 if (!cpumask_empty(tick_get_broadcast_mask()))
350 broadcast = tick_resume_broadcast_oneshot(bc);
350 break; 351 break;
351 } 352 }
352 } 353 }
@@ -373,6 +374,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
373{ 374{
374 struct clock_event_device *bc = tick_broadcast_device.evtdev; 375 struct clock_event_device *bc = tick_broadcast_device.evtdev;
375 376
377 if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
378 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
379
376 return clockevents_program_event(bc, expires, force); 380 return clockevents_program_event(bc, expires, force);
377} 381}
378 382
@@ -531,7 +535,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 535 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
532 536
533 bc->event_handler = tick_handle_oneshot_broadcast; 537 bc->event_handler = tick_handle_oneshot_broadcast;
534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
535 538
536 /* Take the do_timer update */ 539 /* Take the do_timer update */
537 tick_do_timer_cpu = cpu; 540 tick_do_timer_cpu = cpu;
@@ -549,6 +552,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
549 to_cpumask(tmpmask)); 552 to_cpumask(tmpmask));
550 553
551 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 554 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
555 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
552 tick_broadcast_init_next_event(to_cpumask(tmpmask), 556 tick_broadcast_init_next_event(to_cpumask(tmpmask),
553 tick_next_period); 557 tick_next_period);
554 tick_broadcast_set_event(tick_next_period, 1); 558 tick_broadcast_set_event(tick_next_period, 1);
@@ -575,15 +579,12 @@ void tick_broadcast_switch_to_oneshot(void)
575 unsigned long flags; 579 unsigned long flags;
576 580
577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 581 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
578 if (cpumask_empty(tick_get_broadcast_mask()))
579 goto end;
580 582
581 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 583 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
582 bc = tick_broadcast_device.evtdev; 584 bc = tick_broadcast_device.evtdev;
583 if (bc) 585 if (bc)
584 tick_broadcast_setup_oneshot(bc); 586 tick_broadcast_setup_oneshot(bc);
585 587
586end:
587 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 588 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
588} 589}
589 590
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3526038f2836..6a3a5b9ff561 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
534 hrtimer_get_expires(&ts->sched_timer), 0)) 534 hrtimer_get_expires(&ts->sched_timer), 0))
535 break; 535 break;
536 } 536 }
537 /* Update jiffies and reread time */ 537 /* Reread time and update jiffies */
538 tick_do_update_jiffies64(now);
539 now = ktime_get(); 538 now = ktime_get();
539 tick_do_update_jiffies64(now);
540 } 540 }
541} 541}
542 542
diff --git a/kernel/timer.c b/kernel/timer.c
index 67316cb6a777..6ec7e7e0db43 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
861 * 861 *
862 * mod_timer_pinned() is a way to update the expire field of an 862 * mod_timer_pinned() is a way to update the expire field of an
863 * active timer (if the timer is inactive it will be activated) 863 * active timer (if the timer is inactive it will be activated)
864 * and not allow the timer to be migrated to a different CPU. 864 * and to ensure that the timer is scheduled on the current CPU.
865 *
866 * Note that this does not prevent the timer from being migrated
867 * when the current CPU goes offline. If this is a problem for
868 * you, use CPU-hotplug notifiers to handle it correctly, for
869 * example, cancelling the timer when the corresponding CPU goes
870 * offline.
865 * 871 *
866 * mod_timer_pinned(timer, expires) is equivalent to: 872 * mod_timer_pinned(timer, expires) is equivalent to:
867 * 873 *
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1102 * warnings as well as problems when looking into 1108 * warnings as well as problems when looking into
1103 * timer->lockdep_map, make a copy and use that here. 1109 * timer->lockdep_map, make a copy and use that here.
1104 */ 1110 */
1105 struct lockdep_map lockdep_map = timer->lockdep_map; 1111 struct lockdep_map lockdep_map;
1112
1113 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1106#endif 1114#endif
1107 /* 1115 /*
1108 * Couple the lock chain with the lock chain at 1116 * Couple the lock chain with the lock chain at
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..f347ac91292d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,6 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
145 select KALLSYMS 144 select KALLSYMS
146 select GENERIC_TRACER 145 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 146 select CONTEXT_SWITCH_TRACER
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES
272 bool "Trace likely/unlikely profiler" 271 bool "Trace likely/unlikely profiler"
273 select TRACE_BRANCH_PROFILING 272 select TRACE_BRANCH_PROFILING
274 help 273 help
275 This tracer profiles all the the likely and unlikely macros 274 This tracer profiles all likely and unlikely macros
276 in the kernel. It will display the results in: 275 in the kernel. It will display the results in:
277 276
278 /sys/kernel/debug/tracing/trace_stat/branch_annotated 277 /sys/kernel/debug/tracing/trace_stat/branch_annotated
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..b3afe0e76f79 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
45obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 44obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
46ifeq ($(CONFIG_BLOCK),y) 45ifeq ($(CONFIG_BLOCK),y)
47obj-$(CONFIG_EVENT_TRACING) += blktrace.o 46obj-$(CONFIG_EVENT_TRACING) += blktrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cdea7b56b0c9..c0bd0308741c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q)
311} 311}
312EXPORT_SYMBOL_GPL(blk_trace_remove); 312EXPORT_SYMBOL_GPL(blk_trace_remove);
313 313
314static int blk_dropped_open(struct inode *inode, struct file *filp)
315{
316 filp->private_data = inode->i_private;
317
318 return 0;
319}
320
321static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, 314static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
322 size_t count, loff_t *ppos) 315 size_t count, loff_t *ppos)
323{ 316{
@@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
331 324
332static const struct file_operations blk_dropped_fops = { 325static const struct file_operations blk_dropped_fops = {
333 .owner = THIS_MODULE, 326 .owner = THIS_MODULE,
334 .open = blk_dropped_open, 327 .open = simple_open,
335 .read = blk_dropped_read, 328 .read = blk_dropped_read,
336 .llseek = default_llseek, 329 .llseek = default_llseek,
337}; 330};
338 331
339static int blk_msg_open(struct inode *inode, struct file *filp)
340{
341 filp->private_data = inode->i_private;
342
343 return 0;
344}
345
346static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, 332static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
347 size_t count, loff_t *ppos) 333 size_t count, loff_t *ppos)
348{ 334{
@@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
371 357
372static const struct file_operations blk_msg_fops = { 358static const struct file_operations blk_msg_fops = {
373 .owner = THIS_MODULE, 359 .owner = THIS_MODULE,
374 .open = blk_msg_open, 360 .open = simple_open,
375 .write = blk_msg_write, 361 .write = blk_msg_write,
376 .llseek = noop_llseek, 362 .llseek = noop_llseek,
377}; 363};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0fa92f677c92..a008663d86c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1383 1383
1384static int ftrace_cmp_recs(const void *a, const void *b) 1384static int ftrace_cmp_recs(const void *a, const void *b)
1385{ 1385{
1386 const struct dyn_ftrace *reca = a; 1386 const struct dyn_ftrace *key = a;
1387 const struct dyn_ftrace *recb = b; 1387 const struct dyn_ftrace *rec = b;
1388 1388
1389 if (reca->ip > recb->ip) 1389 if (key->flags < rec->ip)
1390 return 1;
1391 if (reca->ip < recb->ip)
1392 return -1; 1390 return -1;
1391 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1392 return 1;
1393 return 0; 1393 return 0;
1394} 1394}
1395 1395
1396/** 1396static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1397 * ftrace_location - return true if the ip giving is a traced location
1398 * @ip: the instruction pointer to check
1399 *
1400 * Returns 1 if @ip given is a pointer to a ftrace location.
1401 * That is, the instruction that is either a NOP or call to
1402 * the function tracer. It checks the ftrace internal tables to
1403 * determine if the address belongs or not.
1404 */
1405int ftrace_location(unsigned long ip)
1406{ 1397{
1407 struct ftrace_page *pg; 1398 struct ftrace_page *pg;
1408 struct dyn_ftrace *rec; 1399 struct dyn_ftrace *rec;
1409 struct dyn_ftrace key; 1400 struct dyn_ftrace key;
1410 1401
1411 key.ip = ip; 1402 key.ip = start;
1403 key.flags = end; /* overload flags, as it is unsigned long */
1412 1404
1413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 1405 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1406 if (end < pg->records[0].ip ||
1407 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1408 continue;
1414 rec = bsearch(&key, pg->records, pg->index, 1409 rec = bsearch(&key, pg->records, pg->index,
1415 sizeof(struct dyn_ftrace), 1410 sizeof(struct dyn_ftrace),
1416 ftrace_cmp_recs); 1411 ftrace_cmp_recs);
1417 if (rec) 1412 if (rec)
1418 return 1; 1413 return rec->ip;
1419 } 1414 }
1420 1415
1421 return 0; 1416 return 0;
1422} 1417}
1423 1418
1419/**
1420 * ftrace_location - return true if the ip giving is a traced location
1421 * @ip: the instruction pointer to check
1422 *
1423 * Returns rec->ip if @ip given is a pointer to a ftrace location.
1424 * That is, the instruction that is either a NOP or call to
1425 * the function tracer. It checks the ftrace internal tables to
1426 * determine if the address belongs or not.
1427 */
1428unsigned long ftrace_location(unsigned long ip)
1429{
1430 return ftrace_location_range(ip, ip);
1431}
1432
1433/**
1434 * ftrace_text_reserved - return true if range contains an ftrace location
1435 * @start: start of range to search
1436 * @end: end of range to search (inclusive). @end points to the last byte to check.
1437 *
1438 * Returns 1 if @start and @end contains a ftrace location.
1439 * That is, the instruction that is either a NOP or call to
1440 * the function tracer. It checks the ftrace internal tables to
1441 * determine if the address belongs or not.
1442 */
1443int ftrace_text_reserved(void *start, void *end)
1444{
1445 unsigned long ret;
1446
1447 ret = ftrace_location_range((unsigned long)start,
1448 (unsigned long)end);
1449
1450 return (int)!!ret;
1451}
1452
1424static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1453static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1425 int filter_hash, 1454 int filter_hash,
1426 bool inc) 1455 bool inc)
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1520 __ftrace_hash_rec_update(ops, filter_hash, 1); 1549 __ftrace_hash_rec_update(ops, filter_hash, 1);
1521} 1550}
1522 1551
1523static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1524{
1525 if (ftrace_pages->index == ftrace_pages->size) {
1526 /* We should have allocated enough */
1527 if (WARN_ON(!ftrace_pages->next))
1528 return NULL;
1529 ftrace_pages = ftrace_pages->next;
1530 }
1531
1532 return &ftrace_pages->records[ftrace_pages->index++];
1533}
1534
1535static struct dyn_ftrace *
1536ftrace_record_ip(unsigned long ip)
1537{
1538 struct dyn_ftrace *rec;
1539
1540 if (ftrace_disabled)
1541 return NULL;
1542
1543 rec = ftrace_alloc_dyn_node(ip);
1544 if (!rec)
1545 return NULL;
1546
1547 rec->ip = ip;
1548
1549 return rec;
1550}
1551
1552static void print_ip_ins(const char *fmt, unsigned char *p) 1552static void print_ip_ins(const char *fmt, unsigned char *p)
1553{ 1553{
1554 int i; 1554 int i;
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)
1598 } 1598 }
1599} 1599}
1600 1600
1601
1602/* Return 1 if the address range is reserved for ftrace */
1603int ftrace_text_reserved(void *start, void *end)
1604{
1605 struct dyn_ftrace *rec;
1606 struct ftrace_page *pg;
1607
1608 do_for_each_ftrace_rec(pg, rec) {
1609 if (rec->ip <= (unsigned long)end &&
1610 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1611 return 1;
1612 } while_for_each_ftrace_rec();
1613 return 0;
1614}
1615
1616static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1601static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1617{ 1602{
1618 unsigned long flag = 0UL; 1603 unsigned long flag = 0UL;
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1698 return -1; /* unknow ftrace bug */ 1683 return -1; /* unknow ftrace bug */
1699} 1684}
1700 1685
1701static void ftrace_replace_code(int update) 1686void __weak ftrace_replace_code(int enable)
1702{ 1687{
1703 struct dyn_ftrace *rec; 1688 struct dyn_ftrace *rec;
1704 struct ftrace_page *pg; 1689 struct ftrace_page *pg;
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)
1708 return; 1693 return;
1709 1694
1710 do_for_each_ftrace_rec(pg, rec) { 1695 do_for_each_ftrace_rec(pg, rec) {
1711 failed = __ftrace_replace_code(rec, update); 1696 failed = __ftrace_replace_code(rec, enable);
1712 if (failed) { 1697 if (failed) {
1713 ftrace_bug(failed, rec->ip); 1698 ftrace_bug(failed, rec->ip);
1714 /* Stop processing */ 1699 /* Stop processing */
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1826 return 0; 1811 return 0;
1827} 1812}
1828 1813
1829static int __ftrace_modify_code(void *data) 1814void ftrace_modify_all_code(int command)
1830{ 1815{
1831 int *command = data; 1816 if (command & FTRACE_UPDATE_CALLS)
1832
1833 if (*command & FTRACE_UPDATE_CALLS)
1834 ftrace_replace_code(1); 1817 ftrace_replace_code(1);
1835 else if (*command & FTRACE_DISABLE_CALLS) 1818 else if (command & FTRACE_DISABLE_CALLS)
1836 ftrace_replace_code(0); 1819 ftrace_replace_code(0);
1837 1820
1838 if (*command & FTRACE_UPDATE_TRACE_FUNC) 1821 if (command & FTRACE_UPDATE_TRACE_FUNC)
1839 ftrace_update_ftrace_func(ftrace_trace_function); 1822 ftrace_update_ftrace_func(ftrace_trace_function);
1840 1823
1841 if (*command & FTRACE_START_FUNC_RET) 1824 if (command & FTRACE_START_FUNC_RET)
1842 ftrace_enable_ftrace_graph_caller(); 1825 ftrace_enable_ftrace_graph_caller();
1843 else if (*command & FTRACE_STOP_FUNC_RET) 1826 else if (command & FTRACE_STOP_FUNC_RET)
1844 ftrace_disable_ftrace_graph_caller(); 1827 ftrace_disable_ftrace_graph_caller();
1828}
1829
1830static int __ftrace_modify_code(void *data)
1831{
1832 int *command = data;
1833
1834 ftrace_modify_all_code(*command);
1845 1835
1846 return 0; 1836 return 0;
1847} 1837}
@@ -2469,57 +2459,35 @@ static int
2469ftrace_avail_open(struct inode *inode, struct file *file) 2459ftrace_avail_open(struct inode *inode, struct file *file)
2470{ 2460{
2471 struct ftrace_iterator *iter; 2461 struct ftrace_iterator *iter;
2472 int ret;
2473 2462
2474 if (unlikely(ftrace_disabled)) 2463 if (unlikely(ftrace_disabled))
2475 return -ENODEV; 2464 return -ENODEV;
2476 2465
2477 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2466 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2478 if (!iter) 2467 if (iter) {
2479 return -ENOMEM; 2468 iter->pg = ftrace_pages_start;
2480 2469 iter->ops = &global_ops;
2481 iter->pg = ftrace_pages_start;
2482 iter->ops = &global_ops;
2483
2484 ret = seq_open(file, &show_ftrace_seq_ops);
2485 if (!ret) {
2486 struct seq_file *m = file->private_data;
2487
2488 m->private = iter;
2489 } else {
2490 kfree(iter);
2491 } 2470 }
2492 2471
2493 return ret; 2472 return iter ? 0 : -ENOMEM;
2494} 2473}
2495 2474
2496static int 2475static int
2497ftrace_enabled_open(struct inode *inode, struct file *file) 2476ftrace_enabled_open(struct inode *inode, struct file *file)
2498{ 2477{
2499 struct ftrace_iterator *iter; 2478 struct ftrace_iterator *iter;
2500 int ret;
2501 2479
2502 if (unlikely(ftrace_disabled)) 2480 if (unlikely(ftrace_disabled))
2503 return -ENODEV; 2481 return -ENODEV;
2504 2482
2505 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2483 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2506 if (!iter) 2484 if (iter) {
2507 return -ENOMEM; 2485 iter->pg = ftrace_pages_start;
2508 2486 iter->flags = FTRACE_ITER_ENABLED;
2509 iter->pg = ftrace_pages_start; 2487 iter->ops = &global_ops;
2510 iter->flags = FTRACE_ITER_ENABLED;
2511 iter->ops = &global_ops;
2512
2513 ret = seq_open(file, &show_ftrace_seq_ops);
2514 if (!ret) {
2515 struct seq_file *m = file->private_data;
2516
2517 m->private = iter;
2518 } else {
2519 kfree(iter);
2520 } 2488 }
2521 2489
2522 return ret; 2490 return iter ? 0 : -ENOMEM;
2523} 2491}
2524 2492
2525static void ftrace_filter_reset(struct ftrace_hash *hash) 2493static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3688 return 0; 3656 return 0;
3689} 3657}
3690 3658
3691static void ftrace_swap_recs(void *a, void *b, int size) 3659static int ftrace_cmp_ips(const void *a, const void *b)
3660{
3661 const unsigned long *ipa = a;
3662 const unsigned long *ipb = b;
3663
3664 if (*ipa > *ipb)
3665 return 1;
3666 if (*ipa < *ipb)
3667 return -1;
3668 return 0;
3669}
3670
3671static void ftrace_swap_ips(void *a, void *b, int size)
3692{ 3672{
3693 struct dyn_ftrace *reca = a; 3673 unsigned long *ipa = a;
3694 struct dyn_ftrace *recb = b; 3674 unsigned long *ipb = b;
3695 struct dyn_ftrace t; 3675 unsigned long t;
3696 3676
3697 t = *reca; 3677 t = *ipa;
3698 *reca = *recb; 3678 *ipa = *ipb;
3699 *recb = t; 3679 *ipb = t;
3700} 3680}
3701 3681
3702static int ftrace_process_locs(struct module *mod, 3682static int ftrace_process_locs(struct module *mod,
3703 unsigned long *start, 3683 unsigned long *start,
3704 unsigned long *end) 3684 unsigned long *end)
3705{ 3685{
3686 struct ftrace_page *start_pg;
3706 struct ftrace_page *pg; 3687 struct ftrace_page *pg;
3688 struct dyn_ftrace *rec;
3707 unsigned long count; 3689 unsigned long count;
3708 unsigned long *p; 3690 unsigned long *p;
3709 unsigned long addr; 3691 unsigned long addr;
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,
3715 if (!count) 3697 if (!count)
3716 return 0; 3698 return 0;
3717 3699
3718 pg = ftrace_allocate_pages(count); 3700 sort(start, count, sizeof(*start),
3719 if (!pg) 3701 ftrace_cmp_ips, ftrace_swap_ips);
3702
3703 start_pg = ftrace_allocate_pages(count);
3704 if (!start_pg)
3720 return -ENOMEM; 3705 return -ENOMEM;
3721 3706
3722 mutex_lock(&ftrace_lock); 3707 mutex_lock(&ftrace_lock);
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,
3729 if (!mod) { 3714 if (!mod) {
3730 WARN_ON(ftrace_pages || ftrace_pages_start); 3715 WARN_ON(ftrace_pages || ftrace_pages_start);
3731 /* First initialization */ 3716 /* First initialization */
3732 ftrace_pages = ftrace_pages_start = pg; 3717 ftrace_pages = ftrace_pages_start = start_pg;
3733 } else { 3718 } else {
3734 if (!ftrace_pages) 3719 if (!ftrace_pages)
3735 goto out; 3720 goto out;
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,
3740 ftrace_pages = ftrace_pages->next; 3725 ftrace_pages = ftrace_pages->next;
3741 } 3726 }
3742 3727
3743 ftrace_pages->next = pg; 3728 ftrace_pages->next = start_pg;
3744 ftrace_pages = pg;
3745 } 3729 }
3746 3730
3747 p = start; 3731 p = start;
3732 pg = start_pg;
3748 while (p < end) { 3733 while (p < end) {
3749 addr = ftrace_call_adjust(*p++); 3734 addr = ftrace_call_adjust(*p++);
3750 /* 3735 /*
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,
3755 */ 3740 */
3756 if (!addr) 3741 if (!addr)
3757 continue; 3742 continue;
3758 if (!ftrace_record_ip(addr)) 3743
3759 break; 3744 if (pg->index == pg->size) {
3745 /* We should have allocated enough */
3746 if (WARN_ON(!pg->next))
3747 break;
3748 pg = pg->next;
3749 }
3750
3751 rec = &pg->records[pg->index++];
3752 rec->ip = addr;
3760 } 3753 }
3761 3754
3762 /* These new locations need to be initialized */ 3755 /* We should have used all pages */
3763 ftrace_new_pgs = pg; 3756 WARN_ON(pg->next);
3757
3758 /* Assign the last page to ftrace_pages */
3759 ftrace_pages = pg;
3764 3760
3765 /* Make each individual set of pages sorted by ips */ 3761 /* These new locations need to be initialized */
3766 for (; pg; pg = pg->next) 3762 ftrace_new_pgs = start_pg;
3767 sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
3768 ftrace_cmp_recs, ftrace_swap_recs);
3769 3763
3770 /* 3764 /*
3771 * We only need to disable interrupts on start up 3765 * We only need to disable interrupts on start up
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cf8d11e91efd..6420cda62336 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
23#include <asm/local.h> 23#include <asm/local.h>
24#include "trace.h" 24#include "trace.h"
25 25
26static void update_pages_handler(struct work_struct *work);
27
26/* 28/*
27 * The ring buffer header is special. We must manually up keep it. 29 * The ring buffer header is special. We must manually up keep it.
28 */ 30 */
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu {
449 raw_spinlock_t reader_lock; /* serialize readers */ 451 raw_spinlock_t reader_lock; /* serialize readers */
450 arch_spinlock_t lock; 452 arch_spinlock_t lock;
451 struct lock_class_key lock_key; 453 struct lock_class_key lock_key;
454 unsigned int nr_pages;
452 struct list_head *pages; 455 struct list_head *pages;
453 struct buffer_page *head_page; /* read from head */ 456 struct buffer_page *head_page; /* read from head */
454 struct buffer_page *tail_page; /* write to tail */ 457 struct buffer_page *tail_page; /* write to tail */
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu {
466 unsigned long read_bytes; 469 unsigned long read_bytes;
467 u64 write_stamp; 470 u64 write_stamp;
468 u64 read_stamp; 471 u64 read_stamp;
472 /* ring buffer pages to update, > 0 to add, < 0 to remove */
473 int nr_pages_to_update;
474 struct list_head new_pages; /* new pages to add */
475 struct work_struct update_pages_work;
476 struct completion update_done;
469}; 477};
470 478
471struct ring_buffer { 479struct ring_buffer {
472 unsigned pages;
473 unsigned flags; 480 unsigned flags;
474 int cpus; 481 int cpus;
475 atomic_t record_disabled; 482 atomic_t record_disabled;
483 atomic_t resize_disabled;
476 cpumask_var_t cpumask; 484 cpumask_var_t cpumask;
477 485
478 struct lock_class_key *reader_lock_key; 486 struct lock_class_key *reader_lock_key;
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
937 struct list_head *head = cpu_buffer->pages; 945 struct list_head *head = cpu_buffer->pages;
938 struct buffer_page *bpage, *tmp; 946 struct buffer_page *bpage, *tmp;
939 947
948 /* Reset the head page if it exists */
949 if (cpu_buffer->head_page)
950 rb_set_head_page(cpu_buffer);
951
940 rb_head_page_deactivate(cpu_buffer); 952 rb_head_page_deactivate(cpu_buffer);
941 953
942 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 954 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
963 return 0; 975 return 0;
964} 976}
965 977
966static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 978static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
967 unsigned nr_pages)
968{ 979{
980 int i;
969 struct buffer_page *bpage, *tmp; 981 struct buffer_page *bpage, *tmp;
970 LIST_HEAD(pages);
971 unsigned i;
972
973 WARN_ON(!nr_pages);
974 982
975 for (i = 0; i < nr_pages; i++) { 983 for (i = 0; i < nr_pages; i++) {
976 struct page *page; 984 struct page *page;
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
981 */ 989 */
982 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 990 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
983 GFP_KERNEL | __GFP_NORETRY, 991 GFP_KERNEL | __GFP_NORETRY,
984 cpu_to_node(cpu_buffer->cpu)); 992 cpu_to_node(cpu));
985 if (!bpage) 993 if (!bpage)
986 goto free_pages; 994 goto free_pages;
987 995
988 rb_check_bpage(cpu_buffer, bpage); 996 list_add(&bpage->list, pages);
989 997
990 list_add(&bpage->list, &pages); 998 page = alloc_pages_node(cpu_to_node(cpu),
991
992 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
993 GFP_KERNEL | __GFP_NORETRY, 0); 999 GFP_KERNEL | __GFP_NORETRY, 0);
994 if (!page) 1000 if (!page)
995 goto free_pages; 1001 goto free_pages;
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 rb_init_page(bpage->page); 1003 rb_init_page(bpage->page);
998 } 1004 }
999 1005
1006 return 0;
1007
1008free_pages:
1009 list_for_each_entry_safe(bpage, tmp, pages, list) {
1010 list_del_init(&bpage->list);
1011 free_buffer_page(bpage);
1012 }
1013
1014 return -ENOMEM;
1015}
1016
1017static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1018 unsigned nr_pages)
1019{
1020 LIST_HEAD(pages);
1021
1022 WARN_ON(!nr_pages);
1023
1024 if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
1025 return -ENOMEM;
1026
1000 /* 1027 /*
1001 * The ring buffer page list is a circular list that does not 1028 * The ring buffer page list is a circular list that does not
1002 * start and end with a list head. All page list items point to 1029 * start and end with a list head. All page list items point to
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1005 cpu_buffer->pages = pages.next; 1032 cpu_buffer->pages = pages.next;
1006 list_del(&pages); 1033 list_del(&pages);
1007 1034
1035 cpu_buffer->nr_pages = nr_pages;
1036
1008 rb_check_pages(cpu_buffer); 1037 rb_check_pages(cpu_buffer);
1009 1038
1010 return 0; 1039 return 0;
1011
1012 free_pages:
1013 list_for_each_entry_safe(bpage, tmp, &pages, list) {
1014 list_del_init(&bpage->list);
1015 free_buffer_page(bpage);
1016 }
1017 return -ENOMEM;
1018} 1040}
1019 1041
1020static struct ring_buffer_per_cpu * 1042static struct ring_buffer_per_cpu *
1021rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 1043rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1022{ 1044{
1023 struct ring_buffer_per_cpu *cpu_buffer; 1045 struct ring_buffer_per_cpu *cpu_buffer;
1024 struct buffer_page *bpage; 1046 struct buffer_page *bpage;
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1035 raw_spin_lock_init(&cpu_buffer->reader_lock); 1057 raw_spin_lock_init(&cpu_buffer->reader_lock);
1036 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1058 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1037 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1059 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1060 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1061 init_completion(&cpu_buffer->update_done);
1038 1062
1039 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1063 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1040 GFP_KERNEL, cpu_to_node(cpu)); 1064 GFP_KERNEL, cpu_to_node(cpu));
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1052 1076
1053 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1054 1078
1055 ret = rb_allocate_pages(cpu_buffer, buffer->pages); 1079 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1056 if (ret < 0) 1080 if (ret < 0)
1057 goto fail_free_reader; 1081 goto fail_free_reader;
1058 1082
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1113{ 1137{
1114 struct ring_buffer *buffer; 1138 struct ring_buffer *buffer;
1115 int bsize; 1139 int bsize;
1116 int cpu; 1140 int cpu, nr_pages;
1117 1141
1118 /* keep it in its own cache line */ 1142 /* keep it in its own cache line */
1119 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1143 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1124 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) 1148 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
1125 goto fail_free_buffer; 1149 goto fail_free_buffer;
1126 1150
1127 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1151 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1128 buffer->flags = flags; 1152 buffer->flags = flags;
1129 buffer->clock = trace_clock_local; 1153 buffer->clock = trace_clock_local;
1130 buffer->reader_lock_key = key; 1154 buffer->reader_lock_key = key;
1131 1155
1132 /* need at least two pages */ 1156 /* need at least two pages */
1133 if (buffer->pages < 2) 1157 if (nr_pages < 2)
1134 buffer->pages = 2; 1158 nr_pages = 2;
1135 1159
1136 /* 1160 /*
1137 * In case of non-hotplug cpu, if the ring-buffer is allocated 1161 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1154 1178
1155 for_each_buffer_cpu(buffer, cpu) { 1179 for_each_buffer_cpu(buffer, cpu) {
1156 buffer->buffers[cpu] = 1180 buffer->buffers[cpu] =
1157 rb_allocate_cpu_buffer(buffer, cpu); 1181 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
1158 if (!buffer->buffers[cpu]) 1182 if (!buffer->buffers[cpu])
1159 goto fail_free_buffers; 1183 goto fail_free_buffers;
1160 } 1184 }
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
1222 1246
1223static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 1247static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1224 1248
1225static void 1249static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1226rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1227{ 1250{
1228 struct buffer_page *bpage; 1251 return local_read(&bpage->entries) & RB_WRITE_MASK;
1229 struct list_head *p; 1252}
1230 unsigned i; 1253
1254static inline unsigned long rb_page_write(struct buffer_page *bpage)
1255{
1256 return local_read(&bpage->write) & RB_WRITE_MASK;
1257}
1258
1259static int
1260rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1261{
1262 struct list_head *tail_page, *to_remove, *next_page;
1263 struct buffer_page *to_remove_page, *tmp_iter_page;
1264 struct buffer_page *last_page, *first_page;
1265 unsigned int nr_removed;
1266 unsigned long head_bit;
1267 int page_entries;
1268
1269 head_bit = 0;
1231 1270
1232 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1271 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1233 rb_head_page_deactivate(cpu_buffer); 1272 atomic_inc(&cpu_buffer->record_disabled);
1273 /*
1274 * We don't race with the readers since we have acquired the reader
1275 * lock. We also don't race with writers after disabling recording.
1276 * This makes it easy to figure out the first and the last page to be
1277 * removed from the list. We unlink all the pages in between including
1278 * the first and last pages. This is done in a busy loop so that we
1279 * lose the least number of traces.
1280 * The pages are freed after we restart recording and unlock readers.
1281 */
1282 tail_page = &cpu_buffer->tail_page->list;
1234 1283
1235 for (i = 0; i < nr_pages; i++) { 1284 /*
1236 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1285 * tail page might be on reader page, we remove the next page
1237 goto out; 1286 * from the ring buffer
1238 p = cpu_buffer->pages->next; 1287 */
1239 bpage = list_entry(p, struct buffer_page, list); 1288 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
1240 list_del_init(&bpage->list); 1289 tail_page = rb_list_head(tail_page->next);
1241 free_buffer_page(bpage); 1290 to_remove = tail_page;
1291
1292 /* start of pages to remove */
1293 first_page = list_entry(rb_list_head(to_remove->next),
1294 struct buffer_page, list);
1295
1296 for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
1297 to_remove = rb_list_head(to_remove)->next;
1298 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
1242 } 1299 }
1243 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1244 goto out;
1245 1300
1246 rb_reset_cpu(cpu_buffer); 1301 next_page = rb_list_head(to_remove)->next;
1247 rb_check_pages(cpu_buffer);
1248 1302
1249out: 1303 /*
1304 * Now we remove all pages between tail_page and next_page.
1305 * Make sure that we have head_bit value preserved for the
1306 * next page
1307 */
1308 tail_page->next = (struct list_head *)((unsigned long)next_page |
1309 head_bit);
1310 next_page = rb_list_head(next_page);
1311 next_page->prev = tail_page;
1312
1313 /* make sure pages points to a valid page in the ring buffer */
1314 cpu_buffer->pages = next_page;
1315
1316 /* update head page */
1317 if (head_bit)
1318 cpu_buffer->head_page = list_entry(next_page,
1319 struct buffer_page, list);
1320
1321 /*
1322 * change read pointer to make sure any read iterators reset
1323 * themselves
1324 */
1325 cpu_buffer->read = 0;
1326
1327 /* pages are removed, resume tracing and then free the pages */
1328 atomic_dec(&cpu_buffer->record_disabled);
1250 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1329 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1330
1331 RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
1332
1333 /* last buffer page to remove */
1334 last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
1335 list);
1336 tmp_iter_page = first_page;
1337
1338 do {
1339 to_remove_page = tmp_iter_page;
1340 rb_inc_page(cpu_buffer, &tmp_iter_page);
1341
1342 /* update the counters */
1343 page_entries = rb_page_entries(to_remove_page);
1344 if (page_entries) {
1345 /*
1346 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 }
1355
1356 /*
1357 * We have already removed references to this list item, just
1358 * free up the buffer_page and its page
1359 */
1360 free_buffer_page(to_remove_page);
1361 nr_removed--;
1362
1363 } while (to_remove_page != last_page);
1364
1365 RB_WARN_ON(cpu_buffer, nr_removed);
1366
1367 return nr_removed == 0;
1251} 1368}
1252 1369
1253static void 1370static int
1254rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 1371rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1255 struct list_head *pages, unsigned nr_pages)
1256{ 1372{
1257 struct buffer_page *bpage; 1373 struct list_head *pages = &cpu_buffer->new_pages;
1258 struct list_head *p; 1374 int retries, success;
1259 unsigned i;
1260 1375
1261 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1376 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1262 rb_head_page_deactivate(cpu_buffer); 1377 /*
1378 * We are holding the reader lock, so the reader page won't be swapped
1379 * in the ring buffer. Now we are racing with the writer trying to
1380 * move head page and the tail page.
1381 * We are going to adapt the reader page update process where:
1382 * 1. We first splice the start and end of list of new pages between
1383 * the head page and its previous page.
1384 * 2. We cmpxchg the prev_page->next to point from head page to the
1385 * start of new pages list.
1386 * 3. Finally, we update the head->prev to the end of new list.
1387 *
1388 * We will try this process 10 times, to make sure that we don't keep
1389 * spinning.
1390 */
1391 retries = 10;
1392 success = 0;
1393 while (retries--) {
1394 struct list_head *head_page, *prev_page, *r;
1395 struct list_head *last_page, *first_page;
1396 struct list_head *head_page_with_bit;
1263 1397
1264 for (i = 0; i < nr_pages; i++) { 1398 head_page = &rb_set_head_page(cpu_buffer)->list;
1265 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1399 prev_page = head_page->prev;
1266 goto out; 1400
1267 p = pages->next; 1401 first_page = pages->next;
1268 bpage = list_entry(p, struct buffer_page, list); 1402 last_page = pages->prev;
1269 list_del_init(&bpage->list); 1403
1270 list_add_tail(&bpage->list, cpu_buffer->pages); 1404 head_page_with_bit = (struct list_head *)
1405 ((unsigned long)head_page | RB_PAGE_HEAD);
1406
1407 last_page->next = head_page_with_bit;
1408 first_page->prev = prev_page;
1409
1410 r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
1411
1412 if (r == head_page_with_bit) {
1413 /*
1414 * yay, we replaced the page pointer to our new list,
1415 * now, we just have to update to head page's prev
1416 * pointer to point to end of list
1417 */
1418 head_page->prev = last_page;
1419 success = 1;
1420 break;
1421 }
1271 } 1422 }
1272 rb_reset_cpu(cpu_buffer);
1273 rb_check_pages(cpu_buffer);
1274 1423
1275out: 1424 if (success)
1425 INIT_LIST_HEAD(pages);
1426 /*
1427 * If we weren't successful in adding in new pages, warn and stop
1428 * tracing
1429 */
1430 RB_WARN_ON(cpu_buffer, !success);
1276 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1431 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1432
1433 /* free pages if they weren't inserted */
1434 if (!success) {
1435 struct buffer_page *bpage, *tmp;
1436 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1437 list) {
1438 list_del_init(&bpage->list);
1439 free_buffer_page(bpage);
1440 }
1441 }
1442 return success;
1443}
1444
1445static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
1446{
1447 int success;
1448
1449 if (cpu_buffer->nr_pages_to_update > 0)
1450 success = rb_insert_pages(cpu_buffer);
1451 else
1452 success = rb_remove_pages(cpu_buffer,
1453 -cpu_buffer->nr_pages_to_update);
1454
1455 if (success)
1456 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
1457}
1458
1459static void update_pages_handler(struct work_struct *work)
1460{
1461 struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
1462 struct ring_buffer_per_cpu, update_pages_work);
1463 rb_update_pages(cpu_buffer);
1464 complete(&cpu_buffer->update_done);
1277} 1465}
1278 1466
1279/** 1467/**
@@ -1283,16 +1471,14 @@ out:
1283 * 1471 *
1284 * Minimum size is 2 * BUF_PAGE_SIZE. 1472 * Minimum size is 2 * BUF_PAGE_SIZE.
1285 * 1473 *
1286 * Returns -1 on failure. 1474 * Returns 0 on success and < 0 on failure.
1287 */ 1475 */
1288int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) 1476int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1477 int cpu_id)
1289{ 1478{
1290 struct ring_buffer_per_cpu *cpu_buffer; 1479 struct ring_buffer_per_cpu *cpu_buffer;
1291 unsigned nr_pages, rm_pages, new_pages; 1480 unsigned nr_pages;
1292 struct buffer_page *bpage, *tmp; 1481 int cpu, err = 0;
1293 unsigned long buffer_size;
1294 LIST_HEAD(pages);
1295 int i, cpu;
1296 1482
1297 /* 1483 /*
1298 * Always succeed at resizing a non-existent buffer: 1484 * Always succeed at resizing a non-existent buffer:
@@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1302 1488
1303 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1489 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1304 size *= BUF_PAGE_SIZE; 1490 size *= BUF_PAGE_SIZE;
1305 buffer_size = buffer->pages * BUF_PAGE_SIZE;
1306 1491
1307 /* we need a minimum of two pages */ 1492 /* we need a minimum of two pages */
1308 if (size < BUF_PAGE_SIZE * 2) 1493 if (size < BUF_PAGE_SIZE * 2)
1309 size = BUF_PAGE_SIZE * 2; 1494 size = BUF_PAGE_SIZE * 2;
1310 1495
1311 if (size == buffer_size) 1496 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1312 return size;
1313
1314 atomic_inc(&buffer->record_disabled);
1315 1497
1316 /* Make sure all writers are done with this buffer. */ 1498 /*
1317 synchronize_sched(); 1499 * Don't succeed if resizing is disabled, as a reader might be
1500 * manipulating the ring buffer and is expecting a sane state while
1501 * this is true.
1502 */
1503 if (atomic_read(&buffer->resize_disabled))
1504 return -EBUSY;
1318 1505
1506 /* prevent another thread from changing buffer sizes */
1319 mutex_lock(&buffer->mutex); 1507 mutex_lock(&buffer->mutex);
1320 get_online_cpus();
1321
1322 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1323 1508
1324 if (size < buffer_size) { 1509 if (cpu_id == RING_BUFFER_ALL_CPUS) {
1510 /* calculate the pages to update */
1511 for_each_buffer_cpu(buffer, cpu) {
1512 cpu_buffer = buffer->buffers[cpu];
1325 1513
1326 /* easy case, just free pages */ 1514 cpu_buffer->nr_pages_to_update = nr_pages -
1327 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) 1515 cpu_buffer->nr_pages;
1328 goto out_fail; 1516 /*
1517 * nothing more to do for removing pages or no update
1518 */
1519 if (cpu_buffer->nr_pages_to_update <= 0)
1520 continue;
1521 /*
1522 * to add pages, make sure all new pages can be
1523 * allocated without receiving ENOMEM
1524 */
1525 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1526 if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1527 &cpu_buffer->new_pages, cpu)) {
1528 /* not enough memory for new pages */
1529 err = -ENOMEM;
1530 goto out_err;
1531 }
1532 }
1329 1533
1330 rm_pages = buffer->pages - nr_pages; 1534 get_online_cpus();
1535 /*
1536 * Fire off all the required work handlers
1537 * We can't schedule on offline CPUs, but it's not necessary
1538 * since we can change their buffer sizes without any race.
1539 */
1540 for_each_buffer_cpu(buffer, cpu) {
1541 cpu_buffer = buffer->buffers[cpu];
1542 if (!cpu_buffer->nr_pages_to_update)
1543 continue;
1544
1545 if (cpu_online(cpu))
1546 schedule_work_on(cpu,
1547 &cpu_buffer->update_pages_work);
1548 else
1549 rb_update_pages(cpu_buffer);
1550 }
1331 1551
1552 /* wait for all the updates to complete */
1332 for_each_buffer_cpu(buffer, cpu) { 1553 for_each_buffer_cpu(buffer, cpu) {
1333 cpu_buffer = buffer->buffers[cpu]; 1554 cpu_buffer = buffer->buffers[cpu];
1334 rb_remove_pages(cpu_buffer, rm_pages); 1555 if (!cpu_buffer->nr_pages_to_update)
1556 continue;
1557
1558 if (cpu_online(cpu))
1559 wait_for_completion(&cpu_buffer->update_done);
1560 cpu_buffer->nr_pages_to_update = 0;
1335 } 1561 }
1336 goto out;
1337 }
1338 1562
1339 /* 1563 put_online_cpus();
1340 * This is a bit more difficult. We only want to add pages 1564 } else {
1341 * when we can allocate enough for all CPUs. We do this 1565 cpu_buffer = buffer->buffers[cpu_id];
1342 * by allocating all the pages and storing them on a local
1343 * link list. If we succeed in our allocation, then we
1344 * add these pages to the cpu_buffers. Otherwise we just free
1345 * them all and return -ENOMEM;
1346 */
1347 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
1348 goto out_fail;
1349 1566
1350 new_pages = nr_pages - buffer->pages; 1567 if (nr_pages == cpu_buffer->nr_pages)
1568 goto out;
1351 1569
1352 for_each_buffer_cpu(buffer, cpu) { 1570 cpu_buffer->nr_pages_to_update = nr_pages -
1353 for (i = 0; i < new_pages; i++) { 1571 cpu_buffer->nr_pages;
1354 struct page *page; 1572
1355 /* 1573 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1356 * __GFP_NORETRY flag makes sure that the allocation 1574 if (cpu_buffer->nr_pages_to_update > 0 &&
1357 * fails gracefully without invoking oom-killer and 1575 __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1358 * the system is not destabilized. 1576 &cpu_buffer->new_pages, cpu_id)) {
1359 */ 1577 err = -ENOMEM;
1360 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1578 goto out_err;
1361 cache_line_size()),
1362 GFP_KERNEL | __GFP_NORETRY,
1363 cpu_to_node(cpu));
1364 if (!bpage)
1365 goto free_pages;
1366 list_add(&bpage->list, &pages);
1367 page = alloc_pages_node(cpu_to_node(cpu),
1368 GFP_KERNEL | __GFP_NORETRY, 0);
1369 if (!page)
1370 goto free_pages;
1371 bpage->page = page_address(page);
1372 rb_init_page(bpage->page);
1373 } 1579 }
1374 }
1375 1580
1376 for_each_buffer_cpu(buffer, cpu) { 1581 get_online_cpus();
1377 cpu_buffer = buffer->buffers[cpu];
1378 rb_insert_pages(cpu_buffer, &pages, new_pages);
1379 }
1380 1582
1381 if (RB_WARN_ON(buffer, !list_empty(&pages))) 1583 if (cpu_online(cpu_id)) {
1382 goto out_fail; 1584 schedule_work_on(cpu_id,
1585 &cpu_buffer->update_pages_work);
1586 wait_for_completion(&cpu_buffer->update_done);
1587 } else
1588 rb_update_pages(cpu_buffer);
1589
1590 cpu_buffer->nr_pages_to_update = 0;
1591 put_online_cpus();
1592 }
1383 1593
1384 out: 1594 out:
1385 buffer->pages = nr_pages; 1595 /*
1386 put_online_cpus(); 1596 * The ring buffer resize can happen with the ring buffer
1597 * enabled, so that the update disturbs the tracing as little
1598 * as possible. But if the buffer is disabled, we do not need
1599 * to worry about that, and we can take the time to verify
1600 * that the buffer is not corrupt.
1601 */
1602 if (atomic_read(&buffer->record_disabled)) {
1603 atomic_inc(&buffer->record_disabled);
1604 /*
1605 * Even though the buffer was disabled, we must make sure
1606 * that it is truly disabled before calling rb_check_pages.
1607 * There could have been a race between checking
1608 * record_disable and incrementing it.
1609 */
1610 synchronize_sched();
1611 for_each_buffer_cpu(buffer, cpu) {
1612 cpu_buffer = buffer->buffers[cpu];
1613 rb_check_pages(cpu_buffer);
1614 }
1615 atomic_dec(&buffer->record_disabled);
1616 }
1617
1387 mutex_unlock(&buffer->mutex); 1618 mutex_unlock(&buffer->mutex);
1619 return size;
1388 1620
1389 atomic_dec(&buffer->record_disabled); 1621 out_err:
1622 for_each_buffer_cpu(buffer, cpu) {
1623 struct buffer_page *bpage, *tmp;
1390 1624
1391 return size; 1625 cpu_buffer = buffer->buffers[cpu];
1626 cpu_buffer->nr_pages_to_update = 0;
1392 1627
1393 free_pages: 1628 if (list_empty(&cpu_buffer->new_pages))
1394 list_for_each_entry_safe(bpage, tmp, &pages, list) { 1629 continue;
1395 list_del_init(&bpage->list);
1396 free_buffer_page(bpage);
1397 }
1398 put_online_cpus();
1399 mutex_unlock(&buffer->mutex);
1400 atomic_dec(&buffer->record_disabled);
1401 return -ENOMEM;
1402 1630
1403 /* 1631 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1404 * Something went totally wrong, and we are too paranoid 1632 list) {
1405 * to even clean up the mess. 1633 list_del_init(&bpage->list);
1406 */ 1634 free_buffer_page(bpage);
1407 out_fail: 1635 }
1408 put_online_cpus(); 1636 }
1409 mutex_unlock(&buffer->mutex); 1637 mutex_unlock(&buffer->mutex);
1410 atomic_dec(&buffer->record_disabled); 1638 return err;
1411 return -1;
1412} 1639}
1413EXPORT_SYMBOL_GPL(ring_buffer_resize); 1640EXPORT_SYMBOL_GPL(ring_buffer_resize);
1414 1641
@@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
1447 return __rb_page_index(iter->head_page, iter->head); 1674 return __rb_page_index(iter->head_page, iter->head);
1448} 1675}
1449 1676
1450static inline unsigned long rb_page_write(struct buffer_page *bpage)
1451{
1452 return local_read(&bpage->write) & RB_WRITE_MASK;
1453}
1454
1455static inline unsigned rb_page_commit(struct buffer_page *bpage) 1677static inline unsigned rb_page_commit(struct buffer_page *bpage)
1456{ 1678{
1457 return local_read(&bpage->page->commit); 1679 return local_read(&bpage->page->commit);
1458} 1680}
1459 1681
1460static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1461{
1462 return local_read(&bpage->entries) & RB_WRITE_MASK;
1463}
1464
1465/* Size is determined by what has been committed */ 1682/* Size is determined by what has been committed */
1466static inline unsigned rb_page_size(struct buffer_page *bpage) 1683static inline unsigned rb_page_size(struct buffer_page *bpage)
1467{ 1684{
@@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1510 * assign the commit to the tail. 1727 * assign the commit to the tail.
1511 */ 1728 */
1512 again: 1729 again:
1513 max_count = cpu_buffer->buffer->pages * 100; 1730 max_count = cpu_buffer->nr_pages * 100;
1514 1731
1515 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1732 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1516 if (RB_WARN_ON(cpu_buffer, !(--max_count))) 1733 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3486 3703
3487 iter->cpu_buffer = cpu_buffer; 3704 iter->cpu_buffer = cpu_buffer;
3488 3705
3706 atomic_inc(&buffer->resize_disabled);
3489 atomic_inc(&cpu_buffer->record_disabled); 3707 atomic_inc(&cpu_buffer->record_disabled);
3490 3708
3491 return iter; 3709 return iter;
@@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
3548{ 3766{
3549 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3767 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3550 3768
3769 /*
3770 * Ring buffer is disabled from recording, here's a good place
3771 * to check the integrity of the ring buffer.
3772 */
3773 rb_check_pages(cpu_buffer);
3774
3551 atomic_dec(&cpu_buffer->record_disabled); 3775 atomic_dec(&cpu_buffer->record_disabled);
3776 atomic_dec(&cpu_buffer->buffer->resize_disabled);
3552 kfree(iter); 3777 kfree(iter);
3553} 3778}
3554EXPORT_SYMBOL_GPL(ring_buffer_read_finish); 3779EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
3588 * ring_buffer_size - return the size of the ring buffer (in bytes) 3813 * ring_buffer_size - return the size of the ring buffer (in bytes)
3589 * @buffer: The ring buffer. 3814 * @buffer: The ring buffer.
3590 */ 3815 */
3591unsigned long ring_buffer_size(struct ring_buffer *buffer) 3816unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
3592{ 3817{
3593 return BUF_PAGE_SIZE * buffer->pages; 3818 /*
3819 * Earlier, this method returned
3820 * BUF_PAGE_SIZE * buffer->nr_pages
3821 * Since the nr_pages field is now removed, we have converted this to
3822 * return the per cpu buffer value.
3823 */
3824 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3825 return 0;
3826
3827 return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
3594} 3828}
3595EXPORT_SYMBOL_GPL(ring_buffer_size); 3829EXPORT_SYMBOL_GPL(ring_buffer_size);
3596 3830
@@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3611 cpu_buffer->commit_page = cpu_buffer->head_page; 3845 cpu_buffer->commit_page = cpu_buffer->head_page;
3612 3846
3613 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 3847 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
3848 INIT_LIST_HEAD(&cpu_buffer->new_pages);
3614 local_set(&cpu_buffer->reader_page->write, 0); 3849 local_set(&cpu_buffer->reader_page->write, 0);
3615 local_set(&cpu_buffer->reader_page->entries, 0); 3850 local_set(&cpu_buffer->reader_page->entries, 0);
3616 local_set(&cpu_buffer->reader_page->page->commit, 0); 3851 local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3647 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3882 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3648 return; 3883 return;
3649 3884
3885 atomic_inc(&buffer->resize_disabled);
3650 atomic_inc(&cpu_buffer->record_disabled); 3886 atomic_inc(&cpu_buffer->record_disabled);
3651 3887
3888 /* Make sure all commits have finished */
3889 synchronize_sched();
3890
3652 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3891 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3653 3892
3654 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3893 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3664 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3903 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3665 3904
3666 atomic_dec(&cpu_buffer->record_disabled); 3905 atomic_dec(&cpu_buffer->record_disabled);
3906 atomic_dec(&buffer->resize_disabled);
3667} 3907}
3668EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 3908EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
3669 3909
@@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3765 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 4005 !cpumask_test_cpu(cpu, buffer_b->cpumask))
3766 goto out; 4006 goto out;
3767 4007
4008 cpu_buffer_a = buffer_a->buffers[cpu];
4009 cpu_buffer_b = buffer_b->buffers[cpu];
4010
3768 /* At least make sure the two buffers are somewhat the same */ 4011 /* At least make sure the two buffers are somewhat the same */
3769 if (buffer_a->pages != buffer_b->pages) 4012 if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
3770 goto out; 4013 goto out;
3771 4014
3772 ret = -EAGAIN; 4015 ret = -EAGAIN;
@@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3780 if (atomic_read(&buffer_b->record_disabled)) 4023 if (atomic_read(&buffer_b->record_disabled))
3781 goto out; 4024 goto out;
3782 4025
3783 cpu_buffer_a = buffer_a->buffers[cpu];
3784 cpu_buffer_b = buffer_b->buffers[cpu];
3785
3786 if (atomic_read(&cpu_buffer_a->record_disabled)) 4026 if (atomic_read(&cpu_buffer_a->record_disabled))
3787 goto out; 4027 goto out;
3788 4028
@@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self,
4071 struct ring_buffer *buffer = 4311 struct ring_buffer *buffer =
4072 container_of(self, struct ring_buffer, cpu_notify); 4312 container_of(self, struct ring_buffer, cpu_notify);
4073 long cpu = (long)hcpu; 4313 long cpu = (long)hcpu;
4314 int cpu_i, nr_pages_same;
4315 unsigned int nr_pages;
4074 4316
4075 switch (action) { 4317 switch (action) {
4076 case CPU_UP_PREPARE: 4318 case CPU_UP_PREPARE:
@@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self,
4078 if (cpumask_test_cpu(cpu, buffer->cpumask)) 4320 if (cpumask_test_cpu(cpu, buffer->cpumask))
4079 return NOTIFY_OK; 4321 return NOTIFY_OK;
4080 4322
4323 nr_pages = 0;
4324 nr_pages_same = 1;
4325 /* check if all cpu sizes are same */
4326 for_each_buffer_cpu(buffer, cpu_i) {
4327 /* fill in the size from first enabled cpu */
4328 if (nr_pages == 0)
4329 nr_pages = buffer->buffers[cpu_i]->nr_pages;
4330 if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
4331 nr_pages_same = 0;
4332 break;
4333 }
4334 }
4335 /* allocate minimum pages, user can later expand it */
4336 if (!nr_pages_same)
4337 nr_pages = 2;
4081 buffer->buffers[cpu] = 4338 buffer->buffers[cpu] =
4082 rb_allocate_cpu_buffer(buffer, cpu); 4339 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
4083 if (!buffer->buffers[cpu]) { 4340 if (!buffer->buffers[cpu]) {
4084 WARN(1, "failed to allocate ring buffer on CPU %ld\n", 4341 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
4085 cpu); 4342 cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ed7b5d1e12f4..68032c6177db 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,18 +87,6 @@ static int tracing_disabled = 1;
87 87
88DEFINE_PER_CPU(int, ftrace_cpu_disabled); 88DEFINE_PER_CPU(int, ftrace_cpu_disabled);
89 89
90static inline void ftrace_disable_cpu(void)
91{
92 preempt_disable();
93 __this_cpu_inc(ftrace_cpu_disabled);
94}
95
96static inline void ftrace_enable_cpu(void)
97{
98 __this_cpu_dec(ftrace_cpu_disabled);
99 preempt_enable();
100}
101
102cpumask_var_t __read_mostly tracing_buffer_mask; 90cpumask_var_t __read_mostly tracing_buffer_mask;
103 91
104/* 92/*
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
629static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 617static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
630{ 618{
631 int len; 619 int len;
632 void *ret;
633 620
634 if (s->len <= s->readpos) 621 if (s->len <= s->readpos)
635 return -EBUSY; 622 return -EBUSY;
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
637 len = s->len - s->readpos; 624 len = s->len - s->readpos;
638 if (cnt > len) 625 if (cnt > len)
639 cnt = len; 626 cnt = len;
640 ret = memcpy(buf, s->buffer + s->readpos, cnt); 627 memcpy(buf, s->buffer + s->readpos, cnt);
641 if (!ret)
642 return -EFAULT;
643 628
644 s->readpos += cnt; 629 s->readpos += cnt;
645 return cnt; 630 return cnt;
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
751 736
752 arch_spin_lock(&ftrace_max_lock); 737 arch_spin_lock(&ftrace_max_lock);
753 738
754 ftrace_disable_cpu();
755
756 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 739 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
757 740
758 if (ret == -EBUSY) { 741 if (ret == -EBUSY) {
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
766 "Failed to swap buffers due to commit in progress\n"); 749 "Failed to swap buffers due to commit in progress\n");
767 } 750 }
768 751
769 ftrace_enable_cpu();
770
771 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 752 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
772 753
773 __update_max_tr(tr, tsk, cpu); 754 __update_max_tr(tr, tsk, cpu);
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
782 * Register a new plugin tracer. 763 * Register a new plugin tracer.
783 */ 764 */
784int register_tracer(struct tracer *type) 765int register_tracer(struct tracer *type)
785__releases(kernel_lock)
786__acquires(kernel_lock)
787{ 766{
788 struct tracer *t; 767 struct tracer *t;
789 int ret = 0; 768 int ret = 0;
@@ -841,7 +820,8 @@ __acquires(kernel_lock)
841 820
842 /* If we expanded the buffers, make sure the max is expanded too */ 821 /* If we expanded the buffers, make sure the max is expanded too */
843 if (ring_buffer_expanded && type->use_max_tr) 822 if (ring_buffer_expanded && type->use_max_tr)
844 ring_buffer_resize(max_tr.buffer, trace_buf_size); 823 ring_buffer_resize(max_tr.buffer, trace_buf_size,
824 RING_BUFFER_ALL_CPUS);
845 825
846 /* the test is responsible for initializing and enabling */ 826 /* the test is responsible for initializing and enabling */
847 pr_info("Testing tracer %s: ", type->name); 827 pr_info("Testing tracer %s: ", type->name);
@@ -857,7 +837,8 @@ __acquires(kernel_lock)
857 837
858 /* Shrink the max buffer again */ 838 /* Shrink the max buffer again */
859 if (ring_buffer_expanded && type->use_max_tr) 839 if (ring_buffer_expanded && type->use_max_tr)
860 ring_buffer_resize(max_tr.buffer, 1); 840 ring_buffer_resize(max_tr.buffer, 1,
841 RING_BUFFER_ALL_CPUS);
861 842
862 printk(KERN_CONT "PASSED\n"); 843 printk(KERN_CONT "PASSED\n");
863 } 844 }
@@ -917,13 +898,6 @@ out:
917 mutex_unlock(&trace_types_lock); 898 mutex_unlock(&trace_types_lock);
918} 899}
919 900
920static void __tracing_reset(struct ring_buffer *buffer, int cpu)
921{
922 ftrace_disable_cpu();
923 ring_buffer_reset_cpu(buffer, cpu);
924 ftrace_enable_cpu();
925}
926
927void tracing_reset(struct trace_array *tr, int cpu) 901void tracing_reset(struct trace_array *tr, int cpu)
928{ 902{
929 struct ring_buffer *buffer = tr->buffer; 903 struct ring_buffer *buffer = tr->buffer;
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
932 906
933 /* Make sure all commits have finished */ 907 /* Make sure all commits have finished */
934 synchronize_sched(); 908 synchronize_sched();
935 __tracing_reset(buffer, cpu); 909 ring_buffer_reset_cpu(buffer, cpu);
936 910
937 ring_buffer_record_enable(buffer); 911 ring_buffer_record_enable(buffer);
938} 912}
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
950 tr->time_start = ftrace_now(tr->cpu); 924 tr->time_start = ftrace_now(tr->cpu);
951 925
952 for_each_online_cpu(cpu) 926 for_each_online_cpu(cpu)
953 __tracing_reset(buffer, cpu); 927 ring_buffer_reset_cpu(buffer, cpu);
954 928
955 ring_buffer_record_enable(buffer); 929 ring_buffer_record_enable(buffer);
956} 930}
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1498 1472
1499#endif /* CONFIG_STACKTRACE */ 1473#endif /* CONFIG_STACKTRACE */
1500 1474
1475/* created for use with alloc_percpu */
1476struct trace_buffer_struct {
1477 char buffer[TRACE_BUF_SIZE];
1478};
1479
1480static struct trace_buffer_struct *trace_percpu_buffer;
1481static struct trace_buffer_struct *trace_percpu_sirq_buffer;
1482static struct trace_buffer_struct *trace_percpu_irq_buffer;
1483static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1484
1485/*
1486 * The buffer used is dependent on the context. There is a per cpu
1487 * buffer for normal context, softirq contex, hard irq context and
1488 * for NMI context. Thise allows for lockless recording.
1489 *
1490 * Note, if the buffers failed to be allocated, then this returns NULL
1491 */
1492static char *get_trace_buf(void)
1493{
1494 struct trace_buffer_struct *percpu_buffer;
1495 struct trace_buffer_struct *buffer;
1496
1497 /*
1498 * If we have allocated per cpu buffers, then we do not
1499 * need to do any locking.
1500 */
1501 if (in_nmi())
1502 percpu_buffer = trace_percpu_nmi_buffer;
1503 else if (in_irq())
1504 percpu_buffer = trace_percpu_irq_buffer;
1505 else if (in_softirq())
1506 percpu_buffer = trace_percpu_sirq_buffer;
1507 else
1508 percpu_buffer = trace_percpu_buffer;
1509
1510 if (!percpu_buffer)
1511 return NULL;
1512
1513 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
1514
1515 return buffer->buffer;
1516}
1517
1518static int alloc_percpu_trace_buffer(void)
1519{
1520 struct trace_buffer_struct *buffers;
1521 struct trace_buffer_struct *sirq_buffers;
1522 struct trace_buffer_struct *irq_buffers;
1523 struct trace_buffer_struct *nmi_buffers;
1524
1525 buffers = alloc_percpu(struct trace_buffer_struct);
1526 if (!buffers)
1527 goto err_warn;
1528
1529 sirq_buffers = alloc_percpu(struct trace_buffer_struct);
1530 if (!sirq_buffers)
1531 goto err_sirq;
1532
1533 irq_buffers = alloc_percpu(struct trace_buffer_struct);
1534 if (!irq_buffers)
1535 goto err_irq;
1536
1537 nmi_buffers = alloc_percpu(struct trace_buffer_struct);
1538 if (!nmi_buffers)
1539 goto err_nmi;
1540
1541 trace_percpu_buffer = buffers;
1542 trace_percpu_sirq_buffer = sirq_buffers;
1543 trace_percpu_irq_buffer = irq_buffers;
1544 trace_percpu_nmi_buffer = nmi_buffers;
1545
1546 return 0;
1547
1548 err_nmi:
1549 free_percpu(irq_buffers);
1550 err_irq:
1551 free_percpu(sirq_buffers);
1552 err_sirq:
1553 free_percpu(buffers);
1554 err_warn:
1555 WARN(1, "Could not allocate percpu trace_printk buffer");
1556 return -ENOMEM;
1557}
1558
1559void trace_printk_init_buffers(void)
1560{
1561 static int buffers_allocated;
1562
1563 if (buffers_allocated)
1564 return;
1565
1566 if (alloc_percpu_trace_buffer())
1567 return;
1568
1569 pr_info("ftrace: Allocated trace_printk buffers\n");
1570
1571 buffers_allocated = 1;
1572}
1573
1501/** 1574/**
1502 * trace_vbprintk - write binary msg to tracing buffer 1575 * trace_vbprintk - write binary msg to tracing buffer
1503 * 1576 *
1504 */ 1577 */
1505int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1578int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1506{ 1579{
1507 static arch_spinlock_t trace_buf_lock =
1508 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1509 static u32 trace_buf[TRACE_BUF_SIZE];
1510
1511 struct ftrace_event_call *call = &event_bprint; 1580 struct ftrace_event_call *call = &event_bprint;
1512 struct ring_buffer_event *event; 1581 struct ring_buffer_event *event;
1513 struct ring_buffer *buffer; 1582 struct ring_buffer *buffer;
1514 struct trace_array *tr = &global_trace; 1583 struct trace_array *tr = &global_trace;
1515 struct trace_array_cpu *data;
1516 struct bprint_entry *entry; 1584 struct bprint_entry *entry;
1517 unsigned long flags; 1585 unsigned long flags;
1518 int disable; 1586 char *tbuffer;
1519 int cpu, len = 0, size, pc; 1587 int len = 0, size, pc;
1520 1588
1521 if (unlikely(tracing_selftest_running || tracing_disabled)) 1589 if (unlikely(tracing_selftest_running || tracing_disabled))
1522 return 0; 1590 return 0;
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1526 1594
1527 pc = preempt_count(); 1595 pc = preempt_count();
1528 preempt_disable_notrace(); 1596 preempt_disable_notrace();
1529 cpu = raw_smp_processor_id();
1530 data = tr->data[cpu];
1531 1597
1532 disable = atomic_inc_return(&data->disabled); 1598 tbuffer = get_trace_buf();
1533 if (unlikely(disable != 1)) 1599 if (!tbuffer) {
1600 len = 0;
1534 goto out; 1601 goto out;
1602 }
1535 1603
1536 /* Lockdep uses trace_printk for lock tracing */ 1604 len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
1537 local_irq_save(flags);
1538 arch_spin_lock(&trace_buf_lock);
1539 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1540 1605
1541 if (len > TRACE_BUF_SIZE || len < 0) 1606 if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
1542 goto out_unlock; 1607 goto out;
1543 1608
1609 local_save_flags(flags);
1544 size = sizeof(*entry) + sizeof(u32) * len; 1610 size = sizeof(*entry) + sizeof(u32) * len;
1545 buffer = tr->buffer; 1611 buffer = tr->buffer;
1546 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1612 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1547 flags, pc); 1613 flags, pc);
1548 if (!event) 1614 if (!event)
1549 goto out_unlock; 1615 goto out;
1550 entry = ring_buffer_event_data(event); 1616 entry = ring_buffer_event_data(event);
1551 entry->ip = ip; 1617 entry->ip = ip;
1552 entry->fmt = fmt; 1618 entry->fmt = fmt;
1553 1619
1554 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1620 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1555 if (!filter_check_discard(call, entry, buffer, event)) { 1621 if (!filter_check_discard(call, entry, buffer, event)) {
1556 ring_buffer_unlock_commit(buffer, event); 1622 ring_buffer_unlock_commit(buffer, event);
1557 ftrace_trace_stack(buffer, flags, 6, pc); 1623 ftrace_trace_stack(buffer, flags, 6, pc);
1558 } 1624 }
1559 1625
1560out_unlock:
1561 arch_spin_unlock(&trace_buf_lock);
1562 local_irq_restore(flags);
1563
1564out: 1626out:
1565 atomic_dec_return(&data->disabled);
1566 preempt_enable_notrace(); 1627 preempt_enable_notrace();
1567 unpause_graph_tracing(); 1628 unpause_graph_tracing();
1568 1629
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr,
1588int trace_array_vprintk(struct trace_array *tr, 1649int trace_array_vprintk(struct trace_array *tr,
1589 unsigned long ip, const char *fmt, va_list args) 1650 unsigned long ip, const char *fmt, va_list args)
1590{ 1651{
1591 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1592 static char trace_buf[TRACE_BUF_SIZE];
1593
1594 struct ftrace_event_call *call = &event_print; 1652 struct ftrace_event_call *call = &event_print;
1595 struct ring_buffer_event *event; 1653 struct ring_buffer_event *event;
1596 struct ring_buffer *buffer; 1654 struct ring_buffer *buffer;
1597 struct trace_array_cpu *data; 1655 int len = 0, size, pc;
1598 int cpu, len = 0, size, pc;
1599 struct print_entry *entry; 1656 struct print_entry *entry;
1600 unsigned long irq_flags; 1657 unsigned long flags;
1601 int disable; 1658 char *tbuffer;
1602 1659
1603 if (tracing_disabled || tracing_selftest_running) 1660 if (tracing_disabled || tracing_selftest_running)
1604 return 0; 1661 return 0;
1605 1662
1663 /* Don't pollute graph traces with trace_vprintk internals */
1664 pause_graph_tracing();
1665
1606 pc = preempt_count(); 1666 pc = preempt_count();
1607 preempt_disable_notrace(); 1667 preempt_disable_notrace();
1608 cpu = raw_smp_processor_id();
1609 data = tr->data[cpu];
1610 1668
1611 disable = atomic_inc_return(&data->disabled); 1669
1612 if (unlikely(disable != 1)) 1670 tbuffer = get_trace_buf();
1671 if (!tbuffer) {
1672 len = 0;
1613 goto out; 1673 goto out;
1674 }
1614 1675
1615 pause_graph_tracing(); 1676 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
1616 raw_local_irq_save(irq_flags); 1677 if (len > TRACE_BUF_SIZE)
1617 arch_spin_lock(&trace_buf_lock); 1678 goto out;
1618 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1619 1679
1680 local_save_flags(flags);
1620 size = sizeof(*entry) + len + 1; 1681 size = sizeof(*entry) + len + 1;
1621 buffer = tr->buffer; 1682 buffer = tr->buffer;
1622 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1683 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1623 irq_flags, pc); 1684 flags, pc);
1624 if (!event) 1685 if (!event)
1625 goto out_unlock; 1686 goto out;
1626 entry = ring_buffer_event_data(event); 1687 entry = ring_buffer_event_data(event);
1627 entry->ip = ip; 1688 entry->ip = ip;
1628 1689
1629 memcpy(&entry->buf, trace_buf, len); 1690 memcpy(&entry->buf, tbuffer, len);
1630 entry->buf[len] = '\0'; 1691 entry->buf[len] = '\0';
1631 if (!filter_check_discard(call, entry, buffer, event)) { 1692 if (!filter_check_discard(call, entry, buffer, event)) {
1632 ring_buffer_unlock_commit(buffer, event); 1693 ring_buffer_unlock_commit(buffer, event);
1633 ftrace_trace_stack(buffer, irq_flags, 6, pc); 1694 ftrace_trace_stack(buffer, flags, 6, pc);
1634 } 1695 }
1635
1636 out_unlock:
1637 arch_spin_unlock(&trace_buf_lock);
1638 raw_local_irq_restore(irq_flags);
1639 unpause_graph_tracing();
1640 out: 1696 out:
1641 atomic_dec_return(&data->disabled);
1642 preempt_enable_notrace(); 1697 preempt_enable_notrace();
1698 unpause_graph_tracing();
1643 1699
1644 return len; 1700 return len;
1645} 1701}
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1652 1708
1653static void trace_iterator_increment(struct trace_iterator *iter) 1709static void trace_iterator_increment(struct trace_iterator *iter)
1654{ 1710{
1655 /* Don't allow ftrace to trace into the ring buffers */
1656 ftrace_disable_cpu();
1657
1658 iter->idx++; 1711 iter->idx++;
1659 if (iter->buffer_iter[iter->cpu]) 1712 if (iter->buffer_iter[iter->cpu])
1660 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); 1713 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1661
1662 ftrace_enable_cpu();
1663} 1714}
1664 1715
1665static struct trace_entry * 1716static struct trace_entry *
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1669 struct ring_buffer_event *event; 1720 struct ring_buffer_event *event;
1670 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1721 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
1671 1722
1672 /* Don't allow ftrace to trace into the ring buffers */
1673 ftrace_disable_cpu();
1674
1675 if (buf_iter) 1723 if (buf_iter)
1676 event = ring_buffer_iter_peek(buf_iter, ts); 1724 event = ring_buffer_iter_peek(buf_iter, ts);
1677 else 1725 else
1678 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 1726 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1679 lost_events); 1727 lost_events);
1680 1728
1681 ftrace_enable_cpu();
1682
1683 if (event) { 1729 if (event) {
1684 iter->ent_size = ring_buffer_event_length(event); 1730 iter->ent_size = ring_buffer_event_length(event);
1685 return ring_buffer_event_data(event); 1731 return ring_buffer_event_data(event);
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1769 1815
1770static void trace_consume(struct trace_iterator *iter) 1816static void trace_consume(struct trace_iterator *iter)
1771{ 1817{
1772 /* Don't allow ftrace to trace into the ring buffers */
1773 ftrace_disable_cpu();
1774 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 1818 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1775 &iter->lost_events); 1819 &iter->lost_events);
1776 ftrace_enable_cpu();
1777} 1820}
1778 1821
1779static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1822static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1862 iter->cpu = 0; 1905 iter->cpu = 0;
1863 iter->idx = -1; 1906 iter->idx = -1;
1864 1907
1865 ftrace_disable_cpu();
1866
1867 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1908 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1868 for_each_tracing_cpu(cpu) 1909 for_each_tracing_cpu(cpu)
1869 tracing_iter_reset(iter, cpu); 1910 tracing_iter_reset(iter, cpu);
1870 } else 1911 } else
1871 tracing_iter_reset(iter, cpu_file); 1912 tracing_iter_reset(iter, cpu_file);
1872 1913
1873 ftrace_enable_cpu();
1874
1875 iter->leftover = 0; 1914 iter->leftover = 0;
1876 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1915 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1877 ; 1916 ;
@@ -2332,15 +2371,13 @@ static struct trace_iterator *
2332__tracing_open(struct inode *inode, struct file *file) 2371__tracing_open(struct inode *inode, struct file *file)
2333{ 2372{
2334 long cpu_file = (long) inode->i_private; 2373 long cpu_file = (long) inode->i_private;
2335 void *fail_ret = ERR_PTR(-ENOMEM);
2336 struct trace_iterator *iter; 2374 struct trace_iterator *iter;
2337 struct seq_file *m; 2375 int cpu;
2338 int cpu, ret;
2339 2376
2340 if (tracing_disabled) 2377 if (tracing_disabled)
2341 return ERR_PTR(-ENODEV); 2378 return ERR_PTR(-ENODEV);
2342 2379
2343 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2380 iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
2344 if (!iter) 2381 if (!iter)
2345 return ERR_PTR(-ENOMEM); 2382 return ERR_PTR(-ENOMEM);
2346 2383
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file)
2397 tracing_iter_reset(iter, cpu); 2434 tracing_iter_reset(iter, cpu);
2398 } 2435 }
2399 2436
2400 ret = seq_open(file, &tracer_seq_ops);
2401 if (ret < 0) {
2402 fail_ret = ERR_PTR(ret);
2403 goto fail_buffer;
2404 }
2405
2406 m = file->private_data;
2407 m->private = iter;
2408
2409 mutex_unlock(&trace_types_lock); 2437 mutex_unlock(&trace_types_lock);
2410 2438
2411 return iter; 2439 return iter;
2412 2440
2413 fail_buffer:
2414 for_each_tracing_cpu(cpu) {
2415 if (iter->buffer_iter[cpu])
2416 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2417 }
2418 free_cpumask_var(iter->started);
2419 tracing_start();
2420 fail: 2441 fail:
2421 mutex_unlock(&trace_types_lock); 2442 mutex_unlock(&trace_types_lock);
2422 kfree(iter->trace); 2443 kfree(iter->trace);
2423 kfree(iter); 2444 seq_release_private(inode, file);
2424 2445 return ERR_PTR(-ENOMEM);
2425 return fail_ret;
2426} 2446}
2427 2447
2428int tracing_open_generic(struct inode *inode, struct file *filp) 2448int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file)
2458 tracing_start(); 2478 tracing_start();
2459 mutex_unlock(&trace_types_lock); 2479 mutex_unlock(&trace_types_lock);
2460 2480
2461 seq_release(inode, file);
2462 mutex_destroy(&iter->mutex); 2481 mutex_destroy(&iter->mutex);
2463 free_cpumask_var(iter->started); 2482 free_cpumask_var(iter->started);
2464 kfree(iter->trace); 2483 kfree(iter->trace);
2465 kfree(iter); 2484 seq_release_private(inode, file);
2466 return 0; 2485 return 0;
2467} 2486}
2468 2487
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2648 if (cpumask_test_cpu(cpu, tracing_cpumask) && 2667 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2649 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2668 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2650 atomic_inc(&global_trace.data[cpu]->disabled); 2669 atomic_inc(&global_trace.data[cpu]->disabled);
2670 ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
2651 } 2671 }
2652 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 2672 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2653 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2673 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2654 atomic_dec(&global_trace.data[cpu]->disabled); 2674 atomic_dec(&global_trace.data[cpu]->disabled);
2675 ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
2655 } 2676 }
2656 } 2677 }
2657 arch_spin_unlock(&ftrace_max_lock); 2678 arch_spin_unlock(&ftrace_max_lock);
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2974 return t->init(tr); 2995 return t->init(tr);
2975} 2996}
2976 2997
2977static int __tracing_resize_ring_buffer(unsigned long size) 2998static void set_buffer_entries(struct trace_array *tr, unsigned long val)
2999{
3000 int cpu;
3001 for_each_tracing_cpu(cpu)
3002 tr->data[cpu]->entries = val;
3003}
3004
3005static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
2978{ 3006{
2979 int ret; 3007 int ret;
2980 3008
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
2985 */ 3013 */
2986 ring_buffer_expanded = 1; 3014 ring_buffer_expanded = 1;
2987 3015
2988 ret = ring_buffer_resize(global_trace.buffer, size); 3016 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
2989 if (ret < 0) 3017 if (ret < 0)
2990 return ret; 3018 return ret;
2991 3019
2992 if (!current_trace->use_max_tr) 3020 if (!current_trace->use_max_tr)
2993 goto out; 3021 goto out;
2994 3022
2995 ret = ring_buffer_resize(max_tr.buffer, size); 3023 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
2996 if (ret < 0) { 3024 if (ret < 0) {
2997 int r; 3025 int r = 0;
3026
3027 if (cpu == RING_BUFFER_ALL_CPUS) {
3028 int i;
3029 for_each_tracing_cpu(i) {
3030 r = ring_buffer_resize(global_trace.buffer,
3031 global_trace.data[i]->entries,
3032 i);
3033 if (r < 0)
3034 break;
3035 }
3036 } else {
3037 r = ring_buffer_resize(global_trace.buffer,
3038 global_trace.data[cpu]->entries,
3039 cpu);
3040 }
2998 3041
2999 r = ring_buffer_resize(global_trace.buffer,
3000 global_trace.entries);
3001 if (r < 0) { 3042 if (r < 0) {
3002 /* 3043 /*
3003 * AARGH! We are left with different 3044 * AARGH! We are left with different
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size)
3019 return ret; 3060 return ret;
3020 } 3061 }
3021 3062
3022 max_tr.entries = size; 3063 if (cpu == RING_BUFFER_ALL_CPUS)
3064 set_buffer_entries(&max_tr, size);
3065 else
3066 max_tr.data[cpu]->entries = size;
3067
3023 out: 3068 out:
3024 global_trace.entries = size; 3069 if (cpu == RING_BUFFER_ALL_CPUS)
3070 set_buffer_entries(&global_trace, size);
3071 else
3072 global_trace.data[cpu]->entries = size;
3025 3073
3026 return ret; 3074 return ret;
3027} 3075}
3028 3076
3029static ssize_t tracing_resize_ring_buffer(unsigned long size) 3077static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3030{ 3078{
3031 int cpu, ret = size; 3079 int ret = size;
3032 3080
3033 mutex_lock(&trace_types_lock); 3081 mutex_lock(&trace_types_lock);
3034 3082
3035 tracing_stop(); 3083 if (cpu_id != RING_BUFFER_ALL_CPUS) {
3036 3084 /* make sure, this cpu is enabled in the mask */
3037 /* disable all cpu buffers */ 3085 if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
3038 for_each_tracing_cpu(cpu) { 3086 ret = -EINVAL;
3039 if (global_trace.data[cpu]) 3087 goto out;
3040 atomic_inc(&global_trace.data[cpu]->disabled); 3088 }
3041 if (max_tr.data[cpu])
3042 atomic_inc(&max_tr.data[cpu]->disabled);
3043 } 3089 }
3044 3090
3045 if (size != global_trace.entries) 3091 ret = __tracing_resize_ring_buffer(size, cpu_id);
3046 ret = __tracing_resize_ring_buffer(size);
3047
3048 if (ret < 0) 3092 if (ret < 0)
3049 ret = -ENOMEM; 3093 ret = -ENOMEM;
3050 3094
3051 for_each_tracing_cpu(cpu) { 3095out:
3052 if (global_trace.data[cpu])
3053 atomic_dec(&global_trace.data[cpu]->disabled);
3054 if (max_tr.data[cpu])
3055 atomic_dec(&max_tr.data[cpu]->disabled);
3056 }
3057
3058 tracing_start();
3059 mutex_unlock(&trace_types_lock); 3096 mutex_unlock(&trace_types_lock);
3060 3097
3061 return ret; 3098 return ret;
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void)
3078 3115
3079 mutex_lock(&trace_types_lock); 3116 mutex_lock(&trace_types_lock);
3080 if (!ring_buffer_expanded) 3117 if (!ring_buffer_expanded)
3081 ret = __tracing_resize_ring_buffer(trace_buf_size); 3118 ret = __tracing_resize_ring_buffer(trace_buf_size,
3119 RING_BUFFER_ALL_CPUS);
3082 mutex_unlock(&trace_types_lock); 3120 mutex_unlock(&trace_types_lock);
3083 3121
3084 return ret; 3122 return ret;
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf)
3102 mutex_lock(&trace_types_lock); 3140 mutex_lock(&trace_types_lock);
3103 3141
3104 if (!ring_buffer_expanded) { 3142 if (!ring_buffer_expanded) {
3105 ret = __tracing_resize_ring_buffer(trace_buf_size); 3143 ret = __tracing_resize_ring_buffer(trace_buf_size,
3144 RING_BUFFER_ALL_CPUS);
3106 if (ret < 0) 3145 if (ret < 0)
3107 goto out; 3146 goto out;
3108 ret = 0; 3147 ret = 0;
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf)
3128 * The max_tr ring buffer has some state (e.g. ring->clock) and 3167 * The max_tr ring buffer has some state (e.g. ring->clock) and
3129 * we want preserve it. 3168 * we want preserve it.
3130 */ 3169 */
3131 ring_buffer_resize(max_tr.buffer, 1); 3170 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3132 max_tr.entries = 1; 3171 set_buffer_entries(&max_tr, 1);
3133 } 3172 }
3134 destroy_trace_option_files(topts); 3173 destroy_trace_option_files(topts);
3135 3174
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf)
3137 3176
3138 topts = create_trace_option_files(current_trace); 3177 topts = create_trace_option_files(current_trace);
3139 if (current_trace->use_max_tr) { 3178 if (current_trace->use_max_tr) {
3140 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); 3179 int cpu;
3141 if (ret < 0) 3180 /* we need to make per cpu buffer sizes equivalent */
3142 goto out; 3181 for_each_tracing_cpu(cpu) {
3143 max_tr.entries = global_trace.entries; 3182 ret = ring_buffer_resize(max_tr.buffer,
3183 global_trace.data[cpu]->entries,
3184 cpu);
3185 if (ret < 0)
3186 goto out;
3187 max_tr.data[cpu]->entries =
3188 global_trace.data[cpu]->entries;
3189 }
3144 } 3190 }
3145 3191
3146 if (t->init) { 3192 if (t->init) {
@@ -3642,30 +3688,82 @@ out_err:
3642 goto out; 3688 goto out;
3643} 3689}
3644 3690
3691struct ftrace_entries_info {
3692 struct trace_array *tr;
3693 int cpu;
3694};
3695
3696static int tracing_entries_open(struct inode *inode, struct file *filp)
3697{
3698 struct ftrace_entries_info *info;
3699
3700 if (tracing_disabled)
3701 return -ENODEV;
3702
3703 info = kzalloc(sizeof(*info), GFP_KERNEL);
3704 if (!info)
3705 return -ENOMEM;
3706
3707 info->tr = &global_trace;
3708 info->cpu = (unsigned long)inode->i_private;
3709
3710 filp->private_data = info;
3711
3712 return 0;
3713}
3714
3645static ssize_t 3715static ssize_t
3646tracing_entries_read(struct file *filp, char __user *ubuf, 3716tracing_entries_read(struct file *filp, char __user *ubuf,
3647 size_t cnt, loff_t *ppos) 3717 size_t cnt, loff_t *ppos)
3648{ 3718{
3649 struct trace_array *tr = filp->private_data; 3719 struct ftrace_entries_info *info = filp->private_data;
3650 char buf[96]; 3720 struct trace_array *tr = info->tr;
3651 int r; 3721 char buf[64];
3722 int r = 0;
3723 ssize_t ret;
3652 3724
3653 mutex_lock(&trace_types_lock); 3725 mutex_lock(&trace_types_lock);
3654 if (!ring_buffer_expanded) 3726
3655 r = sprintf(buf, "%lu (expanded: %lu)\n", 3727 if (info->cpu == RING_BUFFER_ALL_CPUS) {
3656 tr->entries >> 10, 3728 int cpu, buf_size_same;
3657 trace_buf_size >> 10); 3729 unsigned long size;
3658 else 3730
3659 r = sprintf(buf, "%lu\n", tr->entries >> 10); 3731 size = 0;
3732 buf_size_same = 1;
3733 /* check if all cpu sizes are same */
3734 for_each_tracing_cpu(cpu) {
3735 /* fill in the size from first enabled cpu */
3736 if (size == 0)
3737 size = tr->data[cpu]->entries;
3738 if (size != tr->data[cpu]->entries) {
3739 buf_size_same = 0;
3740 break;
3741 }
3742 }
3743
3744 if (buf_size_same) {
3745 if (!ring_buffer_expanded)
3746 r = sprintf(buf, "%lu (expanded: %lu)\n",
3747 size >> 10,
3748 trace_buf_size >> 10);
3749 else
3750 r = sprintf(buf, "%lu\n", size >> 10);
3751 } else
3752 r = sprintf(buf, "X\n");
3753 } else
3754 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
3755
3660 mutex_unlock(&trace_types_lock); 3756 mutex_unlock(&trace_types_lock);
3661 3757
3662 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3758 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3759 return ret;
3663} 3760}
3664 3761
3665static ssize_t 3762static ssize_t
3666tracing_entries_write(struct file *filp, const char __user *ubuf, 3763tracing_entries_write(struct file *filp, const char __user *ubuf,
3667 size_t cnt, loff_t *ppos) 3764 size_t cnt, loff_t *ppos)
3668{ 3765{
3766 struct ftrace_entries_info *info = filp->private_data;
3669 unsigned long val; 3767 unsigned long val;
3670 int ret; 3768 int ret;
3671 3769
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3680 /* value is in KB */ 3778 /* value is in KB */
3681 val <<= 10; 3779 val <<= 10;
3682 3780
3683 ret = tracing_resize_ring_buffer(val); 3781 ret = tracing_resize_ring_buffer(val, info->cpu);
3684 if (ret < 0) 3782 if (ret < 0)
3685 return ret; 3783 return ret;
3686 3784
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3689 return cnt; 3787 return cnt;
3690} 3788}
3691 3789
3790static int
3791tracing_entries_release(struct inode *inode, struct file *filp)
3792{
3793 struct ftrace_entries_info *info = filp->private_data;
3794
3795 kfree(info);
3796
3797 return 0;
3798}
3799
3692static ssize_t 3800static ssize_t
3693tracing_total_entries_read(struct file *filp, char __user *ubuf, 3801tracing_total_entries_read(struct file *filp, char __user *ubuf,
3694 size_t cnt, loff_t *ppos) 3802 size_t cnt, loff_t *ppos)
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3700 3808
3701 mutex_lock(&trace_types_lock); 3809 mutex_lock(&trace_types_lock);
3702 for_each_tracing_cpu(cpu) { 3810 for_each_tracing_cpu(cpu) {
3703 size += tr->entries >> 10; 3811 size += tr->data[cpu]->entries >> 10;
3704 if (!ring_buffer_expanded) 3812 if (!ring_buffer_expanded)
3705 expanded_size += trace_buf_size >> 10; 3813 expanded_size += trace_buf_size >> 10;
3706 } 3814 }
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3734 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 3842 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3735 tracing_off(); 3843 tracing_off();
3736 /* resize the ring buffer to 0 */ 3844 /* resize the ring buffer to 0 */
3737 tracing_resize_ring_buffer(0); 3845 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
3738 3846
3739 return 0; 3847 return 0;
3740} 3848}
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3749 struct print_entry *entry; 3857 struct print_entry *entry;
3750 unsigned long irq_flags; 3858 unsigned long irq_flags;
3751 struct page *pages[2]; 3859 struct page *pages[2];
3860 void *map_page[2];
3752 int nr_pages = 1; 3861 int nr_pages = 1;
3753 ssize_t written; 3862 ssize_t written;
3754 void *page1;
3755 void *page2;
3756 int offset; 3863 int offset;
3757 int size; 3864 int size;
3758 int len; 3865 int len;
3759 int ret; 3866 int ret;
3867 int i;
3760 3868
3761 if (tracing_disabled) 3869 if (tracing_disabled)
3762 return -EINVAL; 3870 return -EINVAL;
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3795 goto out; 3903 goto out;
3796 } 3904 }
3797 3905
3798 page1 = kmap_atomic(pages[0]); 3906 for (i = 0; i < nr_pages; i++)
3799 if (nr_pages == 2) 3907 map_page[i] = kmap_atomic(pages[i]);
3800 page2 = kmap_atomic(pages[1]);
3801 3908
3802 local_save_flags(irq_flags); 3909 local_save_flags(irq_flags);
3803 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 3910 size = sizeof(*entry) + cnt + 2; /* possible \n added */
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3815 3922
3816 if (nr_pages == 2) { 3923 if (nr_pages == 2) {
3817 len = PAGE_SIZE - offset; 3924 len = PAGE_SIZE - offset;
3818 memcpy(&entry->buf, page1 + offset, len); 3925 memcpy(&entry->buf, map_page[0] + offset, len);
3819 memcpy(&entry->buf[len], page2, cnt - len); 3926 memcpy(&entry->buf[len], map_page[1], cnt - len);
3820 } else 3927 } else
3821 memcpy(&entry->buf, page1 + offset, cnt); 3928 memcpy(&entry->buf, map_page[0] + offset, cnt);
3822 3929
3823 if (entry->buf[cnt - 1] != '\n') { 3930 if (entry->buf[cnt - 1] != '\n') {
3824 entry->buf[cnt] = '\n'; 3931 entry->buf[cnt] = '\n';
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3833 *fpos += written; 3940 *fpos += written;
3834 3941
3835 out_unlock: 3942 out_unlock:
3836 if (nr_pages == 2) 3943 for (i = 0; i < nr_pages; i++){
3837 kunmap_atomic(page2); 3944 kunmap_atomic(map_page[i]);
3838 kunmap_atomic(page1); 3945 put_page(pages[i]);
3839 while (nr_pages > 0) 3946 }
3840 put_page(pages[--nr_pages]);
3841 out: 3947 out:
3842 return written; 3948 return written;
3843} 3949}
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = {
3933}; 4039};
3934 4040
3935static const struct file_operations tracing_entries_fops = { 4041static const struct file_operations tracing_entries_fops = {
3936 .open = tracing_open_generic, 4042 .open = tracing_entries_open,
3937 .read = tracing_entries_read, 4043 .read = tracing_entries_read,
3938 .write = tracing_entries_write, 4044 .write = tracing_entries_write,
4045 .release = tracing_entries_release,
3939 .llseek = generic_file_llseek, 4046 .llseek = generic_file_llseek,
3940}; 4047};
3941 4048
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4367 struct dentry *d_cpu; 4474 struct dentry *d_cpu;
4368 char cpu_dir[30]; /* 30 characters should be more than enough */ 4475 char cpu_dir[30]; /* 30 characters should be more than enough */
4369 4476
4477 if (!d_percpu)
4478 return;
4479
4370 snprintf(cpu_dir, 30, "cpu%ld", cpu); 4480 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4371 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4481 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4372 if (!d_cpu) { 4482 if (!d_cpu) {
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4387 4497
4388 trace_create_file("stats", 0444, d_cpu, 4498 trace_create_file("stats", 0444, d_cpu,
4389 (void *) cpu, &tracing_stats_fops); 4499 (void *) cpu, &tracing_stats_fops);
4500
4501 trace_create_file("buffer_size_kb", 0444, d_cpu,
4502 (void *) cpu, &tracing_entries_fops);
4390} 4503}
4391 4504
4392#ifdef CONFIG_FTRACE_SELFTEST 4505#ifdef CONFIG_FTRACE_SELFTEST
@@ -4629,7 +4742,8 @@ static ssize_t
4629rb_simple_read(struct file *filp, char __user *ubuf, 4742rb_simple_read(struct file *filp, char __user *ubuf,
4630 size_t cnt, loff_t *ppos) 4743 size_t cnt, loff_t *ppos)
4631{ 4744{
4632 struct ring_buffer *buffer = filp->private_data; 4745 struct trace_array *tr = filp->private_data;
4746 struct ring_buffer *buffer = tr->buffer;
4633 char buf[64]; 4747 char buf[64];
4634 int r; 4748 int r;
4635 4749
@@ -4647,7 +4761,8 @@ static ssize_t
4647rb_simple_write(struct file *filp, const char __user *ubuf, 4761rb_simple_write(struct file *filp, const char __user *ubuf,
4648 size_t cnt, loff_t *ppos) 4762 size_t cnt, loff_t *ppos)
4649{ 4763{
4650 struct ring_buffer *buffer = filp->private_data; 4764 struct trace_array *tr = filp->private_data;
4765 struct ring_buffer *buffer = tr->buffer;
4651 unsigned long val; 4766 unsigned long val;
4652 int ret; 4767 int ret;
4653 4768
@@ -4716,7 +4831,7 @@ static __init int tracer_init_debugfs(void)
4716 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4831 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
4717 4832
4718 trace_create_file("buffer_size_kb", 0644, d_tracer, 4833 trace_create_file("buffer_size_kb", 0644, d_tracer,
4719 &global_trace, &tracing_entries_fops); 4834 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
4720 4835
4721 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 4836 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4722 &global_trace, &tracing_total_entries_fops); 4837 &global_trace, &tracing_total_entries_fops);
@@ -4734,7 +4849,7 @@ static __init int tracer_init_debugfs(void)
4734 &trace_clock_fops); 4849 &trace_clock_fops);
4735 4850
4736 trace_create_file("tracing_on", 0644, d_tracer, 4851 trace_create_file("tracing_on", 0644, d_tracer,
4737 global_trace.buffer, &rb_simple_fops); 4852 &global_trace, &rb_simple_fops);
4738 4853
4739#ifdef CONFIG_DYNAMIC_FTRACE 4854#ifdef CONFIG_DYNAMIC_FTRACE
4740 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4855 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
@@ -4955,6 +5070,10 @@ __init static int tracer_alloc_buffers(void)
4955 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 5070 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4956 goto out_free_buffer_mask; 5071 goto out_free_buffer_mask;
4957 5072
5073 /* Only allocate trace_printk buffers if a trace_printk exists */
5074 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5075 trace_printk_init_buffers();
5076
4958 /* To save memory, keep the ring buffer size to its minimum */ 5077 /* To save memory, keep the ring buffer size to its minimum */
4959 if (ring_buffer_expanded) 5078 if (ring_buffer_expanded)
4960 ring_buf_size = trace_buf_size; 5079 ring_buf_size = trace_buf_size;
@@ -4973,7 +5092,6 @@ __init static int tracer_alloc_buffers(void)
4973 WARN_ON(1); 5092 WARN_ON(1);
4974 goto out_free_cpumask; 5093 goto out_free_cpumask;
4975 } 5094 }
4976 global_trace.entries = ring_buffer_size(global_trace.buffer);
4977 if (global_trace.buffer_disabled) 5095 if (global_trace.buffer_disabled)
4978 tracing_off(); 5096 tracing_off();
4979 5097
@@ -4986,7 +5104,6 @@ __init static int tracer_alloc_buffers(void)
4986 ring_buffer_free(global_trace.buffer); 5104 ring_buffer_free(global_trace.buffer);
4987 goto out_free_cpumask; 5105 goto out_free_cpumask;
4988 } 5106 }
4989 max_tr.entries = 1;
4990#endif 5107#endif
4991 5108
4992 /* Allocate the first page for all buffers */ 5109 /* Allocate the first page for all buffers */
@@ -4995,6 +5112,12 @@ __init static int tracer_alloc_buffers(void)
4995 max_tr.data[i] = &per_cpu(max_tr_data, i); 5112 max_tr.data[i] = &per_cpu(max_tr_data, i);
4996 } 5113 }
4997 5114
5115 set_buffer_entries(&global_trace,
5116 ring_buffer_size(global_trace.buffer, 0));
5117#ifdef CONFIG_TRACER_MAX_TRACE
5118 set_buffer_entries(&max_tr, 1);
5119#endif
5120
4998 trace_init_cmdlines(); 5121 trace_init_cmdlines();
4999 5122
5000 register_tracer(&nop_trace); 5123 register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 95059f091a24..6c6f7933eede 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -131,6 +131,7 @@ struct trace_array_cpu {
131 atomic_t disabled; 131 atomic_t disabled;
132 void *buffer_page; /* ring buffer spare */ 132 void *buffer_page; /* ring buffer spare */
133 133
134 unsigned long entries;
134 unsigned long saved_latency; 135 unsigned long saved_latency;
135 unsigned long critical_start; 136 unsigned long critical_start;
136 unsigned long critical_end; 137 unsigned long critical_end;
@@ -152,7 +153,6 @@ struct trace_array_cpu {
152 */ 153 */
153struct trace_array { 154struct trace_array {
154 struct ring_buffer *buffer; 155 struct ring_buffer *buffer;
155 unsigned long entries;
156 int cpu; 156 int cpu;
157 int buffer_disabled; 157 int buffer_disabled;
158 cycle_t time_start; 158 cycle_t time_start;
@@ -826,6 +826,8 @@ extern struct list_head ftrace_events;
826extern const char *__start___trace_bprintk_fmt[]; 826extern const char *__start___trace_bprintk_fmt[];
827extern const char *__stop___trace_bprintk_fmt[]; 827extern const char *__stop___trace_bprintk_fmt[];
828 828
829void trace_printk_init_buffers(void);
830
829#undef FTRACE_ENTRY 831#undef FTRACE_ENTRY
830#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 832#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
831 extern struct ftrace_event_call \ 833 extern struct ftrace_event_call \
@@ -836,11 +838,11 @@ extern const char *__stop___trace_bprintk_fmt[];
836 filter) 838 filter)
837#include "trace_entries.h" 839#include "trace_entries.h"
838 840
839#ifdef CONFIG_FUNCTION_TRACER 841#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
840int perf_ftrace_event_register(struct ftrace_event_call *call, 842int perf_ftrace_event_register(struct ftrace_event_call *call,
841 enum trace_reg type, void *data); 843 enum trace_reg type, void *data);
842#else 844#else
843#define perf_ftrace_event_register NULL 845#define perf_ftrace_event_register NULL
844#endif /* CONFIG_FUNCTION_TRACER */ 846#endif
845 847
846#endif /* _LINUX_KERNEL_TRACE_H */ 848#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9d..29111da1d100 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
294 if (!call->name || !call->class || !call->class->reg) 294 if (!call->name || !call->class || !call->class->reg)
295 continue; 295 continue;
296 296
297 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
298 continue;
299
297 if (match && 300 if (match &&
298 strcmp(match, call->name) != 0 && 301 strcmp(match, call->name) != 0 &&
299 strcmp(match, call->class->system) != 0) 302 strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1164 return -1; 1167 return -1;
1165 } 1168 }
1166 1169
1167 if (call->class->reg) 1170 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1168 trace_create_file("enable", 0644, call->dir, call, 1171 trace_create_file("enable", 0644, call->dir, call,
1169 enable); 1172 enable);
1170 1173
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc856..e039906b037d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 180 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 181 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 182 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
183}; \ 184}; \
184struct ftrace_event_call __used \ 185struct ftrace_event_call __used \
185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 859fae6b1825..df611a0e76c5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter)
652{ 652{
653 u64 next_ts; 653 u64 next_ts;
654 int ret; 654 int ret;
655 /* trace_find_next_entry will reset ent_size */
656 int ent_size = iter->ent_size;
655 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
656 struct trace_entry *entry = iter->ent, 658 struct trace_entry *entry = iter->ent,
657 *next_entry = trace_find_next_entry(iter, NULL, 659 *next_entry = trace_find_next_entry(iter, NULL,
@@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter)
660 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); 662 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
661 unsigned long rel_usecs; 663 unsigned long rel_usecs;
662 664
665 /* Restore the original ent_size */
666 iter->ent_size = ent_size;
667
663 if (!next_entry) 668 if (!next_entry)
664 next_ts = iter->ts; 669 next_ts = iter->ts;
665 rel_usecs = ns2usecs(next_ts - iter->ts); 670 rel_usecs = ns2usecs(next_ts - iter->ts);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 6fd4ffd042f9..a9077c1b4ad3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
51 const char **iter; 51 const char **iter;
52 char *fmt; 52 char *fmt;
53 53
54 /* allocate the trace_printk per cpu buffers */
55 if (start != end)
56 trace_printk_init_buffers();
57
54 mutex_lock(&btrace_mutex); 58 mutex_lock(&btrace_mutex);
55 for (iter = start; iter < end; iter++) { 59 for (iter = start; iter < end; iter++) {
56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); 60 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a4721..000000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/events/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include <linux/slab.h>
13#include <linux/kref.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* A cpu workqueue thread */
19struct cpu_workqueue_stats {
20 struct list_head list;
21 struct kref kref;
22 int cpu;
23 pid_t pid;
24/* Can be inserted from interrupt or user context, need to be atomic */
25 atomic_t inserted;
26/*
27 * Don't need to be atomic, works are serialized in a single workqueue thread
28 * on a single CPU.
29 */
30 unsigned int executed;
31};
32
33/* List of workqueue threads on one cpu */
34struct workqueue_global_stats {
35 struct list_head list;
36 spinlock_t lock;
37};
38
39/* Don't need a global lock because allocated before the workqueues, and
40 * never freed.
41 */
42static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
43#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
44
45static void cpu_workqueue_stat_free(struct kref *kref)
46{
47 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
48}
49
50/* Insertion of a work */
51static void
52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
54 struct work_struct *work)
55{
56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
57 struct cpu_workqueue_stats *node;
58 unsigned long flags;
59
60 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
61 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
62 if (node->pid == wq_thread->pid) {
63 atomic_inc(&node->inserted);
64 goto found;
65 }
66 }
67 pr_debug("trace_workqueue: entry not found\n");
68found:
69 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
70}
71
72/* Execution of a work */
73static void
74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
76 struct work_struct *work)
77{
78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
79 struct cpu_workqueue_stats *node;
80 unsigned long flags;
81
82 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
83 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
84 if (node->pid == wq_thread->pid) {
85 node->executed++;
86 goto found;
87 }
88 }
89 pr_debug("trace_workqueue: entry not found\n");
90found:
91 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
92}
93
94/* Creation of a cpu workqueue thread */
95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
97{
98 struct cpu_workqueue_stats *cws;
99 unsigned long flags;
100
101 WARN_ON(cpu < 0);
102
103 /* Workqueues are sometimes created in atomic context */
104 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
105 if (!cws) {
106 pr_warning("trace_workqueue: not enough memory\n");
107 return;
108 }
109 INIT_LIST_HEAD(&cws->list);
110 kref_init(&cws->kref);
111 cws->cpu = cpu;
112 cws->pid = wq_thread->pid;
113
114 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
115 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
116 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
117}
118
119/* Destruction of a cpu workqueue thread */
120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
122{
123 /* Workqueue only execute on one cpu */
124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
125 struct cpu_workqueue_stats *node, *next;
126 unsigned long flags;
127
128 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
129 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
130 list) {
131 if (node->pid == wq_thread->pid) {
132 list_del(&node->list);
133 kref_put(&node->kref, cpu_workqueue_stat_free);
134 goto found;
135 }
136 }
137
138 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
139found:
140 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
141
142}
143
144static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
145{
146 unsigned long flags;
147 struct cpu_workqueue_stats *ret = NULL;
148
149
150 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
153 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
154 struct cpu_workqueue_stats, list);
155 kref_get(&ret->kref);
156 }
157
158 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
159
160 return ret;
161}
162
163static void *workqueue_stat_start(struct tracer_stat *trace)
164{
165 int cpu;
166 void *ret = NULL;
167
168 for_each_possible_cpu(cpu) {
169 ret = workqueue_stat_start_cpu(cpu);
170 if (ret)
171 return ret;
172 }
173 return NULL;
174}
175
176static void *workqueue_stat_next(void *prev, int idx)
177{
178 struct cpu_workqueue_stats *prev_cws = prev;
179 struct cpu_workqueue_stats *ret;
180 int cpu = prev_cws->cpu;
181 unsigned long flags;
182
183 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
184 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186 do {
187 cpu = cpumask_next(cpu, cpu_possible_mask);
188 if (cpu >= nr_cpu_ids)
189 return NULL;
190 } while (!(ret = workqueue_stat_start_cpu(cpu)));
191 return ret;
192 } else {
193 ret = list_entry(prev_cws->list.next,
194 struct cpu_workqueue_stats, list);
195 kref_get(&ret->kref);
196 }
197 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
198
199 return ret;
200}
201
202static int workqueue_stat_show(struct seq_file *s, void *p)
203{
204 struct cpu_workqueue_stats *cws = p;
205 struct pid *pid;
206 struct task_struct *tsk;
207
208 pid = find_get_pid(cws->pid);
209 if (pid) {
210 tsk = get_pid_task(pid, PIDTYPE_PID);
211 if (tsk) {
212 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
213 atomic_read(&cws->inserted), cws->executed,
214 tsk->comm);
215 put_task_struct(tsk);
216 }
217 put_pid(pid);
218 }
219
220 return 0;
221}
222
223static void workqueue_stat_release(void *stat)
224{
225 struct cpu_workqueue_stats *node = stat;
226
227 kref_put(&node->kref, cpu_workqueue_stat_free);
228}
229
230static int workqueue_stat_headers(struct seq_file *s)
231{
232 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
233 seq_printf(s, "# | | | |\n");
234 return 0;
235}
236
237struct tracer_stat workqueue_stats __read_mostly = {
238 .name = "workqueues",
239 .stat_start = workqueue_stat_start,
240 .stat_next = workqueue_stat_next,
241 .stat_show = workqueue_stat_show,
242 .stat_release = workqueue_stat_release,
243 .stat_headers = workqueue_stat_headers
244};
245
246
247int __init stat_workqueue_init(void)
248{
249 if (register_stat_tracer(&workqueue_stats)) {
250 pr_warning("Unable to register workqueue stat tracer\n");
251 return 1;
252 }
253
254 return 0;
255}
256fs_initcall(stat_workqueue_init);
257
258/*
259 * Workqueues are created very early, just after pre-smp initcalls.
260 * So we must register our tracepoints at this stage.
261 */
262int __init trace_workqueue_early_init(void)
263{
264 int ret, cpu;
265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
272 if (ret)
273 goto out;
274
275 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
276 if (ret)
277 goto no_insertion;
278
279 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
280 if (ret)
281 goto no_execution;
282
283 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
284 if (ret)
285 goto no_creation;
286
287 return 0;
288
289no_creation:
290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
291no_execution:
292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
293no_insertion:
294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
295out:
296 pr_warning("trace_workqueue: unable to trace workqueues\n");
297
298 return 1;
299}
300early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c08..9a3128dc67df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1032 cwq = get_cwq(gcwq->cpu, wq); 1032 cwq = get_cwq(gcwq->cpu, wq);
1033 trace_workqueue_queue_work(cpu, cwq, work); 1033 trace_workqueue_queue_work(cpu, cwq, work);
1034 1034
1035 BUG_ON(!list_empty(&work->entry)); 1035 if (WARN_ON(!list_empty(&work->entry))) {
1036 spin_unlock_irqrestore(&gcwq->lock, flags);
1037 return;
1038 }
1036 1039
1037 cwq->nr_in_flight[cwq->work_color]++; 1040 cwq->nr_in_flight[cwq->work_color]++;
1038 work_flags = work_color_to_flags(cwq->work_color); 1041 work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
1210 } else 1213 } else
1211 wake_up_all(&gcwq->trustee_wait); 1214 wake_up_all(&gcwq->trustee_wait);
1212 1215
1213 /* sanity check nr_running */ 1216 /*
1214 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && 1217 * Sanity check nr_running. Because trustee releases gcwq->lock
1218 * between setting %WORKER_ROGUE and zapping nr_running, the
1219 * warning may trigger spuriously. Check iff trustee is idle.
1220 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
1222 gcwq->nr_workers == gcwq->nr_idle &&
1215 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1223 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1216} 1224}
1217 1225
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
1810 * lock freed" warnings as well as problems when looking into 1818 * lock freed" warnings as well as problems when looking into
1811 * work->lockdep_map, make a copy and use that here. 1819 * work->lockdep_map, make a copy and use that here.
1812 */ 1820 */
1813 struct lockdep_map lockdep_map = work->lockdep_map; 1821 struct lockdep_map lockdep_map;
1822
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1814#endif 1824#endif
1815 /* 1825 /*
1816 * A single work shouldn't be executed concurrently by 1826 * A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
2506{ 2516{
2507 struct wq_barrier barr; 2517 struct wq_barrier barr;
2508 2518
2519 lock_map_acquire(&work->lockdep_map);
2520 lock_map_release(&work->lockdep_map);
2521
2509 if (start_flush_work(work, &barr, true)) { 2522 if (start_flush_work(work, &barr, true)) {
2510 wait_for_completion(&barr.done); 2523 wait_for_completion(&barr.done);
2511 destroy_work_on_stack(&barr.work); 2524 destroy_work_on_stack(&barr.work);