aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/cgroup.c564
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/compat.c63
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c31
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/fork.c75
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/manage.c46
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/params.c62
-rw-r--r--kernel/printk.c1390
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c332
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/res_counter.c71
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/smp.c27
-rw-r--r--kernel/smpboot.c62
-rw-r--r--kernel/smpboot.h18
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/timer.c12
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c242
-rw-r--r--kernel/trace/ring_buffer.c585
-rw-r--r--kernel/trace/trace.c503
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/workqueue.c21
48 files changed, 4177 insertions, 1870 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9f..6c07f30fa9b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
45obj-$(CONFIG_SMP) += smp.o 45obj-$(CONFIG_SMP) += smp.o
46obj-$(CONFIG_SMP) += smpboot.o
46ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
47obj-y += up.o 48obj-y += up.o
48endif 49endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34eae..4b96415527b8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
70#include <linux/compat.h>
70 71
71#include "audit.h" 72#include "audit.h"
72 73
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
2710 audit_log_end(ab); 2711 audit_log_end(ab);
2711} 2712}
2712 2713
2713void __audit_seccomp(unsigned long syscall) 2714void __audit_seccomp(unsigned long syscall, long signr, int code)
2714{ 2715{
2715 struct audit_buffer *ab; 2716 struct audit_buffer *ab;
2716 2717
2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2718 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2718 audit_log_abend(ab, "seccomp", SIGKILL); 2719 audit_log_abend(ab, "seccomp", signr);
2719 audit_log_format(ab, " syscall=%ld", syscall); 2720 audit_log_format(ab, " syscall=%ld", syscall);
2721 audit_log_format(ab, " compat=%d", is_compat_task());
2722 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2723 audit_log_format(ab, " code=0x%x", code);
2720 audit_log_end(ab); 2724 audit_log_end(ab);
2721} 2725}
2722 2726
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..ad8eae5bb801 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
66/* 70/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 71 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 72 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 131 /* A list running through the active hierarchies */
128 struct list_head root_list; 132 struct list_head root_list;
129 133
134 /* All cgroups on this root, cgroup_mutex protected */
135 struct list_head allcg_list;
136
130 /* Hierarchy-specific flags */ 137 /* Hierarchy-specific flags */
131 unsigned long flags; 138 unsigned long flags;
132 139
@@ -145,6 +152,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 152static struct cgroupfs_root rootnode;
146 153
147/* 154/*
155 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
156 */
157struct cfent {
158 struct list_head node;
159 struct dentry *dentry;
160 struct cftype *type;
161};
162
163/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 164 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 165 * cgroup_subsys->use_id != 0.
150 */ 166 */
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)
239 255
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 257
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css)
260{
261 int v = atomic_read(&css->refcnt);
262
263 return v >= 0 ? v : v - CSS_DEACT_BIAS;
264}
265
242/* convenient tests for these bits */ 266/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 267inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 268{
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 303#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 304list_for_each_entry(_root, &roots, root_list)
281 305
306static inline struct cgroup *__d_cgrp(struct dentry *dentry)
307{
308 return dentry->d_fsdata;
309}
310
311static inline struct cfent *__d_cfe(struct dentry *dentry)
312{
313 return dentry->d_fsdata;
314}
315
316static inline struct cftype *__d_cft(struct dentry *dentry)
317{
318 return __d_cfe(dentry)->type;
319}
320
282/* the list of cgroups eligible for automatic release. Protected by 321/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 322 * release_list_lock */
284static LIST_HEAD(release_list); 323static LIST_HEAD(release_list);
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 855 struct cgroup_subsys *ss;
817 int ret = 0; 856 int ret = 0;
818 857
819 for_each_subsys(cgrp->root, ss) 858 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 859 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(cgrp); 860 continue;
822 if (ret) 861
823 break; 862 ret = ss->pre_destroy(cgrp);
863 if (ret) {
864 /* ->pre_destroy() failure is being deprecated */
865 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
866 break;
824 } 867 }
868 }
825 869
826 return ret; 870 return ret;
827} 871}
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 908 BUG_ON(!list_empty(&cgrp->pidlists));
865 909
866 kfree_rcu(cgrp, rcu_head); 910 kfree_rcu(cgrp, rcu_head);
911 } else {
912 struct cfent *cfe = __d_cfe(dentry);
913 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
914
915 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name);
918 kfree(cfe);
867 } 919 }
868 iput(inode); 920 iput(inode);
869} 921}
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 934 dput(parent);
883} 935}
884 936
885static void cgroup_clear_directory(struct dentry *dentry) 937static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 938{
887 struct list_head *node; 939 struct cfent *cfe;
888 940
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 941 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 942 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 943
892 while (node != &dentry->d_subdirs) { 944 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 945 struct dentry *d = cfe->dentry;
894 946
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 947 if (cft && cfe->type != cft)
896 list_del_init(node); 948 continue;
897 if (d->d_inode) { 949
898 /* This should never be called on a cgroup 950 dget(d);
899 * directory with child cgroups */ 951 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 952 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 953 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 954 dput(d);
903 spin_unlock(&dentry->d_lock); 955
904 d_delete(d); 956 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 957 }
912 spin_unlock(&dentry->d_lock); 958 return -ENOENT;
959}
960
961static void cgroup_clear_directory(struct dentry *dir)
962{
963 struct cgroup *cgrp = __d_cgrp(dir);
964
965 while (!list_empty(&cgrp->files))
966 cgroup_rm_file(cgrp, NULL);
913} 967}
914 968
915/* 969/*
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1348 if (ret)
1295 goto out_unlock; 1349 goto out_unlock;
1296 1350
1351 /* See feature-removal-schedule.txt */
1352 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1353 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1354 task_tgid_nr(current), current->comm);
1355
1297 /* Don't allow flags or name to change at remount */ 1356 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1357 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1358 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1367 goto out_unlock;
1309 } 1368 }
1310 1369
1311 /* (re)populate subsystem files */ 1370 /* clear out any existing files and repopulate subsystem files */
1371 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1372 cgroup_populate_dir(cgrp);
1313 1373
1314 if (opts.release_agent) 1374 if (opts.release_agent)
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1393{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1394 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1395 INIT_LIST_HEAD(&cgrp->children);
1396 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1397 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1398 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1399 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1405static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1406{
1346 struct cgroup *cgrp = &root->top_cgroup; 1407 struct cgroup *cgrp = &root->top_cgroup;
1408
1347 INIT_LIST_HEAD(&root->subsys_list); 1409 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1410 INIT_LIST_HEAD(&root->root_list);
1411 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1412 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1413 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1414 cgrp->top_cgroup = cgrp;
1415 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1416 init_cgroup_housekeeping(cgrp);
1353} 1417}
1354 1418
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = {
1692 1756
1693static struct kobject *cgroup_kobj; 1757static struct kobject *cgroup_kobj;
1694 1758
1695static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1696{
1697 return dentry->d_fsdata;
1698}
1699
1700static inline struct cftype *__d_cft(struct dentry *dentry)
1701{
1702 return dentry->d_fsdata;
1703}
1704
1705/** 1759/**
1706 * cgroup_path - generate the path of a cgroup 1760 * cgroup_path - generate the path of a cgroup
1707 * @cgrp: the cgroup in question 1761 * @cgrp: the cgroup in question
@@ -2172,6 +2226,18 @@ retry_find_task:
2172 2226
2173 if (threadgroup) 2227 if (threadgroup)
2174 tsk = tsk->group_leader; 2228 tsk = tsk->group_leader;
2229
2230 /*
2231 * Workqueue threads may acquire PF_THREAD_BOUND and become
2232 * trapped in a cpuset, or RT worker may be born in a cgroup
2233 * with no rt_runtime allocated. Just say no.
2234 */
2235 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2236 ret = -EINVAL;
2237 rcu_read_unlock();
2238 goto out_unlock_cgroup;
2239 }
2240
2175 get_task_struct(tsk); 2241 get_task_struct(tsk);
2176 rcu_read_unlock(); 2242 rcu_read_unlock();
2177 2243
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2603 return mode; 2669 return mode;
2604} 2670}
2605 2671
2606int cgroup_add_file(struct cgroup *cgrp, 2672static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2607 struct cgroup_subsys *subsys, 2673 const struct cftype *cft)
2608 const struct cftype *cft)
2609{ 2674{
2610 struct dentry *dir = cgrp->dentry; 2675 struct dentry *dir = cgrp->dentry;
2676 struct cgroup *parent = __d_cgrp(dir);
2611 struct dentry *dentry; 2677 struct dentry *dentry;
2678 struct cfent *cfe;
2612 int error; 2679 int error;
2613 umode_t mode; 2680 umode_t mode;
2614
2615 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2681 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2682
2683 /* does @cft->flags tell us to skip creation on @cgrp? */
2684 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2685 return 0;
2686 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2687 return 0;
2688
2616 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2689 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2617 strcpy(name, subsys->name); 2690 strcpy(name, subsys->name);
2618 strcat(name, "."); 2691 strcat(name, ".");
2619 } 2692 }
2620 strcat(name, cft->name); 2693 strcat(name, cft->name);
2694
2621 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2695 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2696
2697 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2698 if (!cfe)
2699 return -ENOMEM;
2700
2622 dentry = lookup_one_len(name, dir, strlen(name)); 2701 dentry = lookup_one_len(name, dir, strlen(name));
2623 if (!IS_ERR(dentry)) { 2702 if (IS_ERR(dentry)) {
2624 mode = cgroup_file_mode(cft);
2625 error = cgroup_create_file(dentry, mode | S_IFREG,
2626 cgrp->root->sb);
2627 if (!error)
2628 dentry->d_fsdata = (void *)cft;
2629 dput(dentry);
2630 } else
2631 error = PTR_ERR(dentry); 2703 error = PTR_ERR(dentry);
2704 goto out;
2705 }
2706
2707 mode = cgroup_file_mode(cft);
2708 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2709 if (!error) {
2710 cfe->type = (void *)cft;
2711 cfe->dentry = dentry;
2712 dentry->d_fsdata = cfe;
2713 list_add_tail(&cfe->node, &parent->files);
2714 cfe = NULL;
2715 }
2716 dput(dentry);
2717out:
2718 kfree(cfe);
2632 return error; 2719 return error;
2633} 2720}
2634EXPORT_SYMBOL_GPL(cgroup_add_file);
2635 2721
2636int cgroup_add_files(struct cgroup *cgrp, 2722static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2637 struct cgroup_subsys *subsys, 2723 const struct cftype cfts[], bool is_add)
2638 const struct cftype cft[],
2639 int count)
2640{ 2724{
2641 int i, err; 2725 const struct cftype *cft;
2642 for (i = 0; i < count; i++) { 2726 int err, ret = 0;
2643 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2727
2644 if (err) 2728 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2645 return err; 2729 if (is_add)
2730 err = cgroup_add_file(cgrp, subsys, cft);
2731 else
2732 err = cgroup_rm_file(cgrp, cft);
2733 if (err) {
2734 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2735 is_add ? "add" : "remove", cft->name, err);
2736 ret = err;
2737 }
2738 }
2739 return ret;
2740}
2741
2742static DEFINE_MUTEX(cgroup_cft_mutex);
2743
2744static void cgroup_cfts_prepare(void)
2745 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2746{
2747 /*
2748 * Thanks to the entanglement with vfs inode locking, we can't walk
2749 * the existing cgroups under cgroup_mutex and create files.
2750 * Instead, we increment reference on all cgroups and build list of
2751 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2752 * exclusive access to the field.
2753 */
2754 mutex_lock(&cgroup_cft_mutex);
2755 mutex_lock(&cgroup_mutex);
2756}
2757
2758static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2759 const struct cftype *cfts, bool is_add)
2760 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2761{
2762 LIST_HEAD(pending);
2763 struct cgroup *cgrp, *n;
2764
2765 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2766 if (cfts && ss->root != &rootnode) {
2767 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2768 dget(cgrp->dentry);
2769 list_add_tail(&cgrp->cft_q_node, &pending);
2770 }
2771 }
2772
2773 mutex_unlock(&cgroup_mutex);
2774
2775 /*
2776 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2777 * files for all cgroups which were created before.
2778 */
2779 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2780 struct inode *inode = cgrp->dentry->d_inode;
2781
2782 mutex_lock(&inode->i_mutex);
2783 mutex_lock(&cgroup_mutex);
2784 if (!cgroup_is_removed(cgrp))
2785 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2786 mutex_unlock(&cgroup_mutex);
2787 mutex_unlock(&inode->i_mutex);
2788
2789 list_del_init(&cgrp->cft_q_node);
2790 dput(cgrp->dentry);
2646 } 2791 }
2792
2793 mutex_unlock(&cgroup_cft_mutex);
2794}
2795
2796/**
2797 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2798 * @ss: target cgroup subsystem
2799 * @cfts: zero-length name terminated array of cftypes
2800 *
2801 * Register @cfts to @ss. Files described by @cfts are created for all
2802 * existing cgroups to which @ss is attached and all future cgroups will
2803 * have them too. This function can be called anytime whether @ss is
2804 * attached or not.
2805 *
2806 * Returns 0 on successful registration, -errno on failure. Note that this
2807 * function currently returns 0 as long as @cfts registration is successful
2808 * even if some file creation attempts on existing cgroups fail.
2809 */
2810int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2811{
2812 struct cftype_set *set;
2813
2814 set = kzalloc(sizeof(*set), GFP_KERNEL);
2815 if (!set)
2816 return -ENOMEM;
2817
2818 cgroup_cfts_prepare();
2819 set->cfts = cfts;
2820 list_add_tail(&set->node, &ss->cftsets);
2821 cgroup_cfts_commit(ss, cfts, true);
2822
2647 return 0; 2823 return 0;
2648} 2824}
2649EXPORT_SYMBOL_GPL(cgroup_add_files); 2825EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2826
2827/**
2828 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2829 * @ss: target cgroup subsystem
2830 * @cfts: zero-length name terminated array of cftypes
2831 *
2832 * Unregister @cfts from @ss. Files described by @cfts are removed from
2833 * all existing cgroups to which @ss is attached and all future cgroups
2834 * won't have them either. This function can be called anytime whether @ss
2835 * is attached or not.
2836 *
2837 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2838 * registered with @ss.
2839 */
2840int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2841{
2842 struct cftype_set *set;
2843
2844 cgroup_cfts_prepare();
2845
2846 list_for_each_entry(set, &ss->cftsets, node) {
2847 if (set->cfts == cfts) {
2848 list_del_init(&set->node);
2849 cgroup_cfts_commit(ss, cfts, false);
2850 return 0;
2851 }
2852 }
2853
2854 cgroup_cfts_commit(ss, NULL, false);
2855 return -ENOENT;
2856}
2650 2857
2651/** 2858/**
2652 * cgroup_task_count - count the number of tasks in a cgroup. 2859 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3832,14 @@ static struct cftype files[] = {
3625 .read_u64 = cgroup_clone_children_read, 3832 .read_u64 = cgroup_clone_children_read,
3626 .write_u64 = cgroup_clone_children_write, 3833 .write_u64 = cgroup_clone_children_write,
3627 }, 3834 },
3628}; 3835 {
3629 3836 .name = "release_agent",
3630static struct cftype cft_release_agent = { 3837 .flags = CFTYPE_ONLY_ON_ROOT,
3631 .name = "release_agent", 3838 .read_seq_string = cgroup_release_agent_show,
3632 .read_seq_string = cgroup_release_agent_show, 3839 .write_string = cgroup_release_agent_write,
3633 .write_string = cgroup_release_agent_write, 3840 .max_write_len = PATH_MAX,
3634 .max_write_len = PATH_MAX, 3841 },
3842 { } /* terminate */
3635}; 3843};
3636 3844
3637static int cgroup_populate_dir(struct cgroup *cgrp) 3845static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3639 int err; 3847 int err;
3640 struct cgroup_subsys *ss; 3848 struct cgroup_subsys *ss;
3641 3849
3642 /* First clear out any existing files */ 3850 err = cgroup_addrm_files(cgrp, NULL, files, true);
3643 cgroup_clear_directory(cgrp->dentry);
3644
3645 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3646 if (err < 0) 3851 if (err < 0)
3647 return err; 3852 return err;
3648 3853
3649 if (cgrp == cgrp->top_cgroup) { 3854 /* process cftsets of each subsystem */
3650 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3651 return err;
3652 }
3653
3654 for_each_subsys(cgrp->root, ss) { 3855 for_each_subsys(cgrp->root, ss) {
3655 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3856 struct cftype_set *set;
3656 return err; 3857
3858 list_for_each_entry(set, &ss->cftsets, node)
3859 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3657 } 3860 }
3861
3658 /* This cgroup is ready now */ 3862 /* This cgroup is ready now */
3659 for_each_subsys(cgrp->root, ss) { 3863 for_each_subsys(cgrp->root, ss) {
3660 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3864 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3670 return 0; 3874 return 0;
3671} 3875}
3672 3876
3877static void css_dput_fn(struct work_struct *work)
3878{
3879 struct cgroup_subsys_state *css =
3880 container_of(work, struct cgroup_subsys_state, dput_work);
3881
3882 dput(css->cgroup->dentry);
3883}
3884
3673static void init_cgroup_css(struct cgroup_subsys_state *css, 3885static void init_cgroup_css(struct cgroup_subsys_state *css,
3674 struct cgroup_subsys *ss, 3886 struct cgroup_subsys *ss,
3675 struct cgroup *cgrp) 3887 struct cgroup *cgrp)
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3682 set_bit(CSS_ROOT, &css->flags); 3894 set_bit(CSS_ROOT, &css->flags);
3683 BUG_ON(cgrp->subsys[ss->subsys_id]); 3895 BUG_ON(cgrp->subsys[ss->subsys_id]);
3684 cgrp->subsys[ss->subsys_id] = css; 3896 cgrp->subsys[ss->subsys_id] = css;
3897
3898 /*
3899 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3900 * which is put on the last css_put(). dput() requires process
3901 * context, which css_put() may be called without. @css->dput_work
3902 * will be used to invoke dput() asynchronously from css_put().
3903 */
3904 INIT_WORK(&css->dput_work, css_dput_fn);
3905 if (ss->__DEPRECATED_clear_css_refs)
3906 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3685} 3907}
3686 3908
3687static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3909static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3784 if (err < 0) 4006 if (err < 0)
3785 goto err_remove; 4007 goto err_remove;
3786 4008
4009 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4010 for_each_subsys(root, ss)
4011 if (!ss->__DEPRECATED_clear_css_refs)
4012 dget(dentry);
4013
3787 /* The cgroup directory was pre-locked for us */ 4014 /* The cgroup directory was pre-locked for us */
3788 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4015 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3789 4016
4017 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4018
3790 err = cgroup_populate_dir(cgrp); 4019 err = cgroup_populate_dir(cgrp);
3791 /* If err < 0, we have a half-filled directory - oh well ;) */ 4020 /* If err < 0, we have a half-filled directory - oh well ;) */
3792 4021
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3826 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4055 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3827} 4056}
3828 4057
4058/*
4059 * Check the reference count on each subsystem. Since we already
4060 * established that there are no tasks in the cgroup, if the css refcount
4061 * is also 1, then there should be no outstanding references, so the
4062 * subsystem is safe to destroy. We scan across all subsystems rather than
4063 * using the per-hierarchy linked list of mounted subsystems since we can
4064 * be called via check_for_release() with no synchronization other than
4065 * RCU, and the subsystem linked list isn't RCU-safe.
4066 */
3829static int cgroup_has_css_refs(struct cgroup *cgrp) 4067static int cgroup_has_css_refs(struct cgroup *cgrp)
3830{ 4068{
3831 /* Check the reference count on each subsystem. Since we
3832 * already established that there are no tasks in the
3833 * cgroup, if the css refcount is also 1, then there should
3834 * be no outstanding references, so the subsystem is safe to
3835 * destroy. We scan across all subsystems rather than using
3836 * the per-hierarchy linked list of mounted subsystems since
3837 * we can be called via check_for_release() with no
3838 * synchronization other than RCU, and the subsystem linked
3839 * list isn't RCU-safe */
3840 int i; 4069 int i;
4070
3841 /* 4071 /*
3842 * We won't need to lock the subsys array, because the subsystems 4072 * We won't need to lock the subsys array, because the subsystems
3843 * we're concerned about aren't going anywhere since our cgroup root 4073 * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4076 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3847 struct cgroup_subsys *ss = subsys[i]; 4077 struct cgroup_subsys *ss = subsys[i];
3848 struct cgroup_subsys_state *css; 4078 struct cgroup_subsys_state *css;
4079
3849 /* Skip subsystems not present or not in this hierarchy */ 4080 /* Skip subsystems not present or not in this hierarchy */
3850 if (ss == NULL || ss->root != cgrp->root) 4081 if (ss == NULL || ss->root != cgrp->root)
3851 continue; 4082 continue;
4083
3852 css = cgrp->subsys[ss->subsys_id]; 4084 css = cgrp->subsys[ss->subsys_id];
3853 /* When called from check_for_release() it's possible 4085 /*
4086 * When called from check_for_release() it's possible
3854 * that by this point the cgroup has been removed 4087 * that by this point the cgroup has been removed
3855 * and the css deleted. But a false-positive doesn't 4088 * and the css deleted. But a false-positive doesn't
3856 * matter, since it can only happen if the cgroup 4089 * matter, since it can only happen if the cgroup
3857 * has been deleted and hence no longer needs the 4090 * has been deleted and hence no longer needs the
3858 * release agent to be called anyway. */ 4091 * release agent to be called anyway.
3859 if (css && (atomic_read(&css->refcnt) > 1)) 4092 */
4093 if (css && css_refcnt(css) > 1)
3860 return 1; 4094 return 1;
3861 } 4095 }
3862 return 0; 4096 return 0;
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3866 * Atomically mark all (or else none) of the cgroup's CSS objects as 4100 * Atomically mark all (or else none) of the cgroup's CSS objects as
3867 * CSS_REMOVED. Return true on success, or false if the cgroup has 4101 * CSS_REMOVED. Return true on success, or false if the cgroup has
3868 * busy subsystems. Call with cgroup_mutex held 4102 * busy subsystems. Call with cgroup_mutex held
4103 *
4104 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4105 * not, cgroup removal behaves differently.
4106 *
4107 * If clear is set, css refcnt for the subsystem should be zero before
4108 * cgroup removal can be committed. This is implemented by
4109 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4110 * called multiple times until all css refcnts reach zero and is allowed to
4111 * veto removal on any invocation. This behavior is deprecated and will be
4112 * removed as soon as the existing user (memcg) is updated.
4113 *
4114 * If clear is not set, each css holds an extra reference to the cgroup's
4115 * dentry and cgroup removal proceeds regardless of css refs.
4116 * ->pre_destroy() will be called at least once and is not allowed to fail.
4117 * On the last put of each css, whenever that may be, the extra dentry ref
4118 * is put so that dentry destruction happens only after all css's are
4119 * released.
3869 */ 4120 */
3870
3871static int cgroup_clear_css_refs(struct cgroup *cgrp) 4121static int cgroup_clear_css_refs(struct cgroup *cgrp)
3872{ 4122{
3873 struct cgroup_subsys *ss; 4123 struct cgroup_subsys *ss;
3874 unsigned long flags; 4124 unsigned long flags;
3875 bool failed = false; 4125 bool failed = false;
4126
3876 local_irq_save(flags); 4127 local_irq_save(flags);
4128
4129 /*
4130 * Block new css_tryget() by deactivating refcnt. If all refcnts
4131 * for subsystems w/ clear_css_refs set were 1 at the moment of
4132 * deactivation, we succeeded.
4133 */
3877 for_each_subsys(cgrp->root, ss) { 4134 for_each_subsys(cgrp->root, ss) {
3878 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4135 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3879 int refcnt; 4136
3880 while (1) { 4137 WARN_ON(atomic_read(&css->refcnt) < 0);
3881 /* We can only remove a CSS with a refcnt==1 */ 4138 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3882 refcnt = atomic_read(&css->refcnt); 4139
3883 if (refcnt > 1) { 4140 if (ss->__DEPRECATED_clear_css_refs)
3884 failed = true; 4141 failed |= css_refcnt(css) != 1;
3885 goto done;
3886 }
3887 BUG_ON(!refcnt);
3888 /*
3889 * Drop the refcnt to 0 while we check other
3890 * subsystems. This will cause any racing
3891 * css_tryget() to spin until we set the
3892 * CSS_REMOVED bits or abort
3893 */
3894 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3895 break;
3896 cpu_relax();
3897 }
3898 } 4142 }
3899 done: 4143
4144 /*
4145 * If succeeded, set REMOVED and put all the base refs; otherwise,
4146 * restore refcnts to positive values. Either way, all in-progress
4147 * css_tryget() will be released.
4148 */
3900 for_each_subsys(cgrp->root, ss) { 4149 for_each_subsys(cgrp->root, ss) {
3901 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4150 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3902 if (failed) { 4151
3903 /* 4152 if (!failed) {
3904 * Restore old refcnt if we previously managed
3905 * to clear it from 1 to 0
3906 */
3907 if (!atomic_read(&css->refcnt))
3908 atomic_set(&css->refcnt, 1);
3909 } else {
3910 /* Commit the fact that the CSS is removed */
3911 set_bit(CSS_REMOVED, &css->flags); 4153 set_bit(CSS_REMOVED, &css->flags);
4154 css_put(css);
4155 } else {
4156 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3912 } 4157 }
3913 } 4158 }
4159
3914 local_irq_restore(flags); 4160 local_irq_restore(flags);
3915 return !failed; 4161 return !failed;
3916} 4162}
@@ -3995,6 +4241,8 @@ again:
3995 list_del_init(&cgrp->sibling); 4241 list_del_init(&cgrp->sibling);
3996 cgroup_unlock_hierarchy(cgrp->root); 4242 cgroup_unlock_hierarchy(cgrp->root);
3997 4243
4244 list_del_init(&cgrp->allcg_node);
4245
3998 d = dget(cgrp->dentry); 4246 d = dget(cgrp->dentry);
3999 4247
4000 cgroup_d_remove_dir(d); 4248 cgroup_d_remove_dir(d);
@@ -4021,12 +4269,29 @@ again:
4021 return 0; 4269 return 0;
4022} 4270}
4023 4271
4272static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4273{
4274 INIT_LIST_HEAD(&ss->cftsets);
4275
4276 /*
4277 * base_cftset is embedded in subsys itself, no need to worry about
4278 * deregistration.
4279 */
4280 if (ss->base_cftypes) {
4281 ss->base_cftset.cfts = ss->base_cftypes;
4282 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4283 }
4284}
4285
4024static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4286static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4025{ 4287{
4026 struct cgroup_subsys_state *css; 4288 struct cgroup_subsys_state *css;
4027 4289
4028 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4290 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4029 4291
4292 /* init base cftset */
4293 cgroup_init_cftsets(ss);
4294
4030 /* Create the top cgroup state for this subsystem */ 4295 /* Create the top cgroup state for this subsystem */
4031 list_add(&ss->sibling, &rootnode.subsys_list); 4296 list_add(&ss->sibling, &rootnode.subsys_list);
4032 ss->root = &rootnode; 4297 ss->root = &rootnode;
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 return 0; 4361 return 0;
4097 } 4362 }
4098 4363
4364 /* init base cftset */
4365 cgroup_init_cftsets(ss);
4366
4099 /* 4367 /*
4100 * need to register a subsys id before anything else - for example, 4368 * need to register a subsys id before anything else - for example,
4101 * init_cgroup_css needs it. 4369 * init_cgroup_css needs it.
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp)
4685} 4953}
4686 4954
4687/* Caller must verify that the css is not for root cgroup */ 4955/* Caller must verify that the css is not for root cgroup */
4688void __css_put(struct cgroup_subsys_state *css, int count) 4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 do {
4959 int v = css_refcnt(css);
4960
4961 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4962 return true;
4963 cpu_relax();
4964 } while (!test_bit(CSS_REMOVED, &css->flags));
4965
4966 return false;
4967}
4968EXPORT_SYMBOL_GPL(__css_tryget);
4969
4970/* Caller must verify that the css is not for root cgroup */
4971void __css_put(struct cgroup_subsys_state *css)
4689{ 4972{
4690 struct cgroup *cgrp = css->cgroup; 4973 struct cgroup *cgrp = css->cgroup;
4691 int val; 4974
4692 rcu_read_lock(); 4975 rcu_read_lock();
4693 val = atomic_sub_return(count, &css->refcnt); 4976 atomic_dec(&css->refcnt);
4694 if (val == 1) { 4977 switch (css_refcnt(css)) {
4978 case 1:
4695 if (notify_on_release(cgrp)) { 4979 if (notify_on_release(cgrp)) {
4696 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4980 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4697 check_for_release(cgrp); 4981 check_for_release(cgrp);
4698 } 4982 }
4699 cgroup_wakeup_rmdir_waiter(cgrp); 4983 cgroup_wakeup_rmdir_waiter(cgrp);
4984 break;
4985 case 0:
4986 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4987 schedule_work(&css->dput_work);
4988 break;
4700 } 4989 }
4701 rcu_read_unlock(); 4990 rcu_read_unlock();
4702 WARN_ON_ONCE(val < 1);
4703} 4991}
4704EXPORT_SYMBOL_GPL(__css_put); 4992EXPORT_SYMBOL_GPL(__css_put);
4705 4993
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4818 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5106 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4819 * it's unchanged until freed. 5107 * it's unchanged until freed.
4820 */ 5108 */
4821 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5109 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4822 5110
4823 if (cssid) 5111 if (cssid)
4824 return cssid->id; 5112 return cssid->id;
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4830{ 5118{
4831 struct css_id *cssid; 5119 struct css_id *cssid;
4832 5120
4833 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5121 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4834 5122
4835 if (cssid) 5123 if (cssid)
4836 return cssid->depth; 5124 return cssid->depth;
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = {
5211 .name = "releasable", 5499 .name = "releasable",
5212 .read_u64 = releasable_read, 5500 .read_u64 = releasable_read,
5213 }, 5501 },
5214};
5215 5502
5216static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5503 { } /* terminate */
5217{ 5504};
5218 return cgroup_add_files(cont, ss, debug_files,
5219 ARRAY_SIZE(debug_files));
5220}
5221 5505
5222struct cgroup_subsys debug_subsys = { 5506struct cgroup_subsys debug_subsys = {
5223 .name = "debug", 5507 .name = "debug",
5224 .create = debug_create, 5508 .create = debug_create,
5225 .destroy = debug_destroy, 5509 .destroy = debug_destroy,
5226 .populate = debug_populate,
5227 .subsys_id = debug_subsys_id, 5510 .subsys_id = debug_subsys_id,
5511 .base_cftypes = debug_files,
5228}; 5512};
5229#endif /* CONFIG_CGROUP_DEBUG */ 5513#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
358static struct cftype files[] = { 358static struct cftype files[] = {
359 { 359 {
360 .name = "state", 360 .name = "state",
361 .flags = CFTYPE_NOT_ON_ROOT,
361 .read_seq_string = freezer_read, 362 .read_seq_string = freezer_read,
362 .write_string = freezer_write, 363 .write_string = freezer_write,
363 }, 364 },
365 { } /* terminate */
364}; 366};
365 367
366static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
367{
368 if (!cgroup->parent)
369 return 0;
370 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
371}
372
373struct cgroup_subsys freezer_subsys = { 368struct cgroup_subsys freezer_subsys = {
374 .name = "freezer", 369 .name = "freezer",
375 .create = freezer_create, 370 .create = freezer_create,
376 .destroy = freezer_destroy, 371 .destroy = freezer_destroy,
377 .populate = freezer_populate,
378 .subsys_id = freezer_subsys_id, 372 .subsys_id = freezer_subsys_id,
379 .can_attach = freezer_can_attach, 373 .can_attach = freezer_can_attach,
380 .fork = freezer_fork, 374 .fork = freezer_fork,
375 .base_cftypes = files,
381}; 376};
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809a..d2c67aa49ae6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
372 372
373#ifdef __ARCH_WANT_SYS_SIGPROCMASK 373#ifdef __ARCH_WANT_SYS_SIGPROCMASK
374 374
375asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 375/*
376 compat_old_sigset_t __user *oset) 376 * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
377 * blocked set of signals to the supplied signal set
378 */
379static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
377{ 380{
378 old_sigset_t s; 381 memcpy(blocked->sig, &set, sizeof(set));
379 long ret; 382}
380 mm_segment_t old_fs;
381 383
382 if (set && get_user(s, set)) 384asmlinkage long compat_sys_sigprocmask(int how,
383 return -EFAULT; 385 compat_old_sigset_t __user *nset,
384 old_fs = get_fs(); 386 compat_old_sigset_t __user *oset)
385 set_fs(KERNEL_DS); 387{
386 ret = sys_sigprocmask(how, 388 old_sigset_t old_set, new_set;
387 set ? (old_sigset_t __user *) &s : NULL, 389 sigset_t new_blocked;
388 oset ? (old_sigset_t __user *) &s : NULL); 390
389 set_fs(old_fs); 391 old_set = current->blocked.sig[0];
390 if (ret == 0) 392
391 if (oset) 393 if (nset) {
392 ret = put_user(s, oset); 394 if (get_user(new_set, nset))
393 return ret; 395 return -EFAULT;
396 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
397
398 new_blocked = current->blocked;
399
400 switch (how) {
401 case SIG_BLOCK:
402 sigaddsetmask(&new_blocked, new_set);
403 break;
404 case SIG_UNBLOCK:
405 sigdelsetmask(&new_blocked, new_set);
406 break;
407 case SIG_SETMASK:
408 compat_sig_setmask(&new_blocked, new_set);
409 break;
410 default:
411 return -EINVAL;
412 }
413
414 set_current_blocked(&new_blocked);
415 }
416
417 if (oset) {
418 if (put_user(old_set, oset))
419 return -EFAULT;
420 }
421
422 return 0;
394} 423}
395 424
396#endif 425#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..0e6353cf147a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,8 @@
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/suspend.h> 18#include <linux/suspend.h>
19 19
20#include "smpboot.h"
21
20#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
21/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 23/* Serializes the updates to cpu_online_mask, cpu_present_mask */
22static DEFINE_MUTEX(cpu_add_remove_lock); 24static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
295 int ret, nr_calls = 0; 297 int ret, nr_calls = 0;
296 void *hcpu = (void *)(long)cpu; 298 void *hcpu = (void *)(long)cpu;
297 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 299 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
300 struct task_struct *idle;
298 301
299 if (cpu_online(cpu) || !cpu_present(cpu)) 302 if (cpu_online(cpu) || !cpu_present(cpu))
300 return -EINVAL; 303 return -EINVAL;
301 304
302 cpu_hotplug_begin(); 305 cpu_hotplug_begin();
306
307 idle = idle_thread_get(cpu);
308 if (IS_ERR(idle)) {
309 ret = PTR_ERR(idle);
310 goto out;
311 }
312
303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 313 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
304 if (ret) { 314 if (ret) {
305 nr_calls--; 315 nr_calls--;
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
309 } 319 }
310 320
311 /* Arch-specific enabling code. */ 321 /* Arch-specific enabling code. */
312 ret = __cpu_up(cpu); 322 ret = __cpu_up(cpu, idle);
313 if (ret != 0) 323 if (ret != 0)
314 goto out_notify; 324 goto out_notify;
315 BUG_ON(!cpu_online(cpu)); 325 BUG_ON(!cpu_online(cpu));
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
320out_notify: 330out_notify:
321 if (ret != 0) 331 if (ret != 0)
322 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 332 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
333out:
323 cpu_hotplug_done(); 334 cpu_hotplug_done();
324 335
325 return ret; 336 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070b4ba2..8c8bd652dd12 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
1765 .write_u64 = cpuset_write_u64, 1765 .write_u64 = cpuset_write_u64,
1766 .private = FILE_SPREAD_SLAB, 1766 .private = FILE_SPREAD_SLAB,
1767 }, 1767 },
1768};
1769
1770static struct cftype cft_memory_pressure_enabled = {
1771 .name = "memory_pressure_enabled",
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775};
1776 1768
1777static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1769 {
1778{ 1770 .name = "memory_pressure_enabled",
1779 int err; 1771 .flags = CFTYPE_ONLY_ON_ROOT,
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775 },
1780 1776
1781 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 1777 { } /* terminate */
1782 if (err) 1778};
1783 return err;
1784 /* memory_pressure_enabled is in root cpuset only */
1785 if (!cont->parent)
1786 err = cgroup_add_file(cont, ss,
1787 &cft_memory_pressure_enabled);
1788 return err;
1789}
1790 1779
1791/* 1780/*
1792 * post_clone() is called during cgroup_create() when the 1781 * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
1887 .destroy = cpuset_destroy, 1876 .destroy = cpuset_destroy,
1888 .can_attach = cpuset_can_attach, 1877 .can_attach = cpuset_can_attach,
1889 .attach = cpuset_attach, 1878 .attach = cpuset_attach,
1890 .populate = cpuset_populate,
1891 .post_clone = cpuset_post_clone, 1879 .post_clone = cpuset_post_clone,
1892 .subsys_id = cpuset_subsys_id, 1880 .subsys_id = cpuset_subsys_id,
1881 .base_cftypes = files,
1893 .early_init = 1, 1882 .early_init = 1,
1894}; 1883};
1895 1884
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd126f82b57c..91a445925855 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2039,8 +2039,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2039 * accessing the event control register. If a NMI hits, then it will 2039 * accessing the event control register. If a NMI hits, then it will
2040 * not restart the event. 2040 * not restart the event.
2041 */ 2041 */
2042void __perf_event_task_sched_out(struct task_struct *task, 2042static void __perf_event_task_sched_out(struct task_struct *task,
2043 struct task_struct *next) 2043 struct task_struct *next)
2044{ 2044{
2045 int ctxn; 2045 int ctxn;
2046 2046
@@ -2279,8 +2279,8 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
2279 * accessing the event control register. If a NMI hits, then it will 2279 * accessing the event control register. If a NMI hits, then it will
2280 * keep the event running. 2280 * keep the event running.
2281 */ 2281 */
2282void __perf_event_task_sched_in(struct task_struct *prev, 2282static void __perf_event_task_sched_in(struct task_struct *prev,
2283 struct task_struct *task) 2283 struct task_struct *task)
2284{ 2284{
2285 struct perf_event_context *ctx; 2285 struct perf_event_context *ctx;
2286 int ctxn; 2286 int ctxn;
@@ -2305,6 +2305,12 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2305 perf_branch_stack_sched_in(prev, task); 2305 perf_branch_stack_sched_in(prev, task);
2306} 2306}
2307 2307
2308void __perf_event_task_sched(struct task_struct *prev, struct task_struct *next)
2309{
2310 __perf_event_task_sched_out(prev, next);
2311 __perf_event_task_sched_in(prev, next);
2312}
2313
2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2314static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2309{ 2315{
2310 u64 frequency = event->attr.sample_freq; 2316 u64 frequency = event->attr.sample_freq;
@@ -4957,7 +4963,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
4957 if (rctx < 0) 4963 if (rctx < 0)
4958 return; 4964 return;
4959 4965
4960 perf_sample_data_init(&data, addr); 4966 perf_sample_data_init(&data, addr, 0);
4961 4967
4962 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 4968 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
4963 4969
@@ -5215,7 +5221,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5215 .data = record, 5221 .data = record,
5216 }; 5222 };
5217 5223
5218 perf_sample_data_init(&data, addr); 5224 perf_sample_data_init(&data, addr, 0);
5219 data.raw = &raw; 5225 data.raw = &raw;
5220 5226
5221 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5227 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5318,7 +5324,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5318 struct perf_sample_data sample; 5324 struct perf_sample_data sample;
5319 struct pt_regs *regs = data; 5325 struct pt_regs *regs = data;
5320 5326
5321 perf_sample_data_init(&sample, bp->attr.bp_addr); 5327 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
5322 5328
5323 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5329 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5324 perf_swevent_event(bp, 1, &sample, regs); 5330 perf_swevent_event(bp, 1, &sample, regs);
@@ -5344,13 +5350,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5344 5350
5345 event->pmu->read(event); 5351 event->pmu->read(event);
5346 5352
5347 perf_sample_data_init(&data, 0); 5353 perf_sample_data_init(&data, 0, event->hw.last_period);
5348 data.period = event->hw.last_period;
5349 regs = get_irq_regs(); 5354 regs = get_irq_regs();
5350 5355
5351 if (regs && !perf_exclude_event(event, regs)) { 5356 if (regs && !perf_exclude_event(event, regs)) {
5352 if (!(event->attr.exclude_idle && is_idle_task(current))) 5357 if (!(event->attr.exclude_idle && is_idle_task(current)))
5353 if (perf_event_overflow(event, &data, regs)) 5358 if (__perf_event_overflow(event, 1, &data, regs))
5354 ret = HRTIMER_NORESTART; 5359 ret = HRTIMER_NORESTART;
5355 } 5360 }
5356 5361
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..ad54c833116a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
34#include <linux/cgroup.h> 34#include <linux/cgroup.h>
35#include <linux/security.h> 35#include <linux/security.h>
36#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
37#include <linux/seccomp.h>
37#include <linux/swap.h> 38#include <linux/swap.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
39#include <linux/jiffies.h> 40#include <linux/jiffies.h>
@@ -47,6 +48,7 @@
47#include <linux/audit.h> 48#include <linux/audit.h>
48#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
49#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/proc_fs.h>
50#include <linux/profile.h> 52#include <linux/profile.h>
51#include <linux/rmap.h> 53#include <linux/rmap.h>
52#include <linux/ksm.h> 54#include <linux/ksm.h>
@@ -111,32 +113,67 @@ int nr_processes(void)
111 return total; 113 return total;
112} 114}
113 115
114#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 116#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
115# define alloc_task_struct_node(node) \
116 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
117# define free_task_struct(tsk) \
118 kmem_cache_free(task_struct_cachep, (tsk))
119static struct kmem_cache *task_struct_cachep; 117static struct kmem_cache *task_struct_cachep;
118
119static inline struct task_struct *alloc_task_struct_node(int node)
120{
121 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
122}
123
124void __weak arch_release_task_struct(struct task_struct *tsk) { }
125
126static inline void free_task_struct(struct task_struct *tsk)
127{
128 arch_release_task_struct(tsk);
129 kmem_cache_free(task_struct_cachep, tsk);
130}
120#endif 131#endif
121 132
122#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 133#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
134void __weak arch_release_thread_info(struct thread_info *ti) { }
135
136/*
137 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
138 * kmemcache based allocator.
139 */
140# if THREAD_SIZE >= PAGE_SIZE
123static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 141static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
124 int node) 142 int node)
125{ 143{
126#ifdef CONFIG_DEBUG_STACK_USAGE 144 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
127 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 145 THREAD_SIZE_ORDER);
128#else
129 gfp_t mask = GFP_KERNEL;
130#endif
131 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
132 146
133 return page ? page_address(page) : NULL; 147 return page ? page_address(page) : NULL;
134} 148}
135 149
136static inline void free_thread_info(struct thread_info *ti) 150static inline void free_thread_info(struct thread_info *ti)
137{ 151{
152 arch_release_thread_info(ti);
138 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 153 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
139} 154}
155# else
156static struct kmem_cache *thread_info_cache;
157
158static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
159 int node)
160{
161 return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
162}
163
164static void free_thread_info(struct thread_info *ti)
165{
166 arch_release_thread_info(ti);
167 kmem_cache_free(thread_info_cache, ti);
168}
169
170void thread_info_cache_init(void)
171{
172 thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
173 THREAD_SIZE, 0, NULL);
174 BUG_ON(thread_info_cache == NULL);
175}
176# endif
140#endif 177#endif
141 178
142/* SLAB cache for signal_struct structures (tsk->signal) */ 179/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -170,6 +207,7 @@ void free_task(struct task_struct *tsk)
170 free_thread_info(tsk->stack); 207 free_thread_info(tsk->stack);
171 rt_mutex_debug_task_free(tsk); 208 rt_mutex_debug_task_free(tsk);
172 ftrace_graph_exit_task(tsk); 209 ftrace_graph_exit_task(tsk);
210 put_seccomp_filter(tsk);
173 free_task_struct(tsk); 211 free_task_struct(tsk);
174} 212}
175EXPORT_SYMBOL(free_task); 213EXPORT_SYMBOL(free_task);
@@ -203,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk)
203} 241}
204EXPORT_SYMBOL_GPL(__put_task_struct); 242EXPORT_SYMBOL_GPL(__put_task_struct);
205 243
206/* 244void __init __weak arch_task_cache_init(void) { }
207 * macro override instead of weak attribute alias, to workaround
208 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
209 */
210#ifndef arch_task_cache_init
211#define arch_task_cache_init()
212#endif
213 245
214void __init fork_init(unsigned long mempages) 246void __init fork_init(unsigned long mempages)
215{ 247{
216#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 248#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
217#ifndef ARCH_MIN_TASKALIGN 249#ifndef ARCH_MIN_TASKALIGN
218#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 250#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
219#endif 251#endif
@@ -1162,6 +1194,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1162 goto fork_out; 1194 goto fork_out;
1163 1195
1164 ftrace_graph_init_task(p); 1196 ftrace_graph_init_task(p);
1197 get_seccomp_filter(p);
1165 1198
1166 rt_mutex_init_task(p); 1199 rt_mutex_init_task(p);
1167 1200
@@ -1464,6 +1497,8 @@ bad_fork_cleanup_io:
1464 if (p->io_context) 1497 if (p->io_context)
1465 exit_io_context(p); 1498 exit_io_context(p);
1466bad_fork_cleanup_namespaces: 1499bad_fork_cleanup_namespaces:
1500 if (unlikely(clone_flags & CLONE_NEWPID))
1501 pid_ns_release_proc(p->nsproxy->pid_ns);
1467 exit_task_namespaces(p); 1502 exit_task_namespaces(p);
1468bad_fork_cleanup_mm: 1503bad_fork_cleanup_mm:
1469 if (p->mm) 1504 if (p->mm)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2a..6df614912b9d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
108 108
109 touch_nmi_watchdog(); 109 touch_nmi_watchdog();
110 110
111 if (sysctl_hung_task_panic) 111 if (sysctl_hung_task_panic) {
112 trigger_all_cpu_backtrace();
112 panic("hung_task: blocked tasks"); 113 panic("hung_task: blocked tasks");
114 }
113} 115}
114 116
115/* 117/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c33..fc275e4f629b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
379 * If its disabled or no action available 379 * If its disabled or no action available
380 * keep it masked and get out of here 380 * keep it masked and get out of here
381 */ 381 */
382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
383 desc->istate |= IRQS_PENDING;
383 goto out_unlock; 384 goto out_unlock;
385 }
384 386
385 handle_irq_event(desc); 387 handle_irq_event(desc);
386 388
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
518out_unlock: 520out_unlock:
519 raw_spin_unlock(&desc->lock); 521 raw_spin_unlock(&desc->lock);
520} 522}
523EXPORT_SYMBOL(handle_edge_irq);
521 524
522#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER 525#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
523/** 526/**
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95eb..192a302d6cfd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
112{ 112{
113 return radix_tree_lookup(&irq_desc_tree, irq); 113 return radix_tree_lookup(&irq_desc_tree, irq);
114} 114}
115EXPORT_SYMBOL(irq_to_desc);
115 116
116static void delete_irq_desc(unsigned int irq) 117static void delete_irq_desc(unsigned int irq)
117{ 118{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569b..585f6381f8e4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
565 * IRQF_TRIGGER_* but the PIC does not support multiple 565 * IRQF_TRIGGER_* but the PIC does not support multiple
566 * flow-types? 566 * flow-types?
567 */ 567 */
568 pr_debug("No set_type function for IRQ %d (%s)\n", irq, 568 pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
569 chip ? (chip->name ? : "unknown") : "unknown"); 569 chip ? (chip->name ? : "unknown") : "unknown");
570 return 0; 570 return 0;
571 } 571 }
572 572
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
600 ret = 0; 600 ret = 0;
601 break; 601 break;
602 default: 602 default:
603 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 603 pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
604 flags, irq, chip->irq_set_type); 604 flags, irq, chip->irq_set_type);
605 } 605 }
606 if (unmask) 606 if (unmask)
@@ -837,8 +837,7 @@ void exit_irq_thread(void)
837 837
838 action = kthread_data(tsk); 838 action = kthread_data(tsk);
839 839
840 printk(KERN_ERR 840 pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
841 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
842 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 841 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
843 842
844 desc = irq_to_desc(action->irq); 843 desc = irq_to_desc(action->irq);
@@ -878,7 +877,6 @@ static int
878__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 877__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
879{ 878{
880 struct irqaction *old, **old_ptr; 879 struct irqaction *old, **old_ptr;
881 const char *old_name = NULL;
882 unsigned long flags, thread_mask = 0; 880 unsigned long flags, thread_mask = 0;
883 int ret, nested, shared = 0; 881 int ret, nested, shared = 0;
884 cpumask_var_t mask; 882 cpumask_var_t mask;
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
972 */ 970 */
973 if (!((old->flags & new->flags) & IRQF_SHARED) || 971 if (!((old->flags & new->flags) & IRQF_SHARED) ||
974 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || 972 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
975 ((old->flags ^ new->flags) & IRQF_ONESHOT)) { 973 ((old->flags ^ new->flags) & IRQF_ONESHOT))
976 old_name = old->name;
977 goto mismatch; 974 goto mismatch;
978 }
979 975
980 /* All handlers must agree on per-cpuness */ 976 /* All handlers must agree on per-cpuness */
981 if ((old->flags & IRQF_PERCPU) != 977 if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1031 * all existing action->thread_mask bits. 1027 * all existing action->thread_mask bits.
1032 */ 1028 */
1033 new->thread_mask = 1 << ffz(thread_mask); 1029 new->thread_mask = 1 << ffz(thread_mask);
1030
1031 } else if (new->handler == irq_default_primary_handler) {
1032 /*
1033 * The interrupt was requested with handler = NULL, so
1034 * we use the default primary handler for it. But it
1035 * does not have the oneshot flag set. In combination
1036 * with level interrupts this is deadly, because the
1037 * default primary handler just wakes the thread, then
1038 * the irq lines is reenabled, but the device still
1039 * has the level irq asserted. Rinse and repeat....
1040 *
1041 * While this works for edge type interrupts, we play
1042 * it safe and reject unconditionally because we can't
1043 * say for sure which type this interrupt really
1044 * has. The type flags are unreliable as the
1045 * underlying chip implementation can override them.
1046 */
1047 pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1048 irq);
1049 ret = -EINVAL;
1050 goto out_mask;
1034 } 1051 }
1035 1052
1036 if (!shared) { 1053 if (!shared) {
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1078 1095
1079 if (nmsk != omsk) 1096 if (nmsk != omsk)
1080 /* hope the handler works with current trigger mode */ 1097 /* hope the handler works with current trigger mode */
1081 pr_warning("IRQ %d uses trigger mode %u; requested %u\n", 1098 pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
1082 irq, nmsk, omsk); 1099 irq, nmsk, omsk);
1083 } 1100 }
1084 1101
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1115 return 0; 1132 return 0;
1116 1133
1117mismatch: 1134mismatch:
1118#ifdef CONFIG_DEBUG_SHIRQ
1119 if (!(new->flags & IRQF_PROBE_SHARED)) { 1135 if (!(new->flags & IRQF_PROBE_SHARED)) {
1120 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 1136 pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
1121 if (old_name) 1137 irq, new->flags, new->name, old->flags, old->name);
1122 printk(KERN_ERR "current handler: %s\n", old_name); 1138#ifdef CONFIG_DEBUG_SHIRQ
1123 dump_stack(); 1139 dump_stack();
1124 }
1125#endif 1140#endif
1141 }
1126 ret = -EBUSY; 1142 ret = -EBUSY;
1127 1143
1128out_mask: 1144out_mask:
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a6..cb228bf21760 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
103 int irq; 103 int irq;
104 104
105 for_each_irq_desc(irq, desc) { 105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
106 if (irqd_is_wakeup_set(&desc->irq_data)) { 111 if (irqd_is_wakeup_set(&desc->irq_data)) {
107 if (desc->istate & IRQS_PENDING) 112 if (desc->depth == 1 && desc->istate & IRQS_PENDING)
108 return -EBUSY; 113 return -EBUSY;
109 continue; 114 continue;
110 } 115 }
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c9..6454db7b6a4d 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
58 /* 58 /*
59 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
60 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
61 * active. 61 * active. Clear the pending bit so suspend/resume does not
62 * get confused.
62 */ 63 */
63 if (irq_settings_is_level(desc)) 64 if (irq_settings_is_level(desc)) {
65 desc->istate &= ~IRQS_PENDING;
64 return; 66 return;
67 }
65 if (desc->istate & IRQS_REPLAY) 68 if (desc->istate & IRQS_REPLAY)
66 return; 69 return;
67 if (desc->istate & IRQS_PENDING) { 70 if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e425..a4e60973ca73 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2953,7 +2953,7 @@ static struct module *load_module(void __user *umod,
2953 2953
2954 /* Module is ready to execute: parsing args may do that. */ 2954 /* Module is ready to execute: parsing args may do that. */
2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL); 2956 -32768, 32767, &ddebug_dyndbg_module_param_cb);
2957 if (err < 0) 2957 if (err < 0)
2958 goto unlink; 2958 goto unlink;
2959 2959
diff --git a/kernel/params.c b/kernel/params.c
index f37d82631347..ed35345be536 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
85 85
86static int parse_one(char *param, 86static int parse_one(char *param,
87 char *val, 87 char *val,
88 const char *doing,
88 const struct kernel_param *params, 89 const struct kernel_param *params,
89 unsigned num_params, 90 unsigned num_params,
90 s16 min_level, 91 s16 min_level,
91 s16 max_level, 92 s16 max_level,
92 int (*handle_unknown)(char *param, char *val)) 93 int (*handle_unknown)(char *param, char *val,
94 const char *doing))
93{ 95{
94 unsigned int i; 96 unsigned int i;
95 int err; 97 int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
104 if (!val && params[i].ops->set != param_set_bool 106 if (!val && params[i].ops->set != param_set_bool
105 && params[i].ops->set != param_set_bint) 107 && params[i].ops->set != param_set_bint)
106 return -EINVAL; 108 return -EINVAL;
107 pr_debug("They are equal! Calling %p\n", 109 pr_debug("handling %s with %p\n", param,
108 params[i].ops->set); 110 params[i].ops->set);
109 mutex_lock(&param_lock); 111 mutex_lock(&param_lock);
110 err = params[i].ops->set(val, &params[i]); 112 err = params[i].ops->set(val, &params[i]);
111 mutex_unlock(&param_lock); 113 mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
114 } 116 }
115 117
116 if (handle_unknown) { 118 if (handle_unknown) {
117 pr_debug("Unknown argument: calling %p\n", handle_unknown); 119 pr_debug("doing %s: %s='%s'\n", doing, param, val);
118 return handle_unknown(param, val); 120 return handle_unknown(param, val, doing);
119 } 121 }
120 122
121 pr_debug("Unknown argument `%s'\n", param); 123 pr_debug("Unknown argument '%s'\n", param);
122 return -ENOENT; 124 return -ENOENT;
123} 125}
124 126
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
175} 177}
176 178
177/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
178int parse_args(const char *name, 180int parse_args(const char *doing,
179 char *args, 181 char *args,
180 const struct kernel_param *params, 182 const struct kernel_param *params,
181 unsigned num, 183 unsigned num,
182 s16 min_level, 184 s16 min_level,
183 s16 max_level, 185 s16 max_level,
184 int (*unknown)(char *param, char *val)) 186 int (*unknown)(char *param, char *val, const char *doing))
185{ 187{
186 char *param, *val; 188 char *param, *val;
187 189
188 pr_debug("Parsing ARGS: %s\n", args);
189
190 /* Chew leading spaces */ 190 /* Chew leading spaces */
191 args = skip_spaces(args); 191 args = skip_spaces(args);
192 192
193 if (*args)
194 pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
195
193 while (*args) { 196 while (*args) {
194 int ret; 197 int ret;
195 int irq_was_disabled; 198 int irq_was_disabled;
196 199
197 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
198 irq_was_disabled = irqs_disabled(); 201 irq_was_disabled = irqs_disabled();
199 ret = parse_one(param, val, params, num, 202 ret = parse_one(param, val, doing, params, num,
200 min_level, max_level, unknown); 203 min_level, max_level, unknown);
201 if (irq_was_disabled && !irqs_disabled()) { 204 if (irq_was_disabled && !irqs_disabled())
202 printk(KERN_WARNING "parse_args(): option '%s' enabled " 205 pr_warn("%s: option '%s' enabled irq's!\n",
203 "irq's!\n", param); 206 doing, param);
204 } 207
205 switch (ret) { 208 switch (ret) {
206 case -ENOENT: 209 case -ENOENT:
207 printk(KERN_ERR "%s: Unknown parameter `%s'\n", 210 pr_err("%s: Unknown parameter `%s'\n", doing, param);
208 name, param);
209 return ret; 211 return ret;
210 case -ENOSPC: 212 case -ENOSPC:
211 printk(KERN_ERR 213 pr_err("%s: `%s' too large for parameter `%s'\n",
212 "%s: `%s' too large for parameter `%s'\n", 214 doing, val ?: "", param);
213 name, val ?: "", param);
214 return ret; 215 return ret;
215 case 0: 216 case 0:
216 break; 217 break;
217 default: 218 default:
218 printk(KERN_ERR 219 pr_err("%s: `%s' invalid for parameter `%s'\n",
219 "%s: `%s' invalid for parameter `%s'\n", 220 doing, val ?: "", param);
220 name, val ?: "", param);
221 return ret; 221 return ret;
222 } 222 }
223 } 223 }
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
263int param_set_charp(const char *val, const struct kernel_param *kp) 263int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 264{
265 if (strlen(val) > 1024) { 265 if (strlen(val) > 1024) {
266 printk(KERN_ERR "%s: string parameter too long\n", 266 pr_err("%s: string parameter too long\n", kp->name);
267 kp->name);
268 return -ENOSPC; 267 return -ENOSPC;
269 } 268 }
270 269
@@ -400,8 +399,7 @@ static int param_array(const char *name,
400 int len; 399 int len;
401 400
402 if (*num == max) { 401 if (*num == max) {
403 printk(KERN_ERR "%s: can only take %i arguments\n", 402 pr_err("%s: can only take %i arguments\n", name, max);
404 name, max);
405 return -EINVAL; 403 return -EINVAL;
406 } 404 }
407 len = strcspn(val, ","); 405 len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
420 } while (save == ','); 418 } while (save == ',');
421 419
422 if (*num < min) { 420 if (*num < min) {
423 printk(KERN_ERR "%s: needs at least %i arguments\n", 421 pr_err("%s: needs at least %i arguments\n", name, min);
424 name, min);
425 return -EINVAL; 422 return -EINVAL;
426 } 423 }
427 return 0; 424 return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
480 const struct kparam_string *kps = kp->str; 477 const struct kparam_string *kps = kp->str;
481 478
482 if (strlen(val)+1 > kps->maxlen) { 479 if (strlen(val)+1 > kps->maxlen) {
483 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 480 pr_err("%s: string doesn't fit in %u chars.\n",
484 kp->name, kps->maxlen-1); 481 kp->name, kps->maxlen-1);
485 return -ENOSPC; 482 return -ENOSPC;
486 } 483 }
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
750#endif 747#endif
751 if (err) { 748 if (err) {
752 kobject_put(&mk->kobj); 749 kobject_put(&mk->kobj);
753 printk(KERN_ERR 750 pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
754 "Module '%s' failed add to sysfs, error number %d\n",
755 name, err); 751 name, err);
756 printk(KERN_ERR
757 "The system will be unstable now.\n");
758 return NULL; 752 return NULL;
759 } 753 }
760 754
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d39..32462d2b364a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
41#include <linux/cpu.h> 41#include <linux/cpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46 47
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
54{ 55{
55} 56}
56 57
57#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
58
59/* printk's without a loglevel use this.. */ 58/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 59#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 60
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
99static int console_locked, console_suspended; 98static int console_locked, console_suspended;
100 99
101/* 100/*
102 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
103 * It is also used in interesting ways to provide interlocking in
104 * console_unlock();.
105 */
106static DEFINE_RAW_SPINLOCK(logbuf_lock);
107
108#define LOG_BUF_MASK (log_buf_len-1)
109#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
110
111/*
112 * The indices into log_buf are not constrained to log_buf_len - they
113 * must be masked before subscripting
114 */
115static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
116static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
117static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
118
119/*
120 * If exclusive_console is non-NULL then only this console is to be printed to. 101 * If exclusive_console is non-NULL then only this console is to be printed to.
121 */ 102 */
122static struct console *exclusive_console; 103static struct console *exclusive_console;
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);
145/* Flag: console code may call schedule() */ 126/* Flag: console code may call schedule() */
146static int console_may_schedule; 127static int console_may_schedule;
147 128
129/*
130 * The printk log buffer consists of a chain of concatenated variable
131 * length records. Every record starts with a record header, containing
132 * the overall length of the record.
133 *
134 * The heads to the first and last entry in the buffer, as well as the
135 * sequence numbers of these both entries are maintained when messages
136 * are stored..
137 *
138 * If the heads indicate available messages, the length in the header
139 * tells the start next message. A length == 0 for the next message
140 * indicates a wrap-around to the beginning of the buffer.
141 *
142 * Every record carries the monotonic timestamp in microseconds, as well as
143 * the standard userspace syslog level and syslog facility. The usual
144 * kernel messages use LOG_KERN; userspace-injected messages always carry
145 * a matching syslog facility, by default LOG_USER. The origin of every
146 * message can be reliably determined that way.
147 *
148 * The human readable log message directly follows the message header. The
149 * length of the message text is stored in the header, the stored message
150 * is not terminated.
151 *
152 * Optionally, a message can carry a dictionary of properties (key/value pairs),
153 * to provide userspace with a machine-readable message context.
154 *
155 * Examples for well-defined, commonly used property names are:
156 * DEVICE=b12:8 device identifier
157 * b12:8 block dev_t
158 * c127:3 char dev_t
159 * n8 netdev ifindex
160 * +sound:card0 subsystem:devname
161 * SUBSYSTEM=pci driver-core subsystem name
162 *
163 * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
164 * follows directly after a '=' character. Every property is terminated by
165 * a '\0' character. The last property is not terminated.
166 *
167 * Example of a message structure:
168 * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
169 * 0008 34 00 record is 52 bytes long
170 * 000a 0b 00 text is 11 bytes long
171 * 000c 1f 00 dictionary is 23 bytes long
172 * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
173 * 0010 69 74 27 73 20 61 20 6c "it's a l"
174 * 69 6e 65 "ine"
175 * 001b 44 45 56 49 43 "DEVIC"
176 * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
177 * 52 49 56 45 52 3d 62 75 "RIVER=bu"
178 * 67 "g"
179 * 0032 00 00 00 padding to next message header
180 *
181 * The 'struct log' buffer header must never be directly exported to
182 * userspace, it is a kernel-private implementation detail that might
183 * need to be changed in the future, when the requirements change.
184 *
185 * /dev/kmsg exports the structured data in the following line format:
186 * "level,sequnum,timestamp;<message text>\n"
187 *
188 * The optional key/value pairs are attached as continuation lines starting
189 * with a space character and terminated by a newline. All possible
190 * non-prinatable characters are escaped in the "\xff" notation.
191 *
192 * Users of the export format should ignore possible additional values
193 * separated by ',', and find the message after the ';' character.
194 */
195
196struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */
202};
203
204/*
205 * The logbuf_lock protects kmsg buffer, indices, counters. It is also
206 * used in interesting ways to provide interlocking in console_unlock();
207 */
208static DEFINE_RAW_SPINLOCK(logbuf_lock);
209
210/* the next printk record to read by syslog(READ) or /proc/kmsg */
211static u64 syslog_seq;
212static u32 syslog_idx;
213
214/* index and sequence number of the first record stored in the buffer */
215static u64 log_first_seq;
216static u32 log_first_idx;
217
218/* index and sequence number of the next record to store in the buffer */
219static u64 log_next_seq;
148#ifdef CONFIG_PRINTK 220#ifdef CONFIG_PRINTK
221static u32 log_next_idx;
222
223/* the next printk record to read after the last 'clear' command */
224static u64 clear_seq;
225static u32 clear_idx;
226
227#define LOG_LINE_MAX 1024
149 228
150static char __log_buf[__LOG_BUF_LEN]; 229/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4
232#else
233#define LOG_ALIGN 8
234#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
151static char *log_buf = __log_buf; 237static char *log_buf = __log_buf;
152static int log_buf_len = __LOG_BUF_LEN; 238static u32 log_buf_len = __LOG_BUF_LEN;
153static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 239
154static int saved_console_loglevel = -1; 240/* cpu currently holding logbuf_lock */
241static volatile unsigned int logbuf_cpu = UINT_MAX;
242
243/* human readable text of the record */
244static char *log_text(const struct log *msg)
245{
246 return (char *)msg + sizeof(struct log);
247}
248
249/* optional key/value pair dictionary attached to the record */
250static char *log_dict(const struct log *msg)
251{
252 return (char *)msg + sizeof(struct log) + msg->text_len;
253}
254
255/* get record by index; idx must point to valid msg */
256static struct log *log_from_idx(u32 idx)
257{
258 struct log *msg = (struct log *)(log_buf + idx);
259
260 /*
261 * A length == 0 record is the end of buffer marker. Wrap around and
262 * read the message at the start of the buffer.
263 */
264 if (!msg->len)
265 return (struct log *)log_buf;
266 return msg;
267}
268
269/* get next record; idx must point to valid msg */
270static u32 log_next(u32 idx)
271{
272 struct log *msg = (struct log *)(log_buf + idx);
273
274 /* length == 0 indicates the end of the buffer; wrap */
275 /*
276 * A length == 0 record is the end of buffer marker. Wrap around and
277 * read the message at the start of the buffer as *this* one, and
278 * return the one after that.
279 */
280 if (!msg->len) {
281 msg = (struct log *)log_buf;
282 return msg->len;
283 }
284 return idx + msg->len;
285}
286
287/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level,
289 const char *dict, u16 dict_len,
290 const char *text, u16 text_len)
291{
292 struct log *msg;
293 u32 size, pad_len;
294
295 /* number of '\0' padding bytes to next message */
296 size = sizeof(struct log) + text_len + dict_len;
297 pad_len = (-size) & (LOG_ALIGN - 1);
298 size += pad_len;
299
300 while (log_first_seq < log_next_seq) {
301 u32 free;
302
303 if (log_next_idx > log_first_idx)
304 free = max(log_buf_len - log_next_idx, log_first_idx);
305 else
306 free = log_first_idx - log_next_idx;
307
308 if (free > size + sizeof(struct log))
309 break;
310
311 /* drop old messages until we have enough contiuous space */
312 log_first_idx = log_next(log_first_idx);
313 log_first_seq++;
314 }
315
316 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
317 /*
318 * This message + an additional empty header does not fit
319 * at the end of the buffer. Add an empty header with len == 0
320 * to signify a wrap around.
321 */
322 memset(log_buf + log_next_idx, 0, sizeof(struct log));
323 log_next_idx = 0;
324 }
325
326 /* fill message */
327 msg = (struct log *)(log_buf + log_next_idx);
328 memcpy(log_text(msg), text, text_len);
329 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7);
333 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336
337 /* insert message */
338 log_next_idx += msg->len;
339 log_next_seq++;
340}
341
342/* /dev/kmsg - userspace message inject/listen interface */
343struct devkmsg_user {
344 u64 seq;
345 u32 idx;
346 struct mutex lock;
347 char buf[8192];
348};
349
350static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
351 unsigned long count, loff_t pos)
352{
353 char *buf, *line;
354 int i;
355 int level = default_message_loglevel;
356 int facility = 1; /* LOG_USER */
357 size_t len = iov_length(iv, count);
358 ssize_t ret = len;
359
360 if (len > LOG_LINE_MAX)
361 return -EINVAL;
362 buf = kmalloc(len+1, GFP_KERNEL);
363 if (buf == NULL)
364 return -ENOMEM;
365
366 line = buf;
367 for (i = 0; i < count; i++) {
368 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
369 goto out;
370 line += iv[i].iov_len;
371 }
372
373 /*
374 * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
375 * the decimal value represents 32bit, the lower 3 bit are the log
376 * level, the rest are the log facility.
377 *
378 * If no prefix or no userspace facility is specified, we
379 * enforce LOG_USER, to be able to reliably distinguish
380 * kernel-generated messages from userspace-injected ones.
381 */
382 line = buf;
383 if (line[0] == '<') {
384 char *endp = NULL;
385
386 i = simple_strtoul(line+1, &endp, 10);
387 if (endp && endp[0] == '>') {
388 level = i & 7;
389 if (i >> 3)
390 facility = i >> 3;
391 endp++;
392 len -= endp - line;
393 line = endp;
394 }
395 }
396 line[len] = '\0';
397
398 printk_emit(facility, level, NULL, 0, "%s", line);
399out:
400 kfree(buf);
401 return ret;
402}
403
404static ssize_t devkmsg_read(struct file *file, char __user *buf,
405 size_t count, loff_t *ppos)
406{
407 struct devkmsg_user *user = file->private_data;
408 struct log *msg;
409 u64 ts_usec;
410 size_t i;
411 size_t len;
412 ssize_t ret;
413
414 if (!user)
415 return -EBADF;
416
417 mutex_lock(&user->lock);
418 raw_spin_lock(&logbuf_lock);
419 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) {
421 ret = -EAGAIN;
422 raw_spin_unlock(&logbuf_lock);
423 goto out;
424 }
425
426 raw_spin_unlock(&logbuf_lock);
427 ret = wait_event_interruptible(log_wait,
428 user->seq != log_next_seq);
429 if (ret)
430 goto out;
431 raw_spin_lock(&logbuf_lock);
432 }
433
434 if (user->seq < log_first_seq) {
435 /* our last seen message is gone, return error and reset */
436 user->idx = log_first_idx;
437 user->seq = log_first_seq;
438 ret = -EPIPE;
439 raw_spin_unlock(&logbuf_lock);
440 goto out;
441 }
442
443 msg = log_from_idx(user->idx);
444 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec);
448
449 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) {
451 unsigned char c = log_text(msg)[i];
452
453 if (c < ' ' || c >= 128)
454 len += sprintf(user->buf + len, "\\x%02x", c);
455 else
456 user->buf[len++] = c;
457 }
458 user->buf[len++] = '\n';
459
460 if (msg->dict_len) {
461 bool line = true;
462
463 for (i = 0; i < msg->dict_len; i++) {
464 unsigned char c = log_dict(msg)[i];
465
466 if (line) {
467 user->buf[len++] = ' ';
468 line = false;
469 }
470
471 if (c == '\0') {
472 user->buf[len++] = '\n';
473 line = true;
474 continue;
475 }
476
477 if (c < ' ' || c >= 128) {
478 len += sprintf(user->buf + len, "\\x%02x", c);
479 continue;
480 }
481
482 user->buf[len++] = c;
483 }
484 user->buf[len++] = '\n';
485 }
486
487 user->idx = log_next(user->idx);
488 user->seq++;
489 raw_spin_unlock(&logbuf_lock);
490
491 if (len > count) {
492 ret = -EINVAL;
493 goto out;
494 }
495
496 if (copy_to_user(buf, user->buf, len)) {
497 ret = -EFAULT;
498 goto out;
499 }
500 ret = len;
501out:
502 mutex_unlock(&user->lock);
503 return ret;
504}
505
506static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
507{
508 struct devkmsg_user *user = file->private_data;
509 loff_t ret = 0;
510
511 if (!user)
512 return -EBADF;
513 if (offset)
514 return -ESPIPE;
515
516 raw_spin_lock(&logbuf_lock);
517 switch (whence) {
518 case SEEK_SET:
519 /* the first record */
520 user->idx = log_first_idx;
521 user->seq = log_first_seq;
522 break;
523 case SEEK_DATA:
524 /*
525 * The first record after the last SYSLOG_ACTION_CLEAR,
526 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
527 * changes no global state, and does not clear anything.
528 */
529 user->idx = clear_idx;
530 user->seq = clear_seq;
531 break;
532 case SEEK_END:
533 /* after the last record */
534 user->idx = log_next_idx;
535 user->seq = log_next_seq;
536 break;
537 default:
538 ret = -EINVAL;
539 }
540 raw_spin_unlock(&logbuf_lock);
541 return ret;
542}
543
544static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
545{
546 struct devkmsg_user *user = file->private_data;
547 int ret = 0;
548
549 if (!user)
550 return POLLERR|POLLNVAL;
551
552 poll_wait(file, &log_wait, wait);
553
554 raw_spin_lock(&logbuf_lock);
555 if (user->seq < log_next_seq) {
556 /* return error when data has vanished underneath us */
557 if (user->seq < log_first_seq)
558 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
559 ret = POLLIN|POLLRDNORM;
560 }
561 raw_spin_unlock(&logbuf_lock);
562
563 return ret;
564}
565
566static int devkmsg_open(struct inode *inode, struct file *file)
567{
568 struct devkmsg_user *user;
569 int err;
570
571 /* write-only does not need any file context */
572 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
573 return 0;
574
575 err = security_syslog(SYSLOG_ACTION_READ_ALL);
576 if (err)
577 return err;
578
579 user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
580 if (!user)
581 return -ENOMEM;
582
583 mutex_init(&user->lock);
584
585 raw_spin_lock(&logbuf_lock);
586 user->idx = log_first_idx;
587 user->seq = log_first_seq;
588 raw_spin_unlock(&logbuf_lock);
589
590 file->private_data = user;
591 return 0;
592}
593
594static int devkmsg_release(struct inode *inode, struct file *file)
595{
596 struct devkmsg_user *user = file->private_data;
597
598 if (!user)
599 return 0;
600
601 mutex_destroy(&user->lock);
602 kfree(user);
603 return 0;
604}
605
606const struct file_operations kmsg_fops = {
607 .open = devkmsg_open,
608 .read = devkmsg_read,
609 .aio_write = devkmsg_writev,
610 .llseek = devkmsg_llseek,
611 .poll = devkmsg_poll,
612 .release = devkmsg_release,
613};
155 614
156#ifdef CONFIG_KEXEC 615#ifdef CONFIG_KEXEC
157/* 616/*
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;
165void log_buf_kexec_setup(void) 624void log_buf_kexec_setup(void)
166{ 625{
167 VMCOREINFO_SYMBOL(log_buf); 626 VMCOREINFO_SYMBOL(log_buf);
168 VMCOREINFO_SYMBOL(log_end);
169 VMCOREINFO_SYMBOL(log_buf_len); 627 VMCOREINFO_SYMBOL(log_buf_len);
170 VMCOREINFO_SYMBOL(logged_chars); 628 VMCOREINFO_SYMBOL(log_first_idx);
629 VMCOREINFO_SYMBOL(log_next_idx);
171} 630}
172#endif 631#endif
173 632
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);
191void __init setup_log_buf(int early) 650void __init setup_log_buf(int early)
192{ 651{
193 unsigned long flags; 652 unsigned long flags;
194 unsigned start, dest_idx, offset;
195 char *new_log_buf; 653 char *new_log_buf;
196 int free; 654 int free;
197 655
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early)
219 log_buf_len = new_log_buf_len; 677 log_buf_len = new_log_buf_len;
220 log_buf = new_log_buf; 678 log_buf = new_log_buf;
221 new_log_buf_len = 0; 679 new_log_buf_len = 0;
222 free = __LOG_BUF_LEN - log_end; 680 free = __LOG_BUF_LEN - log_next_idx;
223 681 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
224 offset = start = min(con_start, log_start);
225 dest_idx = 0;
226 while (start != log_end) {
227 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
228
229 log_buf[dest_idx] = __log_buf[log_idx_mask];
230 start++;
231 dest_idx++;
232 }
233 log_start -= offset;
234 con_start -= offset;
235 log_end -= offset;
236 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 682 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
237 683
238 pr_info("log_buf_len: %d\n", log_buf_len); 684 pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)
332 return 0; 778 return 0;
333} 779}
334 780
781#if defined(CONFIG_PRINTK_TIME)
782static bool printk_time = 1;
783#else
784static bool printk_time;
785#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{
790 size_t len = 0;
791
792 if (syslog) {
793 if (buf) {
794 len += sprintf(buf, "<%u>", msg->level);
795 } else {
796 len += 3;
797 if (msg->level > 9)
798 len++;
799 if (msg->level > 99)
800 len++;
801 }
802 }
803
804 if (printk_time) {
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len;
817}
818
819static size_t msg_print_text(const struct log *msg, bool syslog,
820 char *buf, size_t size)
821{
822 const char *text = log_text(msg);
823 size_t text_size = msg->text_len;
824 size_t len = 0;
825
826 do {
827 const char *next = memchr(text, '\n', text_size);
828 size_t text_len;
829
830 if (next) {
831 text_len = next - text;
832 next++;
833 text_size -= next - text;
834 } else {
835 text_len = text_size;
836 }
837
838 if (buf) {
839 if (print_prefix(msg, syslog, NULL) +
840 text_len + 1>= size - len)
841 break;
842
843 len += print_prefix(msg, syslog, buf + len);
844 memcpy(buf + len, text, text_len);
845 len += text_len;
846 buf[len++] = '\n';
847 } else {
848 /* SYSLOG_ACTION_* buffer size only calculation */
849 len += print_prefix(msg, syslog, NULL);
850 len += text_len + 1;
851 }
852
853 text = next;
854 } while (text);
855
856 return len;
857}
858
859static int syslog_print(char __user *buf, int size)
860{
861 char *text;
862 struct log *msg;
863 int len;
864
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text)
867 return -ENOMEM;
868
869 raw_spin_lock_irq(&logbuf_lock);
870 if (syslog_seq < log_first_seq) {
871 /* messages are gone, move to first one */
872 syslog_seq = log_first_seq;
873 syslog_idx = log_first_idx;
874 }
875 msg = log_from_idx(syslog_idx);
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX);
877 syslog_idx = log_next(syslog_idx);
878 syslog_seq++;
879 raw_spin_unlock_irq(&logbuf_lock);
880
881 if (len > 0 && copy_to_user(buf, text, len))
882 len = -EFAULT;
883
884 kfree(text);
885 return len;
886}
887
888static int syslog_print_all(char __user *buf, int size, bool clear)
889{
890 char *text;
891 int len = 0;
892
893 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
894 if (!text)
895 return -ENOMEM;
896
897 raw_spin_lock_irq(&logbuf_lock);
898 if (buf) {
899 u64 next_seq;
900 u64 seq;
901 u32 idx;
902
903 if (clear_seq < log_first_seq) {
904 /* messages are gone, move to first available one */
905 clear_seq = log_first_seq;
906 clear_idx = log_first_idx;
907 }
908
909 /*
910 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump.
912 */
913 seq = clear_seq;
914 idx = clear_idx;
915 while (seq < log_next_seq) {
916 struct log *msg = log_from_idx(idx);
917
918 len += msg_print_text(msg, true, NULL, 0);
919 idx = log_next(idx);
920 seq++;
921 }
922 seq = clear_seq;
923 idx = clear_idx;
924 while (len > size && seq < log_next_seq) {
925 struct log *msg = log_from_idx(idx);
926
927 len -= msg_print_text(msg, true, NULL, 0);
928 idx = log_next(idx);
929 seq++;
930 }
931
932 /* last message in this dump */
933 next_seq = log_next_seq;
934
935 len = 0;
936 while (len >= 0 && seq < next_seq) {
937 struct log *msg = log_from_idx(idx);
938 int textlen;
939
940 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
941 if (textlen < 0) {
942 len = textlen;
943 break;
944 }
945 idx = log_next(idx);
946 seq++;
947
948 raw_spin_unlock_irq(&logbuf_lock);
949 if (copy_to_user(buf + len, text, textlen))
950 len = -EFAULT;
951 else
952 len += textlen;
953 raw_spin_lock_irq(&logbuf_lock);
954
955 if (seq < log_first_seq) {
956 /* messages are gone, move to next one */
957 seq = log_first_seq;
958 idx = log_first_idx;
959 }
960 }
961 }
962
963 if (clear) {
964 clear_seq = log_next_seq;
965 clear_idx = log_next_idx;
966 }
967 raw_spin_unlock_irq(&logbuf_lock);
968
969 kfree(text);
970 return len;
971}
972
335int do_syslog(int type, char __user *buf, int len, bool from_file) 973int do_syslog(int type, char __user *buf, int len, bool from_file)
336{ 974{
337 unsigned i, j, limit, count; 975 bool clear = false;
338 int do_clear = 0; 976 static int saved_console_loglevel = -1;
339 char c;
340 int error; 977 int error;
341 978
342 error = check_syslog_permissions(type, from_file); 979 error = check_syslog_permissions(type, from_file);
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
364 goto out; 1001 goto out;
365 } 1002 }
366 error = wait_event_interruptible(log_wait, 1003 error = wait_event_interruptible(log_wait,
367 (log_start - log_end)); 1004 syslog_seq != log_next_seq);
368 if (error) 1005 if (error)
369 goto out; 1006 goto out;
370 i = 0; 1007 error = syslog_print(buf, len);
371 raw_spin_lock_irq(&logbuf_lock);
372 while (!error && (log_start != log_end) && i < len) {
373 c = LOG_BUF(log_start);
374 log_start++;
375 raw_spin_unlock_irq(&logbuf_lock);
376 error = __put_user(c,buf);
377 buf++;
378 i++;
379 cond_resched();
380 raw_spin_lock_irq(&logbuf_lock);
381 }
382 raw_spin_unlock_irq(&logbuf_lock);
383 if (!error)
384 error = i;
385 break; 1008 break;
386 /* Read/clear last kernel messages */ 1009 /* Read/clear last kernel messages */
387 case SYSLOG_ACTION_READ_CLEAR: 1010 case SYSLOG_ACTION_READ_CLEAR:
388 do_clear = 1; 1011 clear = true;
389 /* FALL THRU */ 1012 /* FALL THRU */
390 /* Read last kernel messages */ 1013 /* Read last kernel messages */
391 case SYSLOG_ACTION_READ_ALL: 1014 case SYSLOG_ACTION_READ_ALL:
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
399 error = -EFAULT; 1022 error = -EFAULT;
400 goto out; 1023 goto out;
401 } 1024 }
402 count = len; 1025 error = syslog_print_all(buf, len, clear);
403 if (count > log_buf_len)
404 count = log_buf_len;
405 raw_spin_lock_irq(&logbuf_lock);
406 if (count > logged_chars)
407 count = logged_chars;
408 if (do_clear)
409 logged_chars = 0;
410 limit = log_end;
411 /*
412 * __put_user() could sleep, and while we sleep
413 * printk() could overwrite the messages
414 * we try to copy to user space. Therefore
415 * the messages are copied in reverse. <manfreds>
416 */
417 for (i = 0; i < count && !error; i++) {
418 j = limit-1-i;
419 if (j + log_buf_len < log_end)
420 break;
421 c = LOG_BUF(j);
422 raw_spin_unlock_irq(&logbuf_lock);
423 error = __put_user(c,&buf[count-1-i]);
424 cond_resched();
425 raw_spin_lock_irq(&logbuf_lock);
426 }
427 raw_spin_unlock_irq(&logbuf_lock);
428 if (error)
429 break;
430 error = i;
431 if (i != count) {
432 int offset = count-error;
433 /* buffer overflow during copy, correct user buffer. */
434 for (i = 0; i < error; i++) {
435 if (__get_user(c,&buf[i+offset]) ||
436 __put_user(c,&buf[i])) {
437 error = -EFAULT;
438 break;
439 }
440 cond_resched();
441 }
442 }
443 break; 1026 break;
444 /* Clear ring buffer */ 1027 /* Clear ring buffer */
445 case SYSLOG_ACTION_CLEAR: 1028 case SYSLOG_ACTION_CLEAR:
446 logged_chars = 0; 1029 syslog_print_all(NULL, 0, true);
447 break;
448 /* Disable logging to console */ 1030 /* Disable logging to console */
449 case SYSLOG_ACTION_CONSOLE_OFF: 1031 case SYSLOG_ACTION_CONSOLE_OFF:
450 if (saved_console_loglevel == -1) 1032 if (saved_console_loglevel == -1)
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
472 break; 1054 break;
473 /* Number of chars in the log buffer */ 1055 /* Number of chars in the log buffer */
474 case SYSLOG_ACTION_SIZE_UNREAD: 1056 case SYSLOG_ACTION_SIZE_UNREAD:
475 error = log_end - log_start; 1057 raw_spin_lock_irq(&logbuf_lock);
1058 if (syslog_seq < log_first_seq) {
1059 /* messages are gone, move to first one */
1060 syslog_seq = log_first_seq;
1061 syslog_idx = log_first_idx;
1062 }
1063 if (from_file) {
1064 /*
1065 * Short-cut for poll(/"proc/kmsg") which simply checks
1066 * for pending data, not the size; return the count of
1067 * records, not the length.
1068 */
1069 error = log_next_idx - syslog_idx;
1070 } else {
1071 u64 seq;
1072 u32 idx;
1073
1074 error = 0;
1075 seq = syslog_seq;
1076 idx = syslog_idx;
1077 while (seq < log_next_seq) {
1078 struct log *msg = log_from_idx(idx);
1079
1080 error += msg_print_text(msg, true, NULL, 0);
1081 idx = log_next(idx);
1082 seq++;
1083 }
1084 }
1085 raw_spin_unlock_irq(&logbuf_lock);
476 break; 1086 break;
477 /* Size of the log buffer */ 1087 /* Size of the log buffer */
478 case SYSLOG_ACTION_SIZE_BUFFER: 1088 case SYSLOG_ACTION_SIZE_BUFFER:
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])
501{ 1111{
502 syslog_data[0] = log_buf; 1112 syslog_data[0] = log_buf;
503 syslog_data[1] = log_buf + log_buf_len; 1113 syslog_data[1] = log_buf + log_buf_len;
504 syslog_data[2] = log_buf + log_end - 1114 syslog_data[2] = log_buf + log_first_idx;
505 (logged_chars < log_buf_len ? logged_chars : log_buf_len); 1115 syslog_data[3] = log_buf + log_next_idx;
506 syslog_data[3] = log_buf + log_end;
507} 1116}
508#endif /* CONFIG_KGDB_KDB */ 1117#endif /* CONFIG_KGDB_KDB */
509 1118
510/*
511 * Call the console drivers on a range of log_buf
512 */
513static void __call_console_drivers(unsigned start, unsigned end)
514{
515 struct console *con;
516
517 for_each_console(con) {
518 if (exclusive_console && con != exclusive_console)
519 continue;
520 if ((con->flags & CON_ENABLED) && con->write &&
521 (cpu_online(smp_processor_id()) ||
522 (con->flags & CON_ANYTIME)))
523 con->write(con, &LOG_BUF(start), end - start);
524 }
525}
526
527static bool __read_mostly ignore_loglevel; 1119static bool __read_mostly ignore_loglevel;
528 1120
529static int __init ignore_loglevel_setup(char *str) 1121static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
540 "print all kernel messages to the console."); 1132 "print all kernel messages to the console.");
541 1133
542/* 1134/*
543 * Write out chars from start to end - 1 inclusive
544 */
545static void _call_console_drivers(unsigned start,
546 unsigned end, int msg_log_level)
547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
551 console_drivers && start != end) {
552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
553 /* wrapped write */
554 __call_console_drivers(start & LOG_BUF_MASK,
555 log_buf_len);
556 __call_console_drivers(0, end & LOG_BUF_MASK);
557 } else {
558 __call_console_drivers(start, end);
559 }
560 }
561}
562
563/*
564 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
565 * lower 3 bit are the log level, the rest are the log facility. In case
566 * userspace passes usual userspace syslog messages to /dev/kmsg or
567 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
568 * to extract the correct log level for in-kernel processing, and not mangle
569 * the original value.
570 *
571 * If a prefix is found, the length of the prefix is returned. If 'level' is
572 * passed, it will be filled in with the log level without a possible facility
573 * value. If 'special' is passed, the special printk prefix chars are accepted
574 * and returned. If no valid header is found, 0 is returned and the passed
575 * variables are not touched.
576 */
577static size_t log_prefix(const char *p, unsigned int *level, char *special)
578{
579 unsigned int lev = 0;
580 char sp = '\0';
581 size_t len;
582
583 if (p[0] != '<' || !p[1])
584 return 0;
585 if (p[2] == '>') {
586 /* usual single digit level number or special char */
587 switch (p[1]) {
588 case '0' ... '7':
589 lev = p[1] - '0';
590 break;
591 case 'c': /* KERN_CONT */
592 case 'd': /* KERN_DEFAULT */
593 sp = p[1];
594 break;
595 default:
596 return 0;
597 }
598 len = 3;
599 } else {
600 /* multi digit including the level and facility number */
601 char *endp = NULL;
602
603 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
604 if (endp == NULL || endp[0] != '>')
605 return 0;
606 len = (endp + 1) - p;
607 }
608
609 /* do not accept special char if not asked for */
610 if (sp && !special)
611 return 0;
612
613 if (special) {
614 *special = sp;
615 /* return special char, do not touch level */
616 if (sp)
617 return len;
618 }
619
620 if (level)
621 *level = lev;
622 return len;
623}
624
625/*
626 * Call the console drivers, asking them to write out 1135 * Call the console drivers, asking them to write out
627 * log_buf[start] to log_buf[end - 1]. 1136 * log_buf[start] to log_buf[end - 1].
628 * The console_lock must be held. 1137 * The console_lock must be held.
629 */ 1138 */
630static void call_console_drivers(unsigned start, unsigned end) 1139static void call_console_drivers(int level, const char *text, size_t len)
631{ 1140{
632 unsigned cur_index, start_print; 1141 struct console *con;
633 static int msg_level = -1;
634 1142
635 BUG_ON(((int)(start - end)) > 0); 1143 trace_console(text, 0, len, len);
636 1144
637 cur_index = start; 1145 if (level >= console_loglevel && !ignore_loglevel)
638 start_print = start; 1146 return;
639 while (cur_index != end) { 1147 if (!console_drivers)
640 if (msg_level < 0 && ((end - cur_index) > 2)) { 1148 return;
641 /* strip log prefix */
642 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
643 start_print = cur_index;
644 }
645 while (cur_index != end) {
646 char c = LOG_BUF(cur_index);
647
648 cur_index++;
649 if (c == '\n') {
650 if (msg_level < 0) {
651 /*
652 * printk() has already given us loglevel tags in
653 * the buffer. This code is here in case the
654 * log buffer has wrapped right round and scribbled
655 * on those tags
656 */
657 msg_level = default_message_loglevel;
658 }
659 _call_console_drivers(start_print, cur_index, msg_level);
660 msg_level = -1;
661 start_print = cur_index;
662 break;
663 }
664 }
665 }
666 _call_console_drivers(start_print, end, msg_level);
667}
668 1149
669static void emit_log_char(char c) 1150 for_each_console(con) {
670{ 1151 if (exclusive_console && con != exclusive_console)
671 LOG_BUF(log_end) = c; 1152 continue;
672 log_end++; 1153 if (!(con->flags & CON_ENABLED))
673 if (log_end - log_start > log_buf_len) 1154 continue;
674 log_start = log_end - log_buf_len; 1155 if (!con->write)
675 if (log_end - con_start > log_buf_len) 1156 continue;
676 con_start = log_end - log_buf_len; 1157 if (!cpu_online(smp_processor_id()) &&
677 if (logged_chars < log_buf_len) 1158 !(con->flags & CON_ANYTIME))
678 logged_chars++; 1159 continue;
1160 con->write(con, text, len);
1161 }
679} 1162}
680 1163
681/* 1164/*
@@ -700,16 +1183,6 @@ static void zap_locks(void)
700 sema_init(&console_sem, 1); 1183 sema_init(&console_sem, 1);
701} 1184}
702 1185
703#if defined(CONFIG_PRINTK_TIME)
704static bool printk_time = 1;
705#else
706static bool printk_time = 0;
707#endif
708module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
709
710static bool always_kmsg_dump;
711module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
712
713/* Check if we have any console registered that can be called early in boot. */ 1186/* Check if we have any console registered that can be called early in boot. */
714static int have_callable_console(void) 1187static int have_callable_console(void)
715{ 1188{
@@ -722,51 +1195,6 @@ static int have_callable_console(void)
722 return 0; 1195 return 0;
723} 1196}
724 1197
725/**
726 * printk - print a kernel message
727 * @fmt: format string
728 *
729 * This is printk(). It can be called from any context. We want it to work.
730 *
731 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
732 * call the console drivers. If we fail to get the semaphore we place the output
733 * into the log buffer and return. The current holder of the console_sem will
734 * notice the new output in console_unlock(); and will send it to the
735 * consoles before releasing the lock.
736 *
737 * One effect of this deferred printing is that code which calls printk() and
738 * then changes console_loglevel may break. This is because console_loglevel
739 * is inspected when the actual printing occurs.
740 *
741 * See also:
742 * printf(3)
743 *
744 * See the vsnprintf() documentation for format string extensions over C99.
745 */
746
747asmlinkage int printk(const char *fmt, ...)
748{
749 va_list args;
750 int r;
751
752#ifdef CONFIG_KGDB_KDB
753 if (unlikely(kdb_trap_printk)) {
754 va_start(args, fmt);
755 r = vkdb_printf(fmt, args);
756 va_end(args);
757 return r;
758 }
759#endif
760 va_start(args, fmt);
761 r = vprintk(fmt, args);
762 va_end(args);
763
764 return r;
765}
766
767/* cpu currently holding logbuf_lock */
768static volatile unsigned int printk_cpu = UINT_MAX;
769
770/* 1198/*
771 * Can we actually use the console at this time on this cpu? 1199 * Can we actually use the console at this time on this cpu?
772 * 1200 *
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)
810 retval = 0; 1238 retval = 0;
811 } 1239 }
812 } 1240 }
813 printk_cpu = UINT_MAX; 1241 logbuf_cpu = UINT_MAX;
814 if (wake) 1242 if (wake)
815 up(&console_sem); 1243 up(&console_sem);
816 raw_spin_unlock(&logbuf_lock); 1244 raw_spin_unlock(&logbuf_lock);
817 return retval; 1245 return retval;
818} 1246}
819static const char recursion_bug_msg [] =
820 KERN_CRIT "BUG: recent printk recursion!\n";
821static int recursion_bug;
822static int new_text_line = 1;
823static char printk_buf[1024];
824 1247
825int printk_delay_msec __read_mostly; 1248int printk_delay_msec __read_mostly;
826 1249
@@ -836,15 +1259,23 @@ static inline void printk_delay(void)
836 } 1259 }
837} 1260}
838 1261
839asmlinkage int vprintk(const char *fmt, va_list args) 1262asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args)
840{ 1265{
841 int printed_len = 0; 1266 static int recursion_bug;
842 int current_log_level = default_message_loglevel; 1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf;
1273 size_t text_len;
843 unsigned long flags; 1274 unsigned long flags;
844 int this_cpu; 1275 int this_cpu;
845 char *p; 1276 bool newline = false;
846 size_t plen; 1277 bool prefix = false;
847 char special; 1278 int printed_len = 0;
848 1279
849 boot_delay_msec(); 1280 boot_delay_msec();
850 printk_delay(); 1281 printk_delay();
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 /* 1287 /*
857 * Ouch, printk recursed into itself! 1288 * Ouch, printk recursed into itself!
858 */ 1289 */
859 if (unlikely(printk_cpu == this_cpu)) { 1290 if (unlikely(logbuf_cpu == this_cpu)) {
860 /* 1291 /*
861 * If a crash is occurring during printk() on this CPU, 1292 * If a crash is occurring during printk() on this CPU,
862 * then try to get the crash message out but make sure 1293 * then try to get the crash message out but make sure
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)
873 1304
874 lockdep_off(); 1305 lockdep_off();
875 raw_spin_lock(&logbuf_lock); 1306 raw_spin_lock(&logbuf_lock);
876 printk_cpu = this_cpu; 1307 logbuf_cpu = this_cpu;
877 1308
878 if (recursion_bug) { 1309 if (recursion_bug) {
1310 static const char recursion_msg[] =
1311 "BUG: recent printk recursion!";
1312
879 recursion_bug = 0; 1313 recursion_bug = 0;
880 strcpy(printk_buf, recursion_bug_msg); 1314 printed_len += strlen(recursion_msg);
881 printed_len = strlen(recursion_bug_msg); 1315 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len);
882 } 1317 }
883 /* Emit the output into the temporary buffer */
884 printed_len += vscnprintf(printk_buf + printed_len,
885 sizeof(printk_buf) - printed_len, fmt, args);
886 1318
887 p = printk_buf; 1319 /*
1320 * The printf needs to come first; we need the syslog
1321 * prefix which might be passed-in as a parameter.
1322 */
1323 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
888 1324
889 /* Read log level and handle special printk prefix */ 1325 /* mark and strip a trailing newline */
890 plen = log_prefix(p, &current_log_level, &special); 1326 if (text_len && text[text_len-1] == '\n') {
891 if (plen) { 1327 text_len--;
892 p += plen; 1328 newline = true;
1329 }
893 1330
894 switch (special) { 1331 /* strip syslog prefix and extract log level or control flags */
895 case 'c': /* Strip <c> KERN_CONT, continue line */ 1332 if (text[0] == '<' && text[1] && text[2] == '>') {
896 plen = 0; 1333 switch (text[1]) {
897 break; 1334 case '0' ... '7':
898 case 'd': /* Strip <d> KERN_DEFAULT, start new line */ 1335 if (level == -1)
899 plen = 0; 1336 level = text[1] - '0';
900 default: 1337 case 'd': /* KERN_DEFAULT */
901 if (!new_text_line) { 1338 prefix = true;
902 emit_log_char('\n'); 1339 case 'c': /* KERN_CONT */
903 new_text_line = 1; 1340 text += 3;
904 } 1341 text_len -= 3;
905 } 1342 }
906 } 1343 }
907 1344
908 /* 1345 if (level == -1)
909 * Copy the output into log_buf. If the caller didn't provide 1346 level = default_message_loglevel;
910 * the appropriate log prefix, we insert them here
911 */
912 for (; *p; p++) {
913 if (new_text_line) {
914 new_text_line = 0;
915
916 if (plen) {
917 /* Copy original log prefix */
918 int i;
919
920 for (i = 0; i < plen; i++)
921 emit_log_char(printk_buf[i]);
922 printed_len += plen;
923 } else {
924 /* Add log prefix */
925 emit_log_char('<');
926 emit_log_char(current_log_level + '0');
927 emit_log_char('>');
928 printed_len += 3;
929 }
930 1347
931 if (printk_time) { 1348 if (dict) {
932 /* Add the current time stamp */ 1349 prefix = true;
933 char tbuf[50], *tp; 1350 newline = true;
934 unsigned tlen; 1351 }
935 unsigned long long t;
936 unsigned long nanosec_rem;
937
938 t = cpu_clock(printk_cpu);
939 nanosec_rem = do_div(t, 1000000000);
940 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
941 (unsigned long) t,
942 nanosec_rem / 1000);
943
944 for (tp = tbuf; tp < tbuf + tlen; tp++)
945 emit_log_char(*tp);
946 printed_len += tlen;
947 }
948 1352
949 if (!*p) 1353 if (!newline) {
950 break; 1354 if (cont_len && (prefix || cont_task != current)) {
1355 /*
1356 * Flush earlier buffer, which is either from a
1357 * different thread, or when we got a new prefix.
1358 */
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
1360 cont_len = 0;
951 } 1361 }
952 1362
953 emit_log_char(*p); 1363 if (!cont_len) {
954 if (*p == '\n') 1364 cont_level = level;
955 new_text_line = 1; 1365 cont_task = current;
1366 }
1367
1368 /* buffer or append to earlier buffer from the same thread */
1369 if (cont_len + text_len > sizeof(cont_buf))
1370 text_len = sizeof(cont_buf) - cont_len;
1371 memcpy(cont_buf + cont_len, text, text_len);
1372 cont_len += text_len;
1373 } else {
1374 if (cont_len && cont_task == current) {
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385
1386 /* append to the earlier buffer and flush */
1387 if (cont_len + text_len > sizeof(cont_buf))
1388 text_len = sizeof(cont_buf) - cont_len;
1389 memcpy(cont_buf + cont_len, text, text_len);
1390 cont_len += text_len;
1391 log_store(facility, cont_level,
1392 NULL, 0, cont_buf, cont_len);
1393 cont_len = 0;
1394 cont_task = NULL;
1395 printed_len = cont_len;
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 }
956 } 1402 }
957 1403
958 /* 1404 /*
959 * Try to acquire and then immediately release the 1405 * Try to acquire and then immediately release the console semaphore.
960 * console semaphore. The release will do all the 1406 * The release will print out buffers and wake up /dev/kmsg and syslog()
961 * actual magic (print out buffers, wake up klogd, 1407 * users.
962 * etc).
963 * 1408 *
964 * The console_trylock_for_printk() function 1409 * The console_trylock_for_printk() function will release 'logbuf_lock'
965 * will release 'logbuf_lock' regardless of whether it 1410 * regardless of whether it actually gets the console semaphore or not.
966 * actually gets the semaphore or not.
967 */ 1411 */
968 if (console_trylock_for_printk(this_cpu)) 1412 if (console_trylock_for_printk(this_cpu))
969 console_unlock(); 1413 console_unlock();
@@ -974,16 +1418,81 @@ out_restore_irqs:
974 1418
975 return printed_len; 1419 return printed_len;
976} 1420}
977EXPORT_SYMBOL(printk); 1421EXPORT_SYMBOL(vprintk_emit);
978EXPORT_SYMBOL(vprintk);
979 1422
980#else 1423asmlinkage int vprintk(const char *fmt, va_list args)
1424{
1425 return vprintk_emit(0, -1, NULL, 0, fmt, args);
1426}
1427EXPORT_SYMBOL(vprintk);
981 1428
982static void call_console_drivers(unsigned start, unsigned end) 1429asmlinkage int printk_emit(int facility, int level,
1430 const char *dict, size_t dictlen,
1431 const char *fmt, ...)
983{ 1432{
1433 va_list args;
1434 int r;
1435
1436 va_start(args, fmt);
1437 r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
1438 va_end(args);
1439
1440 return r;
984} 1441}
1442EXPORT_SYMBOL(printk_emit);
985 1443
1444/**
1445 * printk - print a kernel message
1446 * @fmt: format string
1447 *
1448 * This is printk(). It can be called from any context. We want it to work.
1449 *
1450 * We try to grab the console_lock. If we succeed, it's easy - we log the
1451 * output and call the console drivers. If we fail to get the semaphore, we
1452 * place the output into the log buffer and return. The current holder of
1453 * the console_sem will notice the new output in console_unlock(); and will
1454 * send it to the consoles before releasing the lock.
1455 *
1456 * One effect of this deferred printing is that code which calls printk() and
1457 * then changes console_loglevel may break. This is because console_loglevel
1458 * is inspected when the actual printing occurs.
1459 *
1460 * See also:
1461 * printf(3)
1462 *
1463 * See the vsnprintf() documentation for format string extensions over C99.
1464 */
1465asmlinkage int printk(const char *fmt, ...)
1466{
1467 va_list args;
1468 int r;
1469
1470#ifdef CONFIG_KGDB_KDB
1471 if (unlikely(kdb_trap_printk)) {
1472 va_start(args, fmt);
1473 r = vkdb_printf(fmt, args);
1474 va_end(args);
1475 return r;
1476 }
986#endif 1477#endif
1478 va_start(args, fmt);
1479 r = vprintk_emit(0, -1, NULL, 0, fmt, args);
1480 va_end(args);
1481
1482 return r;
1483}
1484EXPORT_SYMBOL(printk);
1485
1486#else
1487
1488#define LOG_LINE_MAX 0
1489static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog,
1493 char *buf, size_t size) { return 0; }
1494
1495#endif /* CONFIG_PRINTK */
987 1496
988static int __add_preferred_console(char *name, int idx, char *options, 1497static int __add_preferred_console(char *name, int idx, char *options,
989 char *brl_options) 1498 char *brl_options)
@@ -1217,7 +1726,7 @@ int is_console_locked(void)
1217} 1726}
1218 1727
1219/* 1728/*
1220 * Delayed printk facility, for scheduler-internal messages: 1729 * Delayed printk version, for scheduler-internal messages:
1221 */ 1730 */
1222#define PRINTK_BUF_SIZE 512 1731#define PRINTK_BUF_SIZE 512
1223 1732
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void)
1253 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1762 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1254} 1763}
1255 1764
1765/* the next printk record to write to the console */
1766static u64 console_seq;
1767static u32 console_idx;
1768
1256/** 1769/**
1257 * console_unlock - unlock the console system 1770 * console_unlock - unlock the console system
1258 * 1771 *
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void)
1263 * by printk(). If this is the case, console_unlock(); emits 1776 * by printk(). If this is the case, console_unlock(); emits
1264 * the output prior to releasing the lock. 1777 * the output prior to releasing the lock.
1265 * 1778 *
1266 * If there is output waiting for klogd, we wake it up. 1779 * If there is output waiting, we wake /dev/kmsg and syslog() users.
1267 * 1780 *
1268 * console_unlock(); may be called from any context. 1781 * console_unlock(); may be called from any context.
1269 */ 1782 */
1270void console_unlock(void) 1783void console_unlock(void)
1271{ 1784{
1785 static u64 seen_seq;
1272 unsigned long flags; 1786 unsigned long flags;
1273 unsigned _con_start, _log_end; 1787 bool wake_klogd = false;
1274 unsigned wake_klogd = 0, retry = 0; 1788 bool retry;
1275 1789
1276 if (console_suspended) { 1790 if (console_suspended) {
1277 up(&console_sem); 1791 up(&console_sem);
@@ -1281,17 +1795,38 @@ void console_unlock(void)
1281 console_may_schedule = 0; 1795 console_may_schedule = 0;
1282 1796
1283again: 1797again:
1284 for ( ; ; ) { 1798 for (;;) {
1799 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len;
1802 int level;
1803
1285 raw_spin_lock_irqsave(&logbuf_lock, flags); 1804 raw_spin_lock_irqsave(&logbuf_lock, flags);
1286 wake_klogd |= log_start - log_end; 1805 if (seen_seq != log_next_seq) {
1287 if (con_start == log_end) 1806 wake_klogd = true;
1288 break; /* Nothing to print */ 1807 seen_seq = log_next_seq;
1289 _con_start = con_start; 1808 }
1290 _log_end = log_end; 1809
1291 con_start = log_end; /* Flush */ 1810 if (console_seq < log_first_seq) {
1811 /* messages are gone, move to first one */
1812 console_seq = log_first_seq;
1813 console_idx = log_first_idx;
1814 }
1815
1816 if (console_seq == log_next_seq)
1817 break;
1818
1819 msg = log_from_idx(console_idx);
1820 level = msg->level & 7;
1821
1822 len = msg_print_text(msg, false, text, sizeof(text));
1823
1824 console_idx = log_next(console_idx);
1825 console_seq++;
1292 raw_spin_unlock(&logbuf_lock); 1826 raw_spin_unlock(&logbuf_lock);
1827
1293 stop_critical_timings(); /* don't trace print latency */ 1828 stop_critical_timings(); /* don't trace print latency */
1294 call_console_drivers(_con_start, _log_end); 1829 call_console_drivers(level, text, len);
1295 start_critical_timings(); 1830 start_critical_timings();
1296 local_irq_restore(flags); 1831 local_irq_restore(flags);
1297 } 1832 }
@@ -1312,8 +1847,7 @@ again:
1312 * flush, no worries. 1847 * flush, no worries.
1313 */ 1848 */
1314 raw_spin_lock(&logbuf_lock); 1849 raw_spin_lock(&logbuf_lock);
1315 if (con_start != log_end) 1850 retry = console_seq != log_next_seq;
1316 retry = 1;
1317 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1851 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1318 1852
1319 if (retry && console_trylock()) 1853 if (retry && console_trylock())
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)
1549 * for us. 2083 * for us.
1550 */ 2084 */
1551 raw_spin_lock_irqsave(&logbuf_lock, flags); 2085 raw_spin_lock_irqsave(&logbuf_lock, flags);
1552 con_start = log_start; 2086 console_seq = syslog_seq;
2087 console_idx = syslog_idx;
1553 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2088 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1554 /* 2089 /*
1555 * We're about to replay the log buffer. Only do this to the 2090 * We're about to replay the log buffer. Only do this to the
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1758} 2293}
1759EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 2294EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1760 2295
2296static bool always_kmsg_dump;
2297module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2298
1761/** 2299/**
1762 * kmsg_dump - dump kernel log to kernel message dumpers. 2300 * kmsg_dump - dump kernel log to kernel message dumpers.
1763 * @reason: the reason (oops, panic etc) for dumping 2301 * @reason: the reason (oops, panic etc) for dumping
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1767 */ 2305 */
1768void kmsg_dump(enum kmsg_dump_reason reason) 2306void kmsg_dump(enum kmsg_dump_reason reason)
1769{ 2307{
1770 unsigned long end; 2308 u64 idx;
1771 unsigned chars;
1772 struct kmsg_dumper *dumper; 2309 struct kmsg_dumper *dumper;
1773 const char *s1, *s2; 2310 const char *s1, *s2;
1774 unsigned long l1, l2; 2311 unsigned long l1, l2;
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1780 /* Theoretically, the log could move on after we do this, but 2317 /* Theoretically, the log could move on after we do this, but
1781 there's not a lot we can do about that. The new messages 2318 there's not a lot we can do about that. The new messages
1782 will overwrite the start of what we dump. */ 2319 will overwrite the start of what we dump. */
2320
1783 raw_spin_lock_irqsave(&logbuf_lock, flags); 2321 raw_spin_lock_irqsave(&logbuf_lock, flags);
1784 end = log_end & LOG_BUF_MASK; 2322 if (syslog_seq < log_first_seq)
1785 chars = logged_chars; 2323 idx = syslog_idx;
1786 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2324 else
2325 idx = log_first_idx;
1787 2326
1788 if (chars > end) { 2327 if (idx > log_next_idx) {
1789 s1 = log_buf + log_buf_len - chars + end; 2328 s1 = log_buf;
1790 l1 = chars - end; 2329 l1 = log_next_idx;
1791 2330
1792 s2 = log_buf; 2331 s2 = log_buf + idx;
1793 l2 = end; 2332 l2 = log_buf_len - idx;
1794 } else { 2333 } else {
1795 s1 = ""; 2334 s1 = "";
1796 l1 = 0; 2335 l1 = 0;
1797 2336
1798 s2 = log_buf + end - chars; 2337 s2 = log_buf + idx;
1799 l2 = chars; 2338 l2 = log_next_idx - idx;
1800 } 2339 }
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1801 2341
1802 rcu_read_lock(); 2342 rcu_read_lock();
1803 list_for_each_entry_rcu(dumper, &dump_list, list) 2343 list_for_each_entry_rcu(dumper, &dump_list, list)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
51 51
52#include "rcu.h" 52#include "rcu.h"
53 53
54#ifdef CONFIG_PREEMPT_RCU
55
56/*
57 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep
60 * is enabled.
61 */
62void exit_rcu(void)
63{
64 struct task_struct *t = current;
65
66 if (likely(list_empty(&current->rcu_node_entry)))
67 return;
68 t->rcu_read_lock_nesting = 1;
69 barrier();
70 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
71 __rcu_read_unlock();
72}
73
74#else /* #ifdef CONFIG_PREEMPT_RCU */
75
76void exit_rcu(void)
77{
78}
79
80#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
81
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 82#ifdef CONFIG_DEBUG_LOCK_ALLOC
55static struct lock_class_key rcu_lock_key; 83static struct lock_class_key rcu_lock_key;
56struct lockdep_map rcu_lock_map = 84struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 852}
853 853
854/*
855 * Check for a task exiting while in a preemptible -RCU read-side
856 * critical section, clean up if so. No need to issue warnings,
857 * as debug_check_no_locks_held() already does this if lockdep
858 * is enabled.
859 */
860void exit_rcu(void)
861{
862 struct task_struct *t = current;
863
864 if (t->rcu_read_lock_nesting == 0)
865 return;
866 t->rcu_read_lock_nesting = 1;
867 __rcu_read_unlock();
868}
869
870#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 854#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
871 855
872#ifdef CONFIG_RCU_TRACE 856#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
96MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
97module_param(fqs_stutter, int, 0444); 98module_param(fqs_stutter, int, 0444);
98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
99module_param(onoff_interval, int, 0444); 102module_param(onoff_interval, int, 0444);
100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444); 104module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
139static struct task_struct *onoff_task; 142static struct task_struct *onoff_task;
140#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task; 144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
142 147
143#define RCU_TORTURE_PIPE_LEN 10 148#define RCU_TORTURE_PIPE_LEN 10
144 149
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
164static atomic_t n_rcu_torture_free; 169static atomic_t n_rcu_torture_free;
165static atomic_t n_rcu_torture_mberror; 170static atomic_t n_rcu_torture_mberror;
166static atomic_t n_rcu_torture_error; 171static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
167static long n_rcu_torture_boost_ktrerror; 173static long n_rcu_torture_boost_ktrerror;
168static long n_rcu_torture_boost_rterror; 174static long n_rcu_torture_boost_rterror;
169static long n_rcu_torture_boost_failure; 175static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
173static long n_offline_successes; 179static long n_offline_successes;
174static long n_online_attempts; 180static long n_online_attempts;
175static long n_online_successes; 181static long n_online_successes;
182static long n_barrier_attempts;
183static long n_barrier_successes;
176static struct list_head rcu_torture_removed; 184static struct list_head rcu_torture_removed;
177static cpumask_var_t shuffle_tmp_mask; 185static cpumask_var_t shuffle_tmp_mask;
178 186
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
197static unsigned long boost_starttime; /* jiffies of next boost test start. */ 205static unsigned long boost_starttime; /* jiffies of next boost test start. */
198DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
199 /* and boost task create/destroy. */ 207 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
200 212
201/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 213/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
202 214
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
327 int (*completed)(void); 339 int (*completed)(void);
328 void (*deferred_free)(struct rcu_torture *p); 340 void (*deferred_free)(struct rcu_torture *p);
329 void (*sync)(void); 341 void (*sync)(void);
342 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
330 void (*cb_barrier)(void); 343 void (*cb_barrier)(void);
331 void (*fqs)(void); 344 void (*fqs)(void);
332 int (*stats)(char *page); 345 int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
417 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
418 .deferred_free = rcu_torture_deferred_free, 431 .deferred_free = rcu_torture_deferred_free,
419 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .call = call_rcu,
420 .cb_barrier = rcu_barrier, 434 .cb_barrier = rcu_barrier,
421 .fqs = rcu_force_quiescent_state, 435 .fqs = rcu_force_quiescent_state,
422 .stats = NULL, 436 .stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
460 .completed = rcu_torture_completed, 474 .completed = rcu_torture_completed,
461 .deferred_free = rcu_sync_torture_deferred_free, 475 .deferred_free = rcu_sync_torture_deferred_free,
462 .sync = synchronize_rcu, 476 .sync = synchronize_rcu,
477 .call = NULL,
463 .cb_barrier = NULL, 478 .cb_barrier = NULL,
464 .fqs = rcu_force_quiescent_state, 479 .fqs = rcu_force_quiescent_state,
465 .stats = NULL, 480 .stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
477 .completed = rcu_no_completed, 492 .completed = rcu_no_completed,
478 .deferred_free = rcu_sync_torture_deferred_free, 493 .deferred_free = rcu_sync_torture_deferred_free,
479 .sync = synchronize_rcu_expedited, 494 .sync = synchronize_rcu_expedited,
495 .call = NULL,
480 .cb_barrier = NULL, 496 .cb_barrier = NULL,
481 .fqs = rcu_force_quiescent_state, 497 .fqs = rcu_force_quiescent_state,
482 .stats = NULL, 498 .stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
519 .completed = rcu_bh_torture_completed, 535 .completed = rcu_bh_torture_completed,
520 .deferred_free = rcu_bh_torture_deferred_free, 536 .deferred_free = rcu_bh_torture_deferred_free,
521 .sync = synchronize_rcu_bh, 537 .sync = synchronize_rcu_bh,
538 .call = call_rcu_bh,
522 .cb_barrier = rcu_barrier_bh, 539 .cb_barrier = rcu_barrier_bh,
523 .fqs = rcu_bh_force_quiescent_state, 540 .fqs = rcu_bh_force_quiescent_state,
524 .stats = NULL, 541 .stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
535 .completed = rcu_bh_torture_completed, 552 .completed = rcu_bh_torture_completed,
536 .deferred_free = rcu_sync_torture_deferred_free, 553 .deferred_free = rcu_sync_torture_deferred_free,
537 .sync = synchronize_rcu_bh, 554 .sync = synchronize_rcu_bh,
555 .call = NULL,
538 .cb_barrier = NULL, 556 .cb_barrier = NULL,
539 .fqs = rcu_bh_force_quiescent_state, 557 .fqs = rcu_bh_force_quiescent_state,
540 .stats = NULL, 558 .stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
551 .completed = rcu_bh_torture_completed, 569 .completed = rcu_bh_torture_completed,
552 .deferred_free = rcu_sync_torture_deferred_free, 570 .deferred_free = rcu_sync_torture_deferred_free,
553 .sync = synchronize_rcu_bh_expedited, 571 .sync = synchronize_rcu_bh_expedited,
572 .call = NULL,
554 .cb_barrier = NULL, 573 .cb_barrier = NULL,
555 .fqs = rcu_bh_force_quiescent_state, 574 .fqs = rcu_bh_force_quiescent_state,
556 .stats = NULL, 575 .stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
606 return srcu_batches_completed(&srcu_ctl); 625 return srcu_batches_completed(&srcu_ctl);
607} 626}
608 627
628static void srcu_torture_deferred_free(struct rcu_torture *rp)
629{
630 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
631}
632
609static void srcu_torture_synchronize(void) 633static void srcu_torture_synchronize(void)
610{ 634{
611 synchronize_srcu(&srcu_ctl); 635 synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
621 torture_type, TORTURE_FLAG, idx); 645 torture_type, TORTURE_FLAG, idx);
622 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
623 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
626 } 650 }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
635 .read_delay = srcu_read_delay, 659 .read_delay = srcu_read_delay,
636 .readunlock = srcu_torture_read_unlock, 660 .readunlock = srcu_torture_read_unlock,
637 .completed = srcu_torture_completed, 661 .completed = srcu_torture_completed,
638 .deferred_free = rcu_sync_torture_deferred_free, 662 .deferred_free = srcu_torture_deferred_free,
639 .sync = srcu_torture_synchronize, 663 .sync = srcu_torture_synchronize,
664 .call = NULL,
640 .cb_barrier = NULL, 665 .cb_barrier = NULL,
641 .stats = srcu_torture_stats, 666 .stats = srcu_torture_stats,
642 .name = "srcu" 667 .name = "srcu"
643}; 668};
644 669
670static struct rcu_torture_ops srcu_sync_ops = {
671 .init = srcu_torture_init,
672 .cleanup = srcu_torture_cleanup,
673 .readlock = srcu_torture_read_lock,
674 .read_delay = srcu_read_delay,
675 .readunlock = srcu_torture_read_unlock,
676 .completed = srcu_torture_completed,
677 .deferred_free = rcu_sync_torture_deferred_free,
678 .sync = srcu_torture_synchronize,
679 .call = NULL,
680 .cb_barrier = NULL,
681 .stats = srcu_torture_stats,
682 .name = "srcu_sync"
683};
684
645static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) 685static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
646{ 686{
647 return srcu_read_lock_raw(&srcu_ctl); 687 return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
659 .read_delay = srcu_read_delay, 699 .read_delay = srcu_read_delay,
660 .readunlock = srcu_torture_read_unlock_raw, 700 .readunlock = srcu_torture_read_unlock_raw,
661 .completed = srcu_torture_completed, 701 .completed = srcu_torture_completed,
662 .deferred_free = rcu_sync_torture_deferred_free, 702 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 703 .sync = srcu_torture_synchronize,
704 .call = NULL,
664 .cb_barrier = NULL, 705 .cb_barrier = NULL,
665 .stats = srcu_torture_stats, 706 .stats = srcu_torture_stats,
666 .name = "srcu_raw" 707 .name = "srcu_raw"
667}; 708};
668 709
710static struct rcu_torture_ops srcu_raw_sync_ops = {
711 .init = srcu_torture_init,
712 .cleanup = srcu_torture_cleanup,
713 .readlock = srcu_torture_read_lock_raw,
714 .read_delay = srcu_read_delay,
715 .readunlock = srcu_torture_read_unlock_raw,
716 .completed = srcu_torture_completed,
717 .deferred_free = rcu_sync_torture_deferred_free,
718 .sync = srcu_torture_synchronize,
719 .call = NULL,
720 .cb_barrier = NULL,
721 .stats = srcu_torture_stats,
722 .name = "srcu_raw_sync"
723};
724
669static void srcu_torture_synchronize_expedited(void) 725static void srcu_torture_synchronize_expedited(void)
670{ 726{
671 synchronize_srcu_expedited(&srcu_ctl); 727 synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
680 .completed = srcu_torture_completed, 736 .completed = srcu_torture_completed,
681 .deferred_free = rcu_sync_torture_deferred_free, 737 .deferred_free = rcu_sync_torture_deferred_free,
682 .sync = srcu_torture_synchronize_expedited, 738 .sync = srcu_torture_synchronize_expedited,
739 .call = NULL,
683 .cb_barrier = NULL, 740 .cb_barrier = NULL,
684 .stats = srcu_torture_stats, 741 .stats = srcu_torture_stats,
685 .name = "srcu_expedited" 742 .name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1131 "rtbf: %ld rtb: %ld nt: %ld " 1188 "rtbf: %ld rtb: %ld nt: %ld "
1132 "onoff: %ld/%ld:%ld/%ld", 1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1133 rcu_torture_current, 1191 rcu_torture_current,
1134 rcu_torture_current_version, 1192 rcu_torture_current_version,
1135 list_empty(&rcu_torture_freelist), 1193 list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
1145 n_online_successes, 1203 n_online_successes,
1146 n_online_attempts, 1204 n_online_attempts,
1147 n_offline_successes, 1205 n_offline_successes,
1148 n_offline_attempts); 1206 n_offline_attempts,
1207 n_barrier_successes,
1208 n_barrier_attempts,
1209 n_rcu_torture_barrier_error);
1210 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1149 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1211 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1212 n_rcu_torture_barrier_error != 0 ||
1150 n_rcu_torture_boost_ktrerror != 0 || 1213 n_rcu_torture_boost_ktrerror != 0 ||
1151 n_rcu_torture_boost_rterror != 0 || 1214 n_rcu_torture_boost_rterror != 0 ||
1152 n_rcu_torture_boost_failure != 0) 1215 n_rcu_torture_boost_failure != 0 ||
1153 cnt += sprintf(&page[cnt], " !!!"); 1216 i > 1) {
1154 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1155 if (i > 1) {
1156 cnt += sprintf(&page[cnt], "!!! "); 1217 cnt += sprintf(&page[cnt], "!!! ");
1157 atomic_inc(&n_rcu_torture_error); 1218 atomic_inc(&n_rcu_torture_error);
1158 WARN_ON_ONCE(1); 1219 WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
1337 1398
1338 /* This must be outside of the mutex, otherwise deadlock! */ 1399 /* This must be outside of the mutex, otherwise deadlock! */
1339 kthread_stop(t); 1400 kthread_stop(t);
1401 boost_tasks[cpu] = NULL;
1340} 1402}
1341 1403
1342static int rcutorture_booster_init(int cpu) 1404static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
1484 return; 1546 return;
1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1547 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1486 kthread_stop(onoff_task); 1548 kthread_stop(onoff_task);
1549 onoff_task = NULL;
1487} 1550}
1488 1551
1489#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552#else /* #ifdef CONFIG_HOTPLUG_CPU */
1490 1553
1491static void 1554static int
1492rcu_torture_onoff_init(void) 1555rcu_torture_onoff_init(void)
1493{ 1556{
1557 return 0;
1494} 1558}
1495 1559
1496static void rcu_torture_onoff_cleanup(void) 1560static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
1554 return; 1618 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1619 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task); 1620 kthread_stop(stall_task);
1621 stall_task = NULL;
1622}
1623
1624/* Callback function for RCU barrier testing. */
1625void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1626{
1627 atomic_inc(&barrier_cbs_invoked);
1628}
1629
1630/* kthread function to register callbacks used to test RCU barriers. */
1631static int rcu_torture_barrier_cbs(void *arg)
1632{
1633 long myid = (long)arg;
1634 struct rcu_head rcu;
1635
1636 init_rcu_head_on_stack(&rcu);
1637 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1638 set_user_nice(current, 19);
1639 do {
1640 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
1642 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP);
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1647 if (atomic_dec_and_test(&barrier_cbs_count))
1648 wake_up(&barrier_wq);
1649 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1650 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1651 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1652 while (!kthread_should_stop())
1653 schedule_timeout_interruptible(1);
1654 cur_ops->cb_barrier();
1655 destroy_rcu_head_on_stack(&rcu);
1656 return 0;
1657}
1658
1659/* kthread function to drive and coordinate RCU barrier testing. */
1660static int rcu_torture_barrier(void *arg)
1661{
1662 int i;
1663
1664 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1665 do {
1666 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */
1669 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq,
1672 atomic_read(&barrier_cbs_count) == 0 ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1676 break;
1677 n_barrier_attempts++;
1678 cur_ops->cb_barrier();
1679 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1680 n_rcu_torture_barrier_error++;
1681 WARN_ON_ONCE(1);
1682 }
1683 n_barrier_successes++;
1684 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1688 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1);
1690 return 0;
1691}
1692
1693/* Initialize RCU barrier testing. */
1694static int rcu_torture_barrier_init(void)
1695{
1696 int i;
1697 int ret;
1698
1699 if (n_barrier_cbs == 0)
1700 return 0;
1701 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1702 printk(KERN_ALERT "%s" TORTURE_FLAG
1703 " Call or barrier ops missing for %s,\n",
1704 torture_type, cur_ops->name);
1705 printk(KERN_ALERT "%s" TORTURE_FLAG
1706 " RCU barrier testing omitted from run.\n",
1707 torture_type);
1708 return 0;
1709 }
1710 atomic_set(&barrier_cbs_count, 0);
1711 atomic_set(&barrier_cbs_invoked, 0);
1712 barrier_cbs_tasks =
1713 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1714 GFP_KERNEL);
1715 barrier_cbs_wq =
1716 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1717 GFP_KERNEL);
1718 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1719 return -ENOMEM;
1720 for (i = 0; i < n_barrier_cbs; i++) {
1721 init_waitqueue_head(&barrier_cbs_wq[i]);
1722 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1723 (void *)(long)i,
1724 "rcu_torture_barrier_cbs");
1725 if (IS_ERR(barrier_cbs_tasks[i])) {
1726 ret = PTR_ERR(barrier_cbs_tasks[i]);
1727 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1728 barrier_cbs_tasks[i] = NULL;
1729 return ret;
1730 }
1731 }
1732 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1733 "rcu_torture_barrier");
1734 if (IS_ERR(barrier_task)) {
1735 ret = PTR_ERR(barrier_task);
1736 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1737 barrier_task = NULL;
1738 }
1739 return 0;
1740}
1741
1742/* Clean up after RCU barrier testing. */
1743static void rcu_torture_barrier_cleanup(void)
1744{
1745 int i;
1746
1747 if (barrier_task != NULL) {
1748 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1749 kthread_stop(barrier_task);
1750 barrier_task = NULL;
1751 }
1752 if (barrier_cbs_tasks != NULL) {
1753 for (i = 0; i < n_barrier_cbs; i++) {
1754 if (barrier_cbs_tasks[i] != NULL) {
1755 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1756 kthread_stop(barrier_cbs_tasks[i]);
1757 barrier_cbs_tasks[i] = NULL;
1758 }
1759 }
1760 kfree(barrier_cbs_tasks);
1761 barrier_cbs_tasks = NULL;
1762 }
1763 if (barrier_cbs_wq != NULL) {
1764 kfree(barrier_cbs_wq);
1765 barrier_cbs_wq = NULL;
1766 }
1557} 1767}
1558 1768
1559static int rcutorture_cpu_notify(struct notifier_block *self, 1769static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
1598 fullstop = FULLSTOP_RMMOD; 1808 fullstop = FULLSTOP_RMMOD;
1599 mutex_unlock(&fullstop_mutex); 1809 mutex_unlock(&fullstop_mutex);
1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1810 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1811 rcu_torture_barrier_cleanup();
1601 rcu_torture_stall_cleanup(); 1812 rcu_torture_stall_cleanup();
1602 if (stutter_task) { 1813 if (stutter_task) {
1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1876 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1666 kthread_stop(shutdown_task); 1877 kthread_stop(shutdown_task);
1667 } 1878 }
1879 shutdown_task = NULL;
1668 rcu_torture_onoff_cleanup(); 1880 rcu_torture_onoff_cleanup();
1669 1881
1670 /* Wait for all RCU callbacks to fire. */ 1882 /* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
1676 1888
1677 if (cur_ops->cleanup) 1889 if (cur_ops->cleanup)
1678 cur_ops->cleanup(); 1890 cur_ops->cleanup();
1679 if (atomic_read(&n_rcu_torture_error)) 1891 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1892 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts || 1893 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts) 1894 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
1692 int i; 1904 int i;
1693 int cpu; 1905 int cpu;
1694 int firsterr = 0; 1906 int firsterr = 0;
1907 int retval;
1695 static struct rcu_torture_ops *torture_ops[] = 1908 static struct rcu_torture_ops *torture_ops[] =
1696 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1697 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1698 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops,
1699 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1700 1914
1701 mutex_lock(&fullstop_mutex); 1915 mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
1749 atomic_set(&n_rcu_torture_free, 0); 1963 atomic_set(&n_rcu_torture_free, 0);
1750 atomic_set(&n_rcu_torture_mberror, 0); 1964 atomic_set(&n_rcu_torture_mberror, 0);
1751 atomic_set(&n_rcu_torture_error, 0); 1965 atomic_set(&n_rcu_torture_error, 0);
1966 n_rcu_torture_barrier_error = 0;
1752 n_rcu_torture_boost_ktrerror = 0; 1967 n_rcu_torture_boost_ktrerror = 0;
1753 n_rcu_torture_boost_rterror = 0; 1968 n_rcu_torture_boost_rterror = 0;
1754 n_rcu_torture_boost_failure = 0; 1969 n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
1872 test_boost_duration = 2; 2087 test_boost_duration = 2;
1873 if ((test_boost == 1 && cur_ops->can_boost) || 2088 if ((test_boost == 1 && cur_ops->can_boost) ||
1874 test_boost == 2) { 2089 test_boost == 2) {
1875 int retval;
1876 2090
1877 boost_starttime = jiffies + test_boost_interval * HZ; 2091 boost_starttime = jiffies + test_boost_interval * HZ;
1878 register_cpu_notifier(&rcutorture_cpu_nb); 2092 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
1897 goto unwind; 2111 goto unwind;
1898 } 2112 }
1899 } 2113 }
1900 rcu_torture_onoff_init(); 2114 i = rcu_torture_onoff_init();
2115 if (i != 0) {
2116 firsterr = i;
2117 goto unwind;
2118 }
1901 register_reboot_notifier(&rcutorture_shutdown_nb); 2119 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init(); 2120 i = rcu_torture_stall_init();
2121 if (i != 0) {
2122 firsterr = i;
2123 goto unwind;
2124 }
2125 retval = rcu_torture_barrier_init();
2126 if (retval != 0) {
2127 firsterr = retval;
2128 goto unwind;
2129 }
1903 rcutorture_record_test_transition(); 2130 rcutorture_record_test_transition();
1904 mutex_unlock(&fullstop_mutex); 2131 mutex_unlock(&fullstop_mutex);
1905 return 0; 2132 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab18..0da7b88d92d0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
75 .gpnum = -300, \ 75 .gpnum = -300, \
76 .completed = -300, \ 76 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \
78 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
79 .n_force_qs = 0, \ 81 .n_force_qs = 0, \
80 .n_force_qs_ngp = 0, \ 82 .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145unsigned long rcutorture_testseq; 147unsigned long rcutorture_testseq;
146unsigned long rcutorture_vernum; 148unsigned long rcutorture_vernum;
147 149
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
148/* 157/*
149 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
150 * permit this function to be invoked without holding the root rcu_node 159 * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
192{ 201{
193 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
194 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
195 rcu_preempt_note_context_switch(cpu);
196 trace_rcu_utilization("End context switch"); 204 trace_rcu_utilization("End context switch");
197} 205}
198EXPORT_SYMBOL_GPL(rcu_note_context_switch); 206EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1311#ifdef CONFIG_HOTPLUG_CPU 1319#ifdef CONFIG_HOTPLUG_CPU
1312 1320
1313/* 1321/*
1314 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1322 * Send the specified CPU's RCU callbacks to the orphanage. The
1315 * Also record a quiescent state for this CPU for the current grace period. 1323 * specified CPU must be offline, and the caller must hold the
1316 * Synchronization and interrupt disabling are not required because 1324 * ->onofflock.
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1324 */ 1325 */
1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1326static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp)
1326{ 1329{
1327 int i; 1330 int i;
1328 unsigned long mask;
1329 int receive_cpu = cpumask_any(cpu_online_mask);
1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333 1331
1334 /* First, adjust the counts. */ 1332 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of
1335 * the callbacks, thus no memory barrier is required.
1336 */
1335 if (rdp->nxtlist != NULL) { 1337 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy; 1338 rsp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen; 1339 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen;
1338 rdp->qlen_lazy = 0; 1341 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0; 1342 rdp->qlen = 0;
1340 } 1343 }
1341 1344
1342 /* 1345 /*
1343 * Next, move ready-to-invoke callbacks to be invoked on some 1346 * Next, move those callbacks still needing a grace period to
1344 * other CPU. These will not be required to pass through another 1347 * the orphanage, where some other CPU will pick them up.
1345 * grace period: They are done, regardless of CPU. 1348 * Some of the callbacks might have gone partway through a grace
1349 * period, but that is too bad. They get to start over because we
1350 * cannot assume that grace periods are synchronized across CPUs.
1351 * We don't bother updating the ->nxttail[] array yet, instead
1352 * we just reset the whole thing later on.
1346 */ 1353 */
1347 if (rdp->nxtlist != NULL && 1354 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1355 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1349 struct rcu_head *oldhead; 1356 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1350 struct rcu_head **oldtail; 1357 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 } 1358 }
1366 1359
1367 /* 1360 /*
1368 * Finally, put the rest of the callbacks at the end of the list. 1361 * Then move the ready-to-invoke callbacks to the orphanage,
1369 * The ones that made it partway through get to start over: We 1362 * where some other CPU will pick them up. These will not be
1370 * cannot assume that grace periods are synchronized across CPUs. 1363 * required to pass though another grace period: They are done.
1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */ 1364 */
1374 if (rdp->nxtlist != NULL) { 1365 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1366 *rsp->orphan_donetail = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] = 1367 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 } 1368 }
1385 1369
1370 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL;
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374}
1375
1376/*
1377 * Adopt the RCU callbacks from the specified rcu_state structure's
1378 * orphanage. The caller must hold the ->onofflock.
1379 */
1380static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1381{
1382 int i;
1383 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1384
1386 /* 1385 /*
1387 * Record a quiescent state for the dying CPU. This is safe 1386 * If there is an rcu_barrier() operation in progress, then
1388 * only because we have already cleared out the callbacks. 1387 * only the task doing that operation is permitted to adopt
1389 * (Otherwise, the RCU core might try to schedule the invocation 1388 * callbacks. To do otherwise breaks rcu_barrier() and friends
1390 * of callbacks on this now-offline CPU, which would be bad.) 1389 * by causing them to fail to wait for the callbacks in the
1390 * orphanage.
1391 */ 1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1392 if (rsp->rcu_barrier_in_progress &&
1393 rsp->rcu_barrier_in_progress != current)
1394 return;
1395
1396 /* Do the accounting first. */
1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen;
1400 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0;
1402
1403 /*
1404 * We do not need a memory barrier here because the only way we
1405 * can get here if there is an rcu_barrier() in flight is if
1406 * we are the task doing the rcu_barrier().
1407 */
1408
1409 /* First adopt the ready-to-invoke callbacks. */
1410 if (rsp->orphan_donelist != NULL) {
1411 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1412 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1413 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1414 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1415 rdp->nxttail[i] = rsp->orphan_donetail;
1416 rsp->orphan_donelist = NULL;
1417 rsp->orphan_donetail = &rsp->orphan_donelist;
1418 }
1419
1420 /* And then adopt the callbacks that still need a grace period. */
1421 if (rsp->orphan_nxtlist != NULL) {
1422 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1423 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1424 rsp->orphan_nxtlist = NULL;
1425 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1426 }
1427}
1428
1429/*
1430 * Trace the fact that this CPU is going offline.
1431 */
1432static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1433{
1434 RCU_TRACE(unsigned long mask);
1435 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1436 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1437
1438 RCU_TRACE(mask = rdp->grpmask);
1393 trace_rcu_grace_period(rsp->name, 1439 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1440 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl"); 1441 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1398} 1442}
1399 1443
1400/* 1444/*
1401 * The CPU has been completely removed, and some other CPU is reporting 1445 * The CPU has been completely removed, and some other CPU is reporting
1402 * this fact from process context. Do the remainder of the cleanup. 1446 * this fact from process context. Do the remainder of the cleanup,
1447 * including orphaning the outgoing CPU's RCU callbacks, and also
1448 * adopting them, if there is no _rcu_barrier() instance running.
1403 * There can only be one CPU hotplug operation at a time, so no other 1449 * There can only be one CPU hotplug operation at a time, so no other
1404 * CPU can be attempting to update rcu_cpu_kthread_task. 1450 * CPU can be attempting to update rcu_cpu_kthread_task.
1405 */ 1451 */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1409 unsigned long mask; 1455 unsigned long mask;
1410 int need_report = 0; 1456 int need_report = 0;
1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1457 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1458 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1413 1459
1414 /* Adjust any no-longer-needed kthreads. */ 1460 /* Adjust any no-longer-needed kthreads. */
1415 rcu_stop_cpu_kthread(cpu); 1461 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1); 1462 rcu_node_kthread_setaffinity(rnp, -1);
1417 1463
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1464 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1419 1465
1420 /* Exclude any attempts to start a new grace period. */ 1466 /* Exclude any attempts to start a new grace period. */
1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1467 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1422 1468
1469 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1470 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1471 rcu_adopt_orphan_cbs(rsp);
1472
1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1473 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1424 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1474 mask = rdp->grpmask; /* rnp->grplo is constant. */
1425 do { 1475 do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1456 1506
1457#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1507#else /* #ifdef CONFIG_HOTPLUG_CPU */
1458 1508
1509static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1510{
1511}
1512
1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1513static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1460{ 1514{
1461} 1515}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1524 rcu_is_callbacks_kthread()); 1578 rcu_is_callbacks_kthread());
1525 1579
1526 /* Update count, and requeue any remaining callbacks. */ 1580 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1528 rdp->qlen -= count;
1529 rdp->n_cbs_invoked += count;
1530 if (list != NULL) { 1581 if (list != NULL) {
1531 *tail = rdp->nxtlist; 1582 *tail = rdp->nxtlist;
1532 rdp->nxtlist = list; 1583 rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1536 else 1587 else
1537 break; 1588 break;
1538 } 1589 }
1590 smp_mb(); /* List handling before counting for rcu_barrier(). */
1591 rdp->qlen_lazy -= count_lazy;
1592 rdp->qlen -= count;
1593 rdp->n_cbs_invoked += count;
1539 1594
1540 /* Reinstate batch limit if we have worked down the excess. */ 1595 /* Reinstate batch limit if we have worked down the excess. */
1541 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1596 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1823 rdp = this_cpu_ptr(rsp->rda); 1878 rdp = this_cpu_ptr(rsp->rda);
1824 1879
1825 /* Add the callback to our list. */ 1880 /* Add the callback to our list. */
1826 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1827 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1828 rdp->qlen++; 1881 rdp->qlen++;
1829 if (lazy) 1882 if (lazy)
1830 rdp->qlen_lazy++; 1883 rdp->qlen_lazy++;
1884 else
1885 rcu_idle_count_callbacks_posted();
1886 smp_mb(); /* Count before adding callback for rcu_barrier(). */
1887 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1888 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1831 1889
1832 if (__is_kfree_rcu_offset((unsigned long)func)) 1890 if (__is_kfree_rcu_offset((unsigned long)func))
1833 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1891 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1893} 1951}
1894EXPORT_SYMBOL_GPL(call_rcu_bh); 1952EXPORT_SYMBOL_GPL(call_rcu_bh);
1895 1953
1954/*
1955 * Because a context switch is a grace period for RCU-sched and RCU-bh,
1956 * any blocking grace-period wait automatically implies a grace period
1957 * if there is only one CPU online at any point time during execution
1958 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
1959 * occasionally incorrectly indicate that there are multiple CPUs online
1960 * when there was in fact only one the whole time, as this just adds
1961 * some overhead: RCU still operates correctly.
1962 *
1963 * Of course, sampling num_online_cpus() with preemption enabled can
1964 * give erroneous results if there are concurrent CPU-hotplug operations.
1965 * For example, given a demonic sequence of preemptions in num_online_cpus()
1966 * and CPU-hotplug operations, there could be two or more CPUs online at
1967 * all times, but num_online_cpus() might well return one (or even zero).
1968 *
1969 * However, all such demonic sequences require at least one CPU-offline
1970 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1971 * is only a problem if there is an RCU read-side critical section executing
1972 * throughout. But RCU-sched and RCU-bh read-side critical sections
1973 * disable either preemption or bh, which prevents a CPU from going offline.
1974 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1975 * that there is only one CPU when in fact there was more than one throughout
1976 * is when there were no RCU readers in the system. If there are no
1977 * RCU readers, the grace period by definition can be of zero length,
1978 * regardless of the number of online CPUs.
1979 */
1980static inline int rcu_blocking_is_gp(void)
1981{
1982 might_sleep(); /* Check for RCU read-side critical section. */
1983 return num_online_cpus() <= 1;
1984}
1985
1896/** 1986/**
1897 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1987 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1898 * 1988 *
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
2166 rcu_preempt_cpu_has_callbacks(cpu); 2256 rcu_preempt_cpu_has_callbacks(cpu);
2167} 2257}
2168 2258
2169static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2259/*
2170static atomic_t rcu_barrier_cpu_count; 2260 * RCU callback function for _rcu_barrier(). If we are last, wake
2171static DEFINE_MUTEX(rcu_barrier_mutex); 2261 * up the task executing _rcu_barrier().
2172static struct completion rcu_barrier_completion; 2262 */
2173
2174static void rcu_barrier_callback(struct rcu_head *notused) 2263static void rcu_barrier_callback(struct rcu_head *notused)
2175{ 2264{
2176 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2265 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
2200 void (*call_rcu_func)(struct rcu_head *head, 2289 void (*call_rcu_func)(struct rcu_head *head,
2201 void (*func)(struct rcu_head *head))) 2290 void (*func)(struct rcu_head *head)))
2202{ 2291{
2203 BUG_ON(in_interrupt()); 2292 int cpu;
2293 unsigned long flags;
2294 struct rcu_data *rdp;
2295 struct rcu_head rh;
2296
2297 init_rcu_head_on_stack(&rh);
2298
2204 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2299 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2205 mutex_lock(&rcu_barrier_mutex); 2300 mutex_lock(&rcu_barrier_mutex);
2206 init_completion(&rcu_barrier_completion); 2301
2302 smp_mb(); /* Prevent any prior operations from leaking in. */
2303
2207 /* 2304 /*
2208 * Initialize rcu_barrier_cpu_count to 1, then invoke 2305 * Initialize the count to one rather than to zero in order to
2209 * rcu_barrier_func() on each CPU, so that each CPU also has 2306 * avoid a too-soon return to zero in case of a short grace period
2210 * incremented rcu_barrier_cpu_count. Only then is it safe to 2307 * (or preemption of this task). Also flag this task as doing
2211 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2308 * an rcu_barrier(). This will prevent anyone else from adopting
2212 * might complete its grace period before all of the other CPUs 2309 * orphaned callbacks, which could cause otherwise failure if a
2213 * did their increment, causing this function to return too 2310 * CPU went offline and quickly came back online. To see this,
2214 * early. Note that on_each_cpu() disables irqs, which prevents 2311 * consider the following sequence of events:
2215 * any CPUs from coming online or going offline until each online 2312 *
2216 * CPU has queued its RCU-barrier callback. 2313 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2314 * 2. CPU 1 goes offline, orphaning its callbacks.
2315 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2316 * 4. CPU 1 comes back online.
2317 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2318 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2319 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2217 */ 2320 */
2321 init_completion(&rcu_barrier_completion);
2218 atomic_set(&rcu_barrier_cpu_count, 1); 2322 atomic_set(&rcu_barrier_cpu_count, 1);
2219 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2323 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2324 rsp->rcu_barrier_in_progress = current;
2325 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2326
2327 /*
2328 * Force every CPU with callbacks to register a new callback
2329 * that will tell us when all the preceding callbacks have
2330 * been invoked. If an offline CPU has callbacks, wait for
2331 * it to either come back online or to finish orphaning those
2332 * callbacks.
2333 */
2334 for_each_possible_cpu(cpu) {
2335 preempt_disable();
2336 rdp = per_cpu_ptr(rsp->rda, cpu);
2337 if (cpu_is_offline(cpu)) {
2338 preempt_enable();
2339 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2340 schedule_timeout_interruptible(1);
2341 } else if (ACCESS_ONCE(rdp->qlen)) {
2342 smp_call_function_single(cpu, rcu_barrier_func,
2343 (void *)call_rcu_func, 1);
2344 preempt_enable();
2345 } else {
2346 preempt_enable();
2347 }
2348 }
2349
2350 /*
2351 * Now that all online CPUs have rcu_barrier_callback() callbacks
2352 * posted, we can adopt all of the orphaned callbacks and place
2353 * an rcu_barrier_callback() callback after them. When that is done,
2354 * we are guaranteed to have an rcu_barrier_callback() callback
2355 * following every callback that could possibly have been
2356 * registered before _rcu_barrier() was called.
2357 */
2358 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2359 rcu_adopt_orphan_cbs(rsp);
2360 rsp->rcu_barrier_in_progress = NULL;
2361 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2362 atomic_inc(&rcu_barrier_cpu_count);
2363 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2364 call_rcu_func(&rh, rcu_barrier_callback);
2365
2366 /*
2367 * Now that we have an rcu_barrier_callback() callback on each
2368 * CPU, and thus each counted, remove the initial count.
2369 */
2220 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2370 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2221 complete(&rcu_barrier_completion); 2371 complete(&rcu_barrier_completion);
2372
2373 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2222 wait_for_completion(&rcu_barrier_completion); 2374 wait_for_completion(&rcu_barrier_completion);
2375
2376 /* Other rcu_barrier() invocations can now safely proceed. */
2223 mutex_unlock(&rcu_barrier_mutex); 2377 mutex_unlock(&rcu_barrier_mutex);
2378
2379 destroy_rcu_head_on_stack(&rh);
2224} 2380}
2225 2381
2226/** 2382/**
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2417 2573
2418 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2574 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2419 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2575 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2420 rsp->levelspread[0] = RCU_FANOUT_LEAF; 2576 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
2421} 2577}
2422#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2578#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2423static void __init rcu_init_levelspread(struct rcu_state *rsp) 2579static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
33 * CONFIG_RCU_FANOUT_LEAF.
33 * In theory, it should be possible to add more levels straightforwardly. 34 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this did work well going from three levels to four. 35 * In practice, this did work well going from three levels to four.
35 * Of course, your mileage may vary. 36 * Of course, your mileage may vary.
36 */ 37 */
37#define MAX_RCU_LVLS 4 38#define MAX_RCU_LVLS 4
38#if CONFIG_RCU_FANOUT > 16 39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
371 367
372 raw_spinlock_t onofflock; /* exclude on/offline and */ 368 raw_spinlock_t onofflock; /* exclude on/offline and */
373 /* starting new GP. */ 369 /* starting new GP. */
370 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
371 /* need a grace period. */
372 struct rcu_head **orphan_nxttail; /* Tail of above. */
373 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
374 /* are ready to invoke. */
375 struct rcu_head **orphan_donetail; /* Tail of above. */
376 long qlen_lazy; /* Number of lazy callbacks. */
377 long qlen; /* Total number of callbacks. */
378 struct task_struct *rcu_barrier_in_progress;
379 /* Task doing rcu_barrier(), */
380 /* or NULL if no barrier. */
374 raw_spinlock_t fqslock; /* Only one task forcing */ 381 raw_spinlock_t fqslock; /* Only one task forcing */
375 /* quiescent states. */ 382 /* quiescent states. */
376 unsigned long jiffies_force_qs; /* Time at which to invoke */ 383 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
423/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
424static void rcu_bootup_announce(void); 431static void rcu_bootup_announce(void);
425long rcu_batches_completed(void); 432long rcu_batches_completed(void);
426static void rcu_preempt_note_context_switch(int cpu);
427static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
428#ifdef CONFIG_HOTPLUG_CPU 434#ifdef CONFIG_HOTPLUG_CPU
429static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 477static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 478static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 479static void rcu_prepare_for_idle(int cpu);
480static void rcu_idle_count_callbacks_posted(void);
474static void print_cpu_stall_info_begin(void); 481static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void); 483static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156static void rcu_preempt_note_context_switch(int cpu) 156void rcu_preempt_note_context_switch(void)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 rdp = __this_cpu_ptr(rcu_preempt_state.rda);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(cpu); 231 rcu_preempt_qs(smp_processor_id());
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
970} 970}
971 971
972/*
973 * Check for a task exiting while in a preemptible-RCU read-side
974 * critical section, clean up if so. No need to issue warnings,
975 * as debug_check_no_locks_held() already does this if lockdep
976 * is enabled.
977 */
978void exit_rcu(void)
979{
980 struct task_struct *t = current;
981
982 if (t->rcu_read_lock_nesting == 0)
983 return;
984 t->rcu_read_lock_nesting = 1;
985 __rcu_read_unlock();
986}
987
988#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 972#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
989 973
990static struct rcu_state *rcu_state = &rcu_sched_state; 974static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
1018EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1019 1003
1020/* 1004/*
1021 * Because preemptible RCU does not exist, we never have to check for
1022 * CPUs being in quiescent states.
1023 */
1024static void rcu_preempt_note_context_switch(int cpu)
1025{
1026}
1027
1028/*
1029 * Because preemptible RCU does not exist, there are never any preempted 1005 * Because preemptible RCU does not exist, there are never any preempted
1030 * RCU readers. 1006 * RCU readers.
1031 */ 1007 */
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
1938{ 1914{
1939} 1915}
1940 1916
1917/*
1918 * Don't bother keeping a running count of the number of RCU callbacks
1919 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1920 */
1921static void rcu_idle_count_callbacks_posted(void)
1922{
1923}
1924
1941#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1925#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1942 1926
1943/* 1927/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1980 1964
1965/* Loop counter for rcu_prepare_for_idle(). */
1981static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1986 1979
1987/* 1980/*
1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
1995 */ 1988 */
1996int rcu_needs_cpu(int cpu) 1989int rcu_needs_cpu(int cpu)
1997{ 1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1998 /* If no callbacks, RCU doesn't need the CPU. */ 1993 /* If no callbacks, RCU doesn't need the CPU. */
1999 if (!rcu_cpu_has_callbacks(cpu)) 1994 if (!rcu_cpu_has_callbacks(cpu))
2000 return 0; 1995 return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2045} 2040}
2046 2041
2047/* 2042/*
2043 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing.
2045 */
2046void rcu_idle_demigrate(void *unused)
2047{
2048 trace_rcu_prep_idle("Demigrate");
2049}
2050
2051/*
2048 * Timer handler used to force CPU to start pushing its remaining RCU 2052 * Timer handler used to force CPU to start pushing its remaining RCU
2049 * callbacks in the case where it entered dyntick-idle mode with callbacks 2053 * callbacks in the case where it entered dyntick-idle mode with callbacks
2050 * pending. The hander doesn't really need to do anything because the 2054 * pending. The hander doesn't really need to do anything because the
2051 * real work is done upon re-entry to idle, or by the next scheduling-clock 2055 * real work is done upon re-entry to idle, or by the next scheduling-clock
2052 * interrupt should idle not be re-entered. 2056 * interrupt should idle not be re-entered.
2057 *
2058 * One special case: the timer gets migrated without awakening the CPU
2059 * on which the timer was scheduled on. In this case, we must wake up
2060 * that CPU. We do so with smp_call_function_single().
2053 */ 2061 */
2054static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2062static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2055{ 2063{
2064 int cpu = (int)cpu_in;
2065
2056 trace_rcu_prep_idle("Timer"); 2066 trace_rcu_prep_idle("Timer");
2057 return HRTIMER_NORESTART; 2067 if (cpu != smp_processor_id())
2068 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
2069 else
2070 WARN_ON_ONCE(1); /* Getting here can hang the system... */
2058} 2071}
2059 2072
2060/* 2073/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2062 */ 2075 */
2063static void rcu_prepare_for_idle_init(int cpu) 2076static void rcu_prepare_for_idle_init(int cpu)
2064{ 2077{
2065 static int firsttime = 1; 2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2066 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
2067 2080 rcu_idle_gp_timer_func, cpu);
2068 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
2069 hrtp->function = rcu_idle_gp_timer_func; 2082 per_cpu(rcu_idle_first_pass, cpu) = 1;
2070 if (firsttime) {
2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2072
2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2076 firsttime = 0;
2077 }
2078} 2083}
2079 2084
2080/* 2085/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2084 */ 2089 */
2085static void rcu_cleanup_after_idle(int cpu) 2090static void rcu_cleanup_after_idle(int cpu)
2086{ 2091{
2087 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
2093 trace_rcu_prep_idle("Cleanup after idle");
2088} 2094}
2089 2095
2090/* 2096/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
2108 */ 2114 */
2109static void rcu_prepare_for_idle(int cpu) 2115static void rcu_prepare_for_idle(int cpu)
2110{ 2116{
2117 struct timer_list *tp;
2118
2119 /*
2120 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
2125 * pending.
2126 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) ==
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2133 }
2134 return;
2135 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139
2111 /* 2140 /*
2112 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2141 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2113 * Also reset state to avoid prejudicing later attempts. 2142 * Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2169 per_cpu(rcu_dyntick_drain, cpu) = 0;
2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2171 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2172 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2173 jiffies + RCU_IDLE_GP_DELAY;
2145 else 2174 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2175 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2176 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2180 per_cpu(rcu_nonlazy_posted, cpu);
2148 return; /* Nothing more to do immediately. */ 2181 return; /* Nothing more to do immediately. */
2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2150 /* We have hit the limit, so time to give up. */ 2183 /* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
2184 trace_rcu_prep_idle("Callbacks drained"); 2217 trace_rcu_prep_idle("Callbacks drained");
2185} 2218}
2186 2219
2220/*
2221 * Keep a running count of the number of non-lazy callbacks posted
2222 * on this CPU. This running counter (which is never decremented) allows
2223 * rcu_prepare_for_idle() to detect when something out of the idle loop
2224 * posts a callback, even if an equal number of callbacks are invoked.
2225 * Of course, callbacks should only be posted from within a trace event
2226 * designed to be called from idle or from within RCU_NONIDLE().
2227 */
2228static void rcu_idle_count_callbacks_posted(void)
2229{
2230 __this_cpu_add(rcu_nonlazy_posted, 1);
2231}
2232
2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188 2234
2189#ifdef CONFIG_RCU_CPU_STALL_INFO 2235#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
2192 2238
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{ 2240{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
2196 2242
2197 sprintf(cp, "drain=%d %c timer=%lld", 2243 sprintf(cp, "drain=%d %c timer=%lu",
2198 per_cpu(rcu_dyntick_drain, cpu), 2244 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp) 2246 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203} 2247}
2204 2248
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2249#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
271 271
272 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
275 rsp->completed, gpnum, rsp->fqs_state, 275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 279 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh); 280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
282 if (rnp->level != level) { 282 if (rnp->level != level) {
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b3..bebe2b170d49 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
25int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
26 bool force)
26{ 27{
28 int ret = 0;
29
27 if (counter->usage + val > counter->limit) { 30 if (counter->usage + val > counter->limit) {
28 counter->failcnt++; 31 counter->failcnt++;
29 return -ENOMEM; 32 ret = -ENOMEM;
33 if (!force)
34 return ret;
30 } 35 }
31 36
32 counter->usage += val; 37 counter->usage += val;
33 if (counter->usage > counter->max_usage) 38 if (counter->usage > counter->max_usage)
34 counter->max_usage = counter->usage; 39 counter->max_usage = counter->usage;
35 return 0; 40 return ret;
36} 41}
37 42
38int res_counter_charge(struct res_counter *counter, unsigned long val, 43static int __res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at) 44 struct res_counter **limit_fail_at, bool force)
40{ 45{
41 int ret; 46 int ret, r;
42 unsigned long flags; 47 unsigned long flags;
43 struct res_counter *c, *u; 48 struct res_counter *c, *u;
44 49
50 r = ret = 0;
45 *limit_fail_at = NULL; 51 *limit_fail_at = NULL;
46 local_irq_save(flags); 52 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) { 53 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock); 54 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val); 55 r = res_counter_charge_locked(c, val, force);
50 spin_unlock(&c->lock); 56 spin_unlock(&c->lock);
51 if (ret < 0) { 57 if (r < 0 && !ret) {
58 ret = r;
52 *limit_fail_at = c; 59 *limit_fail_at = c;
53 goto undo; 60 if (!force)
61 break;
54 } 62 }
55 } 63 }
56 ret = 0; 64
57 goto done; 65 if (ret < 0 && !force) {
58undo: 66 for (u = counter; u != c; u = u->parent) {
59 for (u = counter; u != c; u = u->parent) { 67 spin_lock(&u->lock);
60 spin_lock(&u->lock); 68 res_counter_uncharge_locked(u, val);
61 res_counter_uncharge_locked(u, val); 69 spin_unlock(&u->lock);
62 spin_unlock(&u->lock); 70 }
63 } 71 }
64done:
65 local_irq_restore(flags); 72 local_irq_restore(flags);
73
66 return ret; 74 return ret;
67} 75}
68 76
77int res_counter_charge(struct res_counter *counter, unsigned long val,
78 struct res_counter **limit_fail_at)
79{
80 return __res_counter_charge(counter, val, limit_fail_at, false);
81}
82
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, 83int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at) 84 struct res_counter **limit_fail_at)
71{ 85{
72 int ret, r; 86 return __res_counter_charge(counter, val, limit_fail_at, true);
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93} 87}
88
94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
95{ 90{
96 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab9745f7e115..d833cc94eedc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -1911,7 +1912,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1911 struct task_struct *next) 1912 struct task_struct *next)
1912{ 1913{
1913 sched_info_switch(prev, next); 1914 sched_info_switch(prev, next);
1914 perf_event_task_sched_out(prev, next); 1915 perf_event_task_sched(prev, next);
1915 fire_sched_out_preempt_notifiers(prev, next); 1916 fire_sched_out_preempt_notifiers(prev, next);
1916 prepare_lock_switch(rq, next); 1917 prepare_lock_switch(rq, next);
1917 prepare_arch_switch(next); 1918 prepare_arch_switch(next);
@@ -1954,13 +1955,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1954 */ 1955 */
1955 prev_state = prev->state; 1956 prev_state = prev->state;
1956 finish_arch_switch(prev); 1957 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1960 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1964 finish_lock_switch(rq, prev); 1958 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch(); 1959 finish_arch_post_lock_switch();
1966 1960
@@ -2081,6 +2075,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2081#endif 2075#endif
2082 2076
2083 /* Here we just switch the register state and the stack. */ 2077 /* Here we just switch the register state and the stack. */
2078 rcu_switch_from(prev);
2084 switch_to(prev, next, prev); 2079 switch_to(prev, next, prev);
2085 2080
2086 barrier(); 2081 barrier();
@@ -7077,6 +7072,7 @@ void __init sched_init(void)
7077 /* May be allocated at isolcpus cmdline parse time */ 7072 /* May be allocated at isolcpus cmdline parse time */
7078 if (cpu_isolated_map == NULL) 7073 if (cpu_isolated_map == NULL)
7079 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7074 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7075 idle_thread_set_boot_cpu();
7080#endif 7076#endif
7081 init_sched_fair_class(); 7077 init_sched_fair_class();
7082 7078
@@ -7998,13 +7994,9 @@ static struct cftype cpu_files[] = {
7998 .write_u64 = cpu_rt_period_write_uint, 7994 .write_u64 = cpu_rt_period_write_uint,
7999 }, 7995 },
8000#endif 7996#endif
7997 { } /* terminate */
8001}; 7998};
8002 7999
8003static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8004{
8005 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
8006}
8007
8008struct cgroup_subsys cpu_cgroup_subsys = { 8000struct cgroup_subsys cpu_cgroup_subsys = {
8009 .name = "cpu", 8001 .name = "cpu",
8010 .create = cpu_cgroup_create, 8002 .create = cpu_cgroup_create,
@@ -8012,8 +8004,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8012 .can_attach = cpu_cgroup_can_attach, 8004 .can_attach = cpu_cgroup_can_attach,
8013 .attach = cpu_cgroup_attach, 8005 .attach = cpu_cgroup_attach,
8014 .exit = cpu_cgroup_exit, 8006 .exit = cpu_cgroup_exit,
8015 .populate = cpu_cgroup_populate,
8016 .subsys_id = cpu_cgroup_subsys_id, 8007 .subsys_id = cpu_cgroup_subsys_id,
8008 .base_cftypes = cpu_files,
8017 .early_init = 1, 8009 .early_init = 1,
8018}; 8010};
8019 8011
@@ -8198,13 +8190,9 @@ static struct cftype files[] = {
8198 .name = "stat", 8190 .name = "stat",
8199 .read_map = cpuacct_stats_show, 8191 .read_map = cpuacct_stats_show,
8200 }, 8192 },
8193 { } /* terminate */
8201}; 8194};
8202 8195
8203static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8204{
8205 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8206}
8207
8208/* 8196/*
8209 * charge this task's execution time to its accounting group. 8197 * charge this task's execution time to its accounting group.
8210 * 8198 *
@@ -8236,7 +8224,7 @@ struct cgroup_subsys cpuacct_subsys = {
8236 .name = "cpuacct", 8224 .name = "cpuacct",
8237 .create = cpuacct_create, 8225 .create = cpuacct_create,
8238 .destroy = cpuacct_destroy, 8226 .destroy = cpuacct_destroy,
8239 .populate = cpuacct_populate,
8240 .subsys_id = cpuacct_subsys_id, 8227 .subsys_id = cpuacct_subsys_id,
8228 .base_cftypes = files,
8241}; 8229};
8242#endif /* CONFIG_CGROUP_CPUACCT */ 8230#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..ee376beedaf9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
3 * 3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 * 5 *
6 * This defines a simple but solid secure-computing mode. 6 * Copyright (C) 2012 Google, Inc.
7 * Will Drewry <wad@chromium.org>
8 *
9 * This defines a simple but solid secure-computing facility.
10 *
11 * Mode 1 uses a fixed list of allowed system calls.
12 * Mode 2 allows user-defined system call filters in the form
13 * of Berkeley Packet Filters/Linux Socket Filters.
7 */ 14 */
8 15
16#include <linux/atomic.h>
9#include <linux/audit.h> 17#include <linux/audit.h>
10#include <linux/seccomp.h>
11#include <linux/sched.h>
12#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/sched.h>
20#include <linux/seccomp.h>
13 21
14/* #define SECCOMP_DEBUG 1 */ 22/* #define SECCOMP_DEBUG 1 */
15#define NR_SECCOMP_MODES 1 23
24#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h>
26#include <linux/filter.h>
27#include <linux/ptrace.h>
28#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h>
31#include <linux/uaccess.h>
32
33/**
34 * struct seccomp_filter - container for seccomp BPF programs
35 *
36 * @usage: reference count to manage the object lifetime.
37 * get/put helpers should be used when accessing an instance
38 * outside of a lifetime-guarded section. In general, this
39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate
43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting
46 * with current->seccomp.filter, the most recently attached or inherited filter.
47 * However, multiple filters may share a @prev node, by way of fork(), which
48 * results in a unidirectional tree existing in memory. This is similar to
49 * how namespaces work.
50 *
51 * seccomp_filter objects should never be modified after being attached
52 * to a task_struct (other than @usage).
53 */
54struct seccomp_filter {
55 atomic_t usage;
56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[];
59};
60
61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63
64/**
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture.
76 */
77static inline u32 get_u32(u64 data, int index)
78{
79 return ((u32 *)&data)[index];
80}
81
82/* Helper for bpf_load below. */
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
84/**
85 * bpf_load: checks and returns a pointer to the requested offset
86 * @off: offset into struct seccomp_data to load from
87 *
88 * Returns the requested 32-bits of data.
89 * seccomp_check_filter() should assure that @off is 32-bit aligned
90 * and not out of bounds. Failure to do so is a BUG.
91 */
92u32 seccomp_bpf_load(int off)
93{
94 struct pt_regs *regs = task_pt_regs(current);
95 if (off == BPF_DATA(nr))
96 return syscall_get_nr(current, regs);
97 if (off == BPF_DATA(arch))
98 return syscall_get_arch(current, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
100 unsigned long value;
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
102 int index = !!(off % sizeof(u64));
103 syscall_get_arguments(current, regs, arg, 1, &value);
104 return get_u32(value, index);
105 }
106 if (off == BPF_DATA(instruction_pointer))
107 return get_u32(KSTK_EIP(current), 0);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112}
113
114/**
115 * seccomp_check_filter - verify seccomp filter code
116 * @filter: filter to verify
117 * @flen: length of filter
118 *
119 * Takes a previously checked filter (by sk_chk_filter) and
120 * redirects all filter code that loads struct sk_buff data
121 * and related data through seccomp_bpf_load. It also
122 * enforces length and alignment checking of those loads.
123 *
124 * Returns 0 if the rule set is legal or -EINVAL if not.
125 */
126static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
127{
128 int pc;
129 for (pc = 0; pc < flen; pc++) {
130 struct sock_filter *ftest = &filter[pc];
131 u16 code = ftest->code;
132 u32 k = ftest->k;
133
134 switch (code) {
135 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W;
137 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL;
140 continue;
141 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM;
143 ftest->k = sizeof(struct seccomp_data);
144 continue;
145 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM;
147 ftest->k = sizeof(struct seccomp_data);
148 continue;
149 /* Explicitly include allowed calls. */
150 case BPF_S_RET_K:
151 case BPF_S_RET_A:
152 case BPF_S_ALU_ADD_K:
153 case BPF_S_ALU_ADD_X:
154 case BPF_S_ALU_SUB_K:
155 case BPF_S_ALU_SUB_X:
156 case BPF_S_ALU_MUL_K:
157 case BPF_S_ALU_MUL_X:
158 case BPF_S_ALU_DIV_X:
159 case BPF_S_ALU_AND_K:
160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K:
166 case BPF_S_ALU_RSH_X:
167 case BPF_S_ALU_NEG:
168 case BPF_S_LD_IMM:
169 case BPF_S_LDX_IMM:
170 case BPF_S_MISC_TAX:
171 case BPF_S_MISC_TXA:
172 case BPF_S_ALU_DIV_K:
173 case BPF_S_LD_MEM:
174 case BPF_S_LDX_MEM:
175 case BPF_S_ST:
176 case BPF_S_STX:
177 case BPF_S_JMP_JA:
178 case BPF_S_JMP_JEQ_K:
179 case BPF_S_JMP_JEQ_X:
180 case BPF_S_JMP_JGE_K:
181 case BPF_S_JMP_JGE_X:
182 case BPF_S_JMP_JGT_K:
183 case BPF_S_JMP_JGT_X:
184 case BPF_S_JMP_JSET_K:
185 case BPF_S_JMP_JSET_X:
186 continue;
187 default:
188 return -EINVAL;
189 }
190 }
191 return 0;
192}
193
194/**
195 * seccomp_run_filters - evaluates all seccomp filters against @syscall
196 * @syscall: number of the current system call
197 *
198 * Returns valid seccomp BPF response codes.
199 */
200static u32 seccomp_run_filters(int syscall)
201{
202 struct seccomp_filter *f;
203 u32 ret = SECCOMP_RET_ALLOW;
204
205 /* Ensure unexpected behavior doesn't result in failing open. */
206 if (WARN_ON(current->seccomp.filter == NULL))
207 return SECCOMP_RET_KILL;
208
209 /*
210 * All filters in the list are evaluated and the lowest BPF return
211 * value always takes priority (ignoring the DATA).
212 */
213 for (f = current->seccomp.filter; f; f = f->prev) {
214 u32 cur_ret = sk_run_filter(NULL, f->insns);
215 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
216 ret = cur_ret;
217 }
218 return ret;
219}
220
221/**
222 * seccomp_attach_filter: Attaches a seccomp filter to current.
223 * @fprog: BPF program to install
224 *
225 * Returns 0 on success or an errno on failure.
226 */
227static long seccomp_attach_filter(struct sock_fprog *fprog)
228{
229 struct seccomp_filter *filter;
230 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
231 unsigned long total_insns = fprog->len;
232 long ret;
233
234 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
235 return -EINVAL;
236
237 for (filter = current->seccomp.filter; filter; filter = filter->prev)
238 total_insns += filter->len + 4; /* include a 4 instr penalty */
239 if (total_insns > MAX_INSNS_PER_PATH)
240 return -ENOMEM;
241
242 /*
243 * Installing a seccomp filter requires that the task have
244 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
245 * This avoids scenarios where unprivileged tasks can affect the
246 * behavior of privileged children.
247 */
248 if (!current->no_new_privs &&
249 security_capable_noaudit(current_cred(), current_user_ns(),
250 CAP_SYS_ADMIN) != 0)
251 return -EACCES;
252
253 /* Allocate a new seccomp_filter */
254 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
255 GFP_KERNEL|__GFP_NOWARN);
256 if (!filter)
257 return -ENOMEM;
258 atomic_set(&filter->usage, 1);
259 filter->len = fprog->len;
260
261 /* Copy the instructions from fprog. */
262 ret = -EFAULT;
263 if (copy_from_user(filter->insns, fprog->filter, fp_size))
264 goto fail;
265
266 /* Check and rewrite the fprog via the skb checker */
267 ret = sk_chk_filter(filter->insns, filter->len);
268 if (ret)
269 goto fail;
270
271 /* Check and rewrite the fprog for seccomp use */
272 ret = seccomp_check_filter(filter->insns, filter->len);
273 if (ret)
274 goto fail;
275
276 /*
277 * If there is an existing filter, make it the prev and don't drop its
278 * task reference.
279 */
280 filter->prev = current->seccomp.filter;
281 current->seccomp.filter = filter;
282 return 0;
283fail:
284 kfree(filter);
285 return ret;
286}
287
288/**
289 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
290 * @user_filter: pointer to the user data containing a sock_fprog.
291 *
292 * Returns 0 on success and non-zero otherwise.
293 */
294long seccomp_attach_user_filter(char __user *user_filter)
295{
296 struct sock_fprog fprog;
297 long ret = -EFAULT;
298
299#ifdef CONFIG_COMPAT
300 if (is_compat_task()) {
301 struct compat_sock_fprog fprog32;
302 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
303 goto out;
304 fprog.len = fprog32.len;
305 fprog.filter = compat_ptr(fprog32.filter);
306 } else /* falls through to the if below. */
307#endif
308 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
309 goto out;
310 ret = seccomp_attach_filter(&fprog);
311out:
312 return ret;
313}
314
315/* get_seccomp_filter - increments the reference count of the filter on @tsk */
316void get_seccomp_filter(struct task_struct *tsk)
317{
318 struct seccomp_filter *orig = tsk->seccomp.filter;
319 if (!orig)
320 return;
321 /* Reference count is bounded by the number of total processes. */
322 atomic_inc(&orig->usage);
323}
324
325/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
326void put_seccomp_filter(struct task_struct *tsk)
327{
328 struct seccomp_filter *orig = tsk->seccomp.filter;
329 /* Clean up single-reference branches iteratively. */
330 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig;
332 orig = orig->prev;
333 kfree(freeme);
334 }
335}
336
337/**
338 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
339 * @syscall: syscall number to send to userland
340 * @reason: filter-supplied reason code to send to userland (via si_errno)
341 *
342 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
343 */
344static void seccomp_send_sigsys(int syscall, int reason)
345{
346 struct siginfo info;
347 memset(&info, 0, sizeof(info));
348 info.si_signo = SIGSYS;
349 info.si_code = SYS_SECCOMP;
350 info.si_call_addr = (void __user *)KSTK_EIP(current);
351 info.si_errno = reason;
352 info.si_arch = syscall_get_arch(current, task_pt_regs(current));
353 info.si_syscall = syscall;
354 force_sig_info(SIGSYS, &info, current);
355}
356#endif /* CONFIG_SECCOMP_FILTER */
16 357
17/* 358/*
18 * Secure computing mode 1 allows only read/write/exit/sigreturn. 359 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
31}; 372};
32#endif 373#endif
33 374
34void __secure_computing(int this_syscall) 375int __secure_computing(int this_syscall)
35{ 376{
36 int mode = current->seccomp.mode; 377 int mode = current->seccomp.mode;
37 int * syscall; 378 int exit_sig = 0;
379 int *syscall;
380 u32 ret;
38 381
39 switch (mode) { 382 switch (mode) {
40 case 1: 383 case SECCOMP_MODE_STRICT:
41 syscall = mode1_syscalls; 384 syscall = mode1_syscalls;
42#ifdef CONFIG_COMPAT 385#ifdef CONFIG_COMPAT
43 if (is_compat_task()) 386 if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
45#endif 388#endif
46 do { 389 do {
47 if (*syscall == this_syscall) 390 if (*syscall == this_syscall)
48 return; 391 return 0;
49 } while (*++syscall); 392 } while (*++syscall);
393 exit_sig = SIGKILL;
394 ret = SECCOMP_RET_KILL;
395 break;
396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: {
398 int data;
399 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION;
402 switch (ret) {
403 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current),
406 -data, 0);
407 goto skip;
408 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current));
411 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data);
413 goto skip;
414 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
417 goto skip;
418 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /*
421 * The delivery of a fatal signal during event
422 * notification may silently skip tracer notification.
423 * Terminating the task now avoids executing a system
424 * call that may not be intended.
425 */
426 if (fatal_signal_pending(current))
427 break;
428 return 0;
429 case SECCOMP_RET_ALLOW:
430 return 0;
431 case SECCOMP_RET_KILL:
432 default:
433 break;
434 }
435 exit_sig = SIGSYS;
50 break; 436 break;
437 }
438#endif
51 default: 439 default:
52 BUG(); 440 BUG();
53 } 441 }
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
55#ifdef SECCOMP_DEBUG 443#ifdef SECCOMP_DEBUG
56 dump_stack(); 444 dump_stack();
57#endif 445#endif
58 audit_seccomp(this_syscall); 446 audit_seccomp(this_syscall, exit_sig, ret);
59 do_exit(SIGKILL); 447 do_exit(exit_sig);
448#ifdef CONFIG_SECCOMP_FILTER
449skip:
450 audit_seccomp(this_syscall, exit_sig, ret);
451#endif
452 return -1;
60} 453}
61 454
62long prctl_get_seccomp(void) 455long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
64 return current->seccomp.mode; 457 return current->seccomp.mode;
65} 458}
66 459
67long prctl_set_seccomp(unsigned long seccomp_mode) 460/**
461 * prctl_set_seccomp: configures current->seccomp.mode
462 * @seccomp_mode: requested mode to use
463 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
464 *
465 * This function may be called repeatedly with a @seccomp_mode of
466 * SECCOMP_MODE_FILTER to install additional filters. Every filter
467 * successfully installed will be evaluated (in reverse order) for each system
468 * call the task makes.
469 *
470 * Once current->seccomp.mode is non-zero, it may not be changed.
471 *
472 * Returns 0 on success or -EINVAL on failure.
473 */
474long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
68{ 475{
69 long ret; 476 long ret = -EINVAL;
70 477
71 /* can set it only once to be even more secure */ 478 if (current->seccomp.mode &&
72 ret = -EPERM; 479 current->seccomp.mode != seccomp_mode)
73 if (unlikely(current->seccomp.mode))
74 goto out; 480 goto out;
75 481
76 ret = -EINVAL; 482 switch (seccomp_mode) {
77 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { 483 case SECCOMP_MODE_STRICT:
78 current->seccomp.mode = seccomp_mode; 484 ret = 0;
79 set_thread_flag(TIF_SECCOMP);
80#ifdef TIF_NOTSC 485#ifdef TIF_NOTSC
81 disable_TSC(); 486 disable_TSC();
82#endif 487#endif
83 ret = 0; 488 break;
489#ifdef CONFIG_SECCOMP_FILTER
490 case SECCOMP_MODE_FILTER:
491 ret = seccomp_attach_user_filter(filter);
492 if (ret)
493 goto out;
494 break;
495#endif
496 default:
497 goto out;
84 } 498 }
85 499
86 out: 500 current->seccomp.mode = seccomp_mode;
501 set_thread_flag(TIF_SECCOMP);
502out:
87 return ret; 503 return ret;
88} 504}
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..1a006b5d9d9d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -160,7 +160,7 @@ void recalc_sigpending(void)
160 160
161#define SYNCHRONOUS_MASK \ 161#define SYNCHRONOUS_MASK \
162 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ 162 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
163 sigmask(SIGTRAP) | sigmask(SIGFPE)) 163 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
164 164
165int next_signal(struct sigpending *pending, sigset_t *mask) 165int next_signal(struct sigpending *pending, sigset_t *mask)
166{ 166{
@@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2706 err |= __put_user(from->si_uid, &to->si_uid); 2706 err |= __put_user(from->si_uid, &to->si_uid);
2707 err |= __put_user(from->si_ptr, &to->si_ptr); 2707 err |= __put_user(from->si_ptr, &to->si_ptr);
2708 break; 2708 break;
2709#ifdef __ARCH_SIGSYS
2710 case __SI_SYS:
2711 err |= __put_user(from->si_call_addr, &to->si_call_addr);
2712 err |= __put_user(from->si_syscall, &to->si_syscall);
2713 err |= __put_user(from->si_arch, &to->si_arch);
2714 break;
2715#endif
2709 default: /* this is just in case for now ... */ 2716 default: /* this is just in case for now ... */
2710 err |= __put_user(from->si_pid, &to->si_pid); 2717 err |= __put_user(from->si_pid, &to->si_pid);
2711 err |= __put_user(from->si_uid, &to->si_uid); 2718 err |= __put_user(from->si_uid, &to->si_uid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf759..d0ae5b24875e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#include "smpboot.h"
17
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
17static struct { 19static struct {
18 struct list_head queue; 20 struct list_head queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
669{ 671{
670 unsigned int cpu; 672 unsigned int cpu;
671 673
674 idle_threads_init();
675
672 /* FIXME: This should be done in userspace --RR */ 676 /* FIXME: This should be done in userspace --RR */
673 for_each_present_cpu(cpu) { 677 for_each_present_cpu(cpu) {
674 if (num_online_cpus() >= setup_max_cpus) 678 if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
791 } 795 }
792} 796}
793EXPORT_SYMBOL(on_each_cpu_cond); 797EXPORT_SYMBOL(on_each_cpu_cond);
798
799static void do_nothing(void *unused)
800{
801}
802
803/**
804 * kick_all_cpus_sync - Force all cpus out of idle
805 *
806 * Used to synchronize the update of pm_idle function pointer. It's
807 * called after the pointer is updated and returns after the dummy
808 * callback function has been executed on all cpus. The execution of
809 * the function can only happen on the remote cpus after they have
810 * left the idle function which had been called via pm_idle function
811 * pointer. So it's guaranteed that nothing uses the previous pointer
812 * anymore.
813 */
814void kick_all_cpus_sync(void)
815{
816 /* Make sure the change is visible before we kick the cpus */
817 smp_mb();
818 smp_call_function(do_nothing, NULL, 1);
819}
820EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 000000000000..e1a797e028a3
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,62 @@
1/*
2 * Common SMP CPU bringup/teardown functions
3 */
4#include <linux/err.h>
5#include <linux/smp.h>
6#include <linux/init.h>
7#include <linux/sched.h>
8#include <linux/percpu.h>
9
10#include "smpboot.h"
11
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/*
14 * For the hotplug case we keep the task structs around and reuse
15 * them.
16 */
17static DEFINE_PER_CPU(struct task_struct *, idle_threads);
18
19struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
20{
21 struct task_struct *tsk = per_cpu(idle_threads, cpu);
22
23 if (!tsk)
24 return ERR_PTR(-ENOMEM);
25 init_idle(tsk, cpu);
26 return tsk;
27}
28
29void __init idle_thread_set_boot_cpu(void)
30{
31 per_cpu(idle_threads, smp_processor_id()) = current;
32}
33
34static inline void idle_init(unsigned int cpu)
35{
36 struct task_struct *tsk = per_cpu(idle_threads, cpu);
37
38 if (!tsk) {
39 tsk = fork_idle(cpu);
40 if (IS_ERR(tsk))
41 pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
42 else
43 per_cpu(idle_threads, cpu) = tsk;
44 }
45}
46
47/**
48 * idle_thread_init - Initialize the idle thread for a cpu
49 * @cpu: The cpu for which the idle thread should be initialized
50 *
51 * Creates the thread if it does not exist.
52 */
53void __init idle_threads_init(void)
54{
55 unsigned int cpu;
56
57 for_each_possible_cpu(cpu) {
58 if (cpu != smp_processor_id())
59 idle_init(cpu);
60 }
61}
62#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 000000000000..80c0acfb8472
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
1#ifndef SMPBOOT_H
2#define SMPBOOT_H
3
4struct task_struct;
5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void);
11void idle_threads_init(void);
12#else
13static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
14static inline void idle_thread_set_boot_cpu(void) { }
15static inline void idle_threads_init(void) { }
16#endif
17
18#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37/*
38 * Initialize an rcu_batch structure to empty.
39 */
40static inline void rcu_batch_init(struct rcu_batch *b)
41{
42 b->head = NULL;
43 b->tail = &b->head;
44}
45
46/*
47 * Enqueue a callback onto the tail of the specified rcu_batch structure.
48 */
49static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
50{
51 *b->tail = head;
52 b->tail = &head->next;
53}
54
55/*
56 * Is the specified rcu_batch structure empty?
57 */
58static inline bool rcu_batch_empty(struct rcu_batch *b)
59{
60 return b->tail == &b->head;
61}
62
63/*
64 * Remove the callback at the head of the specified rcu_batch structure
65 * and return a pointer to it, or return NULL if the structure is empty.
66 */
67static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
68{
69 struct rcu_head *head;
70
71 if (rcu_batch_empty(b))
72 return NULL;
73
74 head = b->head;
75 b->head = head->next;
76 if (b->tail == &head->next)
77 rcu_batch_init(b);
78
79 return head;
80}
81
82/*
83 * Move all callbacks from the rcu_batch structure specified by "from" to
84 * the structure specified by "to".
85 */
86static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
87{
88 if (!rcu_batch_empty(from)) {
89 *to->tail = from->head;
90 to->tail = from->tail;
91 rcu_batch_init(from);
92 }
93}
94
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
37static int init_srcu_struct_fields(struct srcu_struct *sp) 98static int init_srcu_struct_fields(struct srcu_struct *sp)
38{ 99{
39 sp->completed = 0; 100 sp->completed = 0;
40 mutex_init(&sp->mutex); 101 spin_lock_init(&sp->queue_lock);
102 sp->running = false;
103 rcu_batch_init(&sp->batch_queue);
104 rcu_batch_init(&sp->batch_check0);
105 rcu_batch_init(&sp->batch_check1);
106 rcu_batch_init(&sp->batch_done);
107 INIT_DELAYED_WORK(&sp->work, process_srcu);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM; 109 return sp->per_cpu_ref ? 0 : -ENOMEM;
43} 110}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 140#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 141
75/* 142/*
76 * srcu_readers_active_idx -- returns approximate number of readers 143 * Returns approximate total of the readers' ->seq[] values for the
77 * active on the specified rank of per-CPU counters. 144 * rank of per-CPU counters specified by idx.
78 */ 145 */
146static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
147{
148 int cpu;
149 unsigned long sum = 0;
150 unsigned long t;
79 151
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 152 for_each_possible_cpu(cpu) {
153 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
154 sum += t;
155 }
156 return sum;
157}
158
159/*
160 * Returns approximate number of readers active on the specified rank
161 * of the per-CPU ->c[] counters.
162 */
163static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
81{ 164{
82 int cpu; 165 int cpu;
83 int sum; 166 unsigned long sum = 0;
167 unsigned long t;
84 168
85 sum = 0; 169 for_each_possible_cpu(cpu) {
86 for_each_possible_cpu(cpu) 170 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 171 sum += t;
172 }
88 return sum; 173 return sum;
89} 174}
90 175
176/*
177 * Return true if the number of pre-existing readers is determined to
178 * be stably zero. An example unstable zero can occur if the call
179 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
180 * but due to task migration, sees the corresponding __srcu_read_unlock()
181 * decrement. This can happen because srcu_readers_active_idx() takes
182 * time to sum the array, and might in fact be interrupted or preempted
183 * partway through the summation.
184 */
185static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
186{
187 unsigned long seq;
188
189 seq = srcu_readers_seq_idx(sp, idx);
190
191 /*
192 * The following smp_mb() A pairs with the smp_mb() B located in
193 * __srcu_read_lock(). This pairing ensures that if an
194 * __srcu_read_lock() increments its counter after the summation
195 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
196 * critical section will see any changes made prior to the start
197 * of the current SRCU grace period.
198 *
199 * Also, if the above call to srcu_readers_seq_idx() saw the
200 * increment of ->seq[], then the call to srcu_readers_active_idx()
201 * must see the increment of ->c[].
202 */
203 smp_mb(); /* A */
204
205 /*
206 * Note that srcu_readers_active_idx() can incorrectly return
207 * zero even though there is a pre-existing reader throughout.
208 * To see this, suppose that task A is in a very long SRCU
209 * read-side critical section that started on CPU 0, and that
210 * no other reader exists, so that the sum of the counters
211 * is equal to one. Then suppose that task B starts executing
212 * srcu_readers_active_idx(), summing up to CPU 1, and then that
213 * task C starts reading on CPU 0, so that its increment is not
214 * summed, but finishes reading on CPU 2, so that its decrement
215 * -is- summed. Then when task B completes its sum, it will
216 * incorrectly get zero, despite the fact that task A has been
217 * in its SRCU read-side critical section the whole time.
218 *
219 * We therefore do a validation step should srcu_readers_active_idx()
220 * return zero.
221 */
222 if (srcu_readers_active_idx(sp, idx) != 0)
223 return false;
224
225 /*
226 * The remainder of this function is the validation step.
227 * The following smp_mb() D pairs with the smp_mb() C in
228 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
229 * by srcu_readers_active_idx() above, then any destructive
230 * operation performed after the grace period will happen after
231 * the corresponding SRCU read-side critical section.
232 *
233 * Note that there can be at most NR_CPUS worth of readers using
234 * the old index, which is not enough to overflow even a 32-bit
235 * integer. (Yes, this does mean that systems having more than
236 * a billion or so CPUs need to be 64-bit systems.) Therefore,
237 * the sum of the ->seq[] counters cannot possibly overflow.
238 * Therefore, the only way that the return values of the two
239 * calls to srcu_readers_seq_idx() can be equal is if there were
240 * no increments of the corresponding rank of ->seq[] counts
241 * in the interim. But the missed-increment scenario laid out
242 * above includes an increment of the ->seq[] counter by
243 * the corresponding __srcu_read_lock(). Therefore, if this
244 * scenario occurs, the return values from the two calls to
245 * srcu_readers_seq_idx() will differ, and thus the validation
246 * step below suffices.
247 */
248 smp_mb(); /* D */
249
250 return srcu_readers_seq_idx(sp, idx) == seq;
251}
252
91/** 253/**
92 * srcu_readers_active - returns approximate number of readers. 254 * srcu_readers_active - returns approximate number of readers.
93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 255 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
98 */ 260 */
99static int srcu_readers_active(struct srcu_struct *sp) 261static int srcu_readers_active(struct srcu_struct *sp)
100{ 262{
101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 263 int cpu;
264 unsigned long sum = 0;
265
266 for_each_possible_cpu(cpu) {
267 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
268 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
269 }
270 return sum;
102} 271}
103 272
104/** 273/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 300 int idx;
132 301
133 preempt_disable(); 302 preempt_disable();
134 idx = sp->completed & 0x1; 303 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 304 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 306 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
138 preempt_enable(); 308 preempt_enable();
139 return idx; 309 return idx;
140} 310}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 319void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 320{
151 preempt_disable(); 321 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 322 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
154 preempt_enable(); 324 preempt_enable();
155} 325}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 326EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 333 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 334 * has done well in testing, so there is no need for a config parameter.
165 */ 335 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 336#define SRCU_RETRY_CHECK_DELAY 5
337#define SYNCHRONIZE_SRCU_TRYCOUNT 2
338#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
167 339
168/* 340/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 341 * @@@ Wait until all pre-existing readers complete. Such readers
342 * will have used the index specified by "idx".
343 * the caller should ensures the ->completed is not changed while checking
344 * and idx = (->completed & 1) ^ 1
170 */ 345 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 346static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
172{ 347{
173 int idx; 348 for (;;) {
174 349 if (srcu_readers_active_idx_check(sp, idx))
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 350 return true;
176 !lock_is_held(&rcu_bh_lock_map) && 351 if (--trycount <= 0)
177 !lock_is_held(&rcu_lock_map) && 352 return false;
178 !lock_is_held(&rcu_sched_lock_map), 353 udelay(SRCU_RETRY_CHECK_DELAY);
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 354 }
180 355}
181 idx = sp->completed;
182 mutex_lock(&sp->mutex);
183 356
184 /* 357/*
185 * Check to see if someone else did the work for us while we were 358 * Increment the ->completed counter so that future SRCU readers will
186 * waiting to acquire the lock. We need -two- advances of 359 * use the other rank of the ->c[] and ->seq[] arrays. This allows
187 * the counter, not just one. If there was but one, we might have 360 * us to wait for pre-existing readers in a starvation-free manner.
188 * shown up -after- our helper's first synchronize_sched(), thus 361 */
189 * having failed to prevent CPU-reordering races with concurrent 362static void srcu_flip(struct srcu_struct *sp)
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 363{
191 * either (1) wait for two or (2) supply the second ourselves. 364 sp->completed++;
192 */ 365}
193 366
194 if ((sp->completed - idx) >= 2) { 367/*
195 mutex_unlock(&sp->mutex); 368 * Enqueue an SRCU callback on the specified srcu_struct structure,
196 return; 369 * initiating grace-period processing if it is not already running.
370 */
371void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
372 void (*func)(struct rcu_head *head))
373{
374 unsigned long flags;
375
376 head->next = NULL;
377 head->func = func;
378 spin_lock_irqsave(&sp->queue_lock, flags);
379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) {
381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0);
197 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385}
386EXPORT_SYMBOL_GPL(call_srcu);
198 387
199 sync_func(); /* Force memory barrier on all CPUs. */ 388struct rcu_synchronize {
389 struct rcu_head head;
390 struct completion completion;
391};
200 392
201 /* 393/*
202 * The preceding synchronize_sched() ensures that any CPU that 394 * Awaken the corresponding synchronize_srcu() instance now that a
203 * sees the new value of sp->completed will also see any preceding 395 * grace period has elapsed.
204 * changes to data structures made by this CPU. This prevents 396 */
205 * some other CPU from reordering the accesses in its SRCU 397static void wakeme_after_rcu(struct rcu_head *head)
206 * read-side critical section to precede the corresponding 398{
207 * srcu_read_lock() -- ensuring that such references will in 399 struct rcu_synchronize *rcu;
208 * fact be protected.
209 *
210 * So it is now safe to do the flip.
211 */
212 400
213 idx = sp->completed & 0x1; 401 rcu = container_of(head, struct rcu_synchronize, head);
214 sp->completed++; 402 complete(&rcu->completion);
403}
215 404
216 sync_func(); /* Force memory barrier on all CPUs. */ 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
406static void srcu_reschedule(struct srcu_struct *sp);
217 407
218 /* 408/*
219 * At this point, because of the preceding synchronize_sched(), 409 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
220 * all srcu_read_lock() calls using the old counters have completed. 410 */
221 * Their corresponding critical sections might well be still 411static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
222 * executing, but the srcu_read_lock() primitives themselves 412{
223 * will have finished executing. We initially give readers 413 struct rcu_synchronize rcu;
224 * an arbitrarily chosen 10 microseconds to get out of their 414 struct rcu_head *head = &rcu.head;
225 * SRCU read-side critical sections, then loop waiting 1/HZ 415 bool done = false;
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234 416
235 sync_func(); /* Force memory barrier on all CPUs. */ 417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
418 !lock_is_held(&rcu_bh_lock_map) &&
419 !lock_is_held(&rcu_lock_map) &&
420 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
236 422
237 /* 423 init_completion(&rcu.completion);
238 * The preceding synchronize_sched() forces all srcu_read_unlock() 424
239 * primitives that were executing concurrently with the preceding 425 head->next = NULL;
240 * for_each_possible_cpu() loop to have completed by this point. 426 head->func = wakeme_after_rcu;
241 * More importantly, it also forces the corresponding SRCU read-side 427 spin_lock_irq(&sp->queue_lock);
242 * critical sections to have also completed, and the corresponding 428 if (!sp->running) {
243 * references to SRCU-protected data items to be dropped. 429 /* steal the processing owner */
244 * 430 sp->running = true;
245 * Note: 431 rcu_batch_queue(&sp->batch_check0, head);
246 * 432 spin_unlock_irq(&sp->queue_lock);
247 * Despite what you might think at first glance, the 433
248 * preceding synchronize_sched() -must- be within the 434 srcu_advance_batches(sp, trycount);
249 * critical section ended by the following mutex_unlock(). 435 if (!rcu_batch_empty(&sp->batch_done)) {
250 * Otherwise, a task taking the early exit can race 436 BUG_ON(sp->batch_done.head != head);
251 * with a srcu_read_unlock(), which might have executed 437 rcu_batch_dequeue(&sp->batch_done);
252 * just before the preceding srcu_readers_active() check, 438 done = true;
253 * and whose CPU might have reordered the srcu_read_unlock() 439 }
254 * with the preceding critical section. In this case, there 440 /* give the processing owner to work_struct */
255 * is nothing preventing the synchronize_sched() task that is 441 srcu_reschedule(sp);
256 * taking the early exit from freeing a data structure that 442 } else {
257 * is still being referenced (out of order) by the task 443 rcu_batch_queue(&sp->batch_queue, head);
258 * doing the srcu_read_unlock(). 444 spin_unlock_irq(&sp->queue_lock);
259 * 445 }
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */
264 446
265 mutex_unlock(&sp->mutex); 447 if (!done)
448 wait_for_completion(&rcu.completion);
266} 449}
267 450
268/** 451/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 464 */
282void synchronize_srcu(struct srcu_struct *sp) 465void synchronize_srcu(struct srcu_struct *sp)
283{ 466{
284 __synchronize_srcu(sp, synchronize_sched); 467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
285} 468}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 469EXPORT_SYMBOL_GPL(synchronize_srcu);
287 470
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 472 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 473 * @sp: srcu_struct with which to synchronize.
291 * 474 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 475 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 476 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 477 *
300 * Note that it is illegal to call this function while holding any lock 478 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 480 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 481 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 485 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 487{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 488 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
313} 489}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 490EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 491
316/** 492/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
494 */
495void srcu_barrier(struct srcu_struct *sp)
496{
497 synchronize_srcu(sp);
498}
499EXPORT_SYMBOL_GPL(srcu_barrier);
500
501/**
317 * srcu_batches_completed - return batches completed. 502 * srcu_batches_completed - return batches completed.
318 * @sp: srcu_struct on which to report batch completion. 503 * @sp: srcu_struct on which to report batch completion.
319 * 504 *
320 * Report the number of batches, correlated with, but not necessarily 505 * Report the number of batches, correlated with, but not necessarily
321 * precisely the same as, the number of grace periods that have elapsed. 506 * precisely the same as, the number of grace periods that have elapsed.
322 */ 507 */
323
324long srcu_batches_completed(struct srcu_struct *sp) 508long srcu_batches_completed(struct srcu_struct *sp)
325{ 509{
326 return sp->completed; 510 return sp->completed;
327} 511}
328EXPORT_SYMBOL_GPL(srcu_batches_completed); 512EXPORT_SYMBOL_GPL(srcu_batches_completed);
513
514#define SRCU_CALLBACK_BATCH 10
515#define SRCU_INTERVAL 1
516
517/*
518 * Move any new SRCU callbacks to the first stage of the SRCU grace
519 * period pipeline.
520 */
521static void srcu_collect_new(struct srcu_struct *sp)
522{
523 if (!rcu_batch_empty(&sp->batch_queue)) {
524 spin_lock_irq(&sp->queue_lock);
525 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
526 spin_unlock_irq(&sp->queue_lock);
527 }
528}
529
530/*
531 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
532 * ->batch_check1 and then to ->batch_done as readers drain.
533 */
534static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
535{
536 int idx = 1 ^ (sp->completed & 1);
537
538 /*
539 * Because readers might be delayed for an extended period after
540 * fetching ->completed for their index, at any point in time there
541 * might well be readers using both idx=0 and idx=1. We therefore
542 * need to wait for readers to clear from both index values before
543 * invoking a callback.
544 */
545
546 if (rcu_batch_empty(&sp->batch_check0) &&
547 rcu_batch_empty(&sp->batch_check1))
548 return; /* no callbacks need to be advanced */
549
550 if (!try_check_zero(sp, idx, trycount))
551 return; /* failed to advance, will try after SRCU_INTERVAL */
552
553 /*
554 * The callbacks in ->batch_check1 have already done with their
555 * first zero check and flip back when they were enqueued on
556 * ->batch_check0 in a previous invocation of srcu_advance_batches().
557 * (Presumably try_check_zero() returned false during that
558 * invocation, leaving the callbacks stranded on ->batch_check1.)
559 * They are therefore ready to invoke, so move them to ->batch_done.
560 */
561 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
562
563 if (rcu_batch_empty(&sp->batch_check0))
564 return; /* no callbacks need to be advanced */
565 srcu_flip(sp);
566
567 /*
568 * The callbacks in ->batch_check0 just finished their
569 * first check zero and flip, so move them to ->batch_check1
570 * for future checking on the other idx.
571 */
572 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
573
574 /*
575 * SRCU read-side critical sections are normally short, so check
576 * at least twice in quick succession after a flip.
577 */
578 trycount = trycount < 2 ? 2 : trycount;
579 if (!try_check_zero(sp, idx^1, trycount))
580 return; /* failed to advance, will try after SRCU_INTERVAL */
581
582 /*
583 * The callbacks in ->batch_check1 have now waited for all
584 * pre-existing readers using both idx values. They are therefore
585 * ready to invoke, so move them to ->batch_done.
586 */
587 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
588}
589
590/*
591 * Invoke a limited number of SRCU callbacks that have passed through
592 * their grace period. If there are more to do, SRCU will reschedule
593 * the workqueue.
594 */
595static void srcu_invoke_callbacks(struct srcu_struct *sp)
596{
597 int i;
598 struct rcu_head *head;
599
600 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
601 head = rcu_batch_dequeue(&sp->batch_done);
602 if (!head)
603 break;
604 local_bh_disable();
605 head->func(head);
606 local_bh_enable();
607 }
608}
609
610/*
611 * Finished one round of SRCU grace period. Start another if there are
612 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
613 */
614static void srcu_reschedule(struct srcu_struct *sp)
615{
616 bool pending = true;
617
618 if (rcu_batch_empty(&sp->batch_done) &&
619 rcu_batch_empty(&sp->batch_check1) &&
620 rcu_batch_empty(&sp->batch_check0) &&
621 rcu_batch_empty(&sp->batch_queue)) {
622 spin_lock_irq(&sp->queue_lock);
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 sp->running = false;
628 pending = false;
629 }
630 spin_unlock_irq(&sp->queue_lock);
631 }
632
633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
635}
636
637/*
638 * This is the work-queue function that handles SRCU grace periods.
639 */
640static void process_srcu(struct work_struct *work)
641{
642 struct srcu_struct *sp;
643
644 sp = container_of(work, struct srcu_struct, work.work);
645
646 srcu_collect_new(sp);
647 srcu_advance_batches(sp, 1);
648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp);
650}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..ba0ae8eea6fb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1908 error = prctl_get_seccomp(); 1908 error = prctl_get_seccomp();
1909 break; 1909 break;
1910 case PR_SET_SECCOMP: 1910 case PR_SET_SECCOMP:
1911 error = prctl_set_seccomp(arg2); 1911 error = prctl_set_seccomp(arg2, (char __user *)arg3);
1912 break; 1912 break;
1913 case PR_GET_TSC: 1913 case PR_GET_TSC:
1914 error = GET_TSC_CTL(arg2); 1914 error = GET_TSC_CTL(arg2);
@@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1979 error = put_user(me->signal->is_child_subreaper, 1979 error = put_user(me->signal->is_child_subreaper,
1980 (int __user *) arg2); 1980 (int __user *) arg2);
1981 break; 1981 break;
1982 case PR_SET_NO_NEW_PRIVS:
1983 if (arg2 != 1 || arg3 || arg4 || arg5)
1984 return -EINVAL;
1985
1986 current->no_new_privs = 1;
1987 break;
1988 case PR_GET_NO_NEW_PRIVS:
1989 if (arg2 || arg3 || arg4 || arg5)
1990 return -EINVAL;
1991 return current->no_new_privs ? 1 : 0;
1982 default: 1992 default:
1983 error = -EINVAL; 1993 error = -EINVAL;
1984 break; 1994 break;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7b..aa27d391bfc8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
59 * If one has not already been chosen, it checks to see if a 59 * If one has not already been chosen, it checks to see if a
60 * functional rtc device is available. 60 * functional rtc device is available.
61 */ 61 */
62static struct rtc_device *alarmtimer_get_rtcdev(void) 62struct rtc_device *alarmtimer_get_rtcdev(void)
63{ 63{
64 unsigned long flags; 64 unsigned long flags;
65 struct rtc_device *ret; 65 struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
115 class_interface_unregister(&alarmtimer_rtc_interface); 115 class_interface_unregister(&alarmtimer_rtc_interface);
116} 116}
117#else 117#else
118static inline struct rtc_device *alarmtimer_get_rtcdev(void) 118struct rtc_device *alarmtimer_get_rtcdev(void)
119{ 119{
120 return NULL; 120 return NULL;
121} 121}
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..09de9a941cd7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
861 * 861 *
862 * mod_timer_pinned() is a way to update the expire field of an 862 * mod_timer_pinned() is a way to update the expire field of an
863 * active timer (if the timer is inactive it will be activated) 863 * active timer (if the timer is inactive it will be activated)
864 * and not allow the timer to be migrated to a different CPU. 864 * and to ensure that the timer is scheduled on the current CPU.
865 *
866 * Note that this does not prevent the timer from being migrated
867 * when the current CPU goes offline. If this is a problem for
868 * you, use CPU-hotplug notifiers to handle it correctly, for
869 * example, cancelling the timer when the corresponding CPU goes
870 * offline.
865 * 871 *
866 * mod_timer_pinned(timer, expires) is equivalent to: 872 * mod_timer_pinned(timer, expires) is equivalent to:
867 * 873 *
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1102 * warnings as well as problems when looking into 1108 * warnings as well as problems when looking into
1103 * timer->lockdep_map, make a copy and use that here. 1109 * timer->lockdep_map, make a copy and use that here.
1104 */ 1110 */
1105 struct lockdep_map lockdep_map = timer->lockdep_map; 1111 struct lockdep_map lockdep_map;
1112
1113 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1106#endif 1114#endif
1107 /* 1115 /*
1108 * Couple the lock chain with the lock chain at 1116 * Couple the lock chain with the lock chain at
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..d81a1a532994 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,6 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
145 select KALLSYMS 144 select KALLSYMS
146 select GENERIC_TRACER 145 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 146 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..b3afe0e76f79 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
45obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 44obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
46ifeq ($(CONFIG_BLOCK),y) 45ifeq ($(CONFIG_BLOCK),y)
47obj-$(CONFIG_EVENT_TRACING) += blktrace.o 46obj-$(CONFIG_EVENT_TRACING) += blktrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0fa92f677c92..a008663d86c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1383 1383
1384static int ftrace_cmp_recs(const void *a, const void *b) 1384static int ftrace_cmp_recs(const void *a, const void *b)
1385{ 1385{
1386 const struct dyn_ftrace *reca = a; 1386 const struct dyn_ftrace *key = a;
1387 const struct dyn_ftrace *recb = b; 1387 const struct dyn_ftrace *rec = b;
1388 1388
1389 if (reca->ip > recb->ip) 1389 if (key->flags < rec->ip)
1390 return 1;
1391 if (reca->ip < recb->ip)
1392 return -1; 1390 return -1;
1391 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1392 return 1;
1393 return 0; 1393 return 0;
1394} 1394}
1395 1395
1396/** 1396static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1397 * ftrace_location - return true if the ip giving is a traced location
1398 * @ip: the instruction pointer to check
1399 *
1400 * Returns 1 if @ip given is a pointer to a ftrace location.
1401 * That is, the instruction that is either a NOP or call to
1402 * the function tracer. It checks the ftrace internal tables to
1403 * determine if the address belongs or not.
1404 */
1405int ftrace_location(unsigned long ip)
1406{ 1397{
1407 struct ftrace_page *pg; 1398 struct ftrace_page *pg;
1408 struct dyn_ftrace *rec; 1399 struct dyn_ftrace *rec;
1409 struct dyn_ftrace key; 1400 struct dyn_ftrace key;
1410 1401
1411 key.ip = ip; 1402 key.ip = start;
1403 key.flags = end; /* overload flags, as it is unsigned long */
1412 1404
1413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 1405 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1406 if (end < pg->records[0].ip ||
1407 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1408 continue;
1414 rec = bsearch(&key, pg->records, pg->index, 1409 rec = bsearch(&key, pg->records, pg->index,
1415 sizeof(struct dyn_ftrace), 1410 sizeof(struct dyn_ftrace),
1416 ftrace_cmp_recs); 1411 ftrace_cmp_recs);
1417 if (rec) 1412 if (rec)
1418 return 1; 1413 return rec->ip;
1419 } 1414 }
1420 1415
1421 return 0; 1416 return 0;
1422} 1417}
1423 1418
1419/**
1420 * ftrace_location - return true if the ip giving is a traced location
1421 * @ip: the instruction pointer to check
1422 *
1423 * Returns rec->ip if @ip given is a pointer to a ftrace location.
1424 * That is, the instruction that is either a NOP or call to
1425 * the function tracer. It checks the ftrace internal tables to
1426 * determine if the address belongs or not.
1427 */
1428unsigned long ftrace_location(unsigned long ip)
1429{
1430 return ftrace_location_range(ip, ip);
1431}
1432
1433/**
1434 * ftrace_text_reserved - return true if range contains an ftrace location
1435 * @start: start of range to search
1436 * @end: end of range to search (inclusive). @end points to the last byte to check.
1437 *
1438 * Returns 1 if @start and @end contains a ftrace location.
1439 * That is, the instruction that is either a NOP or call to
1440 * the function tracer. It checks the ftrace internal tables to
1441 * determine if the address belongs or not.
1442 */
1443int ftrace_text_reserved(void *start, void *end)
1444{
1445 unsigned long ret;
1446
1447 ret = ftrace_location_range((unsigned long)start,
1448 (unsigned long)end);
1449
1450 return (int)!!ret;
1451}
1452
1424static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1453static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1425 int filter_hash, 1454 int filter_hash,
1426 bool inc) 1455 bool inc)
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1520 __ftrace_hash_rec_update(ops, filter_hash, 1); 1549 __ftrace_hash_rec_update(ops, filter_hash, 1);
1521} 1550}
1522 1551
1523static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1524{
1525 if (ftrace_pages->index == ftrace_pages->size) {
1526 /* We should have allocated enough */
1527 if (WARN_ON(!ftrace_pages->next))
1528 return NULL;
1529 ftrace_pages = ftrace_pages->next;
1530 }
1531
1532 return &ftrace_pages->records[ftrace_pages->index++];
1533}
1534
1535static struct dyn_ftrace *
1536ftrace_record_ip(unsigned long ip)
1537{
1538 struct dyn_ftrace *rec;
1539
1540 if (ftrace_disabled)
1541 return NULL;
1542
1543 rec = ftrace_alloc_dyn_node(ip);
1544 if (!rec)
1545 return NULL;
1546
1547 rec->ip = ip;
1548
1549 return rec;
1550}
1551
1552static void print_ip_ins(const char *fmt, unsigned char *p) 1552static void print_ip_ins(const char *fmt, unsigned char *p)
1553{ 1553{
1554 int i; 1554 int i;
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)
1598 } 1598 }
1599} 1599}
1600 1600
1601
1602/* Return 1 if the address range is reserved for ftrace */
1603int ftrace_text_reserved(void *start, void *end)
1604{
1605 struct dyn_ftrace *rec;
1606 struct ftrace_page *pg;
1607
1608 do_for_each_ftrace_rec(pg, rec) {
1609 if (rec->ip <= (unsigned long)end &&
1610 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1611 return 1;
1612 } while_for_each_ftrace_rec();
1613 return 0;
1614}
1615
1616static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1601static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1617{ 1602{
1618 unsigned long flag = 0UL; 1603 unsigned long flag = 0UL;
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1698 return -1; /* unknow ftrace bug */ 1683 return -1; /* unknow ftrace bug */
1699} 1684}
1700 1685
1701static void ftrace_replace_code(int update) 1686void __weak ftrace_replace_code(int enable)
1702{ 1687{
1703 struct dyn_ftrace *rec; 1688 struct dyn_ftrace *rec;
1704 struct ftrace_page *pg; 1689 struct ftrace_page *pg;
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)
1708 return; 1693 return;
1709 1694
1710 do_for_each_ftrace_rec(pg, rec) { 1695 do_for_each_ftrace_rec(pg, rec) {
1711 failed = __ftrace_replace_code(rec, update); 1696 failed = __ftrace_replace_code(rec, enable);
1712 if (failed) { 1697 if (failed) {
1713 ftrace_bug(failed, rec->ip); 1698 ftrace_bug(failed, rec->ip);
1714 /* Stop processing */ 1699 /* Stop processing */
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1826 return 0; 1811 return 0;
1827} 1812}
1828 1813
1829static int __ftrace_modify_code(void *data) 1814void ftrace_modify_all_code(int command)
1830{ 1815{
1831 int *command = data; 1816 if (command & FTRACE_UPDATE_CALLS)
1832
1833 if (*command & FTRACE_UPDATE_CALLS)
1834 ftrace_replace_code(1); 1817 ftrace_replace_code(1);
1835 else if (*command & FTRACE_DISABLE_CALLS) 1818 else if (command & FTRACE_DISABLE_CALLS)
1836 ftrace_replace_code(0); 1819 ftrace_replace_code(0);
1837 1820
1838 if (*command & FTRACE_UPDATE_TRACE_FUNC) 1821 if (command & FTRACE_UPDATE_TRACE_FUNC)
1839 ftrace_update_ftrace_func(ftrace_trace_function); 1822 ftrace_update_ftrace_func(ftrace_trace_function);
1840 1823
1841 if (*command & FTRACE_START_FUNC_RET) 1824 if (command & FTRACE_START_FUNC_RET)
1842 ftrace_enable_ftrace_graph_caller(); 1825 ftrace_enable_ftrace_graph_caller();
1843 else if (*command & FTRACE_STOP_FUNC_RET) 1826 else if (command & FTRACE_STOP_FUNC_RET)
1844 ftrace_disable_ftrace_graph_caller(); 1827 ftrace_disable_ftrace_graph_caller();
1828}
1829
1830static int __ftrace_modify_code(void *data)
1831{
1832 int *command = data;
1833
1834 ftrace_modify_all_code(*command);
1845 1835
1846 return 0; 1836 return 0;
1847} 1837}
@@ -2469,57 +2459,35 @@ static int
2469ftrace_avail_open(struct inode *inode, struct file *file) 2459ftrace_avail_open(struct inode *inode, struct file *file)
2470{ 2460{
2471 struct ftrace_iterator *iter; 2461 struct ftrace_iterator *iter;
2472 int ret;
2473 2462
2474 if (unlikely(ftrace_disabled)) 2463 if (unlikely(ftrace_disabled))
2475 return -ENODEV; 2464 return -ENODEV;
2476 2465
2477 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2466 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2478 if (!iter) 2467 if (iter) {
2479 return -ENOMEM; 2468 iter->pg = ftrace_pages_start;
2480 2469 iter->ops = &global_ops;
2481 iter->pg = ftrace_pages_start;
2482 iter->ops = &global_ops;
2483
2484 ret = seq_open(file, &show_ftrace_seq_ops);
2485 if (!ret) {
2486 struct seq_file *m = file->private_data;
2487
2488 m->private = iter;
2489 } else {
2490 kfree(iter);
2491 } 2470 }
2492 2471
2493 return ret; 2472 return iter ? 0 : -ENOMEM;
2494} 2473}
2495 2474
2496static int 2475static int
2497ftrace_enabled_open(struct inode *inode, struct file *file) 2476ftrace_enabled_open(struct inode *inode, struct file *file)
2498{ 2477{
2499 struct ftrace_iterator *iter; 2478 struct ftrace_iterator *iter;
2500 int ret;
2501 2479
2502 if (unlikely(ftrace_disabled)) 2480 if (unlikely(ftrace_disabled))
2503 return -ENODEV; 2481 return -ENODEV;
2504 2482
2505 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2483 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2506 if (!iter) 2484 if (iter) {
2507 return -ENOMEM; 2485 iter->pg = ftrace_pages_start;
2508 2486 iter->flags = FTRACE_ITER_ENABLED;
2509 iter->pg = ftrace_pages_start; 2487 iter->ops = &global_ops;
2510 iter->flags = FTRACE_ITER_ENABLED;
2511 iter->ops = &global_ops;
2512
2513 ret = seq_open(file, &show_ftrace_seq_ops);
2514 if (!ret) {
2515 struct seq_file *m = file->private_data;
2516
2517 m->private = iter;
2518 } else {
2519 kfree(iter);
2520 } 2488 }
2521 2489
2522 return ret; 2490 return iter ? 0 : -ENOMEM;
2523} 2491}
2524 2492
2525static void ftrace_filter_reset(struct ftrace_hash *hash) 2493static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3688 return 0; 3656 return 0;
3689} 3657}
3690 3658
3691static void ftrace_swap_recs(void *a, void *b, int size) 3659static int ftrace_cmp_ips(const void *a, const void *b)
3660{
3661 const unsigned long *ipa = a;
3662 const unsigned long *ipb = b;
3663
3664 if (*ipa > *ipb)
3665 return 1;
3666 if (*ipa < *ipb)
3667 return -1;
3668 return 0;
3669}
3670
3671static void ftrace_swap_ips(void *a, void *b, int size)
3692{ 3672{
3693 struct dyn_ftrace *reca = a; 3673 unsigned long *ipa = a;
3694 struct dyn_ftrace *recb = b; 3674 unsigned long *ipb = b;
3695 struct dyn_ftrace t; 3675 unsigned long t;
3696 3676
3697 t = *reca; 3677 t = *ipa;
3698 *reca = *recb; 3678 *ipa = *ipb;
3699 *recb = t; 3679 *ipb = t;
3700} 3680}
3701 3681
3702static int ftrace_process_locs(struct module *mod, 3682static int ftrace_process_locs(struct module *mod,
3703 unsigned long *start, 3683 unsigned long *start,
3704 unsigned long *end) 3684 unsigned long *end)
3705{ 3685{
3686 struct ftrace_page *start_pg;
3706 struct ftrace_page *pg; 3687 struct ftrace_page *pg;
3688 struct dyn_ftrace *rec;
3707 unsigned long count; 3689 unsigned long count;
3708 unsigned long *p; 3690 unsigned long *p;
3709 unsigned long addr; 3691 unsigned long addr;
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,
3715 if (!count) 3697 if (!count)
3716 return 0; 3698 return 0;
3717 3699
3718 pg = ftrace_allocate_pages(count); 3700 sort(start, count, sizeof(*start),
3719 if (!pg) 3701 ftrace_cmp_ips, ftrace_swap_ips);
3702
3703 start_pg = ftrace_allocate_pages(count);
3704 if (!start_pg)
3720 return -ENOMEM; 3705 return -ENOMEM;
3721 3706
3722 mutex_lock(&ftrace_lock); 3707 mutex_lock(&ftrace_lock);
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,
3729 if (!mod) { 3714 if (!mod) {
3730 WARN_ON(ftrace_pages || ftrace_pages_start); 3715 WARN_ON(ftrace_pages || ftrace_pages_start);
3731 /* First initialization */ 3716 /* First initialization */
3732 ftrace_pages = ftrace_pages_start = pg; 3717 ftrace_pages = ftrace_pages_start = start_pg;
3733 } else { 3718 } else {
3734 if (!ftrace_pages) 3719 if (!ftrace_pages)
3735 goto out; 3720 goto out;
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,
3740 ftrace_pages = ftrace_pages->next; 3725 ftrace_pages = ftrace_pages->next;
3741 } 3726 }
3742 3727
3743 ftrace_pages->next = pg; 3728 ftrace_pages->next = start_pg;
3744 ftrace_pages = pg;
3745 } 3729 }
3746 3730
3747 p = start; 3731 p = start;
3732 pg = start_pg;
3748 while (p < end) { 3733 while (p < end) {
3749 addr = ftrace_call_adjust(*p++); 3734 addr = ftrace_call_adjust(*p++);
3750 /* 3735 /*
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,
3755 */ 3740 */
3756 if (!addr) 3741 if (!addr)
3757 continue; 3742 continue;
3758 if (!ftrace_record_ip(addr)) 3743
3759 break; 3744 if (pg->index == pg->size) {
3745 /* We should have allocated enough */
3746 if (WARN_ON(!pg->next))
3747 break;
3748 pg = pg->next;
3749 }
3750
3751 rec = &pg->records[pg->index++];
3752 rec->ip = addr;
3760 } 3753 }
3761 3754
3762 /* These new locations need to be initialized */ 3755 /* We should have used all pages */
3763 ftrace_new_pgs = pg; 3756 WARN_ON(pg->next);
3757
3758 /* Assign the last page to ftrace_pages */
3759 ftrace_pages = pg;
3764 3760
3765 /* Make each individual set of pages sorted by ips */ 3761 /* These new locations need to be initialized */
3766 for (; pg; pg = pg->next) 3762 ftrace_new_pgs = start_pg;
3767 sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
3768 ftrace_cmp_recs, ftrace_swap_recs);
3769 3763
3770 /* 3764 /*
3771 * We only need to disable interrupts on start up 3765 * We only need to disable interrupts on start up
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cf8d11e91efd..6420cda62336 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
23#include <asm/local.h> 23#include <asm/local.h>
24#include "trace.h" 24#include "trace.h"
25 25
26static void update_pages_handler(struct work_struct *work);
27
26/* 28/*
27 * The ring buffer header is special. We must manually up keep it. 29 * The ring buffer header is special. We must manually up keep it.
28 */ 30 */
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu {
449 raw_spinlock_t reader_lock; /* serialize readers */ 451 raw_spinlock_t reader_lock; /* serialize readers */
450 arch_spinlock_t lock; 452 arch_spinlock_t lock;
451 struct lock_class_key lock_key; 453 struct lock_class_key lock_key;
454 unsigned int nr_pages;
452 struct list_head *pages; 455 struct list_head *pages;
453 struct buffer_page *head_page; /* read from head */ 456 struct buffer_page *head_page; /* read from head */
454 struct buffer_page *tail_page; /* write to tail */ 457 struct buffer_page *tail_page; /* write to tail */
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu {
466 unsigned long read_bytes; 469 unsigned long read_bytes;
467 u64 write_stamp; 470 u64 write_stamp;
468 u64 read_stamp; 471 u64 read_stamp;
472 /* ring buffer pages to update, > 0 to add, < 0 to remove */
473 int nr_pages_to_update;
474 struct list_head new_pages; /* new pages to add */
475 struct work_struct update_pages_work;
476 struct completion update_done;
469}; 477};
470 478
471struct ring_buffer { 479struct ring_buffer {
472 unsigned pages;
473 unsigned flags; 480 unsigned flags;
474 int cpus; 481 int cpus;
475 atomic_t record_disabled; 482 atomic_t record_disabled;
483 atomic_t resize_disabled;
476 cpumask_var_t cpumask; 484 cpumask_var_t cpumask;
477 485
478 struct lock_class_key *reader_lock_key; 486 struct lock_class_key *reader_lock_key;
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
937 struct list_head *head = cpu_buffer->pages; 945 struct list_head *head = cpu_buffer->pages;
938 struct buffer_page *bpage, *tmp; 946 struct buffer_page *bpage, *tmp;
939 947
948 /* Reset the head page if it exists */
949 if (cpu_buffer->head_page)
950 rb_set_head_page(cpu_buffer);
951
940 rb_head_page_deactivate(cpu_buffer); 952 rb_head_page_deactivate(cpu_buffer);
941 953
942 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 954 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
963 return 0; 975 return 0;
964} 976}
965 977
966static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 978static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
967 unsigned nr_pages)
968{ 979{
980 int i;
969 struct buffer_page *bpage, *tmp; 981 struct buffer_page *bpage, *tmp;
970 LIST_HEAD(pages);
971 unsigned i;
972
973 WARN_ON(!nr_pages);
974 982
975 for (i = 0; i < nr_pages; i++) { 983 for (i = 0; i < nr_pages; i++) {
976 struct page *page; 984 struct page *page;
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
981 */ 989 */
982 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 990 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
983 GFP_KERNEL | __GFP_NORETRY, 991 GFP_KERNEL | __GFP_NORETRY,
984 cpu_to_node(cpu_buffer->cpu)); 992 cpu_to_node(cpu));
985 if (!bpage) 993 if (!bpage)
986 goto free_pages; 994 goto free_pages;
987 995
988 rb_check_bpage(cpu_buffer, bpage); 996 list_add(&bpage->list, pages);
989 997
990 list_add(&bpage->list, &pages); 998 page = alloc_pages_node(cpu_to_node(cpu),
991
992 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
993 GFP_KERNEL | __GFP_NORETRY, 0); 999 GFP_KERNEL | __GFP_NORETRY, 0);
994 if (!page) 1000 if (!page)
995 goto free_pages; 1001 goto free_pages;
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 rb_init_page(bpage->page); 1003 rb_init_page(bpage->page);
998 } 1004 }
999 1005
1006 return 0;
1007
1008free_pages:
1009 list_for_each_entry_safe(bpage, tmp, pages, list) {
1010 list_del_init(&bpage->list);
1011 free_buffer_page(bpage);
1012 }
1013
1014 return -ENOMEM;
1015}
1016
1017static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1018 unsigned nr_pages)
1019{
1020 LIST_HEAD(pages);
1021
1022 WARN_ON(!nr_pages);
1023
1024 if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
1025 return -ENOMEM;
1026
1000 /* 1027 /*
1001 * The ring buffer page list is a circular list that does not 1028 * The ring buffer page list is a circular list that does not
1002 * start and end with a list head. All page list items point to 1029 * start and end with a list head. All page list items point to
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1005 cpu_buffer->pages = pages.next; 1032 cpu_buffer->pages = pages.next;
1006 list_del(&pages); 1033 list_del(&pages);
1007 1034
1035 cpu_buffer->nr_pages = nr_pages;
1036
1008 rb_check_pages(cpu_buffer); 1037 rb_check_pages(cpu_buffer);
1009 1038
1010 return 0; 1039 return 0;
1011
1012 free_pages:
1013 list_for_each_entry_safe(bpage, tmp, &pages, list) {
1014 list_del_init(&bpage->list);
1015 free_buffer_page(bpage);
1016 }
1017 return -ENOMEM;
1018} 1040}
1019 1041
1020static struct ring_buffer_per_cpu * 1042static struct ring_buffer_per_cpu *
1021rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 1043rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1022{ 1044{
1023 struct ring_buffer_per_cpu *cpu_buffer; 1045 struct ring_buffer_per_cpu *cpu_buffer;
1024 struct buffer_page *bpage; 1046 struct buffer_page *bpage;
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1035 raw_spin_lock_init(&cpu_buffer->reader_lock); 1057 raw_spin_lock_init(&cpu_buffer->reader_lock);
1036 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1058 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1037 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1059 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1060 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1061 init_completion(&cpu_buffer->update_done);
1038 1062
1039 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1063 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1040 GFP_KERNEL, cpu_to_node(cpu)); 1064 GFP_KERNEL, cpu_to_node(cpu));
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1052 1076
1053 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1054 1078
1055 ret = rb_allocate_pages(cpu_buffer, buffer->pages); 1079 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1056 if (ret < 0) 1080 if (ret < 0)
1057 goto fail_free_reader; 1081 goto fail_free_reader;
1058 1082
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1113{ 1137{
1114 struct ring_buffer *buffer; 1138 struct ring_buffer *buffer;
1115 int bsize; 1139 int bsize;
1116 int cpu; 1140 int cpu, nr_pages;
1117 1141
1118 /* keep it in its own cache line */ 1142 /* keep it in its own cache line */
1119 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1143 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1124 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) 1148 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
1125 goto fail_free_buffer; 1149 goto fail_free_buffer;
1126 1150
1127 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1151 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1128 buffer->flags = flags; 1152 buffer->flags = flags;
1129 buffer->clock = trace_clock_local; 1153 buffer->clock = trace_clock_local;
1130 buffer->reader_lock_key = key; 1154 buffer->reader_lock_key = key;
1131 1155
1132 /* need at least two pages */ 1156 /* need at least two pages */
1133 if (buffer->pages < 2) 1157 if (nr_pages < 2)
1134 buffer->pages = 2; 1158 nr_pages = 2;
1135 1159
1136 /* 1160 /*
1137 * In case of non-hotplug cpu, if the ring-buffer is allocated 1161 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1154 1178
1155 for_each_buffer_cpu(buffer, cpu) { 1179 for_each_buffer_cpu(buffer, cpu) {
1156 buffer->buffers[cpu] = 1180 buffer->buffers[cpu] =
1157 rb_allocate_cpu_buffer(buffer, cpu); 1181 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
1158 if (!buffer->buffers[cpu]) 1182 if (!buffer->buffers[cpu])
1159 goto fail_free_buffers; 1183 goto fail_free_buffers;
1160 } 1184 }
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
1222 1246
1223static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 1247static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1224 1248
1225static void 1249static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1226rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1227{ 1250{
1228 struct buffer_page *bpage; 1251 return local_read(&bpage->entries) & RB_WRITE_MASK;
1229 struct list_head *p; 1252}
1230 unsigned i; 1253
1254static inline unsigned long rb_page_write(struct buffer_page *bpage)
1255{
1256 return local_read(&bpage->write) & RB_WRITE_MASK;
1257}
1258
1259static int
1260rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1261{
1262 struct list_head *tail_page, *to_remove, *next_page;
1263 struct buffer_page *to_remove_page, *tmp_iter_page;
1264 struct buffer_page *last_page, *first_page;
1265 unsigned int nr_removed;
1266 unsigned long head_bit;
1267 int page_entries;
1268
1269 head_bit = 0;
1231 1270
1232 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1271 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1233 rb_head_page_deactivate(cpu_buffer); 1272 atomic_inc(&cpu_buffer->record_disabled);
1273 /*
1274 * We don't race with the readers since we have acquired the reader
1275 * lock. We also don't race with writers after disabling recording.
1276 * This makes it easy to figure out the first and the last page to be
1277 * removed from the list. We unlink all the pages in between including
1278 * the first and last pages. This is done in a busy loop so that we
1279 * lose the least number of traces.
1280 * The pages are freed after we restart recording and unlock readers.
1281 */
1282 tail_page = &cpu_buffer->tail_page->list;
1234 1283
1235 for (i = 0; i < nr_pages; i++) { 1284 /*
1236 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1285 * tail page might be on reader page, we remove the next page
1237 goto out; 1286 * from the ring buffer
1238 p = cpu_buffer->pages->next; 1287 */
1239 bpage = list_entry(p, struct buffer_page, list); 1288 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
1240 list_del_init(&bpage->list); 1289 tail_page = rb_list_head(tail_page->next);
1241 free_buffer_page(bpage); 1290 to_remove = tail_page;
1291
1292 /* start of pages to remove */
1293 first_page = list_entry(rb_list_head(to_remove->next),
1294 struct buffer_page, list);
1295
1296 for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
1297 to_remove = rb_list_head(to_remove)->next;
1298 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
1242 } 1299 }
1243 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1244 goto out;
1245 1300
1246 rb_reset_cpu(cpu_buffer); 1301 next_page = rb_list_head(to_remove)->next;
1247 rb_check_pages(cpu_buffer);
1248 1302
1249out: 1303 /*
1304 * Now we remove all pages between tail_page and next_page.
1305 * Make sure that we have head_bit value preserved for the
1306 * next page
1307 */
1308 tail_page->next = (struct list_head *)((unsigned long)next_page |
1309 head_bit);
1310 next_page = rb_list_head(next_page);
1311 next_page->prev = tail_page;
1312
1313 /* make sure pages points to a valid page in the ring buffer */
1314 cpu_buffer->pages = next_page;
1315
1316 /* update head page */
1317 if (head_bit)
1318 cpu_buffer->head_page = list_entry(next_page,
1319 struct buffer_page, list);
1320
1321 /*
1322 * change read pointer to make sure any read iterators reset
1323 * themselves
1324 */
1325 cpu_buffer->read = 0;
1326
1327 /* pages are removed, resume tracing and then free the pages */
1328 atomic_dec(&cpu_buffer->record_disabled);
1250 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1329 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1330
1331 RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
1332
1333 /* last buffer page to remove */
1334 last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
1335 list);
1336 tmp_iter_page = first_page;
1337
1338 do {
1339 to_remove_page = tmp_iter_page;
1340 rb_inc_page(cpu_buffer, &tmp_iter_page);
1341
1342 /* update the counters */
1343 page_entries = rb_page_entries(to_remove_page);
1344 if (page_entries) {
1345 /*
1346 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 }
1355
1356 /*
1357 * We have already removed references to this list item, just
1358 * free up the buffer_page and its page
1359 */
1360 free_buffer_page(to_remove_page);
1361 nr_removed--;
1362
1363 } while (to_remove_page != last_page);
1364
1365 RB_WARN_ON(cpu_buffer, nr_removed);
1366
1367 return nr_removed == 0;
1251} 1368}
1252 1369
1253static void 1370static int
1254rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 1371rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1255 struct list_head *pages, unsigned nr_pages)
1256{ 1372{
1257 struct buffer_page *bpage; 1373 struct list_head *pages = &cpu_buffer->new_pages;
1258 struct list_head *p; 1374 int retries, success;
1259 unsigned i;
1260 1375
1261 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1376 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1262 rb_head_page_deactivate(cpu_buffer); 1377 /*
1378 * We are holding the reader lock, so the reader page won't be swapped
1379 * in the ring buffer. Now we are racing with the writer trying to
1380 * move head page and the tail page.
1381 * We are going to adapt the reader page update process where:
1382 * 1. We first splice the start and end of list of new pages between
1383 * the head page and its previous page.
1384 * 2. We cmpxchg the prev_page->next to point from head page to the
1385 * start of new pages list.
1386 * 3. Finally, we update the head->prev to the end of new list.
1387 *
1388 * We will try this process 10 times, to make sure that we don't keep
1389 * spinning.
1390 */
1391 retries = 10;
1392 success = 0;
1393 while (retries--) {
1394 struct list_head *head_page, *prev_page, *r;
1395 struct list_head *last_page, *first_page;
1396 struct list_head *head_page_with_bit;
1263 1397
1264 for (i = 0; i < nr_pages; i++) { 1398 head_page = &rb_set_head_page(cpu_buffer)->list;
1265 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1399 prev_page = head_page->prev;
1266 goto out; 1400
1267 p = pages->next; 1401 first_page = pages->next;
1268 bpage = list_entry(p, struct buffer_page, list); 1402 last_page = pages->prev;
1269 list_del_init(&bpage->list); 1403
1270 list_add_tail(&bpage->list, cpu_buffer->pages); 1404 head_page_with_bit = (struct list_head *)
1405 ((unsigned long)head_page | RB_PAGE_HEAD);
1406
1407 last_page->next = head_page_with_bit;
1408 first_page->prev = prev_page;
1409
1410 r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
1411
1412 if (r == head_page_with_bit) {
1413 /*
1414 * yay, we replaced the page pointer to our new list,
1415 * now, we just have to update to head page's prev
1416 * pointer to point to end of list
1417 */
1418 head_page->prev = last_page;
1419 success = 1;
1420 break;
1421 }
1271 } 1422 }
1272 rb_reset_cpu(cpu_buffer);
1273 rb_check_pages(cpu_buffer);
1274 1423
1275out: 1424 if (success)
1425 INIT_LIST_HEAD(pages);
1426 /*
1427 * If we weren't successful in adding in new pages, warn and stop
1428 * tracing
1429 */
1430 RB_WARN_ON(cpu_buffer, !success);
1276 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1431 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1432
1433 /* free pages if they weren't inserted */
1434 if (!success) {
1435 struct buffer_page *bpage, *tmp;
1436 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1437 list) {
1438 list_del_init(&bpage->list);
1439 free_buffer_page(bpage);
1440 }
1441 }
1442 return success;
1443}
1444
1445static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
1446{
1447 int success;
1448
1449 if (cpu_buffer->nr_pages_to_update > 0)
1450 success = rb_insert_pages(cpu_buffer);
1451 else
1452 success = rb_remove_pages(cpu_buffer,
1453 -cpu_buffer->nr_pages_to_update);
1454
1455 if (success)
1456 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
1457}
1458
1459static void update_pages_handler(struct work_struct *work)
1460{
1461 struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
1462 struct ring_buffer_per_cpu, update_pages_work);
1463 rb_update_pages(cpu_buffer);
1464 complete(&cpu_buffer->update_done);
1277} 1465}
1278 1466
1279/** 1467/**
@@ -1283,16 +1471,14 @@ out:
1283 * 1471 *
1284 * Minimum size is 2 * BUF_PAGE_SIZE. 1472 * Minimum size is 2 * BUF_PAGE_SIZE.
1285 * 1473 *
1286 * Returns -1 on failure. 1474 * Returns 0 on success and < 0 on failure.
1287 */ 1475 */
1288int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) 1476int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1477 int cpu_id)
1289{ 1478{
1290 struct ring_buffer_per_cpu *cpu_buffer; 1479 struct ring_buffer_per_cpu *cpu_buffer;
1291 unsigned nr_pages, rm_pages, new_pages; 1480 unsigned nr_pages;
1292 struct buffer_page *bpage, *tmp; 1481 int cpu, err = 0;
1293 unsigned long buffer_size;
1294 LIST_HEAD(pages);
1295 int i, cpu;
1296 1482
1297 /* 1483 /*
1298 * Always succeed at resizing a non-existent buffer: 1484 * Always succeed at resizing a non-existent buffer:
@@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1302 1488
1303 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1489 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1304 size *= BUF_PAGE_SIZE; 1490 size *= BUF_PAGE_SIZE;
1305 buffer_size = buffer->pages * BUF_PAGE_SIZE;
1306 1491
1307 /* we need a minimum of two pages */ 1492 /* we need a minimum of two pages */
1308 if (size < BUF_PAGE_SIZE * 2) 1493 if (size < BUF_PAGE_SIZE * 2)
1309 size = BUF_PAGE_SIZE * 2; 1494 size = BUF_PAGE_SIZE * 2;
1310 1495
1311 if (size == buffer_size) 1496 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1312 return size;
1313
1314 atomic_inc(&buffer->record_disabled);
1315 1497
1316 /* Make sure all writers are done with this buffer. */ 1498 /*
1317 synchronize_sched(); 1499 * Don't succeed if resizing is disabled, as a reader might be
1500 * manipulating the ring buffer and is expecting a sane state while
1501 * this is true.
1502 */
1503 if (atomic_read(&buffer->resize_disabled))
1504 return -EBUSY;
1318 1505
1506 /* prevent another thread from changing buffer sizes */
1319 mutex_lock(&buffer->mutex); 1507 mutex_lock(&buffer->mutex);
1320 get_online_cpus();
1321
1322 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1323 1508
1324 if (size < buffer_size) { 1509 if (cpu_id == RING_BUFFER_ALL_CPUS) {
1510 /* calculate the pages to update */
1511 for_each_buffer_cpu(buffer, cpu) {
1512 cpu_buffer = buffer->buffers[cpu];
1325 1513
1326 /* easy case, just free pages */ 1514 cpu_buffer->nr_pages_to_update = nr_pages -
1327 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) 1515 cpu_buffer->nr_pages;
1328 goto out_fail; 1516 /*
1517 * nothing more to do for removing pages or no update
1518 */
1519 if (cpu_buffer->nr_pages_to_update <= 0)
1520 continue;
1521 /*
1522 * to add pages, make sure all new pages can be
1523 * allocated without receiving ENOMEM
1524 */
1525 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1526 if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1527 &cpu_buffer->new_pages, cpu)) {
1528 /* not enough memory for new pages */
1529 err = -ENOMEM;
1530 goto out_err;
1531 }
1532 }
1329 1533
1330 rm_pages = buffer->pages - nr_pages; 1534 get_online_cpus();
1535 /*
1536 * Fire off all the required work handlers
1537 * We can't schedule on offline CPUs, but it's not necessary
1538 * since we can change their buffer sizes without any race.
1539 */
1540 for_each_buffer_cpu(buffer, cpu) {
1541 cpu_buffer = buffer->buffers[cpu];
1542 if (!cpu_buffer->nr_pages_to_update)
1543 continue;
1544
1545 if (cpu_online(cpu))
1546 schedule_work_on(cpu,
1547 &cpu_buffer->update_pages_work);
1548 else
1549 rb_update_pages(cpu_buffer);
1550 }
1331 1551
1552 /* wait for all the updates to complete */
1332 for_each_buffer_cpu(buffer, cpu) { 1553 for_each_buffer_cpu(buffer, cpu) {
1333 cpu_buffer = buffer->buffers[cpu]; 1554 cpu_buffer = buffer->buffers[cpu];
1334 rb_remove_pages(cpu_buffer, rm_pages); 1555 if (!cpu_buffer->nr_pages_to_update)
1556 continue;
1557
1558 if (cpu_online(cpu))
1559 wait_for_completion(&cpu_buffer->update_done);
1560 cpu_buffer->nr_pages_to_update = 0;
1335 } 1561 }
1336 goto out;
1337 }
1338 1562
1339 /* 1563 put_online_cpus();
1340 * This is a bit more difficult. We only want to add pages 1564 } else {
1341 * when we can allocate enough for all CPUs. We do this 1565 cpu_buffer = buffer->buffers[cpu_id];
1342 * by allocating all the pages and storing them on a local
1343 * link list. If we succeed in our allocation, then we
1344 * add these pages to the cpu_buffers. Otherwise we just free
1345 * them all and return -ENOMEM;
1346 */
1347 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
1348 goto out_fail;
1349 1566
1350 new_pages = nr_pages - buffer->pages; 1567 if (nr_pages == cpu_buffer->nr_pages)
1568 goto out;
1351 1569
1352 for_each_buffer_cpu(buffer, cpu) { 1570 cpu_buffer->nr_pages_to_update = nr_pages -
1353 for (i = 0; i < new_pages; i++) { 1571 cpu_buffer->nr_pages;
1354 struct page *page; 1572
1355 /* 1573 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1356 * __GFP_NORETRY flag makes sure that the allocation 1574 if (cpu_buffer->nr_pages_to_update > 0 &&
1357 * fails gracefully without invoking oom-killer and 1575 __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1358 * the system is not destabilized. 1576 &cpu_buffer->new_pages, cpu_id)) {
1359 */ 1577 err = -ENOMEM;
1360 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1578 goto out_err;
1361 cache_line_size()),
1362 GFP_KERNEL | __GFP_NORETRY,
1363 cpu_to_node(cpu));
1364 if (!bpage)
1365 goto free_pages;
1366 list_add(&bpage->list, &pages);
1367 page = alloc_pages_node(cpu_to_node(cpu),
1368 GFP_KERNEL | __GFP_NORETRY, 0);
1369 if (!page)
1370 goto free_pages;
1371 bpage->page = page_address(page);
1372 rb_init_page(bpage->page);
1373 } 1579 }
1374 }
1375 1580
1376 for_each_buffer_cpu(buffer, cpu) { 1581 get_online_cpus();
1377 cpu_buffer = buffer->buffers[cpu];
1378 rb_insert_pages(cpu_buffer, &pages, new_pages);
1379 }
1380 1582
1381 if (RB_WARN_ON(buffer, !list_empty(&pages))) 1583 if (cpu_online(cpu_id)) {
1382 goto out_fail; 1584 schedule_work_on(cpu_id,
1585 &cpu_buffer->update_pages_work);
1586 wait_for_completion(&cpu_buffer->update_done);
1587 } else
1588 rb_update_pages(cpu_buffer);
1589
1590 cpu_buffer->nr_pages_to_update = 0;
1591 put_online_cpus();
1592 }
1383 1593
1384 out: 1594 out:
1385 buffer->pages = nr_pages; 1595 /*
1386 put_online_cpus(); 1596 * The ring buffer resize can happen with the ring buffer
1597 * enabled, so that the update disturbs the tracing as little
1598 * as possible. But if the buffer is disabled, we do not need
1599 * to worry about that, and we can take the time to verify
1600 * that the buffer is not corrupt.
1601 */
1602 if (atomic_read(&buffer->record_disabled)) {
1603 atomic_inc(&buffer->record_disabled);
1604 /*
1605 * Even though the buffer was disabled, we must make sure
1606 * that it is truly disabled before calling rb_check_pages.
1607 * There could have been a race between checking
1608 * record_disable and incrementing it.
1609 */
1610 synchronize_sched();
1611 for_each_buffer_cpu(buffer, cpu) {
1612 cpu_buffer = buffer->buffers[cpu];
1613 rb_check_pages(cpu_buffer);
1614 }
1615 atomic_dec(&buffer->record_disabled);
1616 }
1617
1387 mutex_unlock(&buffer->mutex); 1618 mutex_unlock(&buffer->mutex);
1619 return size;
1388 1620
1389 atomic_dec(&buffer->record_disabled); 1621 out_err:
1622 for_each_buffer_cpu(buffer, cpu) {
1623 struct buffer_page *bpage, *tmp;
1390 1624
1391 return size; 1625 cpu_buffer = buffer->buffers[cpu];
1626 cpu_buffer->nr_pages_to_update = 0;
1392 1627
1393 free_pages: 1628 if (list_empty(&cpu_buffer->new_pages))
1394 list_for_each_entry_safe(bpage, tmp, &pages, list) { 1629 continue;
1395 list_del_init(&bpage->list);
1396 free_buffer_page(bpage);
1397 }
1398 put_online_cpus();
1399 mutex_unlock(&buffer->mutex);
1400 atomic_dec(&buffer->record_disabled);
1401 return -ENOMEM;
1402 1630
1403 /* 1631 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1404 * Something went totally wrong, and we are too paranoid 1632 list) {
1405 * to even clean up the mess. 1633 list_del_init(&bpage->list);
1406 */ 1634 free_buffer_page(bpage);
1407 out_fail: 1635 }
1408 put_online_cpus(); 1636 }
1409 mutex_unlock(&buffer->mutex); 1637 mutex_unlock(&buffer->mutex);
1410 atomic_dec(&buffer->record_disabled); 1638 return err;
1411 return -1;
1412} 1639}
1413EXPORT_SYMBOL_GPL(ring_buffer_resize); 1640EXPORT_SYMBOL_GPL(ring_buffer_resize);
1414 1641
@@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
1447 return __rb_page_index(iter->head_page, iter->head); 1674 return __rb_page_index(iter->head_page, iter->head);
1448} 1675}
1449 1676
1450static inline unsigned long rb_page_write(struct buffer_page *bpage)
1451{
1452 return local_read(&bpage->write) & RB_WRITE_MASK;
1453}
1454
1455static inline unsigned rb_page_commit(struct buffer_page *bpage) 1677static inline unsigned rb_page_commit(struct buffer_page *bpage)
1456{ 1678{
1457 return local_read(&bpage->page->commit); 1679 return local_read(&bpage->page->commit);
1458} 1680}
1459 1681
1460static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1461{
1462 return local_read(&bpage->entries) & RB_WRITE_MASK;
1463}
1464
1465/* Size is determined by what has been committed */ 1682/* Size is determined by what has been committed */
1466static inline unsigned rb_page_size(struct buffer_page *bpage) 1683static inline unsigned rb_page_size(struct buffer_page *bpage)
1467{ 1684{
@@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1510 * assign the commit to the tail. 1727 * assign the commit to the tail.
1511 */ 1728 */
1512 again: 1729 again:
1513 max_count = cpu_buffer->buffer->pages * 100; 1730 max_count = cpu_buffer->nr_pages * 100;
1514 1731
1515 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1732 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1516 if (RB_WARN_ON(cpu_buffer, !(--max_count))) 1733 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3486 3703
3487 iter->cpu_buffer = cpu_buffer; 3704 iter->cpu_buffer = cpu_buffer;
3488 3705
3706 atomic_inc(&buffer->resize_disabled);
3489 atomic_inc(&cpu_buffer->record_disabled); 3707 atomic_inc(&cpu_buffer->record_disabled);
3490 3708
3491 return iter; 3709 return iter;
@@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
3548{ 3766{
3549 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3767 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3550 3768
3769 /*
3770 * Ring buffer is disabled from recording, here's a good place
3771 * to check the integrity of the ring buffer.
3772 */
3773 rb_check_pages(cpu_buffer);
3774
3551 atomic_dec(&cpu_buffer->record_disabled); 3775 atomic_dec(&cpu_buffer->record_disabled);
3776 atomic_dec(&cpu_buffer->buffer->resize_disabled);
3552 kfree(iter); 3777 kfree(iter);
3553} 3778}
3554EXPORT_SYMBOL_GPL(ring_buffer_read_finish); 3779EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
3588 * ring_buffer_size - return the size of the ring buffer (in bytes) 3813 * ring_buffer_size - return the size of the ring buffer (in bytes)
3589 * @buffer: The ring buffer. 3814 * @buffer: The ring buffer.
3590 */ 3815 */
3591unsigned long ring_buffer_size(struct ring_buffer *buffer) 3816unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
3592{ 3817{
3593 return BUF_PAGE_SIZE * buffer->pages; 3818 /*
3819 * Earlier, this method returned
3820 * BUF_PAGE_SIZE * buffer->nr_pages
3821 * Since the nr_pages field is now removed, we have converted this to
3822 * return the per cpu buffer value.
3823 */
3824 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3825 return 0;
3826
3827 return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
3594} 3828}
3595EXPORT_SYMBOL_GPL(ring_buffer_size); 3829EXPORT_SYMBOL_GPL(ring_buffer_size);
3596 3830
@@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3611 cpu_buffer->commit_page = cpu_buffer->head_page; 3845 cpu_buffer->commit_page = cpu_buffer->head_page;
3612 3846
3613 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 3847 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
3848 INIT_LIST_HEAD(&cpu_buffer->new_pages);
3614 local_set(&cpu_buffer->reader_page->write, 0); 3849 local_set(&cpu_buffer->reader_page->write, 0);
3615 local_set(&cpu_buffer->reader_page->entries, 0); 3850 local_set(&cpu_buffer->reader_page->entries, 0);
3616 local_set(&cpu_buffer->reader_page->page->commit, 0); 3851 local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3647 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3882 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3648 return; 3883 return;
3649 3884
3885 atomic_inc(&buffer->resize_disabled);
3650 atomic_inc(&cpu_buffer->record_disabled); 3886 atomic_inc(&cpu_buffer->record_disabled);
3651 3887
3888 /* Make sure all commits have finished */
3889 synchronize_sched();
3890
3652 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3891 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3653 3892
3654 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3893 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3664 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3903 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3665 3904
3666 atomic_dec(&cpu_buffer->record_disabled); 3905 atomic_dec(&cpu_buffer->record_disabled);
3906 atomic_dec(&buffer->resize_disabled);
3667} 3907}
3668EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 3908EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
3669 3909
@@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3765 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 4005 !cpumask_test_cpu(cpu, buffer_b->cpumask))
3766 goto out; 4006 goto out;
3767 4007
4008 cpu_buffer_a = buffer_a->buffers[cpu];
4009 cpu_buffer_b = buffer_b->buffers[cpu];
4010
3768 /* At least make sure the two buffers are somewhat the same */ 4011 /* At least make sure the two buffers are somewhat the same */
3769 if (buffer_a->pages != buffer_b->pages) 4012 if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
3770 goto out; 4013 goto out;
3771 4014
3772 ret = -EAGAIN; 4015 ret = -EAGAIN;
@@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3780 if (atomic_read(&buffer_b->record_disabled)) 4023 if (atomic_read(&buffer_b->record_disabled))
3781 goto out; 4024 goto out;
3782 4025
3783 cpu_buffer_a = buffer_a->buffers[cpu];
3784 cpu_buffer_b = buffer_b->buffers[cpu];
3785
3786 if (atomic_read(&cpu_buffer_a->record_disabled)) 4026 if (atomic_read(&cpu_buffer_a->record_disabled))
3787 goto out; 4027 goto out;
3788 4028
@@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self,
4071 struct ring_buffer *buffer = 4311 struct ring_buffer *buffer =
4072 container_of(self, struct ring_buffer, cpu_notify); 4312 container_of(self, struct ring_buffer, cpu_notify);
4073 long cpu = (long)hcpu; 4313 long cpu = (long)hcpu;
4314 int cpu_i, nr_pages_same;
4315 unsigned int nr_pages;
4074 4316
4075 switch (action) { 4317 switch (action) {
4076 case CPU_UP_PREPARE: 4318 case CPU_UP_PREPARE:
@@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self,
4078 if (cpumask_test_cpu(cpu, buffer->cpumask)) 4320 if (cpumask_test_cpu(cpu, buffer->cpumask))
4079 return NOTIFY_OK; 4321 return NOTIFY_OK;
4080 4322
4323 nr_pages = 0;
4324 nr_pages_same = 1;
4325 /* check if all cpu sizes are same */
4326 for_each_buffer_cpu(buffer, cpu_i) {
4327 /* fill in the size from first enabled cpu */
4328 if (nr_pages == 0)
4329 nr_pages = buffer->buffers[cpu_i]->nr_pages;
4330 if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
4331 nr_pages_same = 0;
4332 break;
4333 }
4334 }
4335 /* allocate minimum pages, user can later expand it */
4336 if (!nr_pages_same)
4337 nr_pages = 2;
4081 buffer->buffers[cpu] = 4338 buffer->buffers[cpu] =
4082 rb_allocate_cpu_buffer(buffer, cpu); 4339 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
4083 if (!buffer->buffers[cpu]) { 4340 if (!buffer->buffers[cpu]) {
4084 WARN(1, "failed to allocate ring buffer on CPU %ld\n", 4341 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
4085 cpu); 4342 cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a22255c1010..68032c6177db 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,18 +87,6 @@ static int tracing_disabled = 1;
87 87
88DEFINE_PER_CPU(int, ftrace_cpu_disabled); 88DEFINE_PER_CPU(int, ftrace_cpu_disabled);
89 89
90static inline void ftrace_disable_cpu(void)
91{
92 preempt_disable();
93 __this_cpu_inc(ftrace_cpu_disabled);
94}
95
96static inline void ftrace_enable_cpu(void)
97{
98 __this_cpu_dec(ftrace_cpu_disabled);
99 preempt_enable();
100}
101
102cpumask_var_t __read_mostly tracing_buffer_mask; 90cpumask_var_t __read_mostly tracing_buffer_mask;
103 91
104/* 92/*
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
629static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 617static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
630{ 618{
631 int len; 619 int len;
632 void *ret;
633 620
634 if (s->len <= s->readpos) 621 if (s->len <= s->readpos)
635 return -EBUSY; 622 return -EBUSY;
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
637 len = s->len - s->readpos; 624 len = s->len - s->readpos;
638 if (cnt > len) 625 if (cnt > len)
639 cnt = len; 626 cnt = len;
640 ret = memcpy(buf, s->buffer + s->readpos, cnt); 627 memcpy(buf, s->buffer + s->readpos, cnt);
641 if (!ret)
642 return -EFAULT;
643 628
644 s->readpos += cnt; 629 s->readpos += cnt;
645 return cnt; 630 return cnt;
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
751 736
752 arch_spin_lock(&ftrace_max_lock); 737 arch_spin_lock(&ftrace_max_lock);
753 738
754 ftrace_disable_cpu();
755
756 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 739 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
757 740
758 if (ret == -EBUSY) { 741 if (ret == -EBUSY) {
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
766 "Failed to swap buffers due to commit in progress\n"); 749 "Failed to swap buffers due to commit in progress\n");
767 } 750 }
768 751
769 ftrace_enable_cpu();
770
771 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 752 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
772 753
773 __update_max_tr(tr, tsk, cpu); 754 __update_max_tr(tr, tsk, cpu);
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
782 * Register a new plugin tracer. 763 * Register a new plugin tracer.
783 */ 764 */
784int register_tracer(struct tracer *type) 765int register_tracer(struct tracer *type)
785__releases(kernel_lock)
786__acquires(kernel_lock)
787{ 766{
788 struct tracer *t; 767 struct tracer *t;
789 int ret = 0; 768 int ret = 0;
@@ -841,7 +820,8 @@ __acquires(kernel_lock)
841 820
842 /* If we expanded the buffers, make sure the max is expanded too */ 821 /* If we expanded the buffers, make sure the max is expanded too */
843 if (ring_buffer_expanded && type->use_max_tr) 822 if (ring_buffer_expanded && type->use_max_tr)
844 ring_buffer_resize(max_tr.buffer, trace_buf_size); 823 ring_buffer_resize(max_tr.buffer, trace_buf_size,
824 RING_BUFFER_ALL_CPUS);
845 825
846 /* the test is responsible for initializing and enabling */ 826 /* the test is responsible for initializing and enabling */
847 pr_info("Testing tracer %s: ", type->name); 827 pr_info("Testing tracer %s: ", type->name);
@@ -857,7 +837,8 @@ __acquires(kernel_lock)
857 837
858 /* Shrink the max buffer again */ 838 /* Shrink the max buffer again */
859 if (ring_buffer_expanded && type->use_max_tr) 839 if (ring_buffer_expanded && type->use_max_tr)
860 ring_buffer_resize(max_tr.buffer, 1); 840 ring_buffer_resize(max_tr.buffer, 1,
841 RING_BUFFER_ALL_CPUS);
861 842
862 printk(KERN_CONT "PASSED\n"); 843 printk(KERN_CONT "PASSED\n");
863 } 844 }
@@ -917,13 +898,6 @@ out:
917 mutex_unlock(&trace_types_lock); 898 mutex_unlock(&trace_types_lock);
918} 899}
919 900
920static void __tracing_reset(struct ring_buffer *buffer, int cpu)
921{
922 ftrace_disable_cpu();
923 ring_buffer_reset_cpu(buffer, cpu);
924 ftrace_enable_cpu();
925}
926
927void tracing_reset(struct trace_array *tr, int cpu) 901void tracing_reset(struct trace_array *tr, int cpu)
928{ 902{
929 struct ring_buffer *buffer = tr->buffer; 903 struct ring_buffer *buffer = tr->buffer;
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
932 906
933 /* Make sure all commits have finished */ 907 /* Make sure all commits have finished */
934 synchronize_sched(); 908 synchronize_sched();
935 __tracing_reset(buffer, cpu); 909 ring_buffer_reset_cpu(buffer, cpu);
936 910
937 ring_buffer_record_enable(buffer); 911 ring_buffer_record_enable(buffer);
938} 912}
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
950 tr->time_start = ftrace_now(tr->cpu); 924 tr->time_start = ftrace_now(tr->cpu);
951 925
952 for_each_online_cpu(cpu) 926 for_each_online_cpu(cpu)
953 __tracing_reset(buffer, cpu); 927 ring_buffer_reset_cpu(buffer, cpu);
954 928
955 ring_buffer_record_enable(buffer); 929 ring_buffer_record_enable(buffer);
956} 930}
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1498 1472
1499#endif /* CONFIG_STACKTRACE */ 1473#endif /* CONFIG_STACKTRACE */
1500 1474
1475/* created for use with alloc_percpu */
1476struct trace_buffer_struct {
1477 char buffer[TRACE_BUF_SIZE];
1478};
1479
1480static struct trace_buffer_struct *trace_percpu_buffer;
1481static struct trace_buffer_struct *trace_percpu_sirq_buffer;
1482static struct trace_buffer_struct *trace_percpu_irq_buffer;
1483static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1484
1485/*
1486 * The buffer used is dependent on the context. There is a per cpu
1487 * buffer for normal context, softirq contex, hard irq context and
1488 * for NMI context. Thise allows for lockless recording.
1489 *
1490 * Note, if the buffers failed to be allocated, then this returns NULL
1491 */
1492static char *get_trace_buf(void)
1493{
1494 struct trace_buffer_struct *percpu_buffer;
1495 struct trace_buffer_struct *buffer;
1496
1497 /*
1498 * If we have allocated per cpu buffers, then we do not
1499 * need to do any locking.
1500 */
1501 if (in_nmi())
1502 percpu_buffer = trace_percpu_nmi_buffer;
1503 else if (in_irq())
1504 percpu_buffer = trace_percpu_irq_buffer;
1505 else if (in_softirq())
1506 percpu_buffer = trace_percpu_sirq_buffer;
1507 else
1508 percpu_buffer = trace_percpu_buffer;
1509
1510 if (!percpu_buffer)
1511 return NULL;
1512
1513 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
1514
1515 return buffer->buffer;
1516}
1517
1518static int alloc_percpu_trace_buffer(void)
1519{
1520 struct trace_buffer_struct *buffers;
1521 struct trace_buffer_struct *sirq_buffers;
1522 struct trace_buffer_struct *irq_buffers;
1523 struct trace_buffer_struct *nmi_buffers;
1524
1525 buffers = alloc_percpu(struct trace_buffer_struct);
1526 if (!buffers)
1527 goto err_warn;
1528
1529 sirq_buffers = alloc_percpu(struct trace_buffer_struct);
1530 if (!sirq_buffers)
1531 goto err_sirq;
1532
1533 irq_buffers = alloc_percpu(struct trace_buffer_struct);
1534 if (!irq_buffers)
1535 goto err_irq;
1536
1537 nmi_buffers = alloc_percpu(struct trace_buffer_struct);
1538 if (!nmi_buffers)
1539 goto err_nmi;
1540
1541 trace_percpu_buffer = buffers;
1542 trace_percpu_sirq_buffer = sirq_buffers;
1543 trace_percpu_irq_buffer = irq_buffers;
1544 trace_percpu_nmi_buffer = nmi_buffers;
1545
1546 return 0;
1547
1548 err_nmi:
1549 free_percpu(irq_buffers);
1550 err_irq:
1551 free_percpu(sirq_buffers);
1552 err_sirq:
1553 free_percpu(buffers);
1554 err_warn:
1555 WARN(1, "Could not allocate percpu trace_printk buffer");
1556 return -ENOMEM;
1557}
1558
1559void trace_printk_init_buffers(void)
1560{
1561 static int buffers_allocated;
1562
1563 if (buffers_allocated)
1564 return;
1565
1566 if (alloc_percpu_trace_buffer())
1567 return;
1568
1569 pr_info("ftrace: Allocated trace_printk buffers\n");
1570
1571 buffers_allocated = 1;
1572}
1573
1501/** 1574/**
1502 * trace_vbprintk - write binary msg to tracing buffer 1575 * trace_vbprintk - write binary msg to tracing buffer
1503 * 1576 *
1504 */ 1577 */
1505int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1578int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1506{ 1579{
1507 static arch_spinlock_t trace_buf_lock =
1508 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1509 static u32 trace_buf[TRACE_BUF_SIZE];
1510
1511 struct ftrace_event_call *call = &event_bprint; 1580 struct ftrace_event_call *call = &event_bprint;
1512 struct ring_buffer_event *event; 1581 struct ring_buffer_event *event;
1513 struct ring_buffer *buffer; 1582 struct ring_buffer *buffer;
1514 struct trace_array *tr = &global_trace; 1583 struct trace_array *tr = &global_trace;
1515 struct trace_array_cpu *data;
1516 struct bprint_entry *entry; 1584 struct bprint_entry *entry;
1517 unsigned long flags; 1585 unsigned long flags;
1518 int disable; 1586 char *tbuffer;
1519 int cpu, len = 0, size, pc; 1587 int len = 0, size, pc;
1520 1588
1521 if (unlikely(tracing_selftest_running || tracing_disabled)) 1589 if (unlikely(tracing_selftest_running || tracing_disabled))
1522 return 0; 1590 return 0;
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1526 1594
1527 pc = preempt_count(); 1595 pc = preempt_count();
1528 preempt_disable_notrace(); 1596 preempt_disable_notrace();
1529 cpu = raw_smp_processor_id();
1530 data = tr->data[cpu];
1531 1597
1532 disable = atomic_inc_return(&data->disabled); 1598 tbuffer = get_trace_buf();
1533 if (unlikely(disable != 1)) 1599 if (!tbuffer) {
1600 len = 0;
1534 goto out; 1601 goto out;
1602 }
1535 1603
1536 /* Lockdep uses trace_printk for lock tracing */ 1604 len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
1537 local_irq_save(flags);
1538 arch_spin_lock(&trace_buf_lock);
1539 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1540 1605
1541 if (len > TRACE_BUF_SIZE || len < 0) 1606 if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
1542 goto out_unlock; 1607 goto out;
1543 1608
1609 local_save_flags(flags);
1544 size = sizeof(*entry) + sizeof(u32) * len; 1610 size = sizeof(*entry) + sizeof(u32) * len;
1545 buffer = tr->buffer; 1611 buffer = tr->buffer;
1546 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1612 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1547 flags, pc); 1613 flags, pc);
1548 if (!event) 1614 if (!event)
1549 goto out_unlock; 1615 goto out;
1550 entry = ring_buffer_event_data(event); 1616 entry = ring_buffer_event_data(event);
1551 entry->ip = ip; 1617 entry->ip = ip;
1552 entry->fmt = fmt; 1618 entry->fmt = fmt;
1553 1619
1554 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1620 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1555 if (!filter_check_discard(call, entry, buffer, event)) { 1621 if (!filter_check_discard(call, entry, buffer, event)) {
1556 ring_buffer_unlock_commit(buffer, event); 1622 ring_buffer_unlock_commit(buffer, event);
1557 ftrace_trace_stack(buffer, flags, 6, pc); 1623 ftrace_trace_stack(buffer, flags, 6, pc);
1558 } 1624 }
1559 1625
1560out_unlock:
1561 arch_spin_unlock(&trace_buf_lock);
1562 local_irq_restore(flags);
1563
1564out: 1626out:
1565 atomic_dec_return(&data->disabled);
1566 preempt_enable_notrace(); 1627 preempt_enable_notrace();
1567 unpause_graph_tracing(); 1628 unpause_graph_tracing();
1568 1629
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr,
1588int trace_array_vprintk(struct trace_array *tr, 1649int trace_array_vprintk(struct trace_array *tr,
1589 unsigned long ip, const char *fmt, va_list args) 1650 unsigned long ip, const char *fmt, va_list args)
1590{ 1651{
1591 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1592 static char trace_buf[TRACE_BUF_SIZE];
1593
1594 struct ftrace_event_call *call = &event_print; 1652 struct ftrace_event_call *call = &event_print;
1595 struct ring_buffer_event *event; 1653 struct ring_buffer_event *event;
1596 struct ring_buffer *buffer; 1654 struct ring_buffer *buffer;
1597 struct trace_array_cpu *data; 1655 int len = 0, size, pc;
1598 int cpu, len = 0, size, pc;
1599 struct print_entry *entry; 1656 struct print_entry *entry;
1600 unsigned long irq_flags; 1657 unsigned long flags;
1601 int disable; 1658 char *tbuffer;
1602 1659
1603 if (tracing_disabled || tracing_selftest_running) 1660 if (tracing_disabled || tracing_selftest_running)
1604 return 0; 1661 return 0;
1605 1662
1663 /* Don't pollute graph traces with trace_vprintk internals */
1664 pause_graph_tracing();
1665
1606 pc = preempt_count(); 1666 pc = preempt_count();
1607 preempt_disable_notrace(); 1667 preempt_disable_notrace();
1608 cpu = raw_smp_processor_id();
1609 data = tr->data[cpu];
1610 1668
1611 disable = atomic_inc_return(&data->disabled); 1669
1612 if (unlikely(disable != 1)) 1670 tbuffer = get_trace_buf();
1671 if (!tbuffer) {
1672 len = 0;
1613 goto out; 1673 goto out;
1674 }
1614 1675
1615 pause_graph_tracing(); 1676 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
1616 raw_local_irq_save(irq_flags); 1677 if (len > TRACE_BUF_SIZE)
1617 arch_spin_lock(&trace_buf_lock); 1678 goto out;
1618 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1619 1679
1680 local_save_flags(flags);
1620 size = sizeof(*entry) + len + 1; 1681 size = sizeof(*entry) + len + 1;
1621 buffer = tr->buffer; 1682 buffer = tr->buffer;
1622 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1683 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1623 irq_flags, pc); 1684 flags, pc);
1624 if (!event) 1685 if (!event)
1625 goto out_unlock; 1686 goto out;
1626 entry = ring_buffer_event_data(event); 1687 entry = ring_buffer_event_data(event);
1627 entry->ip = ip; 1688 entry->ip = ip;
1628 1689
1629 memcpy(&entry->buf, trace_buf, len); 1690 memcpy(&entry->buf, tbuffer, len);
1630 entry->buf[len] = '\0'; 1691 entry->buf[len] = '\0';
1631 if (!filter_check_discard(call, entry, buffer, event)) { 1692 if (!filter_check_discard(call, entry, buffer, event)) {
1632 ring_buffer_unlock_commit(buffer, event); 1693 ring_buffer_unlock_commit(buffer, event);
1633 ftrace_trace_stack(buffer, irq_flags, 6, pc); 1694 ftrace_trace_stack(buffer, flags, 6, pc);
1634 } 1695 }
1635
1636 out_unlock:
1637 arch_spin_unlock(&trace_buf_lock);
1638 raw_local_irq_restore(irq_flags);
1639 unpause_graph_tracing();
1640 out: 1696 out:
1641 atomic_dec_return(&data->disabled);
1642 preempt_enable_notrace(); 1697 preempt_enable_notrace();
1698 unpause_graph_tracing();
1643 1699
1644 return len; 1700 return len;
1645} 1701}
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1652 1708
1653static void trace_iterator_increment(struct trace_iterator *iter) 1709static void trace_iterator_increment(struct trace_iterator *iter)
1654{ 1710{
1655 /* Don't allow ftrace to trace into the ring buffers */
1656 ftrace_disable_cpu();
1657
1658 iter->idx++; 1711 iter->idx++;
1659 if (iter->buffer_iter[iter->cpu]) 1712 if (iter->buffer_iter[iter->cpu])
1660 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); 1713 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1661
1662 ftrace_enable_cpu();
1663} 1714}
1664 1715
1665static struct trace_entry * 1716static struct trace_entry *
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1669 struct ring_buffer_event *event; 1720 struct ring_buffer_event *event;
1670 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1721 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
1671 1722
1672 /* Don't allow ftrace to trace into the ring buffers */
1673 ftrace_disable_cpu();
1674
1675 if (buf_iter) 1723 if (buf_iter)
1676 event = ring_buffer_iter_peek(buf_iter, ts); 1724 event = ring_buffer_iter_peek(buf_iter, ts);
1677 else 1725 else
1678 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 1726 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1679 lost_events); 1727 lost_events);
1680 1728
1681 ftrace_enable_cpu();
1682
1683 if (event) { 1729 if (event) {
1684 iter->ent_size = ring_buffer_event_length(event); 1730 iter->ent_size = ring_buffer_event_length(event);
1685 return ring_buffer_event_data(event); 1731 return ring_buffer_event_data(event);
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1769 1815
1770static void trace_consume(struct trace_iterator *iter) 1816static void trace_consume(struct trace_iterator *iter)
1771{ 1817{
1772 /* Don't allow ftrace to trace into the ring buffers */
1773 ftrace_disable_cpu();
1774 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 1818 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1775 &iter->lost_events); 1819 &iter->lost_events);
1776 ftrace_enable_cpu();
1777} 1820}
1778 1821
1779static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1822static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1862 iter->cpu = 0; 1905 iter->cpu = 0;
1863 iter->idx = -1; 1906 iter->idx = -1;
1864 1907
1865 ftrace_disable_cpu();
1866
1867 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1908 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1868 for_each_tracing_cpu(cpu) 1909 for_each_tracing_cpu(cpu)
1869 tracing_iter_reset(iter, cpu); 1910 tracing_iter_reset(iter, cpu);
1870 } else 1911 } else
1871 tracing_iter_reset(iter, cpu_file); 1912 tracing_iter_reset(iter, cpu_file);
1872 1913
1873 ftrace_enable_cpu();
1874
1875 iter->leftover = 0; 1914 iter->leftover = 0;
1876 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1915 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1877 ; 1916 ;
@@ -2332,15 +2371,13 @@ static struct trace_iterator *
2332__tracing_open(struct inode *inode, struct file *file) 2371__tracing_open(struct inode *inode, struct file *file)
2333{ 2372{
2334 long cpu_file = (long) inode->i_private; 2373 long cpu_file = (long) inode->i_private;
2335 void *fail_ret = ERR_PTR(-ENOMEM);
2336 struct trace_iterator *iter; 2374 struct trace_iterator *iter;
2337 struct seq_file *m; 2375 int cpu;
2338 int cpu, ret;
2339 2376
2340 if (tracing_disabled) 2377 if (tracing_disabled)
2341 return ERR_PTR(-ENODEV); 2378 return ERR_PTR(-ENODEV);
2342 2379
2343 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2380 iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
2344 if (!iter) 2381 if (!iter)
2345 return ERR_PTR(-ENOMEM); 2382 return ERR_PTR(-ENOMEM);
2346 2383
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file)
2397 tracing_iter_reset(iter, cpu); 2434 tracing_iter_reset(iter, cpu);
2398 } 2435 }
2399 2436
2400 ret = seq_open(file, &tracer_seq_ops);
2401 if (ret < 0) {
2402 fail_ret = ERR_PTR(ret);
2403 goto fail_buffer;
2404 }
2405
2406 m = file->private_data;
2407 m->private = iter;
2408
2409 mutex_unlock(&trace_types_lock); 2437 mutex_unlock(&trace_types_lock);
2410 2438
2411 return iter; 2439 return iter;
2412 2440
2413 fail_buffer:
2414 for_each_tracing_cpu(cpu) {
2415 if (iter->buffer_iter[cpu])
2416 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2417 }
2418 free_cpumask_var(iter->started);
2419 tracing_start();
2420 fail: 2441 fail:
2421 mutex_unlock(&trace_types_lock); 2442 mutex_unlock(&trace_types_lock);
2422 kfree(iter->trace); 2443 kfree(iter->trace);
2423 kfree(iter); 2444 seq_release_private(inode, file);
2424 2445 return ERR_PTR(-ENOMEM);
2425 return fail_ret;
2426} 2446}
2427 2447
2428int tracing_open_generic(struct inode *inode, struct file *filp) 2448int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file)
2458 tracing_start(); 2478 tracing_start();
2459 mutex_unlock(&trace_types_lock); 2479 mutex_unlock(&trace_types_lock);
2460 2480
2461 seq_release(inode, file);
2462 mutex_destroy(&iter->mutex); 2481 mutex_destroy(&iter->mutex);
2463 free_cpumask_var(iter->started); 2482 free_cpumask_var(iter->started);
2464 kfree(iter->trace); 2483 kfree(iter->trace);
2465 kfree(iter); 2484 seq_release_private(inode, file);
2466 return 0; 2485 return 0;
2467} 2486}
2468 2487
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2648 if (cpumask_test_cpu(cpu, tracing_cpumask) && 2667 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2649 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2668 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2650 atomic_inc(&global_trace.data[cpu]->disabled); 2669 atomic_inc(&global_trace.data[cpu]->disabled);
2670 ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
2651 } 2671 }
2652 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 2672 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2653 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2673 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2654 atomic_dec(&global_trace.data[cpu]->disabled); 2674 atomic_dec(&global_trace.data[cpu]->disabled);
2675 ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
2655 } 2676 }
2656 } 2677 }
2657 arch_spin_unlock(&ftrace_max_lock); 2678 arch_spin_unlock(&ftrace_max_lock);
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2974 return t->init(tr); 2995 return t->init(tr);
2975} 2996}
2976 2997
2977static int __tracing_resize_ring_buffer(unsigned long size) 2998static void set_buffer_entries(struct trace_array *tr, unsigned long val)
2999{
3000 int cpu;
3001 for_each_tracing_cpu(cpu)
3002 tr->data[cpu]->entries = val;
3003}
3004
3005static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
2978{ 3006{
2979 int ret; 3007 int ret;
2980 3008
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
2985 */ 3013 */
2986 ring_buffer_expanded = 1; 3014 ring_buffer_expanded = 1;
2987 3015
2988 ret = ring_buffer_resize(global_trace.buffer, size); 3016 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
2989 if (ret < 0) 3017 if (ret < 0)
2990 return ret; 3018 return ret;
2991 3019
2992 if (!current_trace->use_max_tr) 3020 if (!current_trace->use_max_tr)
2993 goto out; 3021 goto out;
2994 3022
2995 ret = ring_buffer_resize(max_tr.buffer, size); 3023 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
2996 if (ret < 0) { 3024 if (ret < 0) {
2997 int r; 3025 int r = 0;
3026
3027 if (cpu == RING_BUFFER_ALL_CPUS) {
3028 int i;
3029 for_each_tracing_cpu(i) {
3030 r = ring_buffer_resize(global_trace.buffer,
3031 global_trace.data[i]->entries,
3032 i);
3033 if (r < 0)
3034 break;
3035 }
3036 } else {
3037 r = ring_buffer_resize(global_trace.buffer,
3038 global_trace.data[cpu]->entries,
3039 cpu);
3040 }
2998 3041
2999 r = ring_buffer_resize(global_trace.buffer,
3000 global_trace.entries);
3001 if (r < 0) { 3042 if (r < 0) {
3002 /* 3043 /*
3003 * AARGH! We are left with different 3044 * AARGH! We are left with different
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size)
3019 return ret; 3060 return ret;
3020 } 3061 }
3021 3062
3022 max_tr.entries = size; 3063 if (cpu == RING_BUFFER_ALL_CPUS)
3064 set_buffer_entries(&max_tr, size);
3065 else
3066 max_tr.data[cpu]->entries = size;
3067
3023 out: 3068 out:
3024 global_trace.entries = size; 3069 if (cpu == RING_BUFFER_ALL_CPUS)
3070 set_buffer_entries(&global_trace, size);
3071 else
3072 global_trace.data[cpu]->entries = size;
3025 3073
3026 return ret; 3074 return ret;
3027} 3075}
3028 3076
3029static ssize_t tracing_resize_ring_buffer(unsigned long size) 3077static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3030{ 3078{
3031 int cpu, ret = size; 3079 int ret = size;
3032 3080
3033 mutex_lock(&trace_types_lock); 3081 mutex_lock(&trace_types_lock);
3034 3082
3035 tracing_stop(); 3083 if (cpu_id != RING_BUFFER_ALL_CPUS) {
3036 3084 /* make sure, this cpu is enabled in the mask */
3037 /* disable all cpu buffers */ 3085 if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
3038 for_each_tracing_cpu(cpu) { 3086 ret = -EINVAL;
3039 if (global_trace.data[cpu]) 3087 goto out;
3040 atomic_inc(&global_trace.data[cpu]->disabled); 3088 }
3041 if (max_tr.data[cpu])
3042 atomic_inc(&max_tr.data[cpu]->disabled);
3043 } 3089 }
3044 3090
3045 if (size != global_trace.entries) 3091 ret = __tracing_resize_ring_buffer(size, cpu_id);
3046 ret = __tracing_resize_ring_buffer(size);
3047
3048 if (ret < 0) 3092 if (ret < 0)
3049 ret = -ENOMEM; 3093 ret = -ENOMEM;
3050 3094
3051 for_each_tracing_cpu(cpu) { 3095out:
3052 if (global_trace.data[cpu])
3053 atomic_dec(&global_trace.data[cpu]->disabled);
3054 if (max_tr.data[cpu])
3055 atomic_dec(&max_tr.data[cpu]->disabled);
3056 }
3057
3058 tracing_start();
3059 mutex_unlock(&trace_types_lock); 3096 mutex_unlock(&trace_types_lock);
3060 3097
3061 return ret; 3098 return ret;
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void)
3078 3115
3079 mutex_lock(&trace_types_lock); 3116 mutex_lock(&trace_types_lock);
3080 if (!ring_buffer_expanded) 3117 if (!ring_buffer_expanded)
3081 ret = __tracing_resize_ring_buffer(trace_buf_size); 3118 ret = __tracing_resize_ring_buffer(trace_buf_size,
3119 RING_BUFFER_ALL_CPUS);
3082 mutex_unlock(&trace_types_lock); 3120 mutex_unlock(&trace_types_lock);
3083 3121
3084 return ret; 3122 return ret;
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf)
3102 mutex_lock(&trace_types_lock); 3140 mutex_lock(&trace_types_lock);
3103 3141
3104 if (!ring_buffer_expanded) { 3142 if (!ring_buffer_expanded) {
3105 ret = __tracing_resize_ring_buffer(trace_buf_size); 3143 ret = __tracing_resize_ring_buffer(trace_buf_size,
3144 RING_BUFFER_ALL_CPUS);
3106 if (ret < 0) 3145 if (ret < 0)
3107 goto out; 3146 goto out;
3108 ret = 0; 3147 ret = 0;
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf)
3128 * The max_tr ring buffer has some state (e.g. ring->clock) and 3167 * The max_tr ring buffer has some state (e.g. ring->clock) and
3129 * we want preserve it. 3168 * we want preserve it.
3130 */ 3169 */
3131 ring_buffer_resize(max_tr.buffer, 1); 3170 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3132 max_tr.entries = 1; 3171 set_buffer_entries(&max_tr, 1);
3133 } 3172 }
3134 destroy_trace_option_files(topts); 3173 destroy_trace_option_files(topts);
3135 3174
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf)
3137 3176
3138 topts = create_trace_option_files(current_trace); 3177 topts = create_trace_option_files(current_trace);
3139 if (current_trace->use_max_tr) { 3178 if (current_trace->use_max_tr) {
3140 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); 3179 int cpu;
3141 if (ret < 0) 3180 /* we need to make per cpu buffer sizes equivalent */
3142 goto out; 3181 for_each_tracing_cpu(cpu) {
3143 max_tr.entries = global_trace.entries; 3182 ret = ring_buffer_resize(max_tr.buffer,
3183 global_trace.data[cpu]->entries,
3184 cpu);
3185 if (ret < 0)
3186 goto out;
3187 max_tr.data[cpu]->entries =
3188 global_trace.data[cpu]->entries;
3189 }
3144 } 3190 }
3145 3191
3146 if (t->init) { 3192 if (t->init) {
@@ -3642,30 +3688,82 @@ out_err:
3642 goto out; 3688 goto out;
3643} 3689}
3644 3690
3691struct ftrace_entries_info {
3692 struct trace_array *tr;
3693 int cpu;
3694};
3695
3696static int tracing_entries_open(struct inode *inode, struct file *filp)
3697{
3698 struct ftrace_entries_info *info;
3699
3700 if (tracing_disabled)
3701 return -ENODEV;
3702
3703 info = kzalloc(sizeof(*info), GFP_KERNEL);
3704 if (!info)
3705 return -ENOMEM;
3706
3707 info->tr = &global_trace;
3708 info->cpu = (unsigned long)inode->i_private;
3709
3710 filp->private_data = info;
3711
3712 return 0;
3713}
3714
3645static ssize_t 3715static ssize_t
3646tracing_entries_read(struct file *filp, char __user *ubuf, 3716tracing_entries_read(struct file *filp, char __user *ubuf,
3647 size_t cnt, loff_t *ppos) 3717 size_t cnt, loff_t *ppos)
3648{ 3718{
3649 struct trace_array *tr = filp->private_data; 3719 struct ftrace_entries_info *info = filp->private_data;
3650 char buf[96]; 3720 struct trace_array *tr = info->tr;
3651 int r; 3721 char buf[64];
3722 int r = 0;
3723 ssize_t ret;
3652 3724
3653 mutex_lock(&trace_types_lock); 3725 mutex_lock(&trace_types_lock);
3654 if (!ring_buffer_expanded) 3726
3655 r = sprintf(buf, "%lu (expanded: %lu)\n", 3727 if (info->cpu == RING_BUFFER_ALL_CPUS) {
3656 tr->entries >> 10, 3728 int cpu, buf_size_same;
3657 trace_buf_size >> 10); 3729 unsigned long size;
3658 else 3730
3659 r = sprintf(buf, "%lu\n", tr->entries >> 10); 3731 size = 0;
3732 buf_size_same = 1;
3733 /* check if all cpu sizes are same */
3734 for_each_tracing_cpu(cpu) {
3735 /* fill in the size from first enabled cpu */
3736 if (size == 0)
3737 size = tr->data[cpu]->entries;
3738 if (size != tr->data[cpu]->entries) {
3739 buf_size_same = 0;
3740 break;
3741 }
3742 }
3743
3744 if (buf_size_same) {
3745 if (!ring_buffer_expanded)
3746 r = sprintf(buf, "%lu (expanded: %lu)\n",
3747 size >> 10,
3748 trace_buf_size >> 10);
3749 else
3750 r = sprintf(buf, "%lu\n", size >> 10);
3751 } else
3752 r = sprintf(buf, "X\n");
3753 } else
3754 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
3755
3660 mutex_unlock(&trace_types_lock); 3756 mutex_unlock(&trace_types_lock);
3661 3757
3662 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3758 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3759 return ret;
3663} 3760}
3664 3761
3665static ssize_t 3762static ssize_t
3666tracing_entries_write(struct file *filp, const char __user *ubuf, 3763tracing_entries_write(struct file *filp, const char __user *ubuf,
3667 size_t cnt, loff_t *ppos) 3764 size_t cnt, loff_t *ppos)
3668{ 3765{
3766 struct ftrace_entries_info *info = filp->private_data;
3669 unsigned long val; 3767 unsigned long val;
3670 int ret; 3768 int ret;
3671 3769
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3680 /* value is in KB */ 3778 /* value is in KB */
3681 val <<= 10; 3779 val <<= 10;
3682 3780
3683 ret = tracing_resize_ring_buffer(val); 3781 ret = tracing_resize_ring_buffer(val, info->cpu);
3684 if (ret < 0) 3782 if (ret < 0)
3685 return ret; 3783 return ret;
3686 3784
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3689 return cnt; 3787 return cnt;
3690} 3788}
3691 3789
3790static int
3791tracing_entries_release(struct inode *inode, struct file *filp)
3792{
3793 struct ftrace_entries_info *info = filp->private_data;
3794
3795 kfree(info);
3796
3797 return 0;
3798}
3799
3692static ssize_t 3800static ssize_t
3693tracing_total_entries_read(struct file *filp, char __user *ubuf, 3801tracing_total_entries_read(struct file *filp, char __user *ubuf,
3694 size_t cnt, loff_t *ppos) 3802 size_t cnt, loff_t *ppos)
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3700 3808
3701 mutex_lock(&trace_types_lock); 3809 mutex_lock(&trace_types_lock);
3702 for_each_tracing_cpu(cpu) { 3810 for_each_tracing_cpu(cpu) {
3703 size += tr->entries >> 10; 3811 size += tr->data[cpu]->entries >> 10;
3704 if (!ring_buffer_expanded) 3812 if (!ring_buffer_expanded)
3705 expanded_size += trace_buf_size >> 10; 3813 expanded_size += trace_buf_size >> 10;
3706 } 3814 }
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3734 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 3842 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3735 tracing_off(); 3843 tracing_off();
3736 /* resize the ring buffer to 0 */ 3844 /* resize the ring buffer to 0 */
3737 tracing_resize_ring_buffer(0); 3845 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
3738 3846
3739 return 0; 3847 return 0;
3740} 3848}
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3749 struct print_entry *entry; 3857 struct print_entry *entry;
3750 unsigned long irq_flags; 3858 unsigned long irq_flags;
3751 struct page *pages[2]; 3859 struct page *pages[2];
3860 void *map_page[2];
3752 int nr_pages = 1; 3861 int nr_pages = 1;
3753 ssize_t written; 3862 ssize_t written;
3754 void *page1;
3755 void *page2;
3756 int offset; 3863 int offset;
3757 int size; 3864 int size;
3758 int len; 3865 int len;
3759 int ret; 3866 int ret;
3867 int i;
3760 3868
3761 if (tracing_disabled) 3869 if (tracing_disabled)
3762 return -EINVAL; 3870 return -EINVAL;
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3795 goto out; 3903 goto out;
3796 } 3904 }
3797 3905
3798 page1 = kmap_atomic(pages[0]); 3906 for (i = 0; i < nr_pages; i++)
3799 if (nr_pages == 2) 3907 map_page[i] = kmap_atomic(pages[i]);
3800 page2 = kmap_atomic(pages[1]);
3801 3908
3802 local_save_flags(irq_flags); 3909 local_save_flags(irq_flags);
3803 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 3910 size = sizeof(*entry) + cnt + 2; /* possible \n added */
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3815 3922
3816 if (nr_pages == 2) { 3923 if (nr_pages == 2) {
3817 len = PAGE_SIZE - offset; 3924 len = PAGE_SIZE - offset;
3818 memcpy(&entry->buf, page1 + offset, len); 3925 memcpy(&entry->buf, map_page[0] + offset, len);
3819 memcpy(&entry->buf[len], page2, cnt - len); 3926 memcpy(&entry->buf[len], map_page[1], cnt - len);
3820 } else 3927 } else
3821 memcpy(&entry->buf, page1 + offset, cnt); 3928 memcpy(&entry->buf, map_page[0] + offset, cnt);
3822 3929
3823 if (entry->buf[cnt - 1] != '\n') { 3930 if (entry->buf[cnt - 1] != '\n') {
3824 entry->buf[cnt] = '\n'; 3931 entry->buf[cnt] = '\n';
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3833 *fpos += written; 3940 *fpos += written;
3834 3941
3835 out_unlock: 3942 out_unlock:
3836 if (nr_pages == 2) 3943 for (i = 0; i < nr_pages; i++){
3837 kunmap_atomic(page2); 3944 kunmap_atomic(map_page[i]);
3838 kunmap_atomic(page1); 3945 put_page(pages[i]);
3839 while (nr_pages > 0) 3946 }
3840 put_page(pages[--nr_pages]);
3841 out: 3947 out:
3842 return written; 3948 return written;
3843} 3949}
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = {
3933}; 4039};
3934 4040
3935static const struct file_operations tracing_entries_fops = { 4041static const struct file_operations tracing_entries_fops = {
3936 .open = tracing_open_generic, 4042 .open = tracing_entries_open,
3937 .read = tracing_entries_read, 4043 .read = tracing_entries_read,
3938 .write = tracing_entries_write, 4044 .write = tracing_entries_write,
4045 .release = tracing_entries_release,
3939 .llseek = generic_file_llseek, 4046 .llseek = generic_file_llseek,
3940}; 4047};
3941 4048
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4367 struct dentry *d_cpu; 4474 struct dentry *d_cpu;
4368 char cpu_dir[30]; /* 30 characters should be more than enough */ 4475 char cpu_dir[30]; /* 30 characters should be more than enough */
4369 4476
4477 if (!d_percpu)
4478 return;
4479
4370 snprintf(cpu_dir, 30, "cpu%ld", cpu); 4480 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4371 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4481 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4372 if (!d_cpu) { 4482 if (!d_cpu) {
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4387 4497
4388 trace_create_file("stats", 0444, d_cpu, 4498 trace_create_file("stats", 0444, d_cpu,
4389 (void *) cpu, &tracing_stats_fops); 4499 (void *) cpu, &tracing_stats_fops);
4500
4501 trace_create_file("buffer_size_kb", 0444, d_cpu,
4502 (void *) cpu, &tracing_entries_fops);
4390} 4503}
4391 4504
4392#ifdef CONFIG_FTRACE_SELFTEST 4505#ifdef CONFIG_FTRACE_SELFTEST
@@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void)
4718 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4831 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
4719 4832
4720 trace_create_file("buffer_size_kb", 0644, d_tracer, 4833 trace_create_file("buffer_size_kb", 0644, d_tracer,
4721 &global_trace, &tracing_entries_fops); 4834 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
4722 4835
4723 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 4836 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4724 &global_trace, &tracing_total_entries_fops); 4837 &global_trace, &tracing_total_entries_fops);
@@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void)
4957 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 5070 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4958 goto out_free_buffer_mask; 5071 goto out_free_buffer_mask;
4959 5072
5073 /* Only allocate trace_printk buffers if a trace_printk exists */
5074 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5075 trace_printk_init_buffers();
5076
4960 /* To save memory, keep the ring buffer size to its minimum */ 5077 /* To save memory, keep the ring buffer size to its minimum */
4961 if (ring_buffer_expanded) 5078 if (ring_buffer_expanded)
4962 ring_buf_size = trace_buf_size; 5079 ring_buf_size = trace_buf_size;
@@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void)
4975 WARN_ON(1); 5092 WARN_ON(1);
4976 goto out_free_cpumask; 5093 goto out_free_cpumask;
4977 } 5094 }
4978 global_trace.entries = ring_buffer_size(global_trace.buffer);
4979 if (global_trace.buffer_disabled) 5095 if (global_trace.buffer_disabled)
4980 tracing_off(); 5096 tracing_off();
4981 5097
@@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void)
4988 ring_buffer_free(global_trace.buffer); 5104 ring_buffer_free(global_trace.buffer);
4989 goto out_free_cpumask; 5105 goto out_free_cpumask;
4990 } 5106 }
4991 max_tr.entries = 1;
4992#endif 5107#endif
4993 5108
4994 /* Allocate the first page for all buffers */ 5109 /* Allocate the first page for all buffers */
@@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void)
4997 max_tr.data[i] = &per_cpu(max_tr_data, i); 5112 max_tr.data[i] = &per_cpu(max_tr_data, i);
4998 } 5113 }
4999 5114
5115 set_buffer_entries(&global_trace,
5116 ring_buffer_size(global_trace.buffer, 0));
5117#ifdef CONFIG_TRACER_MAX_TRACE
5118 set_buffer_entries(&max_tr, 1);
5119#endif
5120
5000 trace_init_cmdlines(); 5121 trace_init_cmdlines();
5001 5122
5002 register_tracer(&nop_trace); 5123 register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f95d65da6db8..6c6f7933eede 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -131,6 +131,7 @@ struct trace_array_cpu {
131 atomic_t disabled; 131 atomic_t disabled;
132 void *buffer_page; /* ring buffer spare */ 132 void *buffer_page; /* ring buffer spare */
133 133
134 unsigned long entries;
134 unsigned long saved_latency; 135 unsigned long saved_latency;
135 unsigned long critical_start; 136 unsigned long critical_start;
136 unsigned long critical_end; 137 unsigned long critical_end;
@@ -152,7 +153,6 @@ struct trace_array_cpu {
152 */ 153 */
153struct trace_array { 154struct trace_array {
154 struct ring_buffer *buffer; 155 struct ring_buffer *buffer;
155 unsigned long entries;
156 int cpu; 156 int cpu;
157 int buffer_disabled; 157 int buffer_disabled;
158 cycle_t time_start; 158 cycle_t time_start;
@@ -826,6 +826,8 @@ extern struct list_head ftrace_events;
826extern const char *__start___trace_bprintk_fmt[]; 826extern const char *__start___trace_bprintk_fmt[];
827extern const char *__stop___trace_bprintk_fmt[]; 827extern const char *__stop___trace_bprintk_fmt[];
828 828
829void trace_printk_init_buffers(void);
830
829#undef FTRACE_ENTRY 831#undef FTRACE_ENTRY
830#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 832#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
831 extern struct ftrace_event_call \ 833 extern struct ftrace_event_call \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9d..29111da1d100 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
294 if (!call->name || !call->class || !call->class->reg) 294 if (!call->name || !call->class || !call->class->reg)
295 continue; 295 continue;
296 296
297 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
298 continue;
299
297 if (match && 300 if (match &&
298 strcmp(match, call->name) != 0 && 301 strcmp(match, call->name) != 0 &&
299 strcmp(match, call->class->system) != 0) 302 strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1164 return -1; 1167 return -1;
1165 } 1168 }
1166 1169
1167 if (call->class->reg) 1170 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1168 trace_create_file("enable", 0644, call->dir, call, 1171 trace_create_file("enable", 0644, call->dir, call,
1169 enable); 1172 enable);
1170 1173
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc856..e039906b037d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 180 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 181 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 182 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
183}; \ 184}; \
184struct ftrace_event_call __used \ 185struct ftrace_event_call __used \
185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 6fd4ffd042f9..a9077c1b4ad3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
51 const char **iter; 51 const char **iter;
52 char *fmt; 52 char *fmt;
53 53
54 /* allocate the trace_printk per cpu buffers */
55 if (start != end)
56 trace_printk_init_buffers();
57
54 mutex_lock(&btrace_mutex); 58 mutex_lock(&btrace_mutex);
55 for (iter = start; iter < end; iter++) { 59 for (iter = start; iter < end; iter++) {
56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); 60 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a4721..000000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/events/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include <linux/slab.h>
13#include <linux/kref.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* A cpu workqueue thread */
19struct cpu_workqueue_stats {
20 struct list_head list;
21 struct kref kref;
22 int cpu;
23 pid_t pid;
24/* Can be inserted from interrupt or user context, need to be atomic */
25 atomic_t inserted;
26/*
27 * Don't need to be atomic, works are serialized in a single workqueue thread
28 * on a single CPU.
29 */
30 unsigned int executed;
31};
32
33/* List of workqueue threads on one cpu */
34struct workqueue_global_stats {
35 struct list_head list;
36 spinlock_t lock;
37};
38
39/* Don't need a global lock because allocated before the workqueues, and
40 * never freed.
41 */
42static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
43#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
44
45static void cpu_workqueue_stat_free(struct kref *kref)
46{
47 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
48}
49
50/* Insertion of a work */
51static void
52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
54 struct work_struct *work)
55{
56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
57 struct cpu_workqueue_stats *node;
58 unsigned long flags;
59
60 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
61 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
62 if (node->pid == wq_thread->pid) {
63 atomic_inc(&node->inserted);
64 goto found;
65 }
66 }
67 pr_debug("trace_workqueue: entry not found\n");
68found:
69 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
70}
71
72/* Execution of a work */
73static void
74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
76 struct work_struct *work)
77{
78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
79 struct cpu_workqueue_stats *node;
80 unsigned long flags;
81
82 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
83 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
84 if (node->pid == wq_thread->pid) {
85 node->executed++;
86 goto found;
87 }
88 }
89 pr_debug("trace_workqueue: entry not found\n");
90found:
91 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
92}
93
94/* Creation of a cpu workqueue thread */
95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
97{
98 struct cpu_workqueue_stats *cws;
99 unsigned long flags;
100
101 WARN_ON(cpu < 0);
102
103 /* Workqueues are sometimes created in atomic context */
104 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
105 if (!cws) {
106 pr_warning("trace_workqueue: not enough memory\n");
107 return;
108 }
109 INIT_LIST_HEAD(&cws->list);
110 kref_init(&cws->kref);
111 cws->cpu = cpu;
112 cws->pid = wq_thread->pid;
113
114 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
115 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
116 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
117}
118
119/* Destruction of a cpu workqueue thread */
120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
122{
123 /* Workqueue only execute on one cpu */
124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
125 struct cpu_workqueue_stats *node, *next;
126 unsigned long flags;
127
128 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
129 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
130 list) {
131 if (node->pid == wq_thread->pid) {
132 list_del(&node->list);
133 kref_put(&node->kref, cpu_workqueue_stat_free);
134 goto found;
135 }
136 }
137
138 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
139found:
140 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
141
142}
143
144static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
145{
146 unsigned long flags;
147 struct cpu_workqueue_stats *ret = NULL;
148
149
150 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
153 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
154 struct cpu_workqueue_stats, list);
155 kref_get(&ret->kref);
156 }
157
158 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
159
160 return ret;
161}
162
163static void *workqueue_stat_start(struct tracer_stat *trace)
164{
165 int cpu;
166 void *ret = NULL;
167
168 for_each_possible_cpu(cpu) {
169 ret = workqueue_stat_start_cpu(cpu);
170 if (ret)
171 return ret;
172 }
173 return NULL;
174}
175
176static void *workqueue_stat_next(void *prev, int idx)
177{
178 struct cpu_workqueue_stats *prev_cws = prev;
179 struct cpu_workqueue_stats *ret;
180 int cpu = prev_cws->cpu;
181 unsigned long flags;
182
183 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
184 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186 do {
187 cpu = cpumask_next(cpu, cpu_possible_mask);
188 if (cpu >= nr_cpu_ids)
189 return NULL;
190 } while (!(ret = workqueue_stat_start_cpu(cpu)));
191 return ret;
192 } else {
193 ret = list_entry(prev_cws->list.next,
194 struct cpu_workqueue_stats, list);
195 kref_get(&ret->kref);
196 }
197 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
198
199 return ret;
200}
201
202static int workqueue_stat_show(struct seq_file *s, void *p)
203{
204 struct cpu_workqueue_stats *cws = p;
205 struct pid *pid;
206 struct task_struct *tsk;
207
208 pid = find_get_pid(cws->pid);
209 if (pid) {
210 tsk = get_pid_task(pid, PIDTYPE_PID);
211 if (tsk) {
212 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
213 atomic_read(&cws->inserted), cws->executed,
214 tsk->comm);
215 put_task_struct(tsk);
216 }
217 put_pid(pid);
218 }
219
220 return 0;
221}
222
223static void workqueue_stat_release(void *stat)
224{
225 struct cpu_workqueue_stats *node = stat;
226
227 kref_put(&node->kref, cpu_workqueue_stat_free);
228}
229
230static int workqueue_stat_headers(struct seq_file *s)
231{
232 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
233 seq_printf(s, "# | | | |\n");
234 return 0;
235}
236
237struct tracer_stat workqueue_stats __read_mostly = {
238 .name = "workqueues",
239 .stat_start = workqueue_stat_start,
240 .stat_next = workqueue_stat_next,
241 .stat_show = workqueue_stat_show,
242 .stat_release = workqueue_stat_release,
243 .stat_headers = workqueue_stat_headers
244};
245
246
247int __init stat_workqueue_init(void)
248{
249 if (register_stat_tracer(&workqueue_stats)) {
250 pr_warning("Unable to register workqueue stat tracer\n");
251 return 1;
252 }
253
254 return 0;
255}
256fs_initcall(stat_workqueue_init);
257
258/*
259 * Workqueues are created very early, just after pre-smp initcalls.
260 * So we must register our tracepoints at this stage.
261 */
262int __init trace_workqueue_early_init(void)
263{
264 int ret, cpu;
265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
272 if (ret)
273 goto out;
274
275 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
276 if (ret)
277 goto no_insertion;
278
279 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
280 if (ret)
281 goto no_execution;
282
283 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
284 if (ret)
285 goto no_creation;
286
287 return 0;
288
289no_creation:
290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
291no_execution:
292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
293no_insertion:
294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
295out:
296 pr_warning("trace_workqueue: unable to trace workqueues\n");
297
298 return 1;
299}
300early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c08..9a3128dc67df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1032 cwq = get_cwq(gcwq->cpu, wq); 1032 cwq = get_cwq(gcwq->cpu, wq);
1033 trace_workqueue_queue_work(cpu, cwq, work); 1033 trace_workqueue_queue_work(cpu, cwq, work);
1034 1034
1035 BUG_ON(!list_empty(&work->entry)); 1035 if (WARN_ON(!list_empty(&work->entry))) {
1036 spin_unlock_irqrestore(&gcwq->lock, flags);
1037 return;
1038 }
1036 1039
1037 cwq->nr_in_flight[cwq->work_color]++; 1040 cwq->nr_in_flight[cwq->work_color]++;
1038 work_flags = work_color_to_flags(cwq->work_color); 1041 work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
1210 } else 1213 } else
1211 wake_up_all(&gcwq->trustee_wait); 1214 wake_up_all(&gcwq->trustee_wait);
1212 1215
1213 /* sanity check nr_running */ 1216 /*
1214 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && 1217 * Sanity check nr_running. Because trustee releases gcwq->lock
1218 * between setting %WORKER_ROGUE and zapping nr_running, the
1219 * warning may trigger spuriously. Check iff trustee is idle.
1220 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
1222 gcwq->nr_workers == gcwq->nr_idle &&
1215 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1223 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1216} 1224}
1217 1225
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
1810 * lock freed" warnings as well as problems when looking into 1818 * lock freed" warnings as well as problems when looking into
1811 * work->lockdep_map, make a copy and use that here. 1819 * work->lockdep_map, make a copy and use that here.
1812 */ 1820 */
1813 struct lockdep_map lockdep_map = work->lockdep_map; 1821 struct lockdep_map lockdep_map;
1822
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1814#endif 1824#endif
1815 /* 1825 /*
1816 * A single work shouldn't be executed concurrently by 1826 * A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
2506{ 2516{
2507 struct wq_barrier barr; 2517 struct wq_barrier barr;
2508 2518
2519 lock_map_acquire(&work->lockdep_map);
2520 lock_map_release(&work->lockdep_map);
2521
2509 if (start_flush_work(work, &barr, true)) { 2522 if (start_flush_work(work, &barr, true)) {
2510 wait_for_completion(&barr.done); 2523 wait_for_completion(&barr.done);
2511 destroy_work_on_stack(&barr.work); 2524 destroy_work_on_stack(&barr.work);