aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c104
-rw-r--r--kernel/cgroup.c755
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/context_tracking.c83
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c122
-rw-r--r--kernel/cred.c127
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/events/hw_breakpoint.c12
-rw-r--r--kernel/events/uprobes.c43
-rw-r--r--kernel/exit.c96
-rw-r--r--kernel/fork.c80
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/futex.c59
-rw-r--r--kernel/irq/chip.c1
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c41
-rw-r--r--kernel/irq/resend.c8
-rw-r--r--kernel/ksysfs.c23
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/modsign_pubkey.c4
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/module_signing.c14
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/posix-cpu-timers.c24
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/power/qos.c65
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk.c52
-rw-r--r--kernel/profile.c7
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcu.h2
-rw-r--r--kernel/rcupdate.c3
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h5
-rw-r--r--kernel/rcutorture.c54
-rw-r--r--kernel/rcutree.c347
-rw-r--r--kernel/rcutree.h67
-rw-r--r--kernel/rcutree_plugin.h415
-rw-r--r--kernel/rcutree_trace.c330
-rw-r--r--kernel/res_counter.c22
-rw-r--r--kernel/sched/core.c136
-rw-r--r--kernel/sched/cputime.c131
-rw-r--r--kernel/sched/debug.c36
-rw-r--r--kernel/sched/fair.c1127
-rw-r--r--kernel/sched/features.h16
-rw-r--r--kernel/sched/sched.h72
-rw-r--r--kernel/seccomp.c13
-rw-r--r--kernel/signal.c35
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/srcu.c16
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c49
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/jiffies.c8
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c137
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timekeeping.c64
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/ring_buffer.c65
-rw-r--r--kernel/trace/trace.c413
-rw-r--r--kernel/trace/trace.h18
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_events.c51
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_functions.c7
-rw-r--r--kernel/trace/trace_functions_graph.c6
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_output.c78
-rw-r--r--kernel/trace/trace_probe.c14
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c12
-rw-r--r--kernel/trace/trace_selftest.c13
-rw-r--r--kernel/trace/trace_syscalls.c61
-rw-r--r--kernel/trace/trace_uprobe.c12
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c16
-rw-r--r--kernel/workqueue.c28
87 files changed, 4073 insertions, 2445 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 86e3285ae7e5..ac0d533eb7de 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
110obj-$(CONFIG_PADATA) += padata.o 110obj-$(CONFIG_PADATA) += padata.o
111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
112obj-$(CONFIG_JUMP_LABEL) += jump_label.o 112obj-$(CONFIG_JUMP_LABEL) += jump_label.o
113obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
113 114
114$(obj)/configs.o: $(obj)/config_data.h 115$(obj)/configs.o: $(obj)/config_data.h
115 116
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2f186ed80c40..e37e6a12c5e3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ struct audit_context {
200 struct list_head names_list; /* anchor for struct audit_names->list */ 200 struct list_head names_list; /* anchor for struct audit_names->list */
201 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
202 struct path pwd; 202 struct path pwd;
203 struct audit_context *previous; /* For nested syscalls */
204 struct audit_aux_data *aux; 203 struct audit_aux_data *aux;
205 struct audit_aux_data *aux_pids; 204 struct audit_aux_data *aux_pids;
206 struct sockaddr_storage *sockaddr; 205 struct sockaddr_storage *sockaddr;
@@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk)
1091 1090
1092static inline void audit_free_context(struct audit_context *context) 1091static inline void audit_free_context(struct audit_context *context)
1093{ 1092{
1094 struct audit_context *previous; 1093 audit_free_names(context);
1095 int count = 0; 1094 unroll_tree_refs(context, NULL, 0);
1096 1095 free_tree_refs(context);
1097 do { 1096 audit_free_aux(context);
1098 previous = context->previous; 1097 kfree(context->filterkey);
1099 if (previous || (count && count < 10)) { 1098 kfree(context->sockaddr);
1100 ++count; 1099 kfree(context);
1101 printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
1102 " freeing multiple contexts (%d)\n",
1103 context->serial, context->major,
1104 context->name_count, count);
1105 }
1106 audit_free_names(context);
1107 unroll_tree_refs(context, NULL, 0);
1108 free_tree_refs(context);
1109 audit_free_aux(context);
1110 kfree(context->filterkey);
1111 kfree(context->sockaddr);
1112 kfree(context);
1113 context = previous;
1114 } while (context);
1115 if (count >= 10)
1116 printk(KERN_ERR "audit: freed %d contexts\n", count);
1117} 1100}
1118 1101
1119void audit_log_task_context(struct audit_buffer *ab) 1102void audit_log_task_context(struct audit_buffer *ab)
@@ -1159,7 +1142,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1159 cred = current_cred(); 1142 cred = current_cred();
1160 1143
1161 spin_lock_irq(&tsk->sighand->siglock); 1144 spin_lock_irq(&tsk->sighand->siglock);
1162 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1145 if (tsk->signal && tsk->signal->tty)
1163 tty = tsk->signal->tty->name; 1146 tty = tsk->signal->tty->name;
1164 else 1147 else
1165 tty = "(none)"; 1148 tty = "(none)";
@@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major,
1783 if (!context) 1766 if (!context)
1784 return; 1767 return;
1785 1768
1786 /*
1787 * This happens only on certain architectures that make system
1788 * calls in kernel_thread via the entry.S interface, instead of
1789 * with direct calls. (If you are porting to a new
1790 * architecture, hitting this condition can indicate that you
1791 * got the _exit/_leave calls backward in entry.S.)
1792 *
1793 * i386 no
1794 * x86_64 no
1795 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
1796 *
1797 * This also happens with vm86 emulation in a non-nested manner
1798 * (entries without exits), so this case must be caught.
1799 */
1800 if (context->in_syscall) {
1801 struct audit_context *newctx;
1802
1803#if AUDIT_DEBUG
1804 printk(KERN_ERR
1805 "audit(:%d) pid=%d in syscall=%d;"
1806 " entering syscall=%d\n",
1807 context->serial, tsk->pid, context->major, major);
1808#endif
1809 newctx = audit_alloc_context(context->state);
1810 if (newctx) {
1811 newctx->previous = context;
1812 context = newctx;
1813 tsk->audit_context = newctx;
1814 } else {
1815 /* If we can't alloc a new context, the best we
1816 * can do is to leak memory (any pending putname
1817 * will be lost). The only other alternative is
1818 * to abandon auditing. */
1819 audit_zero_context(context, context->state);
1820 }
1821 }
1822 BUG_ON(context->in_syscall || context->name_count); 1769 BUG_ON(context->in_syscall || context->name_count);
1823 1770
1824 if (!audit_enabled) 1771 if (!audit_enabled)
@@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code)
1881 if (!list_empty(&context->killed_trees)) 1828 if (!list_empty(&context->killed_trees))
1882 audit_kill_trees(&context->killed_trees); 1829 audit_kill_trees(&context->killed_trees);
1883 1830
1884 if (context->previous) { 1831 audit_free_names(context);
1885 struct audit_context *new_context = context->previous; 1832 unroll_tree_refs(context, NULL, 0);
1886 context->previous = NULL; 1833 audit_free_aux(context);
1887 audit_free_context(context); 1834 context->aux = NULL;
1888 tsk->audit_context = new_context; 1835 context->aux_pids = NULL;
1889 } else { 1836 context->target_pid = 0;
1890 audit_free_names(context); 1837 context->target_sid = 0;
1891 unroll_tree_refs(context, NULL, 0); 1838 context->sockaddr_len = 0;
1892 audit_free_aux(context); 1839 context->type = 0;
1893 context->aux = NULL; 1840 context->fds[0] = -1;
1894 context->aux_pids = NULL; 1841 if (context->state != AUDIT_RECORD_CONTEXT) {
1895 context->target_pid = 0; 1842 kfree(context->filterkey);
1896 context->target_sid = 0; 1843 context->filterkey = NULL;
1897 context->sockaddr_len = 0;
1898 context->type = 0;
1899 context->fds[0] = -1;
1900 if (context->state != AUDIT_RECORD_CONTEXT) {
1901 kfree(context->filterkey);
1902 context->filterkey = NULL;
1903 }
1904 tsk->audit_context = context;
1905 } 1844 }
1845 tsk->audit_context = context;
1906} 1846}
1907 1847
1908static inline void handle_one(const struct inode *inode) 1848static inline void handle_one(const struct inode *inode)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0dbfba2efa77..4855892798fd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 784 * The task_lock() exception
783 * 785 *
784 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 789 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
793 * 795 *
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 856 return inode;
855} 857}
856 858
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 860{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
898 * Release the subsystem state objects. 876 * Release the subsystem state objects.
899 */ 877 */
900 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp); 879 ss->css_free(cgrp);
902 880
903 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
917 895
918 simple_xattrs_free(&cgrp->xattrs); 896 simple_xattrs_free(&cgrp->xattrs);
919 897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
920 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
921 } else { 900 } else {
922 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 966 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 967 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 968 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 970 }
992 if (base_files) { 971 if (base_files) {
993 while (!list_empty(&cgrp->files)) 972 while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 994}
1016 995
1017/* 996/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1102 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1107 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1114 unsigned long subsys_mask;
1163 unsigned long flags; 1115 unsigned long flags;
1164 char *release_agent; 1116 char *release_agent;
1165 bool clone_children; 1117 bool cpuset_clone_children;
1166 char *name; 1118 char *name;
1167 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1168 bool none; 1120 bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1165 continue;
1214 } 1166 }
1215 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1217 continue; 1169 continue;
1218 } 1170 }
1219 if (!strcmp(token, "xattr")) { 1171 if (!strcmp(token, "xattr")) {
@@ -1381,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1381 if (ret) 1333 if (ret)
1382 goto out_unlock; 1334 goto out_unlock;
1383 1335
1384 /* See feature-removal-schedule.txt */
1385 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1336 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1386 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1337 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1387 task_tgid_nr(current), current->comm); 1338 task_tgid_nr(current), current->comm);
@@ -1397,14 +1348,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1348 goto out_unlock;
1398 } 1349 }
1399 1350
1351 /*
1352 * Clear out the files of subsystems that should be removed, do
1353 * this before rebind_subsystems, since rebind_subsystems may
1354 * change this hierarchy's subsys_list.
1355 */
1356 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1357
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1358 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1359 if (ret) {
1360 /* rebind_subsystems failed, re-populate the removed files */
1361 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1362 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1363 goto out_unlock;
1404 } 1364 }
1405 1365
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1366 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1367 cgroup_populate_dir(cgrp, false, added_mask);
1410 1368
@@ -1432,6 +1390,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1390 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1391 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1392 INIT_LIST_HEAD(&cgrp->css_sets);
1393 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1394 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1395 INIT_LIST_HEAD(&cgrp->pidlists);
1437 mutex_init(&cgrp->pidlist_mutex); 1396 mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1409 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1410 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1411 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1412 init_cgroup_housekeeping(cgrp);
1413 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1414}
1456 1415
1457static bool init_root_id(struct cgroupfs_root *root) 1416static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1477,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1477
1519 root->subsys_mask = opts->subsys_mask; 1478 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1479 root->flags = opts->flags;
1480 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1481 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1482 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1483 if (opts->name)
1524 strcpy(root->name, opts->name); 1484 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1485 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1486 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1487 return root;
1528} 1488}
1529 1489
@@ -1536,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1496 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1497 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1498 spin_unlock(&hierarchy_id_lock);
1499 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1500 kfree(root);
1540} 1501}
1541 1502
@@ -1701,7 +1662,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1701 1662
1702 free_cg_links(&tmp_cg_links); 1663 free_cg_links(&tmp_cg_links);
1703 1664
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1665 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1666 BUG_ON(root->number_of_cgroups != 1);
1707 1667
@@ -1750,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1710
1751 BUG_ON(root->number_of_cgroups != 1); 1711 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1712 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1713
1755 mutex_lock(&cgroup_mutex); 1714 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1715 mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1767,11 @@ static struct kobject *cgroup_kobj;
1808 */ 1767 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1768int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1769{
1770 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1771 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1772
1813 cgroup_lock_is_held()); 1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking");
1814 1775
1815 if (!dentry || cgrp == dummytop) { 1776 if (!dentry || cgrp == dummytop) {
1816 /* 1777 /*
@@ -1821,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1782 return 0;
1822 } 1783 }
1823 1784
1824 start = buf + buflen; 1785 start = buf + buflen - 1;
1825 1786
1826 *--start = '\0'; 1787 *start = '\0';
1827 for (;;) { 1788 for (;;) {
1828 int len = dentry->d_name.len; 1789 int len = dentry->d_name.len;
1829 1790
@@ -1834,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1795 if (!cgrp)
1835 break; 1796 break;
1836 1797
1837 dentry = rcu_dereference_check(cgrp->dentry, 1798 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1799 if (!cgrp->parent)
1840 continue; 1800 continue;
1841 if (--start < buf) 1801 if (--start < buf)
@@ -1930,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1890/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1891 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1892 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1893 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1894 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1895static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1896 struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2025 } 1983 }
2026 1984
2027 synchronize_rcu(); 1985 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1986out:
2035 if (retval) { 1987 if (retval) {
2036 for_each_subsys(root, ss) { 1988 for_each_subsys(root, ss) {
@@ -2200,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 * step 5: success! and cleanup 2152 * step 5: success! and cleanup
2201 */ 2153 */
2202 synchronize_rcu(); 2154 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2155 retval = 0;
2205out_put_css_set_refs: 2156out_put_css_set_refs:
2206 if (retval) { 2157 if (retval) {
@@ -2711,10 +2662,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2662
2712 /* start off with i_nlink == 2 (for "." entry) */ 2663 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2664 inc_nlink(inode);
2665 inc_nlink(dentry->d_parent->d_inode);
2714 2666
2715 /* start with the directory inode held, so that we can 2667 /*
2716 * populate it without racing with another mkdir */ 2668 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2669 * @inode->i_mutex should nest outside cgroup_mutex but we
2670 * want to populate it immediately without releasing
2671 * cgroup_mutex. As @inode isn't visible to anyone else
2672 * yet, trylock will always succeed without affecting
2673 * lockdep checks.
2674 */
2675 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2676 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2677 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2678 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2683,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2683 return 0;
2726} 2684}
2727 2685
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2686/**
2755 * cgroup_file_mode - deduce file mode of a control file 2687 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2688 * @cft: the control file in question
@@ -2791,12 +2723,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2723
2792 simple_xattrs_init(&cft->xattrs); 2724 simple_xattrs_init(&cft->xattrs);
2793 2725
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2726 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2727 strcpy(name, subsys->name);
2802 strcat(name, "."); 2728 strcat(name, ".");
@@ -2837,6 +2763,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2763 int err, ret = 0;
2838 2764
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2765 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2766 /* does cft->flags tell us to skip this file on @cgrp? */
2767 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2768 continue;
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue;
2771
2840 if (is_add) 2772 if (is_add)
2841 err = cgroup_add_file(cgrp, subsys, cft); 2773 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2774 else
@@ -3044,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2976 write_unlock(&css_set_lock);
3045} 2977}
3046 2978
2979/**
2980 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2981 * @pos: the current position (%NULL to initiate traversal)
2982 * @cgroup: cgroup whose descendants to walk
2983 *
2984 * To be used by cgroup_for_each_descendant_pre(). Find the next
2985 * descendant to visit for pre-order traversal of @cgroup's descendants.
2986 */
2987struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2988 struct cgroup *cgroup)
2989{
2990 struct cgroup *next;
2991
2992 WARN_ON_ONCE(!rcu_read_lock_held());
2993
2994 /* if first iteration, pretend we just visited @cgroup */
2995 if (!pos) {
2996 if (list_empty(&cgroup->children))
2997 return NULL;
2998 pos = cgroup;
2999 }
3000
3001 /* visit the first child if exists */
3002 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3003 if (next)
3004 return next;
3005
3006 /* no child, visit my or the closest ancestor's next sibling */
3007 do {
3008 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3009 sibling);
3010 if (&next->sibling != &pos->parent->children)
3011 return next;
3012
3013 pos = pos->parent;
3014 } while (pos != cgroup);
3015
3016 return NULL;
3017}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{
3022 struct cgroup *last;
3023
3024 do {
3025 last = pos;
3026 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3027 sibling);
3028 } while (pos);
3029
3030 return last;
3031}
3032
3033/**
3034 * cgroup_next_descendant_post - find the next descendant for post-order walk
3035 * @pos: the current position (%NULL to initiate traversal)
3036 * @cgroup: cgroup whose descendants to walk
3037 *
3038 * To be used by cgroup_for_each_descendant_post(). Find the next
3039 * descendant to visit for post-order traversal of @cgroup's descendants.
3040 */
3041struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3042 struct cgroup *cgroup)
3043{
3044 struct cgroup *next;
3045
3046 WARN_ON_ONCE(!rcu_read_lock_held());
3047
3048 /* if first iteration, visit the leftmost descendant */
3049 if (!pos) {
3050 next = cgroup_leftmost_descendant(cgroup);
3051 return next != cgroup ? next : NULL;
3052 }
3053
3054 /* if there's an unvisited sibling, visit its leftmost descendant */
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return cgroup_leftmost_descendant(next);
3058
3059 /* no sibling left, visit parent */
3060 next = pos->parent;
3061 return next != cgroup ? next : NULL;
3062}
3063EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3064
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3065void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3066 __acquires(css_set_lock)
3049{ 3067{
@@ -3757,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3757 if (flags & POLLHUP) { 3775 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait); 3776 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock); 3777 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list); 3778 list_del_init(&event->list);
3761 spin_unlock(&cgrp->event_list_lock); 3779 spin_unlock(&cgrp->event_list_lock);
3762 /* 3780 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3781 * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3912,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3912static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3913 struct cftype *cft)
3896{ 3914{
3897 return clone_children(cgrp); 3915 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3916}
3899 3917
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3918static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3920 u64 val)
3903{ 3921{
3904 if (val) 3922 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3923 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3924 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3925 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3926 return 0;
3909} 3927}
3910 3928
@@ -4017,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4035 css->flags = 0;
4018 css->id = NULL; 4036 css->id = NULL;
4019 if (cgrp == dummytop) 4037 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4038 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4039 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4040 cgrp->subsys[ss->subsys_id] = css;
4023 4041
4024 /* 4042 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4043 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4044 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4045 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4046 * dput() asynchronously from css_put().
4029 */ 4047 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4048 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4049}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4050
4051/* invoke ->post_create() on a new CSS and mark it online if successful */
4052static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4053{
4054 int ret = 0;
4055
4056 lockdep_assert_held(&cgroup_mutex);
4057
4058 if (ss->css_online)
4059 ret = ss->css_online(cgrp);
4060 if (!ret)
4061 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4062 return ret;
4063}
4064
4065/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4066static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4067 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4068{
4069 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4070
4071 lockdep_assert_held(&cgroup_mutex);
4072
4073 if (!(css->flags & CSS_ONLINE))
4074 return;
4075
4076 /*
4077 * css_offline() should be called with cgroup_mutex unlocked. See
4078 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4079 * details. This temporary unlocking should go away once
4080 * cgroup_mutex is unexported from controllers.
4081 */
4082 if (ss->css_offline) {
4083 mutex_unlock(&cgroup_mutex);
4084 ss->css_offline(cgrp);
4085 mutex_lock(&cgroup_mutex);
4086 }
4087
4088 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4089}
4034 4090
4035/* 4091/*
@@ -4049,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4105 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4106 struct super_block *sb = root->sb;
4051 4107
4108 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4109 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4110 if (!cgrp)
4054 return -ENOMEM; 4111 return -ENOMEM;
4055 4112
4113 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4114 if (cgrp->id < 0)
4115 goto err_free_cgrp;
4116
4117 /*
4118 * Only live parents can have children. Note that the liveliness
4119 * check isn't strictly necessary because cgroup_mkdir() and
4120 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4121 * anyway so that locking is contained inside cgroup proper and we
4122 * don't get nasty surprises if we ever grow another caller.
4123 */
4124 if (!cgroup_lock_live_group(parent)) {
4125 err = -ENODEV;
4126 goto err_free_id;
4127 }
4128
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4129 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4130 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4131 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4133 * fs */
4061 atomic_inc(&sb->s_active); 4134 atomic_inc(&sb->s_active);
4062 4135
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4136 init_cgroup_housekeeping(cgrp);
4066 4137
4067 cgrp->parent = parent; 4138 cgrp->parent = parent;
@@ -4071,26 +4142,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4142 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4143 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4144
4074 if (clone_children(parent)) 4145 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4146 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4147
4077 for_each_subsys(root, ss) { 4148 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4149 struct cgroup_subsys_state *css;
4079 4150
4080 css = ss->create(cgrp); 4151 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4152 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4153 err = PTR_ERR(css);
4083 goto err_destroy; 4154 goto err_free_all;
4084 } 4155 }
4085 init_cgroup_css(css, ss, cgrp); 4156 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4157 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4158 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4159 if (err)
4089 goto err_destroy; 4160 goto err_free_all;
4090 } 4161 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4162 }
4092 if (clone_children(parent) && ss->post_clone) 4163
4093 ss->post_clone(cgrp); 4164 /*
4165 * Create directory. cgroup_create_file() returns with the new
4166 * directory locked on success so that it can be populated without
4167 * dropping cgroup_mutex.
4168 */
4169 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4170 if (err < 0)
4171 goto err_free_all;
4172 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173
4174 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++;
4180
4181 /* each css holds a ref to the cgroup's dentry */
4182 for_each_subsys(root, ss)
4183 dget(dentry);
4184
4185 /* creation succeeded, notify subsystems */
4186 for_each_subsys(root, ss) {
4187 err = online_css(ss, cgrp);
4188 if (err)
4189 goto err_destroy;
4094 4190
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4191 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4192 parent->parent) {
@@ -4102,50 +4198,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4198 }
4103 } 4199 }
4104 4200
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4201 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4202 if (err)
4203 goto err_destroy;
4124 4204
4125 mutex_unlock(&cgroup_mutex); 4205 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4206 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4207
4128 return 0; 4208 return 0;
4129 4209
4130 err_remove: 4210err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4211 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4212 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4213 ss->css_free(cgrp);
4140 } 4214 }
4141
4142 mutex_unlock(&cgroup_mutex); 4215 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4216 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4217 deactivate_super(sb);
4146 4218err_free_id:
4219 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4220err_free_cgrp:
4147 kfree(cgrp); 4221 kfree(cgrp);
4148 return err; 4222 return err;
4223
4224err_destroy:
4225 cgroup_destroy_locked(cgrp);
4226 mutex_unlock(&cgroup_mutex);
4227 mutex_unlock(&dentry->d_inode->i_mutex);
4228 return err;
4149} 4229}
4150 4230
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4231static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4277 return 0;
4198} 4278}
4199 4279
4200/* 4280static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4281 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4282{
4283 struct dentry *d = cgrp->dentry;
4284 struct cgroup *parent = cgrp->parent;
4285 DEFINE_WAIT(wait);
4286 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4287 struct cgroup_subsys *ss;
4225 unsigned long flags; 4288 LIST_HEAD(tmp_list);
4226 bool failed = false; 4289
4290 lockdep_assert_held(&d->d_inode->i_mutex);
4291 lockdep_assert_held(&cgroup_mutex);
4227 4292
4228 local_irq_save(flags); 4293 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4294 return -EBUSY;
4229 4295
4230 /* 4296 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4297 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4298 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4299 * attempts fail thus maintaining the removal conditions verified
4300 * above.
4234 */ 4301 */
4235 for_each_subsys(cgrp->root, ss) { 4302 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4303 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4304
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4305 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4306 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 }
4244
4245 /*
4246 * If succeeded, set REMOVED and put all the base refs; otherwise,
4247 * restore refcnts to positive values. Either way, all in-progress
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 } 4307 }
4308 set_bit(CGRP_REMOVED, &cgrp->flags);
4260 4309
4261 local_irq_restore(flags); 4310 /* tell subsystems to initate destruction */
4262 return !failed; 4311 for_each_subsys(cgrp->root, ss)
4263} 4312 offline_css(ss, cgrp);
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4313
4298 /* 4314 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4315 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4316 * cgroup's dentry and cgroup removal proceeds regardless of css
4317 * refs. On the last put of each css, whenever that may be, the
4318 * extra dentry ref is put so that dentry destruction happens only
4319 * after all css's are released.
4301 */ 4320 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4321 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4322 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4323
4334 raw_spin_lock(&release_list_lock); 4324 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4325 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4326 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4327 raw_spin_unlock(&release_list_lock);
4339 4328
4340 /* delete this cgroup from parent->children */ 4329 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4330 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4331 list_del_init(&cgrp->allcg_node);
4344 4332
4345 d = dget(cgrp->dentry); 4333 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4334 cgroup_d_remove_dir(d);
4348 dput(d); 4335 dput(d);
4349 4336
@@ -4353,21 +4340,35 @@ again:
4353 /* 4340 /*
4354 * Unregister events and notify userspace. 4341 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4342 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4343 * directory to avoid race between userspace and kernelspace. Use
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4357 */ 4347 */
4358 spin_lock(&cgrp->event_list_lock); 4348 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4349 list_splice_init(&cgrp->event_list, &tmp_list);
4360 list_del(&event->list); 4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait); 4353 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1); 4354 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4355 schedule_work(&event->remove);
4364 } 4356 }
4365 spin_unlock(&cgrp->event_list_lock);
4366 4357
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4358 return 0;
4369} 4359}
4370 4360
4361static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4362{
4363 int ret;
4364
4365 mutex_lock(&cgroup_mutex);
4366 ret = cgroup_destroy_locked(dentry->d_fsdata);
4367 mutex_unlock(&cgroup_mutex);
4368
4369 return ret;
4370}
4371
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4372static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4373{
4373 INIT_LIST_HEAD(&ss->cftsets); 4374 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4389
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4390 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4391
4392 mutex_lock(&cgroup_mutex);
4393
4391 /* init base cftset */ 4394 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4395 cgroup_init_cftsets(ss);
4393 4396
4394 /* Create the top cgroup state for this subsystem */ 4397 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4398 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4399 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4400 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4401 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4402 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4403 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4406 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4407 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4408 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4409 init_css_set.subsys[ss->subsys_id] = css;
4407 4410
4408 need_forkexit_callback |= ss->fork || ss->exit; 4411 need_forkexit_callback |= ss->fork || ss->exit;
4409 4412
@@ -4413,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4416 BUG_ON(!list_empty(&init_task.tasks));
4414 4417
4415 ss->active = 1; 4418 ss->active = 1;
4419 BUG_ON(online_css(ss, dummytop));
4420
4421 mutex_unlock(&cgroup_mutex);
4416 4422
4417 /* this function shouldn't be used with modular subsystems, since they 4423 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4424 * need to register a subsys_id, among other things */
@@ -4430,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4436 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4437int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4438{
4433 int i;
4434 struct cgroup_subsys_state *css; 4439 struct cgroup_subsys_state *css;
4440 int i, ret;
4435 4441
4436 /* check name and function validity */ 4442 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4444 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4445 return -EINVAL;
4440 4446
4441 /* 4447 /*
@@ -4464,10 +4470,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4470 subsys[ss->subsys_id] = ss;
4465 4471
4466 /* 4472 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4473 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4474 * struct, so this can happen first (i.e. before the rootnode
4475 * attachment).
4469 */ 4476 */
4470 css = ss->create(dummytop); 4477 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4478 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4479 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4480 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4489 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4490 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4491 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4492 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4493 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4494 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4495 }
4494 4496
4495 /* 4497 /*
@@ -4522,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4522 write_unlock(&css_set_lock); 4524 write_unlock(&css_set_lock);
4523 4525
4524 ss->active = 1; 4526 ss->active = 1;
4527 ret = online_css(ss, dummytop);
4528 if (ret)
4529 goto err_unload;
4525 4530
4526 /* success! */ 4531 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4532 mutex_unlock(&cgroup_mutex);
4528 return 0; 4533 return 0;
4534
4535err_unload:
4536 mutex_unlock(&cgroup_mutex);
4537 /* @ss can't be mounted here as try_module_get() would fail */
4538 cgroup_unload_subsys(ss);
4539 return ret;
4529} 4540}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4541EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4542
@@ -4552,6 +4563,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4563 BUG_ON(ss->root != &rootnode);
4553 4564
4554 mutex_lock(&cgroup_mutex); 4565 mutex_lock(&cgroup_mutex);
4566
4567 offline_css(ss, dummytop);
4568 ss->active = 0;
4569
4570 if (ss->use_id) {
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr);
4573 }
4574
4555 /* deassign the subsys_id */ 4575 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4576 subsys[ss->subsys_id] = NULL;
4557 4577
@@ -4567,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 struct css_set *cg = link->cg; 4587 struct css_set *cg = link->cg;
4568 4588
4569 hlist_del(&cg->hlist); 4589 hlist_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4590 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4591 hhead = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4592 hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4575 write_unlock(&css_set_lock); 4594 write_unlock(&css_set_lock);
4576 4595
4577 /* 4596 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4597 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4598 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4599 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4600 * takes care of freeing the css_id.
4582 */ 4601 */
4583 ss->destroy(dummytop); 4602 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4603 dummytop->subsys[ss->subsys_id] = NULL;
4585 4604
4586 mutex_unlock(&cgroup_mutex); 4605 mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4643,8 @@ int __init cgroup_init_early(void)
4624 4643
4625 BUG_ON(!ss->name); 4644 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4645 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4646 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4647 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4648 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4649 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4650 ss->name, ss->subsys_id);
@@ -4832,44 +4851,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4851}
4833 4852
4834/** 4853/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4854 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4855 * @child: the task in question
4865 * 4856 *
4866 * Adds the task to the list running through its css_set if necessary. 4857 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4858 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4859 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4860 * cgroup_iter_start() - to guarantee that the new task ends up on its
4861 * list.
4870 */ 4862 */
4871void cgroup_post_fork(struct task_struct *child) 4863void cgroup_post_fork(struct task_struct *child)
4872{ 4864{
4865 int i;
4866
4873 /* 4867 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4868 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4869 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4883,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4883 task_unlock(child);
4890 write_unlock(&css_set_lock); 4884 write_unlock(&css_set_lock);
4891 } 4885 }
4886
4887 /*
4888 * Call ss->fork(). This must happen after @child is linked on
4889 * css_set; otherwise, @child might change state between ->fork()
4890 * and addition to css_set.
4891 */
4892 if (need_forkexit_callback) {
4893 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4894 struct cgroup_subsys *ss = subsys[i];
4895
4896 /*
4897 * fork/exit callbacks are supported only for
4898 * builtin subsystems and we don't need further
4899 * synchronization as they never go away.
4900 */
4901 if (!ss || ss->module)
4902 continue;
4903
4904 if (ss->fork)
4905 ss->fork(child);
4906 }
4907 }
4892} 4908}
4909
4893/** 4910/**
4894 * cgroup_exit - detach cgroup from exiting task 4911 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4912 * @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5039/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5040bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5041{
5025 do { 5042 while (true) {
5026 int v = css_refcnt(css); 5043 int t, v;
5027 5044
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5045 v = css_refcnt(css);
5046 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5047 if (likely(t == v))
5029 return true; 5048 return true;
5049 else if (t < 0)
5050 return false;
5030 cpu_relax(); 5051 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5052 }
5032
5033 return false;
5034} 5053}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5054EXPORT_SYMBOL_GPL(__css_tryget);
5036 5055
@@ -5049,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5068 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5069 check_for_release(cgrp);
5051 } 5070 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5071 break;
5054 case 0: 5072 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5073 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5074 break;
5058 } 5075 }
5059 rcu_read_unlock(); 5076 rcu_read_unlock();
@@ -5439,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5456}
5440 5457
5441#ifdef CONFIG_CGROUP_DEBUG 5458#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5459static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5460{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5461 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5462
@@ -5449,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5466 return css;
5450} 5467}
5451 5468
5452static void debug_destroy(struct cgroup *cont) 5469static void debug_css_free(struct cgroup *cont)
5453{ 5470{
5454 kfree(cont->subsys[debug_subsys_id]); 5471 kfree(cont->subsys[debug_subsys_id]);
5455} 5472}
@@ -5578,8 +5595,8 @@ static struct cftype debug_files[] = {
5578 5595
5579struct cgroup_subsys debug_subsys = { 5596struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5597 .name = "debug",
5581 .create = debug_create, 5598 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5599 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5600 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5601 .base_cftypes = debug_files,
5585}; 5602};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was 319
260 * only partially frozen when we exitted write. */ 320 rcu_read_unlock();
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 321
267 seq_puts(m, freezer_state_strs[state]); 322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
331 387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
384}; 488};
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..f6150e92dfc9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1215 return 0;
1216} 1216}
1217 1217
1218#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
1219asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1220 struct compat_timespec __user *interval)
1221{
1222 struct timespec t;
1223 int ret;
1224 mm_segment_t old_fs = get_fs();
1225
1226 set_fs(KERNEL_DS);
1227 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1228 set_fs(old_fs);
1229 if (put_compat_timespec(&t, interval))
1230 return -EFAULT;
1231 return ret;
1232}
1233#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1234
1218/* 1235/*
1219 * Allocate user-space memory for the duration of a single system call, 1236 * Allocate user-space memory for the duration of a single system call,
1220 * in order to marshall parameters inside a compat thunk. 1237 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000000..e0e07fd55508
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,83 @@
1#include <linux/context_tracking.h>
2#include <linux/rcupdate.h>
3#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h>
6
7struct context_tracking {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true,
24#endif
25};
26
27void user_enter(void)
28{
29 unsigned long flags;
30
31 /*
32 * Some contexts may involve an exception occuring in an irq,
33 * leading to that nesting:
34 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
35 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
36 * helpers are enough to protect RCU uses inside the exception. So
37 * just return immediately if we detect we are in an IRQ.
38 */
39 if (in_interrupt())
40 return;
41
42 WARN_ON_ONCE(!current->mm);
43
44 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER);
48 rcu_user_enter();
49 }
50 local_irq_restore(flags);
51}
52
53void user_exit(void)
54{
55 unsigned long flags;
56
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt())
66 return;
67
68 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL);
71 rcu_user_exit();
72 }
73 local_irq_restore(flags);
74}
75
76void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next)
78{
79 if (__this_cpu_read(context_tracking.active)) {
80 clear_tsk_thread_flag(prev, TIF_NOHZ);
81 set_tsk_thread_flag(next, TIF_NOHZ);
82 }
83}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 42bd331ee0ab..3046a503242c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -348,11 +348,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
349 struct task_struct *idle; 349 struct task_struct *idle;
350 350
351 if (cpu_online(cpu) || !cpu_present(cpu))
352 return -EINVAL;
353
354 cpu_hotplug_begin(); 351 cpu_hotplug_begin();
355 352
353 if (cpu_online(cpu) || !cpu_present(cpu)) {
354 ret = -EINVAL;
355 goto out;
356 }
357
356 idle = idle_thread_get(cpu); 358 idle = idle_thread_get(cpu);
357 if (IS_ERR(idle)) { 359 if (IS_ERR(idle)) {
358 ret = PTR_ERR(idle); 360 ret = PTR_ERR(idle);
@@ -601,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
601 603
602static int __init cpu_hotplug_pm_sync_init(void) 604static int __init cpu_hotplug_pm_sync_init(void)
603{ 605{
606 /*
607 * cpu_hotplug_pm_callback has higher priority than x86
608 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
609 * to disable cpu hotplug to avoid cpu hotplug race.
610 */
604 pm_notifier(cpu_hotplug_pm_callback, 0); 611 pm_notifier(cpu_hotplug_pm_callback, 0);
605 return 0; 612 return 0;
606} 613}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..7bb63eea6eb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
302 * are online, with memory. If none are online with memory, walk 302 * are online, with memory. If none are online with memory, walk
303 * up the cpuset hierarchy until we find one that does have some 303 * up the cpuset hierarchy until we find one that does have some
304 * online mems. If we get all the way to the top and still haven't 304 * online mems. If we get all the way to the top and still haven't
305 * found any online mems, return node_states[N_HIGH_MEMORY]. 305 * found any online mems, return node_states[N_MEMORY].
306 * 306 *
307 * One way or another, we guarantee to return some non-empty subset 307 * One way or another, we guarantee to return some non-empty subset
308 * of node_states[N_HIGH_MEMORY]. 308 * of node_states[N_MEMORY].
309 * 309 *
310 * Call with callback_mutex held. 310 * Call with callback_mutex held.
311 */ 311 */
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 314{
315 while (cs && !nodes_intersects(cs->mems_allowed, 315 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_HIGH_MEMORY])) 316 node_states[N_MEMORY]))
317 cs = cs->parent; 317 cs = cs->parent;
318 if (cs) 318 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 319 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_HIGH_MEMORY]); 320 node_states[N_MEMORY]);
321 else 321 else
322 *pmask = node_states[N_HIGH_MEMORY]; 322 *pmask = node_states[N_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 323 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
324} 324}
325 325
326/* 326/*
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1100 return -ENOMEM; 1100 return -ENOMEM;
1101 1101
1102 /* 1102 /*
1103 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1103 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1104 * it's read-only 1104 * it's read-only
1105 */ 1105 */
1106 if (cs == &top_cpuset) { 1106 if (cs == &top_cpuset) {
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1122 goto done;
1123 1123
1124 if (!nodes_subset(trialcs->mems_allowed, 1124 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) { 1125 node_states[N_MEMORY])) {
1126 retval = -EINVAL; 1126 retval = -EINVAL;
1127 goto done; 1127 goto done;
1128 } 1128 }
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
1784}; 1784};
1785 1785
1786/* 1786/*
1787 * post_clone() is called during cgroup_create() when the 1787 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1788 * cont: control group that the new cpuset will be part of
1826 */ 1789 */
1827 1790
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1792{
1830 struct cpuset *cs; 1793 struct cgroup *parent_cg = cont->parent;
1831 struct cpuset *parent; 1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1832 1796
1833 if (!cont->parent) { 1797 if (!parent_cg)
1834 return &top_cpuset.css; 1798 return &top_cpuset.css;
1835 } 1799 parent = cgroup_cs(parent_cg);
1836 parent = cgroup_cs(cont->parent); 1800
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1802 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1803 return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1855 1819
1856 cs->parent = parent; 1820 cs->parent = parent;
1857 number_of_cpusets++; 1821 number_of_cpusets++;
1858 return &cs->css ; 1822
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1859} 1852}
1860 1853
1861/* 1854/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1864 * will call async_rebuild_sched_domains(). 1857 * will call async_rebuild_sched_domains().
1865 */ 1858 */
1866 1859
1867static void cpuset_destroy(struct cgroup *cont) 1860static void cpuset_css_free(struct cgroup *cont)
1868{ 1861{
1869 struct cpuset *cs = cgroup_cs(cont); 1862 struct cpuset *cs = cgroup_cs(cont);
1870 1863
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
1878 1871
1879struct cgroup_subsys cpuset_subsys = { 1872struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1873 .name = "cpuset",
1881 .create = cpuset_create, 1874 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1875 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1876 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach, 1877 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1878 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1879 .base_cftypes = files,
1888 .early_init = 1, 1880 .early_init = 1,
@@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue)
2034 * before dropping down to the next. It always processes a node before 2026 * before dropping down to the next. It always processes a node before
2035 * any of its children. 2027 * any of its children.
2036 * 2028 *
2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY 2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
2038 * if all present pages from a node are offlined. 2030 * if all present pages from a node are offlined.
2039 */ 2031 */
2040static void 2032static void
@@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2073 2065
2074 /* Continue past cpusets with all mems online */ 2066 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed, 2067 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY])) 2068 node_states[N_MEMORY]))
2077 continue; 2069 continue;
2078 2070
2079 oldmems = cp->mems_allowed; 2071 oldmems = cp->mems_allowed;
@@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2081 /* Remove offline mems from this cpuset. */ 2073 /* Remove offline mems from this cpuset. */
2082 mutex_lock(&callback_mutex); 2074 mutex_lock(&callback_mutex);
2083 nodes_and(cp->mems_allowed, cp->mems_allowed, 2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2084 node_states[N_HIGH_MEMORY]); 2076 node_states[N_MEMORY]);
2085 mutex_unlock(&callback_mutex); 2077 mutex_unlock(&callback_mutex);
2086 2078
2087 /* Move tasks from the empty cpuset to a parent */ 2079 /* Move tasks from the empty cpuset to a parent */
@@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online)
2134 2126
2135#ifdef CONFIG_MEMORY_HOTPLUG 2127#ifdef CONFIG_MEMORY_HOTPLUG
2136/* 2128/*
2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2129 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2130 * Call this routine anytime after node_states[N_MEMORY] changes.
2139 * See cpuset_update_active_cpus() for CPU hotplug handling. 2131 * See cpuset_update_active_cpus() for CPU hotplug handling.
2140 */ 2132 */
2141static int cpuset_track_online_nodes(struct notifier_block *self, 2133static int cpuset_track_online_nodes(struct notifier_block *self,
@@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2148 case MEM_ONLINE: 2140 case MEM_ONLINE:
2149 oldmems = top_cpuset.mems_allowed; 2141 oldmems = top_cpuset.mems_allowed;
2150 mutex_lock(&callback_mutex); 2142 mutex_lock(&callback_mutex);
2151 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2152 mutex_unlock(&callback_mutex); 2144 mutex_unlock(&callback_mutex);
2153 update_tasks_nodemask(&top_cpuset, &oldmems, NULL); 2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2154 break; 2146 break;
@@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2177void __init cpuset_init_smp(void) 2169void __init cpuset_init_smp(void)
2178{ 2170{
2179 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2171 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2180 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2172 top_cpuset.mems_allowed = node_states[N_MEMORY];
2181 2173
2182 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2183 2175
@@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void)
2245 * 2237 *
2246 * Description: Returns the nodemask_t mems_allowed of the cpuset 2238 * Description: Returns the nodemask_t mems_allowed of the cpuset
2247 * attached to the specified @tsk. Guaranteed to return some non-empty 2239 * attached to the specified @tsk. Guaranteed to return some non-empty
2248 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the 2240 * subset of node_states[N_MEMORY], even if this means going outside the
2249 * tasks cpuset. 2241 * tasks cpuset.
2250 **/ 2242 **/
2251 2243
diff --git a/kernel/cred.c b/kernel/cred.c
index 709d521903f6..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
30static struct kmem_cache *cred_jar; 30static struct kmem_cache *cred_jar;
31 31
32/* 32/*
33 * The common credentials for the initial task's thread group
34 */
35#ifdef CONFIG_KEYS
36static struct thread_group_cred init_tgcred = {
37 .usage = ATOMIC_INIT(2),
38 .tgid = 0,
39 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
40};
41#endif
42
43/*
44 * The initial credentials for the initial task 33 * The initial credentials for the initial task
45 */ 34 */
46struct cred init_cred = { 35struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
65 .user = INIT_USER, 54 .user = INIT_USER,
66 .user_ns = &init_user_ns, 55 .user_ns = &init_user_ns,
67 .group_info = &init_groups, 56 .group_info = &init_groups,
68#ifdef CONFIG_KEYS
69 .tgcred = &init_tgcred,
70#endif
71}; 57};
72 58
73static inline void set_cred_subscribers(struct cred *cred, int n) 59static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
96} 82}
97 83
98/* 84/*
99 * Dispose of the shared task group credentials
100 */
101#ifdef CONFIG_KEYS
102static void release_tgcred_rcu(struct rcu_head *rcu)
103{
104 struct thread_group_cred *tgcred =
105 container_of(rcu, struct thread_group_cred, rcu);
106
107 BUG_ON(atomic_read(&tgcred->usage) != 0);
108
109 key_put(tgcred->session_keyring);
110 key_put(tgcred->process_keyring);
111 kfree(tgcred);
112}
113#endif
114
115/*
116 * Release a set of thread group credentials.
117 */
118static void release_tgcred(struct cred *cred)
119{
120#ifdef CONFIG_KEYS
121 struct thread_group_cred *tgcred = cred->tgcred;
122
123 if (atomic_dec_and_test(&tgcred->usage))
124 call_rcu(&tgcred->rcu, release_tgcred_rcu);
125#endif
126}
127
128/*
129 * The RCU callback to actually dispose of a set of credentials 85 * The RCU callback to actually dispose of a set of credentials
130 */ 86 */
131static void put_cred_rcu(struct rcu_head *rcu) 87static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
150#endif 106#endif
151 107
152 security_cred_free(cred); 108 security_cred_free(cred);
109 key_put(cred->session_keyring);
110 key_put(cred->process_keyring);
153 key_put(cred->thread_keyring); 111 key_put(cred->thread_keyring);
154 key_put(cred->request_key_auth); 112 key_put(cred->request_key_auth);
155 release_tgcred(cred);
156 if (cred->group_info) 113 if (cred->group_info)
157 put_group_info(cred->group_info); 114 put_group_info(cred->group_info);
158 free_uid(cred->user); 115 free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
246 if (!new) 203 if (!new)
247 return NULL; 204 return NULL;
248 205
249#ifdef CONFIG_KEYS
250 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
251 if (!new->tgcred) {
252 kmem_cache_free(cred_jar, new);
253 return NULL;
254 }
255 atomic_set(&new->tgcred->usage, 1);
256#endif
257
258 atomic_set(&new->usage, 1); 206 atomic_set(&new->usage, 1);
259#ifdef CONFIG_DEBUG_CREDENTIALS 207#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC; 208 new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
308 get_user_ns(new->user_ns); 256 get_user_ns(new->user_ns);
309 257
310#ifdef CONFIG_KEYS 258#ifdef CONFIG_KEYS
259 key_get(new->session_keyring);
260 key_get(new->process_keyring);
311 key_get(new->thread_keyring); 261 key_get(new->thread_keyring);
312 key_get(new->request_key_auth); 262 key_get(new->request_key_auth);
313 atomic_inc(&new->tgcred->usage);
314#endif 263#endif
315 264
316#ifdef CONFIG_SECURITY 265#ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
334 */ 283 */
335struct cred *prepare_exec_creds(void) 284struct cred *prepare_exec_creds(void)
336{ 285{
337 struct thread_group_cred *tgcred = NULL;
338 struct cred *new; 286 struct cred *new;
339 287
340#ifdef CONFIG_KEYS
341 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
342 if (!tgcred)
343 return NULL;
344#endif
345
346 new = prepare_creds(); 288 new = prepare_creds();
347 if (!new) { 289 if (!new)
348 kfree(tgcred);
349 return new; 290 return new;
350 }
351 291
352#ifdef CONFIG_KEYS 292#ifdef CONFIG_KEYS
353 /* newly exec'd tasks don't get a thread keyring */ 293 /* newly exec'd tasks don't get a thread keyring */
354 key_put(new->thread_keyring); 294 key_put(new->thread_keyring);
355 new->thread_keyring = NULL; 295 new->thread_keyring = NULL;
356 296
357 /* create a new per-thread-group creds for all this set of threads to
358 * share */
359 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
360
361 atomic_set(&tgcred->usage, 1);
362 spin_lock_init(&tgcred->lock);
363
364 /* inherit the session keyring; new process keyring */ 297 /* inherit the session keyring; new process keyring */
365 key_get(tgcred->session_keyring); 298 key_put(new->process_keyring);
366 tgcred->process_keyring = NULL; 299 new->process_keyring = NULL;
367
368 release_tgcred(new);
369 new->tgcred = tgcred;
370#endif 300#endif
371 301
372 return new; 302 return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
383 */ 313 */
384int copy_creds(struct task_struct *p, unsigned long clone_flags) 314int copy_creds(struct task_struct *p, unsigned long clone_flags)
385{ 315{
386#ifdef CONFIG_KEYS
387 struct thread_group_cred *tgcred;
388#endif
389 struct cred *new; 316 struct cred *new;
390 int ret; 317 int ret;
391 318
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
425 install_thread_keyring_to_cred(new); 352 install_thread_keyring_to_cred(new);
426 } 353 }
427 354
428 /* we share the process and session keyrings between all the threads in 355 /* The process keyring is only shared between the threads in a process;
429 * a process - this is slightly icky as we violate COW credentials a 356 * anything outside of those threads doesn't inherit.
430 * bit */ 357 */
431 if (!(clone_flags & CLONE_THREAD)) { 358 if (!(clone_flags & CLONE_THREAD)) {
432 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); 359 key_put(new->process_keyring);
433 if (!tgcred) { 360 new->process_keyring = NULL;
434 ret = -ENOMEM;
435 goto error_put;
436 }
437 atomic_set(&tgcred->usage, 1);
438 spin_lock_init(&tgcred->lock);
439 tgcred->process_keyring = NULL;
440 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
441
442 release_tgcred(new);
443 new->tgcred = tgcred;
444 } 361 }
445#endif 362#endif
446 363
@@ -668,9 +585,6 @@ void __init cred_init(void)
668 */ 585 */
669struct cred *prepare_kernel_cred(struct task_struct *daemon) 586struct cred *prepare_kernel_cred(struct task_struct *daemon)
670{ 587{
671#ifdef CONFIG_KEYS
672 struct thread_group_cred *tgcred;
673#endif
674 const struct cred *old; 588 const struct cred *old;
675 struct cred *new; 589 struct cred *new;
676 590
@@ -678,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
678 if (!new) 592 if (!new)
679 return NULL; 593 return NULL;
680 594
681#ifdef CONFIG_KEYS
682 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
683 if (!tgcred) {
684 kmem_cache_free(cred_jar, new);
685 return NULL;
686 }
687#endif
688
689 kdebug("prepare_kernel_cred() alloc %p", new); 595 kdebug("prepare_kernel_cred() alloc %p", new);
690 596
691 if (daemon) 597 if (daemon)
@@ -703,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
703 get_group_info(new->group_info); 609 get_group_info(new->group_info);
704 610
705#ifdef CONFIG_KEYS 611#ifdef CONFIG_KEYS
706 atomic_set(&tgcred->usage, 1); 612 new->session_keyring = NULL;
707 spin_lock_init(&tgcred->lock); 613 new->process_keyring = NULL;
708 tgcred->process_keyring = NULL;
709 tgcred->session_keyring = NULL;
710 new->tgcred = tgcred;
711 new->request_key_auth = NULL;
712 new->thread_keyring = NULL; 614 new->thread_keyring = NULL;
615 new->request_key_auth = NULL;
713 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 616 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
714#endif 617#endif
715 618
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 738f3564e83b..301079d06f24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7434,7 +7434,7 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7434device_initcall(perf_event_sysfs_init);
7435 7435
7436#ifdef CONFIG_CGROUP_PERF 7436#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7438{ 7438{
7439 struct perf_cgroup *jc; 7439 struct perf_cgroup *jc;
7440 7440
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7451 return &jc->css; 7451 return &jc->css;
7452} 7452}
7453 7453
7454static void perf_cgroup_destroy(struct cgroup *cont) 7454static void perf_cgroup_css_free(struct cgroup *cont)
7455{ 7455{
7456 struct perf_cgroup *jc; 7456 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7492struct cgroup_subsys perf_subsys = { 7492struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7493 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7494 .subsys_id = perf_subsys_id,
7495 .create = perf_cgroup_create, 7495 .css_alloc = perf_cgroup_css_alloc,
7496 .destroy = perf_cgroup_destroy, 7496 .css_free = perf_cgroup_css_free,
7497 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499 7499
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9a7b487c6fe2..fe8a916507ed 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
111 * Count the number of breakpoints of the same type and same task. 111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list. 112 * The given event must be not on the list.
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct task_struct *tsk = bp->hw.bp_target; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk &&
122 find_slot_idx(iter) == type &&
123 cpu == iter->cpu)
122 count += hw_breakpoint_weight(iter); 124 count += hw_breakpoint_weight(iter);
123 } 125 }
124 126
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
141 if (!tsk) 143 if (!tsk)
142 slots->pinned += max_task_bp_pinned(cpu, type); 144 slots->pinned += max_task_bp_pinned(cpu, type);
143 else 145 else
144 slots->pinned += task_bp_pinned(bp, type); 146 slots->pinned += task_bp_pinned(cpu, bp, type);
145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
146 148
147 return; 149 return;
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
154 if (!tsk) 156 if (!tsk)
155 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
156 else 158 else
157 nr += task_bp_pinned(bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
158 160
159 if (nr > slots->pinned) 161 if (nr > slots->pinned)
160 slots->pinned = nr; 162 slots->pinned = nr;
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
188 int old_idx = 0; 190 int old_idx = 0;
189 int idx = 0; 191 int idx = 0;
190 192
191 old_count = task_bp_pinned(bp, type); 193 old_count = task_bp_pinned(cpu, bp, type);
192 old_idx = old_count - 1; 194 old_idx = old_count - 1;
193 idx = old_idx + weight; 195 idx = old_idx + weight;
194 196
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5cc4e7e42e68..dea7acfbb071 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -33,6 +33,7 @@
33#include <linux/ptrace.h> /* user_enable_single_step */ 33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */ 35#include "../../mm/internal.h" /* munlock_vma_page */
36#include <linux/percpu-rwsem.h>
36 37
37#include <linux/uprobes.h> 38#include <linux/uprobes.h>
38 39
@@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
71static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
72#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
73 74
75static struct percpu_rw_semaphore dup_mmap_sem;
76
74/* 77/*
75 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe 78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
76 * events active at this time. Probably a fine grained per inode count is 79 * events active at this time. Probably a fine grained per inode count is
@@ -766,10 +769,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
766 struct map_info *info; 769 struct map_info *info;
767 int err = 0; 770 int err = 0;
768 771
772 percpu_down_write(&dup_mmap_sem);
769 info = build_map_info(uprobe->inode->i_mapping, 773 info = build_map_info(uprobe->inode->i_mapping,
770 uprobe->offset, is_register); 774 uprobe->offset, is_register);
771 if (IS_ERR(info)) 775 if (IS_ERR(info)) {
772 return PTR_ERR(info); 776 err = PTR_ERR(info);
777 goto out;
778 }
773 779
774 while (info) { 780 while (info) {
775 struct mm_struct *mm = info->mm; 781 struct mm_struct *mm = info->mm;
@@ -799,7 +805,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
799 mmput(mm); 805 mmput(mm);
800 info = free_map_info(info); 806 info = free_map_info(info);
801 } 807 }
802 808 out:
809 percpu_up_write(&dup_mmap_sem);
803 return err; 810 return err;
804} 811}
805 812
@@ -1131,6 +1138,16 @@ void uprobe_clear_state(struct mm_struct *mm)
1131 kfree(area); 1138 kfree(area);
1132} 1139}
1133 1140
1141void uprobe_start_dup_mmap(void)
1142{
1143 percpu_down_read(&dup_mmap_sem);
1144}
1145
1146void uprobe_end_dup_mmap(void)
1147{
1148 percpu_up_read(&dup_mmap_sem);
1149}
1150
1134void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) 1151void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1135{ 1152{
1136 newmm->uprobes_state.xol_area = NULL; 1153 newmm->uprobes_state.xol_area = NULL;
@@ -1199,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1199 vaddr = kmap_atomic(area->page); 1216 vaddr = kmap_atomic(area->page);
1200 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1201 kunmap_atomic(vaddr); 1218 kunmap_atomic(vaddr);
1219 /*
1220 * We probably need flush_icache_user_range() but it needs vma.
1221 * This should work on supported architectures too.
1222 */
1223 flush_dcache_page(area->page);
1202 1224
1203 return current->utask->xol_vaddr; 1225 return current->utask->xol_vaddr;
1204} 1226}
@@ -1430,16 +1452,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1430 return uprobe; 1452 return uprobe;
1431} 1453}
1432 1454
1433void __weak arch_uprobe_enable_step(struct arch_uprobe *arch)
1434{
1435 user_enable_single_step(current);
1436}
1437
1438void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
1439{
1440 user_disable_single_step(current);
1441}
1442
1443/* 1455/*
1444 * Run handler and ask thread to singlestep. 1456 * Run handler and ask thread to singlestep.
1445 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1493,7 +1505,6 @@ static void handle_swbp(struct pt_regs *regs)
1493 goto out; 1505 goto out;
1494 1506
1495 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1496 arch_uprobe_enable_step(&uprobe->arch);
1497 utask->active_uprobe = uprobe; 1508 utask->active_uprobe = uprobe;
1498 utask->state = UTASK_SSTEP; 1509 utask->state = UTASK_SSTEP;
1499 return; 1510 return;
@@ -1525,7 +1536,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1525 else 1536 else
1526 WARN_ON_ONCE(1); 1537 WARN_ON_ONCE(1);
1527 1538
1528 arch_uprobe_disable_step(&uprobe->arch);
1529 put_uprobe(uprobe); 1539 put_uprobe(uprobe);
1530 utask->active_uprobe = NULL; 1540 utask->active_uprobe = NULL;
1531 utask->state = UTASK_RUNNING; 1541 utask->state = UTASK_RUNNING;
@@ -1604,6 +1614,9 @@ static int __init init_uprobes(void)
1604 mutex_init(&uprobes_mmap_mutex[i]); 1614 mutex_init(&uprobes_mmap_mutex[i]);
1605 } 1615 }
1606 1616
1617 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM;
1619
1607 return register_die_notifier(&uprobe_exception_nb); 1620 return register_die_notifier(&uprobe_exception_nb);
1608} 1621}
1609module_init(init_uprobes); 1622module_init(init_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index d7fe58db4527..b4df21937216 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -310,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
310 } 310 }
311} 311}
312 312
313/**
314 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
315 *
316 * If a kernel thread is launched as a result of a system call, or if
317 * it ever exits, it should generally reparent itself to kthreadd so it
318 * isn't in the way of other processes and is correctly cleaned up on exit.
319 *
320 * The various task state such as scheduling policy and priority may have
321 * been inherited from a user process, so we reset them to sane values here.
322 *
323 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
324 */
325static void reparent_to_kthreadd(void)
326{
327 write_lock_irq(&tasklist_lock);
328
329 ptrace_unlink(current);
330 /* Reparent to init */
331 current->real_parent = current->parent = kthreadd_task;
332 list_move_tail(&current->sibling, &current->real_parent->children);
333
334 /* Set the exit signal to SIGCHLD so we signal init on exit */
335 current->exit_signal = SIGCHLD;
336
337 if (task_nice(current) < 0)
338 set_user_nice(current, 0);
339 /* cpus_allowed? */
340 /* rt_priority? */
341 /* signals? */
342 memcpy(current->signal->rlim, init_task.signal->rlim,
343 sizeof(current->signal->rlim));
344
345 atomic_inc(&init_cred.usage);
346 commit_creds(&init_cred);
347 write_unlock_irq(&tasklist_lock);
348}
349
350void __set_special_pids(struct pid *pid) 313void __set_special_pids(struct pid *pid)
351{ 314{
352 struct task_struct *curr = current->group_leader; 315 struct task_struct *curr = current->group_leader;
@@ -358,13 +321,6 @@ void __set_special_pids(struct pid *pid)
358 change_pid(curr, PIDTYPE_PGID, pid); 321 change_pid(curr, PIDTYPE_PGID, pid);
359} 322}
360 323
361static void set_special_pids(struct pid *pid)
362{
363 write_lock_irq(&tasklist_lock);
364 __set_special_pids(pid);
365 write_unlock_irq(&tasklist_lock);
366}
367
368/* 324/*
369 * Let kernel threads use this to say that they allow a certain signal. 325 * Let kernel threads use this to say that they allow a certain signal.
370 * Must not be used if kthread was cloned with CLONE_SIGHAND. 326 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -404,54 +360,6 @@ int disallow_signal(int sig)
404 360
405EXPORT_SYMBOL(disallow_signal); 361EXPORT_SYMBOL(disallow_signal);
406 362
407/*
408 * Put all the gunge required to become a kernel thread without
409 * attached user resources in one place where it belongs.
410 */
411
412void daemonize(const char *name, ...)
413{
414 va_list args;
415 sigset_t blocked;
416
417 va_start(args, name);
418 vsnprintf(current->comm, sizeof(current->comm), name, args);
419 va_end(args);
420
421 /*
422 * If we were started as result of loading a module, close all of the
423 * user space pages. We don't need them, and if we didn't close them
424 * they would be locked into memory.
425 */
426 exit_mm(current);
427 /*
428 * We don't want to get frozen, in case system-wide hibernation
429 * or suspend transition begins right now.
430 */
431 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
432
433 if (current->nsproxy != &init_nsproxy) {
434 get_nsproxy(&init_nsproxy);
435 switch_task_namespaces(current, &init_nsproxy);
436 }
437 set_special_pids(&init_struct_pid);
438 proc_clear_tty(current);
439
440 /* Block and flush all signals */
441 sigfillset(&blocked);
442 sigprocmask(SIG_BLOCK, &blocked, NULL);
443 flush_signals(current);
444
445 /* Become as one with the init task */
446
447 daemonize_fs_struct();
448 daemonize_descriptors();
449
450 reparent_to_kthreadd();
451}
452
453EXPORT_SYMBOL(daemonize);
454
455#ifdef CONFIG_MM_OWNER 363#ifdef CONFIG_MM_OWNER
456/* 364/*
457 * A task is exiting. If it owned this mm, find a new owner for the mm. 365 * A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -1174,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1174 * as other threads in the parent group can be right 1082 * as other threads in the parent group can be right
1175 * here reaping other children at the same time. 1083 * here reaping other children at the same time.
1176 * 1084 *
1177 * We use thread_group_times() to get times for the thread 1085 * We use thread_group_cputime_adjusted() to get times for the thread
1178 * group, which consolidates times for all threads in the 1086 * group, which consolidates times for all threads in the
1179 * group including the group leader. 1087 * group including the group leader.
1180 */ 1088 */
1181 thread_group_times(p, &tgutime, &tgstime); 1089 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1182 spin_lock_irq(&p->real_parent->sighand->siglock); 1090 spin_lock_irq(&p->real_parent->sighand->siglock);
1183 psig = p->real_parent->signal; 1091 psig = p->real_parent->signal;
1184 sig = p->signal; 1092 sig = p->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index 38e53b87402c..c36c4e301efe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
352 unsigned long charge; 352 unsigned long charge;
353 struct mempolicy *pol; 353 struct mempolicy *pol;
354 354
355 uprobe_start_dup_mmap();
355 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
356 flush_cache_dup_mm(oldmm); 357 flush_cache_dup_mm(oldmm);
357 uprobe_dup_mmap(oldmm, mm); 358 uprobe_dup_mmap(oldmm, mm);
@@ -469,6 +470,7 @@ out:
469 up_write(&mm->mmap_sem); 470 up_write(&mm->mmap_sem);
470 flush_tlb_mm(oldmm); 471 flush_tlb_mm(oldmm);
471 up_write(&oldmm->mmap_sem); 472 up_write(&oldmm->mmap_sem);
473 uprobe_end_dup_mmap();
472 return retval; 474 return retval;
473fail_nomem_anon_vma_fork: 475fail_nomem_anon_vma_fork:
474 mpol_put(pol); 476 mpol_put(pol);
@@ -821,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
821#ifdef CONFIG_TRANSPARENT_HUGEPAGE 823#ifdef CONFIG_TRANSPARENT_HUGEPAGE
822 mm->pmd_huge_pte = NULL; 824 mm->pmd_huge_pte = NULL;
823#endif 825#endif
826#ifdef CONFIG_NUMA_BALANCING
827 mm->first_nid = NUMA_PTE_SCAN_INIT;
828#endif
824 if (!mm_init(mm, tsk)) 829 if (!mm_init(mm, tsk))
825 goto fail_nomem; 830 goto fail_nomem;
826 831
@@ -1125,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1125 */ 1130 */
1126static struct task_struct *copy_process(unsigned long clone_flags, 1131static struct task_struct *copy_process(unsigned long clone_flags,
1127 unsigned long stack_start, 1132 unsigned long stack_start,
1128 struct pt_regs *regs,
1129 unsigned long stack_size, 1133 unsigned long stack_size,
1130 int __user *child_tidptr, 1134 int __user *child_tidptr,
1131 struct pid *pid, 1135 struct pid *pid,
@@ -1133,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1133{ 1137{
1134 int retval; 1138 int retval;
1135 struct task_struct *p; 1139 struct task_struct *p;
1136 int cgroup_callbacks_done = 0;
1137 1140
1138 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1139 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
@@ -1220,7 +1223,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1220 p->utime = p->stime = p->gtime = 0; 1223 p->utime = p->stime = p->gtime = 0;
1221 p->utimescaled = p->stimescaled = 0; 1224 p->utimescaled = p->stimescaled = 0;
1222#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1225#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1223 p->prev_utime = p->prev_stime = 0; 1226 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1224#endif 1227#endif
1225#if defined(SPLIT_RSS_COUNTING) 1228#if defined(SPLIT_RSS_COUNTING)
1226 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1229 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1318,7 +1321,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1318 retval = copy_io(clone_flags, p); 1321 retval = copy_io(clone_flags, p);
1319 if (retval) 1322 if (retval)
1320 goto bad_fork_cleanup_namespaces; 1323 goto bad_fork_cleanup_namespaces;
1321 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1324 retval = copy_thread(clone_flags, stack_start, stack_size, p);
1322 if (retval) 1325 if (retval)
1323 goto bad_fork_cleanup_io; 1326 goto bad_fork_cleanup_io;
1324 1327
@@ -1391,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1391 INIT_LIST_HEAD(&p->thread_group); 1394 INIT_LIST_HEAD(&p->thread_group);
1392 p->task_works = NULL; 1395 p->task_works = NULL;
1393 1396
1394 /* Now that the task is set up, run cgroup callbacks if
1395 * necessary. We need to run them before the task is visible
1396 * on the tasklist. */
1397 cgroup_fork_callbacks(p);
1398 cgroup_callbacks_done = 1;
1399
1400 /* Need tasklist lock for parent etc handling! */ 1397 /* Need tasklist lock for parent etc handling! */
1401 write_lock_irq(&tasklist_lock); 1398 write_lock_irq(&tasklist_lock);
1402 1399
@@ -1501,7 +1498,7 @@ bad_fork_cleanup_cgroup:
1501#endif 1498#endif
1502 if (clone_flags & CLONE_THREAD) 1499 if (clone_flags & CLONE_THREAD)
1503 threadgroup_change_end(current); 1500 threadgroup_change_end(current);
1504 cgroup_exit(p, cgroup_callbacks_done); 1501 cgroup_exit(p, 0);
1505 delayacct_tsk_free(p); 1502 delayacct_tsk_free(p);
1506 module_put(task_thread_info(p)->exec_domain->module); 1503 module_put(task_thread_info(p)->exec_domain->module);
1507bad_fork_cleanup_count: 1504bad_fork_cleanup_count:
@@ -1513,12 +1510,6 @@ fork_out:
1513 return ERR_PTR(retval); 1510 return ERR_PTR(retval);
1514} 1511}
1515 1512
1516noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1517{
1518 memset(regs, 0, sizeof(struct pt_regs));
1519 return regs;
1520}
1521
1522static inline void init_idle_pids(struct pid_link *links) 1513static inline void init_idle_pids(struct pid_link *links)
1523{ 1514{
1524 enum pid_type type; 1515 enum pid_type type;
@@ -1532,10 +1523,7 @@ static inline void init_idle_pids(struct pid_link *links)
1532struct task_struct * __cpuinit fork_idle(int cpu) 1523struct task_struct * __cpuinit fork_idle(int cpu)
1533{ 1524{
1534 struct task_struct *task; 1525 struct task_struct *task;
1535 struct pt_regs regs; 1526 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
1536
1537 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1538 &init_struct_pid, 0);
1539 if (!IS_ERR(task)) { 1527 if (!IS_ERR(task)) {
1540 init_idle_pids(task->pids); 1528 init_idle_pids(task->pids);
1541 init_idle(task, cpu); 1529 init_idle(task, cpu);
@@ -1552,7 +1540,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1552 */ 1540 */
1553long do_fork(unsigned long clone_flags, 1541long do_fork(unsigned long clone_flags,
1554 unsigned long stack_start, 1542 unsigned long stack_start,
1555 struct pt_regs *regs,
1556 unsigned long stack_size, 1543 unsigned long stack_size,
1557 int __user *parent_tidptr, 1544 int __user *parent_tidptr,
1558 int __user *child_tidptr) 1545 int __user *child_tidptr)
@@ -1576,7 +1563,7 @@ long do_fork(unsigned long clone_flags,
1576 * requested, no event is reported; otherwise, report if the event 1563 * requested, no event is reported; otherwise, report if the event
1577 * for the type of forking is enabled. 1564 * for the type of forking is enabled.
1578 */ 1565 */
1579 if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { 1566 if (!(clone_flags & CLONE_UNTRACED)) {
1580 if (clone_flags & CLONE_VFORK) 1567 if (clone_flags & CLONE_VFORK)
1581 trace = PTRACE_EVENT_VFORK; 1568 trace = PTRACE_EVENT_VFORK;
1582 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1569 else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1588,7 +1575,7 @@ long do_fork(unsigned long clone_flags,
1588 trace = 0; 1575 trace = 0;
1589 } 1576 }
1590 1577
1591 p = copy_process(clone_flags, stack_start, regs, stack_size, 1578 p = copy_process(clone_flags, stack_start, stack_size,
1592 child_tidptr, NULL, trace); 1579 child_tidptr, NULL, trace);
1593 /* 1580 /*
1594 * Do this prior waking up the new thread - the thread pointer 1581 * Do this prior waking up the new thread - the thread pointer
@@ -1632,11 +1619,54 @@ long do_fork(unsigned long clone_flags,
1632 */ 1619 */
1633pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 1620pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1634{ 1621{
1635 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, 1622 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1636 (unsigned long)arg, NULL, NULL); 1623 (unsigned long)arg, NULL, NULL);
1637} 1624}
1638#endif 1625#endif
1639 1626
1627#ifdef __ARCH_WANT_SYS_FORK
1628SYSCALL_DEFINE0(fork)
1629{
1630#ifdef CONFIG_MMU
1631 return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1632#else
1633 /* can not support in nommu mode */
1634 return(-EINVAL);
1635#endif
1636}
1637#endif
1638
1639#ifdef __ARCH_WANT_SYS_VFORK
1640SYSCALL_DEFINE0(vfork)
1641{
1642 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1643 0, NULL, NULL);
1644}
1645#endif
1646
1647#ifdef __ARCH_WANT_SYS_CLONE
1648#ifdef CONFIG_CLONE_BACKWARDS
1649SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1650 int __user *, parent_tidptr,
1651 int, tls_val,
1652 int __user *, child_tidptr)
1653#elif defined(CONFIG_CLONE_BACKWARDS2)
1654SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1655 int __user *, parent_tidptr,
1656 int __user *, child_tidptr,
1657 int, tls_val)
1658#else
1659SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1660 int __user *, parent_tidptr,
1661 int __user *, child_tidptr,
1662 int, tls_val)
1663#endif
1664{
1665 return do_fork(clone_flags, newsp, 0,
1666 parent_tidptr, child_tidptr);
1667}
1668#endif
1669
1640#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1670#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1641#define ARCH_MIN_MMSTRUCT_ALIGN 0 1671#define ARCH_MIN_MMSTRUCT_ALIGN 0
1642#endif 1672#endif
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3717e7b306e0..19eb089ca003 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
716 struct futex_pi_state **ps, 716 struct futex_pi_state **ps,
717 struct task_struct *task, int set_waiters) 717 struct task_struct *task, int set_waiters)
718{ 718{
719 int lock_taken, ret, ownerdied = 0; 719 int lock_taken, ret, force_take = 0;
720 u32 uval, newval, curval, vpid = task_pid_vnr(task); 720 u32 uval, newval, curval, vpid = task_pid_vnr(task);
721 721
722retry: 722retry:
@@ -755,17 +755,15 @@ retry:
755 newval = curval | FUTEX_WAITERS; 755 newval = curval | FUTEX_WAITERS;
756 756
757 /* 757 /*
758 * There are two cases, where a futex might have no owner (the 758 * Should we force take the futex? See below.
759 * owner TID is 0): OWNER_DIED. We take over the futex in this
760 * case. We also do an unconditional take over, when the owner
761 * of the futex died.
762 *
763 * This is safe as we are protected by the hash bucket lock !
764 */ 759 */
765 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 760 if (unlikely(force_take)) {
766 /* Keep the OWNER_DIED bit */ 761 /*
762 * Keep the OWNER_DIED and the WAITERS bit and set the
763 * new TID value.
764 */
767 newval = (curval & ~FUTEX_TID_MASK) | vpid; 765 newval = (curval & ~FUTEX_TID_MASK) | vpid;
768 ownerdied = 0; 766 force_take = 0;
769 lock_taken = 1; 767 lock_taken = 1;
770 } 768 }
771 769
@@ -775,7 +773,7 @@ retry:
775 goto retry; 773 goto retry;
776 774
777 /* 775 /*
778 * We took the lock due to owner died take over. 776 * We took the lock due to forced take over.
779 */ 777 */
780 if (unlikely(lock_taken)) 778 if (unlikely(lock_taken))
781 return 1; 779 return 1;
@@ -790,20 +788,25 @@ retry:
790 switch (ret) { 788 switch (ret) {
791 case -ESRCH: 789 case -ESRCH:
792 /* 790 /*
793 * No owner found for this futex. Check if the 791 * We failed to find an owner for this
794 * OWNER_DIED bit is set to figure out whether 792 * futex. So we have no pi_state to block
795 * this is a robust futex or not. 793 * on. This can happen in two cases:
794 *
795 * 1) The owner died
796 * 2) A stale FUTEX_WAITERS bit
797 *
798 * Re-read the futex value.
796 */ 799 */
797 if (get_futex_value_locked(&curval, uaddr)) 800 if (get_futex_value_locked(&curval, uaddr))
798 return -EFAULT; 801 return -EFAULT;
799 802
800 /* 803 /*
801 * We simply start over in case of a robust 804 * If the owner died or we have a stale
802 * futex. The code above will take the futex 805 * WAITERS bit the owner TID in the user space
803 * and return happy. 806 * futex is 0.
804 */ 807 */
805 if (curval & FUTEX_OWNER_DIED) { 808 if (!(curval & FUTEX_TID_MASK)) {
806 ownerdied = 1; 809 force_take = 1;
807 goto retry; 810 goto retry;
808 } 811 }
809 default: 812 default:
@@ -840,6 +843,9 @@ static void wake_futex(struct futex_q *q)
840{ 843{
841 struct task_struct *p = q->task; 844 struct task_struct *p = q->task;
842 845
846 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
847 return;
848
843 /* 849 /*
844 * We set q->lock_ptr = NULL _before_ we wake up the task. If 850 * We set q->lock_ptr = NULL _before_ we wake up the task. If
845 * a non-futex wake up happens on another CPU then the task 851 * a non-futex wake up happens on another CPU then the task
@@ -1075,6 +1081,10 @@ retry_private:
1075 1081
1076 plist_for_each_entry_safe(this, next, head, list) { 1082 plist_for_each_entry_safe(this, next, head, list) {
1077 if (match_futex (&this->key, &key1)) { 1083 if (match_futex (&this->key, &key1)) {
1084 if (this->pi_state || this->rt_waiter) {
1085 ret = -EINVAL;
1086 goto out_unlock;
1087 }
1078 wake_futex(this); 1088 wake_futex(this);
1079 if (++ret >= nr_wake) 1089 if (++ret >= nr_wake)
1080 break; 1090 break;
@@ -1087,6 +1097,10 @@ retry_private:
1087 op_ret = 0; 1097 op_ret = 0;
1088 plist_for_each_entry_safe(this, next, head, list) { 1098 plist_for_each_entry_safe(this, next, head, list) {
1089 if (match_futex (&this->key, &key2)) { 1099 if (match_futex (&this->key, &key2)) {
1100 if (this->pi_state || this->rt_waiter) {
1101 ret = -EINVAL;
1102 goto out_unlock;
1103 }
1090 wake_futex(this); 1104 wake_futex(this);
1091 if (++op_ret >= nr_wake2) 1105 if (++op_ret >= nr_wake2)
1092 break; 1106 break;
@@ -1095,6 +1109,7 @@ retry_private:
1095 ret += op_ret; 1109 ret += op_ret;
1096 } 1110 }
1097 1111
1112out_unlock:
1098 double_unlock_hb(hb1, hb2); 1113 double_unlock_hb(hb1, hb2);
1099out_put_keys: 1114out_put_keys:
1100 put_futex_key(&key2); 1115 put_futex_key(&key2);
@@ -1384,9 +1399,13 @@ retry_private:
1384 /* 1399 /*
1385 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1400 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1386 * be paired with each other and no other futex ops. 1401 * be paired with each other and no other futex ops.
1402 *
1403 * We should never be requeueing a futex_q with a pi_state,
1404 * which is awaiting a futex_unlock_pi().
1387 */ 1405 */
1388 if ((requeue_pi && !this->rt_waiter) || 1406 if ((requeue_pi && !this->rt_waiter) ||
1389 (!requeue_pi && this->rt_waiter)) { 1407 (!requeue_pi && this->rt_waiter) ||
1408 this->pi_state) {
1390 ret = -EINVAL; 1409 ret = -EINVAL;
1391 break; 1410 break;
1392 } 1411 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 57d86d07221e..3aca9f29d30e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq)
272 272
273 raw_spin_lock_irq(&desc->lock); 273 raw_spin_lock_irq(&desc->lock);
274 274
275 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
275 kstat_incr_irqs_this_cpu(irq, desc); 276 kstat_incr_irqs_this_cpu(irq, desc);
276 277
277 action = desc->action; 278 action = desc->action;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4e69e24d3d7d..96f3a1d9c379 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
177 irq_base = irq_alloc_descs(first_irq, first_irq, size, 177 irq_base = irq_alloc_descs(first_irq, first_irq, size,
178 of_node_to_nid(of_node)); 178 of_node_to_nid(of_node));
179 if (irq_base < 0) { 179 if (irq_base < 0) {
180 WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 180 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
181 first_irq); 181 first_irq);
182 irq_base = first_irq; 182 irq_base = first_irq;
183 } 183 }
184 } else 184 } else
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c69326aa773..35c70c9e24d8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
616 return ret; 616 return ret;
617} 617}
618 618
619#ifdef CONFIG_HARDIRQS_SW_RESEND
620int irq_set_parent(int irq, int parent_irq)
621{
622 unsigned long flags;
623 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
624
625 if (!desc)
626 return -EINVAL;
627
628 desc->parent_irq = parent_irq;
629
630 irq_put_desc_unlock(desc, flags);
631 return 0;
632}
633#endif
634
619/* 635/*
620 * Default primary interrupt handler for threaded interrupts. Is 636 * Default primary interrupt handler for threaded interrupts. Is
621 * assigned as primary handler when request_threaded_irq is called 637 * assigned as primary handler when request_threaded_irq is called
@@ -716,6 +732,7 @@ static void
716irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 732irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
717{ 733{
718 cpumask_var_t mask; 734 cpumask_var_t mask;
735 bool valid = true;
719 736
720 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) 737 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
721 return; 738 return;
@@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
730 } 747 }
731 748
732 raw_spin_lock_irq(&desc->lock); 749 raw_spin_lock_irq(&desc->lock);
733 cpumask_copy(mask, desc->irq_data.affinity); 750 /*
751 * This code is triggered unconditionally. Check the affinity
752 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
753 */
754 if (desc->irq_data.affinity)
755 cpumask_copy(mask, desc->irq_data.affinity);
756 else
757 valid = false;
734 raw_spin_unlock_irq(&desc->lock); 758 raw_spin_unlock_irq(&desc->lock);
735 759
736 set_cpus_allowed_ptr(current, mask); 760 if (valid)
761 set_cpus_allowed_ptr(current, mask);
737 free_cpumask_var(mask); 762 free_cpumask_var(mask);
738} 763}
739#else 764#else
@@ -833,6 +858,8 @@ static int irq_thread(void *data)
833 init_task_work(&on_exit_work, irq_thread_dtor); 858 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 859 task_work_add(current, &on_exit_work, false);
835 860
861 irq_thread_check_affinity(desc, action);
862
836 while (!irq_wait_for_interrupt(action)) { 863 while (!irq_wait_for_interrupt(action)) {
837 irqreturn_t action_ret; 864 irqreturn_t action_ret;
838 865
@@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
936 */ 963 */
937 get_task_struct(t); 964 get_task_struct(t);
938 new->thread = t; 965 new->thread = t;
966 /*
967 * Tell the thread to set its affinity. This is
968 * important for shared interrupt handlers as we do
969 * not invoke setup_affinity() for the secondary
970 * handlers as everything is already set up. Even for
971 * interrupts marked with IRQF_NO_BALANCE this is
972 * correct as we want the thread to move to the cpu(s)
973 * on which the requesting code placed the interrupt.
974 */
975 set_bit(IRQTF_AFFINITY, &new->thread_flags);
939 } 976 }
940 977
941 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { 978 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 6454db7b6a4d..9065107f083e 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
74 if (!desc->irq_data.chip->irq_retrigger || 74 if (!desc->irq_data.chip->irq_retrigger ||
75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
76#ifdef CONFIG_HARDIRQS_SW_RESEND 76#ifdef CONFIG_HARDIRQS_SW_RESEND
77 /*
78 * If the interrupt has a parent irq and runs
79 * in the thread context of the parent irq,
80 * retrigger the parent.
81 */
82 if (desc->parent_irq &&
83 irq_settings_is_nested_thread(desc))
84 irq = desc->parent_irq;
77 /* Set it pending and activate the softirq: */ 85 /* Set it pending and activate the softirq: */
78 set_bit(irq, irqs_resend); 86 set_bit(irq, irqs_resend);
79 tasklet_schedule(&resend_tasklet); 87 tasklet_schedule(&resend_tasklet);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..6ada93c23a9a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
26static struct kobj_attribute _name##_attr = \ 26static struct kobj_attribute _name##_attr = \
27 __ATTR(_name, 0644, _name##_show, _name##_store) 27 __ATTR(_name, 0644, _name##_show, _name##_store)
28 28
29#if defined(CONFIG_HOTPLUG)
30/* current uevent sequence number */ 29/* current uevent sequence number */
31static ssize_t uevent_seqnum_show(struct kobject *kobj, 30static ssize_t uevent_seqnum_show(struct kobject *kobj,
32 struct kobj_attribute *attr, char *buf) 31 struct kobj_attribute *attr, char *buf)
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
54 return count; 53 return count;
55} 54}
56KERNEL_ATTR_RW(uevent_helper); 55KERNEL_ATTR_RW(uevent_helper);
57#endif 56
58 57
59#ifdef CONFIG_PROFILING 58#ifdef CONFIG_PROFILING
60static ssize_t profiling_show(struct kobject *kobj, 59static ssize_t profiling_show(struct kobject *kobj,
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
141} 140}
142KERNEL_ATTR_RO(fscaps); 141KERNEL_ATTR_RO(fscaps);
143 142
143int rcu_expedited;
144static ssize_t rcu_expedited_show(struct kobject *kobj,
145 struct kobj_attribute *attr, char *buf)
146{
147 return sprintf(buf, "%d\n", rcu_expedited);
148}
149static ssize_t rcu_expedited_store(struct kobject *kobj,
150 struct kobj_attribute *attr,
151 const char *buf, size_t count)
152{
153 if (kstrtoint(buf, 0, &rcu_expedited))
154 return -EINVAL;
155
156 return count;
157}
158KERNEL_ATTR_RW(rcu_expedited);
159
144/* 160/*
145 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 161 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
146 */ 162 */
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
169 185
170static struct attribute * kernel_attrs[] = { 186static struct attribute * kernel_attrs[] = {
171 &fscaps_attr.attr, 187 &fscaps_attr.attr,
172#if defined(CONFIG_HOTPLUG)
173 &uevent_seqnum_attr.attr, 188 &uevent_seqnum_attr.attr,
174 &uevent_helper_attr.attr, 189 &uevent_helper_attr.attr,
175#endif
176#ifdef CONFIG_PROFILING 190#ifdef CONFIG_PROFILING
177 &profiling_attr.attr, 191 &profiling_attr.attr,
178#endif 192#endif
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = {
182 &kexec_crash_size_attr.attr, 196 &kexec_crash_size_attr.attr,
183 &vmcoreinfo_attr.attr, 197 &vmcoreinfo_attr.attr,
184#endif 198#endif
199 &rcu_expedited_attr.attr,
185 NULL 200 NULL
186}; 201};
187 202
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 29fb60caecb5..691dc2ef9baf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -428,7 +428,7 @@ int kthreadd(void *unused)
428 set_task_comm(tsk, "kthreadd"); 428 set_task_comm(tsk, "kthreadd");
429 ignore_signals(tsk); 429 ignore_signals(tsk);
430 set_cpus_allowed_ptr(tsk, cpu_all_mask); 430 set_cpus_allowed_ptr(tsk, cpu_all_mask);
431 set_mems_allowed(node_states[N_HIGH_MEMORY]); 431 set_mems_allowed(node_states[N_MEMORY]);
432 432
433 current->flags |= PF_NOFREEZE; 433 current->flags |= PF_NOFREEZE;
434 434
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 91c32a0b612c..b2c71c5873e4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
39 39
40static void print_name(struct seq_file *m, struct lock_class *class) 40static void print_name(struct seq_file *m, struct lock_class *class)
41{ 41{
42 char str[128]; 42 char str[KSYM_NAME_LEN];
43 const char *name = class->name; 43 const char *name = class->name;
44 44
45 if (!name) { 45 if (!name) {
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 4646eb2c3820..767e559dfb10 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -21,10 +21,10 @@ struct key *modsign_keyring;
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initdata const u8 modsign_certificate_list_end[];
23asm(".section .init.data,\"aw\"\n" 23asm(".section .init.data,\"aw\"\n"
24 "modsign_certificate_list:\n" 24 SYMBOL_PREFIX "modsign_certificate_list:\n"
25 ".incbin \"signing_key.x509\"\n" 25 ".incbin \"signing_key.x509\"\n"
26 ".incbin \"extra_certificates\"\n" 26 ".incbin \"extra_certificates\"\n"
27 "modsign_certificate_list_end:" 27 SYMBOL_PREFIX "modsign_certificate_list_end:"
28 ); 28 );
29 29
30/* 30/*
diff --git a/kernel/module.c b/kernel/module.c
index 6085f5ef88ea..808bd62e1723 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -372,9 +372,6 @@ static bool check_symbol(const struct symsearch *syms,
372 printk(KERN_WARNING "Symbol %s is being used " 372 printk(KERN_WARNING "Symbol %s is being used "
373 "by a non-GPL module, which will not " 373 "by a non-GPL module, which will not "
374 "be allowed in the future\n", fsa->name); 374 "be allowed in the future\n", fsa->name);
375 printk(KERN_WARNING "Please see the file "
376 "Documentation/feature-removal-schedule.txt "
377 "in the kernel source tree for more details.\n");
378 } 375 }
379 } 376 }
380 377
@@ -2293,12 +2290,17 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2293 src = (void *)info->hdr + symsect->sh_offset; 2290 src = (void *)info->hdr + symsect->sh_offset;
2294 nsrc = symsect->sh_size / sizeof(*src); 2291 nsrc = symsect->sh_size / sizeof(*src);
2295 2292
2293 /* strtab always starts with a nul, so offset 0 is the empty string. */
2294 strtab_size = 1;
2295
2296 /* Compute total space required for the core symbols' strtab. */ 2296 /* Compute total space required for the core symbols' strtab. */
2297 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) 2297 for (ndst = i = 0; i < nsrc; i++) {
2298 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { 2298 if (i == 0 ||
2299 strtab_size += strlen(&info->strtab[src->st_name]) + 1; 2299 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
2300 strtab_size += strlen(&info->strtab[src[i].st_name])+1;
2300 ndst++; 2301 ndst++;
2301 } 2302 }
2303 }
2302 2304
2303 /* Append room for core symbols at end of core part. */ 2305 /* Append room for core symbols at end of core part. */
2304 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2306 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
@@ -2332,15 +2334,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2332 mod->core_symtab = dst = mod->module_core + info->symoffs; 2334 mod->core_symtab = dst = mod->module_core + info->symoffs;
2333 mod->core_strtab = s = mod->module_core + info->stroffs; 2335 mod->core_strtab = s = mod->module_core + info->stroffs;
2334 src = mod->symtab; 2336 src = mod->symtab;
2335 *dst = *src;
2336 *s++ = 0; 2337 *s++ = 0;
2337 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2338 for (ndst = i = 0; i < mod->num_symtab; i++) {
2338 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2339 if (i == 0 ||
2339 continue; 2340 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
2340 2341 dst[ndst] = src[i];
2341 dst[ndst] = *src; 2342 dst[ndst++].st_name = s - mod->core_strtab;
2342 dst[ndst++].st_name = s - mod->core_strtab; 2343 s += strlcpy(s, &mod->strtab[src[i].st_name],
2343 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; 2344 KSYM_NAME_LEN) + 1;
2345 }
2344 } 2346 }
2345 mod->core_num_syms = ndst; 2347 mod->core_num_syms = ndst;
2346} 2348}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index ea1b1df5dbb0..f2970bddc5ea 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -27,13 +27,13 @@
27 * - Information block 27 * - Information block
28 */ 28 */
29struct module_signature { 29struct module_signature {
30 enum pkey_algo algo : 8; /* Public-key crypto algorithm */ 30 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
31 enum pkey_hash_algo hash : 8; /* Digest algorithm */ 31 u8 hash; /* Digest algorithm [enum pkey_hash_algo] */
32 enum pkey_id_type id_type : 8; /* Key identifier type */ 32 u8 id_type; /* Key identifier type [enum pkey_id_type] */
33 u8 signer_len; /* Length of signer's name */ 33 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */ 34 u8 key_id_len; /* Length of key identifier */
35 u8 __pad[3]; 35 u8 __pad[3];
36 __be32 sig_len; /* Length of signature data */ 36 __be32 sig_len; /* Length of signature data */
37}; 37};
38 38
39/* 39/*
diff --git a/kernel/padata.c b/kernel/padata.c
index 89fe3d1b9efb..072f4ee4eb89 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
171{ 171{
172 int cpu, num_cpus; 172 int cpu, num_cpus;
173 unsigned int next_nr, next_index; 173 unsigned int next_nr, next_index;
174 struct padata_parallel_queue *queue, *next_queue; 174 struct padata_parallel_queue *next_queue;
175 struct padata_priv *padata; 175 struct padata_priv *padata;
176 struct padata_list *reorder; 176 struct padata_list *reorder;
177 177
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
204 goto out; 204 goto out;
205 } 205 }
206 206
207 queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); 207 if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
208 if (queue->cpu_index == next_queue->cpu_index) {
209 padata = ERR_PTR(-ENODATA); 208 padata = ERR_PTR(-ENODATA);
210 goto out; 209 goto out;
211 } 210 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 3026ddae0a34..36aa02ff17d6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Generic pidhash and scalable, time-bounded PID allocator 2 * Generic pidhash and scalable, time-bounded PID allocator
3 * 3 *
4 * (C) 2002-2003 William Irwin, IBM 4 * (C) 2002-2003 Nadia Yvette Chambers, IBM
5 * (C) 2004 William Irwin, Oracle 5 * (C) 2004 Nadia Yvette Chambers, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 * 7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain 8 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -84,21 +84,6 @@ struct pid_namespace init_pid_ns = {
84}; 84};
85EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
86 86
87int is_container_init(struct task_struct *tsk)
88{
89 int ret = 0;
90 struct pid *pid;
91
92 rcu_read_lock();
93 pid = task_pid(tsk);
94 if (pid != NULL && pid->numbers[pid->level].nr == 1)
95 ret = 1;
96 rcu_read_unlock();
97
98 return ret;
99}
100EXPORT_SYMBOL(is_container_init);
101
102/* 87/*
103 * Note: disable interrupts while the pidmap_lock is held as an 88 * Note: disable interrupts while the pidmap_lock is held as an
104 * interrupt might come in and do read_lock(&tasklist_lock). 89 * interrupt might come in and do read_lock(&tasklist_lock).
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa21..d73840271dce 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
217 return 0; 217 return 0;
218} 218}
219 219
220void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
221{
222 struct signal_struct *sig = tsk->signal;
223 struct task_struct *t;
224
225 times->utime = sig->utime;
226 times->stime = sig->stime;
227 times->sum_exec_runtime = sig->sum_sched_runtime;
228
229 rcu_read_lock();
230 /* make sure we can trust tsk->thread_group list */
231 if (!likely(pid_alive(tsk)))
232 goto out;
233
234 t = tsk;
235 do {
236 times->utime += t->utime;
237 times->stime += t->stime;
238 times->sum_exec_runtime += task_sched_runtime(t);
239 } while_each_thread(tsk, t);
240out:
241 rcu_read_unlock();
242}
243
244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 220static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
245{ 221{
246 if (b->utime > a->utime) 222 if (b->utime > a->utime)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f458238109cc..1c16f9167de1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
59{ 59{
60 unsigned long val; 60 unsigned long val;
61 61
62 if (strict_strtoul(buf, 10, &val)) 62 if (kstrtoul(buf, 10, &val))
63 return -EINVAL; 63 return -EINVAL;
64 64
65 if (val > 1) 65 if (val > 1)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 846bd42c7ed1..9322ff7eaad6 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -213,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
213} 213}
214 214
215/** 215/**
216 * pm_qos_flags_remove_req - Remove device PM QoS flags request.
217 * @pqf: Device PM QoS flags set to remove the request from.
218 * @req: Request to remove from the set.
219 */
220static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
221 struct pm_qos_flags_request *req)
222{
223 s32 val = 0;
224
225 list_del(&req->node);
226 list_for_each_entry(req, &pqf->list, node)
227 val |= req->flags;
228
229 pqf->effective_flags = val;
230}
231
232/**
233 * pm_qos_update_flags - Update a set of PM QoS flags.
234 * @pqf: Set of flags to update.
235 * @req: Request to add to the set, to modify, or to remove from the set.
236 * @action: Action to take on the set.
237 * @val: Value of the request to add or modify.
238 *
239 * Update the given set of PM QoS flags and call notifiers if the aggregate
240 * value has changed. Returns 1 if the aggregate constraint value has changed,
241 * 0 otherwise.
242 */
243bool pm_qos_update_flags(struct pm_qos_flags *pqf,
244 struct pm_qos_flags_request *req,
245 enum pm_qos_req_action action, s32 val)
246{
247 unsigned long irqflags;
248 s32 prev_value, curr_value;
249
250 spin_lock_irqsave(&pm_qos_lock, irqflags);
251
252 prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
253
254 switch (action) {
255 case PM_QOS_REMOVE_REQ:
256 pm_qos_flags_remove_req(pqf, req);
257 break;
258 case PM_QOS_UPDATE_REQ:
259 pm_qos_flags_remove_req(pqf, req);
260 case PM_QOS_ADD_REQ:
261 req->flags = val;
262 INIT_LIST_HEAD(&req->node);
263 list_add_tail(&req->node, &pqf->list);
264 pqf->effective_flags |= val;
265 break;
266 default:
267 /* no action */
268 ;
269 }
270
271 curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
272
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274
275 return prev_value != curr_value;
276}
277
278/**
216 * pm_qos_request - returns current system wide qos expectation 279 * pm_qos_request - returns current system wide qos expectation
217 * @pm_qos_class: identification of which qos value is requested 280 * @pm_qos_class: identification of which qos value is requested
218 * 281 *
@@ -500,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
500 } else { 563 } else {
501 ascii_value[count] = '\0'; 564 ascii_value[count] = '\0';
502 } 565 }
503 ret = strict_strtoul(ascii_value, 16, &ulval); 566 ret = kstrtoul(ascii_value, 16, &ulval);
504 if (ret) { 567 if (ret) {
505 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); 568 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
506 return -EINVAL; 569 return -EINVAL;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3c9d764eb0d8..7c33ed200410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
126 126
127 /* Figure out where to put the new node */ 127 /* Figure out where to put the new node */
128 while (*new) { 128 while (*new) {
129 ext = container_of(*new, struct swsusp_extent, node); 129 ext = rb_entry(*new, struct swsusp_extent, node);
130 parent = *new; 130 parent = *new;
131 if (swap_offset < ext->start) { 131 if (swap_offset < ext->start) {
132 /* Try to merge */ 132 /* Try to merge */
diff --git a/kernel/printk.c b/kernel/printk.c
index 2d607f4d1797..19c0d7bcf24a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);
87struct console *console_drivers; 87struct console *console_drivers;
88EXPORT_SYMBOL_GPL(console_drivers); 88EXPORT_SYMBOL_GPL(console_drivers);
89 89
90#ifdef CONFIG_LOCKDEP
91static struct lockdep_map console_lock_dep_map = {
92 .name = "console_lock"
93};
94#endif
95
90/* 96/*
91 * This is used for debugging the mess that is the VT code by 97 * This is used for debugging the mess that is the VT code by
92 * keeping track if we have the console semaphore held. It's 98 * keeping track if we have the console semaphore held. It's
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early)
741 free, (free * 100) / __LOG_BUF_LEN); 747 free, (free * 100) / __LOG_BUF_LEN);
742} 748}
743 749
750static bool __read_mostly ignore_loglevel;
751
752static int __init ignore_loglevel_setup(char *str)
753{
754 ignore_loglevel = 1;
755 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
756
757 return 0;
758}
759
760early_param("ignore_loglevel", ignore_loglevel_setup);
761module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
762MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
763 "print all kernel messages to the console.");
764
744#ifdef CONFIG_BOOT_PRINTK_DELAY 765#ifdef CONFIG_BOOT_PRINTK_DELAY
745 766
746static int boot_delay; /* msecs delay after each printk during bootup */ 767static int boot_delay; /* msecs delay after each printk during bootup */
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str)
764} 785}
765__setup("boot_delay=", boot_delay_setup); 786__setup("boot_delay=", boot_delay_setup);
766 787
767static void boot_delay_msec(void) 788static void boot_delay_msec(int level)
768{ 789{
769 unsigned long long k; 790 unsigned long long k;
770 unsigned long timeout; 791 unsigned long timeout;
771 792
772 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 793 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
794 || (level >= console_loglevel && !ignore_loglevel)) {
773 return; 795 return;
796 }
774 797
775 k = (unsigned long long)loops_per_msec * boot_delay; 798 k = (unsigned long long)loops_per_msec * boot_delay;
776 799
@@ -789,7 +812,7 @@ static void boot_delay_msec(void)
789 } 812 }
790} 813}
791#else 814#else
792static inline void boot_delay_msec(void) 815static inline void boot_delay_msec(int level)
793{ 816{
794} 817}
795#endif 818#endif
@@ -1232,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1232 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1255 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1233} 1256}
1234 1257
1235static bool __read_mostly ignore_loglevel;
1236
1237static int __init ignore_loglevel_setup(char *str)
1238{
1239 ignore_loglevel = 1;
1240 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
1241
1242 return 0;
1243}
1244
1245early_param("ignore_loglevel", ignore_loglevel_setup);
1246module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
1247MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
1248 "print all kernel messages to the console.");
1249
1250/* 1258/*
1251 * Call the console drivers, asking them to write out 1259 * Call the console drivers, asking them to write out
1252 * log_buf[start] to log_buf[end - 1]. 1260 * log_buf[start] to log_buf[end - 1].
@@ -1492,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1492 int this_cpu; 1500 int this_cpu;
1493 int printed_len = 0; 1501 int printed_len = 0;
1494 1502
1495 boot_delay_msec(); 1503 boot_delay_msec(level);
1496 printk_delay(); 1504 printk_delay();
1497 1505
1498 /* This stops the holder of console_sem just where we want him */ 1506 /* This stops the holder of console_sem just where we want him */
@@ -1908,12 +1916,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1908 */ 1916 */
1909void console_lock(void) 1917void console_lock(void)
1910{ 1918{
1911 BUG_ON(in_interrupt()); 1919 might_sleep();
1920
1912 down(&console_sem); 1921 down(&console_sem);
1913 if (console_suspended) 1922 if (console_suspended)
1914 return; 1923 return;
1915 console_locked = 1; 1924 console_locked = 1;
1916 console_may_schedule = 1; 1925 console_may_schedule = 1;
1926 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1917} 1927}
1918EXPORT_SYMBOL(console_lock); 1928EXPORT_SYMBOL(console_lock);
1919 1929
@@ -1935,6 +1945,7 @@ int console_trylock(void)
1935 } 1945 }
1936 console_locked = 1; 1946 console_locked = 1;
1937 console_may_schedule = 0; 1947 console_may_schedule = 0;
1948 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1938 return 1; 1949 return 1;
1939} 1950}
1940EXPORT_SYMBOL(console_trylock); 1951EXPORT_SYMBOL(console_trylock);
@@ -2095,6 +2106,7 @@ skip:
2095 local_irq_restore(flags); 2106 local_irq_restore(flags);
2096 } 2107 }
2097 console_locked = 0; 2108 console_locked = 0;
2109 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2098 2110
2099 /* Release the exclusive_console once it is used */ 2111 /* Release the exclusive_console once it is used */
2100 if (unlikely(exclusive_console)) 2112 if (unlikely(exclusive_console))
diff --git a/kernel/profile.c b/kernel/profile.c
index 76b8e77773ee..1f391819c42f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,9 +8,10 @@
8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
9 * Red Hat, July 2004 9 * Red Hat, July 2004
10 * Consolidation of architecture support code for profiling, 10 * Consolidation of architecture support code for profiling,
11 * William Irwin, Oracle, July 2004 11 * Nadia Yvette Chambers, Oracle, July 2004
12 * Amortized hit count accounting via per-cpu open-addressed hashtables 12 * Amortized hit count accounting via per-cpu open-addressed hashtables
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, Nadia Yvette Chambers,
14 * Oracle, 2004
14 */ 15 */
15 16
16#include <linux/export.h> 17#include <linux/export.h>
@@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
256 * pagetable hash functions, but uses a full hashtable full of finite 257 * pagetable hash functions, but uses a full hashtable full of finite
257 * collision chains, not just pairs of them. 258 * collision chains, not just pairs of them.
258 * 259 *
259 * -- wli 260 * -- nyc
260 */ 261 */
261static void __profile_flip_buffers(void *unused) 262static void __profile_flip_buffers(void *unused)
262{ 263{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 7b09b88862cc..1599157336a6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -463,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer)
463 return; 463 return;
464 464
465 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 465 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
466 if (unlikely(p->ptrace & PT_EXITKILL))
467 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
468
466 if (__ptrace_detach(tracer, p)) 469 if (__ptrace_detach(tracer, p))
467 list_add(&p->ptrace_entry, &ptrace_dead); 470 list_add(&p->ptrace_entry, &ptrace_dead);
468 } 471 }
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..20dfba576c2b 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
109 } 109 }
110} 110}
111 111
112extern int rcu_expedited;
113
112#endif /* __LINUX_RCU_H */ 114#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da594..a2cf76177b44 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h>
49 50
50#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
51#include <trace/events/rcu.h> 52#include <trace/events/rcu.h>
52 53
53#include "rcu.h" 54#include "rcu.h"
54 55
56module_param(rcu_expedited, int, 0);
57
55#ifdef CONFIG_PREEMPT_RCU 58#ifdef CONFIG_PREEMPT_RCU
56 59
57/* 60/*
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f7..e7dce58f9c2a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 0; 198 return rcu_dynticks_nesting <= 1;
199} 199}
200 200
201/* 201/*
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d0190282204..f85016a2309b 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -706,7 +706,10 @@ void synchronize_rcu(void)
706 return; 706 return;
707 707
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 rcu_barrier(); 709 if (rcu_expedited)
710 synchronize_rcu_expedited();
711 else
712 rcu_barrier();
710} 713}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 714EXPORT_SYMBOL_GPL(synchronize_rcu);
712 715
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index aaa7b9f3532a..31dea01c85fd 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title)
339 339
340struct rcu_torture_ops { 340struct rcu_torture_ops {
341 void (*init)(void); 341 void (*init)(void);
342 void (*cleanup)(void);
343 int (*readlock)(void); 342 int (*readlock)(void);
344 void (*read_delay)(struct rcu_random_state *rrsp); 343 void (*read_delay)(struct rcu_random_state *rrsp);
345 void (*readunlock)(int idx); 344 void (*readunlock)(int idx);
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
431 430
432static struct rcu_torture_ops rcu_ops = { 431static struct rcu_torture_ops rcu_ops = {
433 .init = NULL, 432 .init = NULL,
434 .cleanup = NULL,
435 .readlock = rcu_torture_read_lock, 433 .readlock = rcu_torture_read_lock,
436 .read_delay = rcu_read_delay, 434 .read_delay = rcu_read_delay,
437 .readunlock = rcu_torture_read_unlock, 435 .readunlock = rcu_torture_read_unlock,
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void)
475 473
476static struct rcu_torture_ops rcu_sync_ops = { 474static struct rcu_torture_ops rcu_sync_ops = {
477 .init = rcu_sync_torture_init, 475 .init = rcu_sync_torture_init,
478 .cleanup = NULL,
479 .readlock = rcu_torture_read_lock, 476 .readlock = rcu_torture_read_lock,
480 .read_delay = rcu_read_delay, 477 .read_delay = rcu_read_delay,
481 .readunlock = rcu_torture_read_unlock, 478 .readunlock = rcu_torture_read_unlock,
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
493 490
494static struct rcu_torture_ops rcu_expedited_ops = { 491static struct rcu_torture_ops rcu_expedited_ops = {
495 .init = rcu_sync_torture_init, 492 .init = rcu_sync_torture_init,
496 .cleanup = NULL,
497 .readlock = rcu_torture_read_lock, 493 .readlock = rcu_torture_read_lock,
498 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 494 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
499 .readunlock = rcu_torture_read_unlock, 495 .readunlock = rcu_torture_read_unlock,
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
536 532
537static struct rcu_torture_ops rcu_bh_ops = { 533static struct rcu_torture_ops rcu_bh_ops = {
538 .init = NULL, 534 .init = NULL,
539 .cleanup = NULL,
540 .readlock = rcu_bh_torture_read_lock, 535 .readlock = rcu_bh_torture_read_lock,
541 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 536 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
542 .readunlock = rcu_bh_torture_read_unlock, 537 .readunlock = rcu_bh_torture_read_unlock,
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
553 548
554static struct rcu_torture_ops rcu_bh_sync_ops = { 549static struct rcu_torture_ops rcu_bh_sync_ops = {
555 .init = rcu_sync_torture_init, 550 .init = rcu_sync_torture_init,
556 .cleanup = NULL,
557 .readlock = rcu_bh_torture_read_lock, 551 .readlock = rcu_bh_torture_read_lock,
558 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 552 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
559 .readunlock = rcu_bh_torture_read_unlock, 553 .readunlock = rcu_bh_torture_read_unlock,
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
570 564
571static struct rcu_torture_ops rcu_bh_expedited_ops = { 565static struct rcu_torture_ops rcu_bh_expedited_ops = {
572 .init = rcu_sync_torture_init, 566 .init = rcu_sync_torture_init,
573 .cleanup = NULL,
574 .readlock = rcu_bh_torture_read_lock, 567 .readlock = rcu_bh_torture_read_lock,
575 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
576 .readunlock = rcu_bh_torture_read_unlock, 569 .readunlock = rcu_bh_torture_read_unlock,
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
589 * Definitions for srcu torture testing. 582 * Definitions for srcu torture testing.
590 */ 583 */
591 584
592static struct srcu_struct srcu_ctl; 585DEFINE_STATIC_SRCU(srcu_ctl);
593
594static void srcu_torture_init(void)
595{
596 init_srcu_struct(&srcu_ctl);
597 rcu_sync_torture_init();
598}
599
600static void srcu_torture_cleanup(void)
601{
602 synchronize_srcu(&srcu_ctl);
603 cleanup_srcu_struct(&srcu_ctl);
604}
605 586
606static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 587static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
607{ 588{
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page)
672} 653}
673 654
674static struct rcu_torture_ops srcu_ops = { 655static struct rcu_torture_ops srcu_ops = {
675 .init = srcu_torture_init, 656 .init = rcu_sync_torture_init,
676 .cleanup = srcu_torture_cleanup,
677 .readlock = srcu_torture_read_lock, 657 .readlock = srcu_torture_read_lock,
678 .read_delay = srcu_read_delay, 658 .read_delay = srcu_read_delay,
679 .readunlock = srcu_torture_read_unlock, 659 .readunlock = srcu_torture_read_unlock,
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = {
687}; 667};
688 668
689static struct rcu_torture_ops srcu_sync_ops = { 669static struct rcu_torture_ops srcu_sync_ops = {
690 .init = srcu_torture_init, 670 .init = rcu_sync_torture_init,
691 .cleanup = srcu_torture_cleanup,
692 .readlock = srcu_torture_read_lock, 671 .readlock = srcu_torture_read_lock,
693 .read_delay = srcu_read_delay, 672 .read_delay = srcu_read_delay,
694 .readunlock = srcu_torture_read_unlock, 673 .readunlock = srcu_torture_read_unlock,
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
712} 691}
713 692
714static struct rcu_torture_ops srcu_raw_ops = { 693static struct rcu_torture_ops srcu_raw_ops = {
715 .init = srcu_torture_init, 694 .init = rcu_sync_torture_init,
716 .cleanup = srcu_torture_cleanup,
717 .readlock = srcu_torture_read_lock_raw, 695 .readlock = srcu_torture_read_lock_raw,
718 .read_delay = srcu_read_delay, 696 .read_delay = srcu_read_delay,
719 .readunlock = srcu_torture_read_unlock_raw, 697 .readunlock = srcu_torture_read_unlock_raw,
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
727}; 705};
728 706
729static struct rcu_torture_ops srcu_raw_sync_ops = { 707static struct rcu_torture_ops srcu_raw_sync_ops = {
730 .init = srcu_torture_init, 708 .init = rcu_sync_torture_init,
731 .cleanup = srcu_torture_cleanup,
732 .readlock = srcu_torture_read_lock_raw, 709 .readlock = srcu_torture_read_lock_raw,
733 .read_delay = srcu_read_delay, 710 .read_delay = srcu_read_delay,
734 .readunlock = srcu_torture_read_unlock_raw, 711 .readunlock = srcu_torture_read_unlock_raw,
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void)
747} 724}
748 725
749static struct rcu_torture_ops srcu_expedited_ops = { 726static struct rcu_torture_ops srcu_expedited_ops = {
750 .init = srcu_torture_init, 727 .init = rcu_sync_torture_init,
751 .cleanup = srcu_torture_cleanup,
752 .readlock = srcu_torture_read_lock, 728 .readlock = srcu_torture_read_lock,
753 .read_delay = srcu_read_delay, 729 .read_delay = srcu_read_delay,
754 .readunlock = srcu_torture_read_unlock, 730 .readunlock = srcu_torture_read_unlock,
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
783 759
784static struct rcu_torture_ops sched_ops = { 760static struct rcu_torture_ops sched_ops = {
785 .init = rcu_sync_torture_init, 761 .init = rcu_sync_torture_init,
786 .cleanup = NULL,
787 .readlock = sched_torture_read_lock, 762 .readlock = sched_torture_read_lock,
788 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 763 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
789 .readunlock = sched_torture_read_unlock, 764 .readunlock = sched_torture_read_unlock,
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = {
799 774
800static struct rcu_torture_ops sched_sync_ops = { 775static struct rcu_torture_ops sched_sync_ops = {
801 .init = rcu_sync_torture_init, 776 .init = rcu_sync_torture_init,
802 .cleanup = NULL,
803 .readlock = sched_torture_read_lock, 777 .readlock = sched_torture_read_lock,
804 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
805 .readunlock = sched_torture_read_unlock, 779 .readunlock = sched_torture_read_unlock,
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = {
814 788
815static struct rcu_torture_ops sched_expedited_ops = { 789static struct rcu_torture_ops sched_expedited_ops = {
816 .init = rcu_sync_torture_init, 790 .init = rcu_sync_torture_init,
817 .cleanup = NULL,
818 .readlock = sched_torture_read_lock, 791 .readlock = sched_torture_read_lock,
819 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 792 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
820 .readunlock = sched_torture_read_unlock, 793 .readunlock = sched_torture_read_unlock,
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1369 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1397 "test_boost=%d/%d test_boost_interval=%d " 1370 "test_boost=%d/%d test_boost_interval=%d "
1398 "test_boost_duration=%d shutdown_secs=%d " 1371 "test_boost_duration=%d shutdown_secs=%d "
1372 "stall_cpu=%d stall_cpu_holdoff=%d "
1373 "n_barrier_cbs=%d "
1399 "onoff_interval=%d onoff_holdoff=%d\n", 1374 "onoff_interval=%d onoff_holdoff=%d\n",
1400 torture_type, tag, nrealreaders, nfakewriters, 1375 torture_type, tag, nrealreaders, nfakewriters,
1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1376 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1377 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1403 test_boost, cur_ops->can_boost, 1378 test_boost, cur_ops->can_boost,
1404 test_boost_interval, test_boost_duration, shutdown_secs, 1379 test_boost_interval, test_boost_duration, shutdown_secs,
1380 stall_cpu, stall_cpu_holdoff,
1381 n_barrier_cbs,
1405 onoff_interval, onoff_holdoff); 1382 onoff_interval, onoff_holdoff);
1406} 1383}
1407 1384
@@ -1502,6 +1479,7 @@ rcu_torture_onoff(void *arg)
1502 unsigned long delta; 1479 unsigned long delta;
1503 int maxcpu = -1; 1480 int maxcpu = -1;
1504 DEFINE_RCU_RANDOM(rand); 1481 DEFINE_RCU_RANDOM(rand);
1482 int ret;
1505 unsigned long starttime; 1483 unsigned long starttime;
1506 1484
1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1485 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
@@ -1522,7 +1500,13 @@ rcu_torture_onoff(void *arg)
1522 torture_type, cpu); 1500 torture_type, cpu);
1523 starttime = jiffies; 1501 starttime = jiffies;
1524 n_offline_attempts++; 1502 n_offline_attempts++;
1525 if (cpu_down(cpu) == 0) { 1503 ret = cpu_down(cpu);
1504 if (ret) {
1505 if (verbose)
1506 pr_alert("%s" TORTURE_FLAG
1507 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1508 torture_type, cpu, ret);
1509 } else {
1526 if (verbose) 1510 if (verbose)
1527 pr_alert("%s" TORTURE_FLAG 1511 pr_alert("%s" TORTURE_FLAG
1528 "rcu_torture_onoff task: offlined %d\n", 1512 "rcu_torture_onoff task: offlined %d\n",
@@ -1936,8 +1920,6 @@ rcu_torture_cleanup(void)
1936 1920
1937 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1921 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1938 1922
1939 if (cur_ops->cleanup)
1940 cur_ops->cleanup();
1941 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1923 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1942 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1924 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1943 else if (n_online_successes != n_online_attempts || 1925 else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 74df86bd9204..e441b77b614e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
71 .gpnum = -300, \ 71 .gpnum = 0UL - 300UL, \
72 .completed = -300, \ 72 .completed = 0UL - 300UL, \
73 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ 73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
@@ -207,18 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
209 .dynticks = ATOMIC_INIT(1), 209 .dynticks = ATOMIC_INIT(1),
210#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
211 .ignore_user_qs = true,
212#endif
213}; 210};
214 211
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 212static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 213static long qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 214static long qlowmark = 100; /* Once only this many pending, use blimit. */
218 215
219module_param(blimit, int, 0444); 216module_param(blimit, long, 0444);
220module_param(qhimark, int, 0444); 217module_param(qhimark, long, 0444);
221module_param(qlowmark, int, 0444); 218module_param(qlowmark, long, 0444);
222 219
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -303,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
303static int 300static int
304cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 301cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305{ 302{
306 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; 303 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
304 rdp->nxttail[RCU_DONE_TAIL] != NULL;
307} 305}
308 306
309/* 307/*
@@ -312,8 +310,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
312static int 310static int
313cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
314{ 312{
315 return *rdp->nxttail[RCU_DONE_TAIL + 313 struct rcu_head **ntp;
316 ACCESS_ONCE(rsp->completed) != rdp->completed] && 314
315 ntp = rdp->nxttail[RCU_DONE_TAIL +
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)];
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
317 !rcu_gp_in_progress(rsp); 318 !rcu_gp_in_progress(rsp);
318} 319}
319 320
@@ -416,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
416 */ 417 */
417void rcu_user_enter(void) 418void rcu_user_enter(void)
418{ 419{
419 unsigned long flags; 420 rcu_eqs_enter(1);
420 struct rcu_dynticks *rdtp;
421
422 /*
423 * Some contexts may involve an exception occuring in an irq,
424 * leading to that nesting:
425 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
426 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
427 * helpers are enough to protect RCU uses inside the exception. So
428 * just return immediately if we detect we are in an IRQ.
429 */
430 if (in_interrupt())
431 return;
432
433 WARN_ON_ONCE(!current->mm);
434
435 local_irq_save(flags);
436 rdtp = &__get_cpu_var(rcu_dynticks);
437 if (!rdtp->ignore_user_qs && !rdtp->in_user) {
438 rdtp->in_user = true;
439 rcu_eqs_enter(true);
440 }
441 local_irq_restore(flags);
442} 421}
443 422
444/** 423/**
@@ -575,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
575 */ 554 */
576void rcu_user_exit(void) 555void rcu_user_exit(void)
577{ 556{
578 unsigned long flags; 557 rcu_eqs_exit(1);
579 struct rcu_dynticks *rdtp;
580
581 /*
582 * Some contexts may involve an exception occuring in an irq,
583 * leading to that nesting:
584 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
585 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
586 * helpers are enough to protect RCU uses inside the exception. So
587 * just return immediately if we detect we are in an IRQ.
588 */
589 if (in_interrupt())
590 return;
591
592 local_irq_save(flags);
593 rdtp = &__get_cpu_var(rcu_dynticks);
594 if (rdtp->in_user) {
595 rdtp->in_user = false;
596 rcu_eqs_exit(true);
597 }
598 local_irq_restore(flags);
599} 558}
600 559
601/** 560/**
@@ -718,21 +677,6 @@ int rcu_is_cpu_idle(void)
718} 677}
719EXPORT_SYMBOL(rcu_is_cpu_idle); 678EXPORT_SYMBOL(rcu_is_cpu_idle);
720 679
721#ifdef CONFIG_RCU_USER_QS
722void rcu_user_hooks_switch(struct task_struct *prev,
723 struct task_struct *next)
724{
725 struct rcu_dynticks *rdtp;
726
727 /* Interrupts are disabled in context switch */
728 rdtp = &__get_cpu_var(rcu_dynticks);
729 if (!rdtp->ignore_user_qs) {
730 clear_tsk_thread_flag(prev, TIF_NOHZ);
731 set_tsk_thread_flag(next, TIF_NOHZ);
732 }
733}
734#endif /* #ifdef CONFIG_RCU_USER_QS */
735
736#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 680#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
737 681
738/* 682/*
@@ -873,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
873 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
874} 818}
875 819
820/*
821 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
822 * for architectures that do not implement trigger_all_cpu_backtrace().
823 * The NMI-triggered stack traces are more accurate because they are
824 * printed by the target CPU.
825 */
826static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
827{
828 int cpu;
829 unsigned long flags;
830 struct rcu_node *rnp;
831
832 rcu_for_each_leaf_node(rsp, rnp) {
833 raw_spin_lock_irqsave(&rnp->lock, flags);
834 if (rnp->qsmask != 0) {
835 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
836 if (rnp->qsmask & (1UL << cpu))
837 dump_cpu_task(rnp->grplo + cpu);
838 }
839 raw_spin_unlock_irqrestore(&rnp->lock, flags);
840 }
841}
842
876static void print_other_cpu_stall(struct rcu_state *rsp) 843static void print_other_cpu_stall(struct rcu_state *rsp)
877{ 844{
878 int cpu; 845 int cpu;
@@ -880,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
880 unsigned long flags; 847 unsigned long flags;
881 int ndetected = 0; 848 int ndetected = 0;
882 struct rcu_node *rnp = rcu_get_root(rsp); 849 struct rcu_node *rnp = rcu_get_root(rsp);
850 long totqlen = 0;
883 851
884 /* Only let one CPU complain about others per time interval. */ 852 /* Only let one CPU complain about others per time interval. */
885 853
@@ -924,12 +892,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
924 raw_spin_unlock_irqrestore(&rnp->lock, flags); 892 raw_spin_unlock_irqrestore(&rnp->lock, flags);
925 893
926 print_cpu_stall_info_end(); 894 print_cpu_stall_info_end();
927 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", 895 for_each_possible_cpu(cpu)
928 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 896 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
897 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
898 smp_processor_id(), (long)(jiffies - rsp->gp_start),
899 rsp->gpnum, rsp->completed, totqlen);
929 if (ndetected == 0) 900 if (ndetected == 0)
930 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 901 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
931 else if (!trigger_all_cpu_backtrace()) 902 else if (!trigger_all_cpu_backtrace())
932 dump_stack(); 903 rcu_dump_cpu_stacks(rsp);
933 904
934 /* Complain about tasks blocking the grace period. */ 905 /* Complain about tasks blocking the grace period. */
935 906
@@ -940,8 +911,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
940 911
941static void print_cpu_stall(struct rcu_state *rsp) 912static void print_cpu_stall(struct rcu_state *rsp)
942{ 913{
914 int cpu;
943 unsigned long flags; 915 unsigned long flags;
944 struct rcu_node *rnp = rcu_get_root(rsp); 916 struct rcu_node *rnp = rcu_get_root(rsp);
917 long totqlen = 0;
945 918
946 /* 919 /*
947 * OK, time to rat on ourselves... 920 * OK, time to rat on ourselves...
@@ -952,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
952 print_cpu_stall_info_begin(); 925 print_cpu_stall_info_begin();
953 print_cpu_stall_info(rsp, smp_processor_id()); 926 print_cpu_stall_info(rsp, smp_processor_id());
954 print_cpu_stall_info_end(); 927 print_cpu_stall_info_end();
955 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); 928 for_each_possible_cpu(cpu)
929 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
930 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
931 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
956 if (!trigger_all_cpu_backtrace()) 932 if (!trigger_all_cpu_backtrace())
957 dump_stack(); 933 dump_stack();
958 934
@@ -1091,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp)
1091 rdp->nxtlist = NULL; 1067 rdp->nxtlist = NULL;
1092 for (i = 0; i < RCU_NEXT_SIZE; i++) 1068 for (i = 0; i < RCU_NEXT_SIZE; i++)
1093 rdp->nxttail[i] = &rdp->nxtlist; 1069 rdp->nxttail[i] = &rdp->nxtlist;
1070 init_nocb_callback_list(rdp);
1094} 1071}
1095 1072
1096/* 1073/*
@@ -1404,15 +1381,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1404 !cpu_needs_another_gp(rsp, rdp)) { 1381 !cpu_needs_another_gp(rsp, rdp)) {
1405 /* 1382 /*
1406 * Either we have not yet spawned the grace-period 1383 * Either we have not yet spawned the grace-period
1407 * task or this CPU does not need another grace period. 1384 * task, this CPU does not need another grace period,
1385 * or a grace period is already in progress.
1408 * Either way, don't start a new grace period. 1386 * Either way, don't start a new grace period.
1409 */ 1387 */
1410 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1388 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1411 return; 1389 return;
1412 } 1390 }
1413 1391
1392 /*
1393 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be
1396 * handled after the end of the next grace period. If the
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406
1414 rsp->gp_flags = RCU_GP_FLAG_INIT; 1407 rsp->gp_flags = RCU_GP_FLAG_INIT;
1415 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1409
1410 /* Ensure that CPU is aware of completion of last grace period. */
1411 rcu_process_gp_end(rsp, rdp);
1412 local_irq_restore(flags);
1413
1414 /* Wake up rcu_gp_kthread() to start the grace period. */
1416 wake_up(&rsp->gp_wq); 1415 wake_up(&rsp->gp_wq);
1417} 1416}
1418 1417
@@ -1573,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1573/* 1572/*
1574 * Send the specified CPU's RCU callbacks to the orphanage. The 1573 * Send the specified CPU's RCU callbacks to the orphanage. The
1575 * specified CPU must be offline, and the caller must hold the 1574 * specified CPU must be offline, and the caller must hold the
1576 * ->onofflock. 1575 * ->orphan_lock.
1577 */ 1576 */
1578static void 1577static void
1579rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1578rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1580 struct rcu_node *rnp, struct rcu_data *rdp) 1579 struct rcu_node *rnp, struct rcu_data *rdp)
1581{ 1580{
1581 /* No-CBs CPUs do not have orphanable callbacks. */
1582 if (is_nocb_cpu(rdp->cpu))
1583 return;
1584
1582 /* 1585 /*
1583 * Orphan the callbacks. First adjust the counts. This is safe 1586 * Orphan the callbacks. First adjust the counts. This is safe
1584 * because ->onofflock excludes _rcu_barrier()'s adoption of 1587 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1585 * the callbacks, thus no memory barrier is required. 1588 * cannot be running now. Thus no memory barrier is required.
1586 */ 1589 */
1587 if (rdp->nxtlist != NULL) { 1590 if (rdp->nxtlist != NULL) {
1588 rsp->qlen_lazy += rdp->qlen_lazy; 1591 rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1623,13 +1626,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1623 1626
1624/* 1627/*
1625 * Adopt the RCU callbacks from the specified rcu_state structure's 1628 * Adopt the RCU callbacks from the specified rcu_state structure's
1626 * orphanage. The caller must hold the ->onofflock. 1629 * orphanage. The caller must hold the ->orphan_lock.
1627 */ 1630 */
1628static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1631static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1629{ 1632{
1630 int i; 1633 int i;
1631 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1634 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1632 1635
1636 /* No-CBs CPUs are handled specially. */
1637 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
1638 return;
1639
1633 /* Do the accounting first. */ 1640 /* Do the accounting first. */
1634 rdp->qlen_lazy += rsp->qlen_lazy; 1641 rdp->qlen_lazy += rsp->qlen_lazy;
1635 rdp->qlen += rsp->qlen; 1642 rdp->qlen += rsp->qlen;
@@ -1702,7 +1709,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1702 1709
1703 /* Exclude any attempts to start a new grace period. */ 1710 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex); 1711 mutex_lock(&rsp->onoff_mutex);
1705 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1712 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1706 1713
1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1714 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1708 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1715 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1729,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1729 /* 1736 /*
1730 * We still hold the leaf rcu_node structure lock here, and 1737 * We still hold the leaf rcu_node structure lock here, and
1731 * irqs are still disabled. The reason for this subterfuge is 1738 * irqs are still disabled. The reason for this subterfuge is
1732 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1739 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1733 * held leads to deadlock. 1740 * held leads to deadlock.
1734 */ 1741 */
1735 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1742 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1736 rnp = rdp->mynode; 1743 rnp = rdp->mynode;
1737 if (need_report & RCU_OFL_TASKS_NORM_GP) 1744 if (need_report & RCU_OFL_TASKS_NORM_GP)
1738 rcu_report_unblock_qs_rnp(rnp, flags); 1745 rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1769,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1769{ 1776{
1770 unsigned long flags; 1777 unsigned long flags;
1771 struct rcu_head *next, *list, **tail; 1778 struct rcu_head *next, *list, **tail;
1772 int bl, count, count_lazy, i; 1779 long bl, count, count_lazy;
1780 int i;
1773 1781
1774 /* If no callbacks are ready, just return.*/ 1782 /* If no callbacks are ready, just return.*/
1775 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -2107,9 +2115,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2107 } 2115 }
2108} 2116}
2109 2117
2118/*
2119 * Helper function for call_rcu() and friends. The cpu argument will
2120 * normally be -1, indicating "currently running CPU". It may specify
2121 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
2122 * is expected to specify a CPU.
2123 */
2110static void 2124static void
2111__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 2125__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2112 struct rcu_state *rsp, bool lazy) 2126 struct rcu_state *rsp, int cpu, bool lazy)
2113{ 2127{
2114 unsigned long flags; 2128 unsigned long flags;
2115 struct rcu_data *rdp; 2129 struct rcu_data *rdp;
@@ -2129,9 +2143,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2129 rdp = this_cpu_ptr(rsp->rda); 2143 rdp = this_cpu_ptr(rsp->rda);
2130 2144
2131 /* Add the callback to our list. */ 2145 /* Add the callback to our list. */
2132 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { 2146 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2147 int offline;
2148
2149 if (cpu != -1)
2150 rdp = per_cpu_ptr(rsp->rda, cpu);
2151 offline = !__call_rcu_nocb(rdp, head, lazy);
2152 WARN_ON_ONCE(offline);
2133 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2153 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2134 WARN_ON_ONCE(1);
2135 local_irq_restore(flags); 2154 local_irq_restore(flags);
2136 return; 2155 return;
2137 } 2156 }
@@ -2160,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2160 */ 2179 */
2161void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2180void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2162{ 2181{
2163 __call_rcu(head, func, &rcu_sched_state, 0); 2182 __call_rcu(head, func, &rcu_sched_state, -1, 0);
2164} 2183}
2165EXPORT_SYMBOL_GPL(call_rcu_sched); 2184EXPORT_SYMBOL_GPL(call_rcu_sched);
2166 2185
@@ -2169,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
2169 */ 2188 */
2170void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2189void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2171{ 2190{
2172 __call_rcu(head, func, &rcu_bh_state, 0); 2191 __call_rcu(head, func, &rcu_bh_state, -1, 0);
2173} 2192}
2174EXPORT_SYMBOL_GPL(call_rcu_bh); 2193EXPORT_SYMBOL_GPL(call_rcu_bh);
2175 2194
@@ -2205,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void)
2205 * rcu_read_lock_sched(). 2224 * rcu_read_lock_sched().
2206 * 2225 *
2207 * This means that all preempt_disable code sequences, including NMI and 2226 * This means that all preempt_disable code sequences, including NMI and
2208 * hardware-interrupt handlers, in progress on entry will have completed 2227 * non-threaded hardware-interrupt handlers, in progress on entry will
2209 * before this primitive returns. However, this does not guarantee that 2228 * have completed before this primitive returns. However, this does not
2210 * softirq handlers will have completed, since in some kernels, these 2229 * guarantee that softirq handlers will have completed, since in some
2211 * handlers can run in process context, and can block. 2230 * kernels, these handlers can run in process context, and can block.
2231 *
2232 * Note that this guarantee implies further memory-ordering guarantees.
2233 * On systems with more than one CPU, when synchronize_sched() returns,
2234 * each CPU is guaranteed to have executed a full memory barrier since the
2235 * end of its last RCU-sched read-side critical section whose beginning
2236 * preceded the call to synchronize_sched(). In addition, each CPU having
2237 * an RCU read-side critical section that extends beyond the return from
2238 * synchronize_sched() is guaranteed to have executed a full memory barrier
2239 * after the beginning of synchronize_sched() and before the beginning of
2240 * that RCU read-side critical section. Note that these guarantees include
2241 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2242 * that are executing in the kernel.
2243 *
2244 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2245 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2246 * to have executed a full memory barrier during the execution of
2247 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2248 * again only if the system has more than one CPU).
2212 * 2249 *
2213 * This primitive provides the guarantees made by the (now removed) 2250 * This primitive provides the guarantees made by the (now removed)
2214 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2251 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2224,7 +2261,10 @@ void synchronize_sched(void)
2224 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2261 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2225 if (rcu_blocking_is_gp()) 2262 if (rcu_blocking_is_gp())
2226 return; 2263 return;
2227 wait_rcu_gp(call_rcu_sched); 2264 if (rcu_expedited)
2265 synchronize_sched_expedited();
2266 else
2267 wait_rcu_gp(call_rcu_sched);
2228} 2268}
2229EXPORT_SYMBOL_GPL(synchronize_sched); 2269EXPORT_SYMBOL_GPL(synchronize_sched);
2230 2270
@@ -2236,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2236 * read-side critical sections have completed. RCU read-side critical 2276 * read-side critical sections have completed. RCU read-side critical
2237 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2277 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2238 * and may be nested. 2278 * and may be nested.
2279 *
2280 * See the description of synchronize_sched() for more detailed information
2281 * on memory ordering guarantees.
2239 */ 2282 */
2240void synchronize_rcu_bh(void) 2283void synchronize_rcu_bh(void)
2241{ 2284{
@@ -2245,13 +2288,13 @@ void synchronize_rcu_bh(void)
2245 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2288 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2246 if (rcu_blocking_is_gp()) 2289 if (rcu_blocking_is_gp())
2247 return; 2290 return;
2248 wait_rcu_gp(call_rcu_bh); 2291 if (rcu_expedited)
2292 synchronize_rcu_bh_expedited();
2293 else
2294 wait_rcu_gp(call_rcu_bh);
2249} 2295}
2250EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2296EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2251 2297
2252static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
2253static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
2254
2255static int synchronize_sched_expedited_cpu_stop(void *data) 2298static int synchronize_sched_expedited_cpu_stop(void *data)
2256{ 2299{
2257 /* 2300 /*
@@ -2308,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2308 */ 2351 */
2309void synchronize_sched_expedited(void) 2352void synchronize_sched_expedited(void)
2310{ 2353{
2311 int firstsnap, s, snap, trycount = 0; 2354 long firstsnap, s, snap;
2355 int trycount = 0;
2356 struct rcu_state *rsp = &rcu_sched_state;
2357
2358 /*
2359 * If we are in danger of counter wrap, just do synchronize_sched().
2360 * By allowing sync_sched_expedited_started to advance no more than
2361 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2362 * that more than 3.5 billion CPUs would be required to force a
2363 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2364 * course be required on a 64-bit system.
2365 */
2366 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2367 (ulong)atomic_long_read(&rsp->expedited_done) +
2368 ULONG_MAX / 8)) {
2369 synchronize_sched();
2370 atomic_long_inc(&rsp->expedited_wrap);
2371 return;
2372 }
2312 2373
2313 /* Note that atomic_inc_return() implies full memory barrier. */ 2374 /*
2314 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); 2375 * Take a ticket. Note that atomic_inc_return() implies a
2376 * full memory barrier.
2377 */
2378 snap = atomic_long_inc_return(&rsp->expedited_start);
2379 firstsnap = snap;
2315 get_online_cpus(); 2380 get_online_cpus();
2316 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2381 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2317 2382
@@ -2323,48 +2388,65 @@ void synchronize_sched_expedited(void)
2323 synchronize_sched_expedited_cpu_stop, 2388 synchronize_sched_expedited_cpu_stop,
2324 NULL) == -EAGAIN) { 2389 NULL) == -EAGAIN) {
2325 put_online_cpus(); 2390 put_online_cpus();
2391 atomic_long_inc(&rsp->expedited_tryfail);
2392
2393 /* Check to see if someone else did our work for us. */
2394 s = atomic_long_read(&rsp->expedited_done);
2395 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2396 /* ensure test happens before caller kfree */
2397 smp_mb__before_atomic_inc(); /* ^^^ */
2398 atomic_long_inc(&rsp->expedited_workdone1);
2399 return;
2400 }
2326 2401
2327 /* No joy, try again later. Or just synchronize_sched(). */ 2402 /* No joy, try again later. Or just synchronize_sched(). */
2328 if (trycount++ < 10) { 2403 if (trycount++ < 10) {
2329 udelay(trycount * num_online_cpus()); 2404 udelay(trycount * num_online_cpus());
2330 } else { 2405 } else {
2331 synchronize_sched(); 2406 wait_rcu_gp(call_rcu_sched);
2407 atomic_long_inc(&rsp->expedited_normal);
2332 return; 2408 return;
2333 } 2409 }
2334 2410
2335 /* Check to see if someone else did our work for us. */ 2411 /* Recheck to see if someone else did our work for us. */
2336 s = atomic_read(&sync_sched_expedited_done); 2412 s = atomic_long_read(&rsp->expedited_done);
2337 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { 2413 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2338 smp_mb(); /* ensure test happens before caller kfree */ 2414 /* ensure test happens before caller kfree */
2415 smp_mb__before_atomic_inc(); /* ^^^ */
2416 atomic_long_inc(&rsp->expedited_workdone2);
2339 return; 2417 return;
2340 } 2418 }
2341 2419
2342 /* 2420 /*
2343 * Refetching sync_sched_expedited_started allows later 2421 * Refetching sync_sched_expedited_started allows later
2344 * callers to piggyback on our grace period. We subtract 2422 * callers to piggyback on our grace period. We retry
2345 * 1 to get the same token that the last incrementer got. 2423 * after they started, so our grace period works for them,
2346 * We retry after they started, so our grace period works 2424 * and they started after our first try, so their grace
2347 * for them, and they started after our first try, so their 2425 * period works for us.
2348 * grace period works for us.
2349 */ 2426 */
2350 get_online_cpus(); 2427 get_online_cpus();
2351 snap = atomic_read(&sync_sched_expedited_started); 2428 snap = atomic_long_read(&rsp->expedited_start);
2352 smp_mb(); /* ensure read is before try_stop_cpus(). */ 2429 smp_mb(); /* ensure read is before try_stop_cpus(). */
2353 } 2430 }
2431 atomic_long_inc(&rsp->expedited_stoppedcpus);
2354 2432
2355 /* 2433 /*
2356 * Everyone up to our most recent fetch is covered by our grace 2434 * Everyone up to our most recent fetch is covered by our grace
2357 * period. Update the counter, but only if our work is still 2435 * period. Update the counter, but only if our work is still
2358 * relevant -- which it won't be if someone who started later 2436 * relevant -- which it won't be if someone who started later
2359 * than we did beat us to the punch. 2437 * than we did already did their update.
2360 */ 2438 */
2361 do { 2439 do {
2362 s = atomic_read(&sync_sched_expedited_done); 2440 atomic_long_inc(&rsp->expedited_done_tries);
2363 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { 2441 s = atomic_long_read(&rsp->expedited_done);
2364 smp_mb(); /* ensure test happens before caller kfree */ 2442 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2443 /* ensure test happens before caller kfree */
2444 smp_mb__before_atomic_inc(); /* ^^^ */
2445 atomic_long_inc(&rsp->expedited_done_lost);
2365 break; 2446 break;
2366 } 2447 }
2367 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); 2448 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2449 atomic_long_inc(&rsp->expedited_done_exit);
2368 2450
2369 put_online_cpus(); 2451 put_online_cpus();
2370} 2452}
@@ -2558,9 +2640,17 @@ static void _rcu_barrier(struct rcu_state *rsp)
2558 * When that callback is invoked, we will know that all of the 2640 * When that callback is invoked, we will know that all of the
2559 * corresponding CPU's preceding callbacks have been invoked. 2641 * corresponding CPU's preceding callbacks have been invoked.
2560 */ 2642 */
2561 for_each_online_cpu(cpu) { 2643 for_each_possible_cpu(cpu) {
2644 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2645 continue;
2562 rdp = per_cpu_ptr(rsp->rda, cpu); 2646 rdp = per_cpu_ptr(rsp->rda, cpu);
2563 if (ACCESS_ONCE(rdp->qlen)) { 2647 if (is_nocb_cpu(cpu)) {
2648 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2649 rsp->n_barrier_done);
2650 atomic_inc(&rsp->barrier_cpu_count);
2651 __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2652 rsp, cpu, 0);
2653 } else if (ACCESS_ONCE(rdp->qlen)) {
2564 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2654 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2565 rsp->n_barrier_done); 2655 rsp->n_barrier_done);
2566 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2656 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -2634,6 +2724,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2634#endif 2724#endif
2635 rdp->cpu = cpu; 2725 rdp->cpu = cpu;
2636 rdp->rsp = rsp; 2726 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp);
2637 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2728 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2638} 2729}
2639 2730
@@ -2715,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2715 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2806 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2716 struct rcu_node *rnp = rdp->mynode; 2807 struct rcu_node *rnp = rdp->mynode;
2717 struct rcu_state *rsp; 2808 struct rcu_state *rsp;
2809 int ret = NOTIFY_OK;
2718 2810
2719 trace_rcu_utilization("Start CPU hotplug"); 2811 trace_rcu_utilization("Start CPU hotplug");
2720 switch (action) { 2812 switch (action) {
@@ -2728,7 +2820,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2728 rcu_boost_kthread_setaffinity(rnp, -1); 2820 rcu_boost_kthread_setaffinity(rnp, -1);
2729 break; 2821 break;
2730 case CPU_DOWN_PREPARE: 2822 case CPU_DOWN_PREPARE:
2731 rcu_boost_kthread_setaffinity(rnp, cpu); 2823 if (nocb_cpu_expendable(cpu))
2824 rcu_boost_kthread_setaffinity(rnp, cpu);
2825 else
2826 ret = NOTIFY_BAD;
2732 break; 2827 break;
2733 case CPU_DYING: 2828 case CPU_DYING:
2734 case CPU_DYING_FROZEN: 2829 case CPU_DYING_FROZEN:
@@ -2752,7 +2847,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2752 break; 2847 break;
2753 } 2848 }
2754 trace_rcu_utilization("End CPU hotplug"); 2849 trace_rcu_utilization("End CPU hotplug");
2755 return NOTIFY_OK; 2850 return ret;
2756} 2851}
2757 2852
2758/* 2853/*
@@ -2772,6 +2867,7 @@ static int __init rcu_spawn_gp_kthread(void)
2772 raw_spin_lock_irqsave(&rnp->lock, flags); 2867 raw_spin_lock_irqsave(&rnp->lock, flags);
2773 rsp->gp_kthread = t; 2868 rsp->gp_kthread = t;
2774 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2869 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2870 rcu_spawn_nocb_kthreads(rsp);
2775 } 2871 }
2776 return 0; 2872 return 0;
2777} 2873}
@@ -2967,6 +3063,7 @@ void __init rcu_init(void)
2967 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3063 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2968 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3064 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2969 __rcu_init_preempt(); 3065 __rcu_init_preempt();
3066 rcu_init_nocb();
2970 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3067 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
2971 3068
2972 /* 3069 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a240f032848e..4b69291b093d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -287,6 +287,7 @@ struct rcu_data {
287 long qlen_last_fqs_check; 287 long qlen_last_fqs_check;
288 /* qlen at last check for QS forcing */ 288 /* qlen at last check for QS forcing */
289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
290 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
290 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ 291 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
291 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ 292 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
292 unsigned long n_force_qs_snap; 293 unsigned long n_force_qs_snap;
@@ -317,6 +318,18 @@ struct rcu_data {
317 struct rcu_head oom_head; 318 struct rcu_head oom_head;
318#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 319#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
319 320
321 /* 7) Callback offloading. */
322#ifdef CONFIG_RCU_NOCB_CPU
323 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
324 struct rcu_head **nocb_tail;
325 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
326 atomic_long_t nocb_q_count_lazy; /* (approximate). */
327 int nocb_p_count; /* # CBs being invoked by kthread */
328 int nocb_p_count_lazy; /* (approximate). */
329 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
330 struct task_struct *nocb_kthread;
331#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
332
320 int cpu; 333 int cpu;
321 struct rcu_state *rsp; 334 struct rcu_state *rsp;
322}; 335};
@@ -369,6 +382,12 @@ struct rcu_state {
369 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 382 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
370 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 383 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
371 void (*func)(struct rcu_head *head)); 384 void (*func)(struct rcu_head *head));
385#ifdef CONFIG_RCU_NOCB_CPU
386 void (*call_remote)(struct rcu_head *head,
387 void (*func)(struct rcu_head *head));
388 /* call_rcu() flavor, but for */
389 /* placing on remote CPU. */
390#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
372 391
373 /* The following fields are guarded by the root rcu_node's lock. */ 392 /* The following fields are guarded by the root rcu_node's lock. */
374 393
@@ -383,9 +402,8 @@ struct rcu_state {
383 402
384 /* End of fields guarded by root rcu_node's lock. */ 403 /* End of fields guarded by root rcu_node's lock. */
385 404
386 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; 405 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
387 /* exclude on/offline and */ 406 /* Protect following fields. */
388 /* starting new GP. */
389 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 407 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
390 /* need a grace period. */ 408 /* need a grace period. */
391 struct rcu_head **orphan_nxttail; /* Tail of above. */ 409 struct rcu_head **orphan_nxttail; /* Tail of above. */
@@ -394,7 +412,7 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 412 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 413 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 414 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */ 415 /* End of fields guarded by orphan_lock. */
398 416
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ 417 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400 418
@@ -405,6 +423,18 @@ struct rcu_state {
405 /* _rcu_barrier(). */ 423 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */ 424 /* End of fields guarded by barrier_mutex. */
407 425
426 atomic_long_t expedited_start; /* Starting ticket. */
427 atomic_long_t expedited_done; /* Done ticket. */
428 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
429 atomic_long_t expedited_tryfail; /* # acquisition failures. */
430 atomic_long_t expedited_workdone1; /* # done by others #1. */
431 atomic_long_t expedited_workdone2; /* # done by others #2. */
432 atomic_long_t expedited_normal; /* # fallbacks to normal. */
433 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
434 atomic_long_t expedited_done_tries; /* # tries to update _done. */
435 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
436 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
437
408 unsigned long jiffies_force_qs; /* Time at which to invoke */ 438 unsigned long jiffies_force_qs; /* Time at which to invoke */
409 /* force_quiescent_state(). */ 439 /* force_quiescent_state(). */
410 unsigned long n_force_qs; /* Number of calls to */ 440 unsigned long n_force_qs; /* Number of calls to */
@@ -428,6 +458,8 @@ struct rcu_state {
428#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 458#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
429 459
430extern struct list_head rcu_struct_flavors; 460extern struct list_head rcu_struct_flavors;
461
462/* Sequence through rcu_state structures for each RCU flavor. */
431#define for_each_rcu_flavor(rsp) \ 463#define for_each_rcu_flavor(rsp) \
432 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 464 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
433 465
@@ -504,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
504static void print_cpu_stall_info_end(void); 536static void print_cpu_stall_info_end(void);
505static void zero_cpu_stall_ticks(struct rcu_data *rdp); 537static void zero_cpu_stall_ticks(struct rcu_data *rdp);
506static void increment_cpu_stall_ticks(void); 538static void increment_cpu_stall_ticks(void);
539static bool is_nocb_cpu(int cpu);
540static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
541 bool lazy);
542static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
543 struct rcu_data *rdp);
544static bool nocb_cpu_expendable(int cpu);
545static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
546static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
547static void init_nocb_callback_list(struct rcu_data *rdp);
548static void __init rcu_init_nocb(void);
507 549
508#endif /* #ifndef RCU_TREE_NONCORE */ 550#endif /* #ifndef RCU_TREE_NONCORE */
551
552#ifdef CONFIG_RCU_TRACE
553#ifdef CONFIG_RCU_NOCB_CPU
554/* Sum up queue lengths for tracing. */
555static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
556{
557 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
558 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
559}
560#else /* #ifdef CONFIG_RCU_NOCB_CPU */
561static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
562{
563 *ql = 0;
564 *qll = 0;
565}
566#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
567#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f92115488187..f6e5ec2932b4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/gfp.h>
28#include <linux/oom.h> 29#include <linux/oom.h>
29#include <linux/smpboot.h> 30#include <linux/smpboot.h>
30 31
@@ -36,6 +37,14 @@
36#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 37#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
37#endif 38#endif
38 39
40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool rcu_nocb_poll; /* Offload kthread are to poll. */
44module_param(rcu_nocb_poll, bool, 0444);
45static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47
39/* 48/*
40 * Check the RCU kernel configuration parameters and print informative 49 * Check the RCU kernel configuration parameters and print informative
41 * messages about anything out of the ordinary. If you like #ifdef, you 50 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void)
76 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
77 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
78 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU
89 if (have_rcu_nocb_mask) {
90 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
91 cpumask_clear_cpu(0, rcu_nocb_mask);
92 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
93 }
94 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
95 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
96 if (rcu_nocb_poll)
97 pr_info("\tExperimental polled no-CBs CPUs.\n");
98 }
99#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
79} 100}
80 101
81#ifdef CONFIG_TREE_PREEMPT_RCU 102#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void)
642 */ 663 */
643void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 664void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
644{ 665{
645 __call_rcu(head, func, &rcu_preempt_state, 0); 666 __call_rcu(head, func, &rcu_preempt_state, -1, 0);
646} 667}
647EXPORT_SYMBOL_GPL(call_rcu); 668EXPORT_SYMBOL_GPL(call_rcu);
648 669
@@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
656void kfree_call_rcu(struct rcu_head *head, 677void kfree_call_rcu(struct rcu_head *head,
657 void (*func)(struct rcu_head *rcu)) 678 void (*func)(struct rcu_head *rcu))
658{ 679{
659 __call_rcu(head, func, &rcu_preempt_state, 1); 680 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
660} 681}
661EXPORT_SYMBOL_GPL(kfree_call_rcu); 682EXPORT_SYMBOL_GPL(kfree_call_rcu);
662 683
@@ -670,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
670 * concurrently with new RCU read-side critical sections that began while 691 * concurrently with new RCU read-side critical sections that began while
671 * synchronize_rcu() was waiting. RCU read-side critical sections are 692 * synchronize_rcu() was waiting. RCU read-side critical sections are
672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 693 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
694 *
695 * See the description of synchronize_sched() for more detailed information
696 * on memory ordering guarantees.
673 */ 697 */
674void synchronize_rcu(void) 698void synchronize_rcu(void)
675{ 699{
@@ -679,7 +703,10 @@ void synchronize_rcu(void)
679 "Illegal synchronize_rcu() in RCU read-side critical section"); 703 "Illegal synchronize_rcu() in RCU read-side critical section");
680 if (!rcu_scheduler_active) 704 if (!rcu_scheduler_active)
681 return; 705 return;
682 wait_rcu_gp(call_rcu); 706 if (rcu_expedited)
707 synchronize_rcu_expedited();
708 else
709 wait_rcu_gp(call_rcu);
683} 710}
684EXPORT_SYMBOL_GPL(synchronize_rcu); 711EXPORT_SYMBOL_GPL(synchronize_rcu);
685 712
@@ -757,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
757 * grace period for the specified rcu_node structure. If there are no such 784 * grace period for the specified rcu_node structure. If there are no such
758 * tasks, report it up the rcu_node hierarchy. 785 * tasks, report it up the rcu_node hierarchy.
759 * 786 *
760 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. 787 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
788 * CPU hotplug operations.
761 */ 789 */
762static void 790static void
763sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 791sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -831,7 +859,7 @@ void synchronize_rcu_expedited(void)
831 udelay(trycount * num_online_cpus()); 859 udelay(trycount * num_online_cpus());
832 } else { 860 } else {
833 put_online_cpus(); 861 put_online_cpus();
834 synchronize_rcu(); 862 wait_rcu_gp(call_rcu);
835 return; 863 return;
836 } 864 }
837 } 865 }
@@ -875,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
875 903
876/** 904/**
877 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 905 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
906 *
907 * Note that this primitive does not necessarily wait for an RCU grace period
908 * to complete. For example, if there are no RCU callbacks queued anywhere
909 * in the system, then rcu_barrier() is within its rights to return
910 * immediately, without waiting for anything, much less an RCU grace period.
878 */ 911 */
879void rcu_barrier(void) 912void rcu_barrier(void)
880{ 913{
@@ -1013,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu)
1013void kfree_call_rcu(struct rcu_head *head, 1046void kfree_call_rcu(struct rcu_head *head,
1014 void (*func)(struct rcu_head *rcu)) 1047 void (*func)(struct rcu_head *rcu))
1015{ 1048{
1016 __call_rcu(head, func, &rcu_sched_state, 1); 1049 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1017} 1050}
1018EXPORT_SYMBOL_GPL(kfree_call_rcu); 1051EXPORT_SYMBOL_GPL(kfree_call_rcu);
1019 1052
@@ -2092,3 +2125,373 @@ static void increment_cpu_stall_ticks(void)
2092} 2125}
2093 2126
2094#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2127#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2128
2129#ifdef CONFIG_RCU_NOCB_CPU
2130
2131/*
2132 * Offload callback processing from the boot-time-specified set of CPUs
2133 * specified by rcu_nocb_mask. For each CPU in the set, there is a
2134 * kthread created that pulls the callbacks from the corresponding CPU,
2135 * waits for a grace period to elapse, and invokes the callbacks.
2136 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2137 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2138 * has been specified, in which case each kthread actively polls its
2139 * CPU. (Which isn't so great for energy efficiency, but which does
2140 * reduce RCU's overhead on that CPU.)
2141 *
2142 * This is intended to be used in conjunction with Frederic Weisbecker's
2143 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2144 * running CPU-bound user-mode computations.
2145 *
2146 * Offloading of callback processing could also in theory be used as
2147 * an energy-efficiency measure because CPUs with no RCU callbacks
2148 * queued are more aggressive about entering dyntick-idle mode.
2149 */
2150
2151
2152/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2153static int __init rcu_nocb_setup(char *str)
2154{
2155 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2156 have_rcu_nocb_mask = true;
2157 cpulist_parse(str, rcu_nocb_mask);
2158 return 1;
2159}
2160__setup("rcu_nocbs=", rcu_nocb_setup);
2161
2162/* Is the specified CPU a no-CPUs CPU? */
2163static bool is_nocb_cpu(int cpu)
2164{
2165 if (have_rcu_nocb_mask)
2166 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2167 return false;
2168}
2169
2170/*
2171 * Enqueue the specified string of rcu_head structures onto the specified
2172 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2173 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2174 * counts are supplied by rhcount and rhcount_lazy.
2175 *
2176 * If warranted, also wake up the kthread servicing this CPUs queues.
2177 */
2178static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2179 struct rcu_head *rhp,
2180 struct rcu_head **rhtp,
2181 int rhcount, int rhcount_lazy)
2182{
2183 int len;
2184 struct rcu_head **old_rhpp;
2185 struct task_struct *t;
2186
2187 /* Enqueue the callback on the nocb list and update counts. */
2188 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2189 ACCESS_ONCE(*old_rhpp) = rhp;
2190 atomic_long_add(rhcount, &rdp->nocb_q_count);
2191 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2192
2193 /* If we are not being polled and there is a kthread, awaken it ... */
2194 t = ACCESS_ONCE(rdp->nocb_kthread);
2195 if (rcu_nocb_poll | !t)
2196 return;
2197 len = atomic_long_read(&rdp->nocb_q_count);
2198 if (old_rhpp == &rdp->nocb_head) {
2199 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2200 rdp->qlen_last_fqs_check = 0;
2201 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2202 wake_up_process(t); /* ... or if many callbacks queued. */
2203 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2204 }
2205 return;
2206}
2207
2208/*
2209 * This is a helper for __call_rcu(), which invokes this when the normal
2210 * callback queue is inoperable. If this is not a no-CBs CPU, this
2211 * function returns failure back to __call_rcu(), which can complain
2212 * appropriately.
2213 *
2214 * Otherwise, this function queues the callback where the corresponding
2215 * "rcuo" kthread can find it.
2216 */
2217static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2218 bool lazy)
2219{
2220
2221 if (!is_nocb_cpu(rdp->cpu))
2222 return 0;
2223 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2224 return 1;
2225}
2226
2227/*
2228 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2229 * not a no-CBs CPU.
2230 */
2231static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2232 struct rcu_data *rdp)
2233{
2234 long ql = rsp->qlen;
2235 long qll = rsp->qlen_lazy;
2236
2237 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2238 if (!is_nocb_cpu(smp_processor_id()))
2239 return 0;
2240 rsp->qlen = 0;
2241 rsp->qlen_lazy = 0;
2242
2243 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2244 if (rsp->orphan_donelist != NULL) {
2245 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2246 rsp->orphan_donetail, ql, qll);
2247 ql = qll = 0;
2248 rsp->orphan_donelist = NULL;
2249 rsp->orphan_donetail = &rsp->orphan_donelist;
2250 }
2251 if (rsp->orphan_nxtlist != NULL) {
2252 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2253 rsp->orphan_nxttail, ql, qll);
2254 ql = qll = 0;
2255 rsp->orphan_nxtlist = NULL;
2256 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2257 }
2258 return 1;
2259}
2260
2261/*
2262 * There must be at least one non-no-CBs CPU in operation at any given
2263 * time, because no-CBs CPUs are not capable of initiating grace periods
2264 * independently. This function therefore complains if the specified
2265 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2266 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2267 * but you have to have a base case!)
2268 */
2269static bool nocb_cpu_expendable(int cpu)
2270{
2271 cpumask_var_t non_nocb_cpus;
2272 int ret;
2273
2274 /*
2275 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2276 * then offlining this CPU is harmless. Let it happen.
2277 */
2278 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2279 return 1;
2280
2281 /* If no memory, play it safe and keep the CPU around. */
2282 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2283 return 0;
2284 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2285 cpumask_clear_cpu(cpu, non_nocb_cpus);
2286 ret = !cpumask_empty(non_nocb_cpus);
2287 free_cpumask_var(non_nocb_cpus);
2288 return ret;
2289}
2290
2291/*
2292 * Helper structure for remote registry of RCU callbacks.
2293 * This is needed for when a no-CBs CPU needs to start a grace period.
2294 * If it just invokes call_rcu(), the resulting callback will be queued,
2295 * which can result in deadlock.
2296 */
2297struct rcu_head_remote {
2298 struct rcu_head *rhp;
2299 call_rcu_func_t *crf;
2300 void (*func)(struct rcu_head *rhp);
2301};
2302
2303/*
2304 * Register a callback as specified by the rcu_head_remote struct.
2305 * This function is intended to be invoked via smp_call_function_single().
2306 */
2307static void call_rcu_local(void *arg)
2308{
2309 struct rcu_head_remote *rhrp =
2310 container_of(arg, struct rcu_head_remote, rhp);
2311
2312 rhrp->crf(rhrp->rhp, rhrp->func);
2313}
2314
2315/*
2316 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2317 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2318 * smp_call_function_single().
2319 */
2320static void invoke_crf_remote(struct rcu_head *rhp,
2321 void (*func)(struct rcu_head *rhp),
2322 call_rcu_func_t crf)
2323{
2324 struct rcu_head_remote rhr;
2325
2326 rhr.rhp = rhp;
2327 rhr.crf = crf;
2328 rhr.func = func;
2329 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2330}
2331
2332/*
2333 * Helper functions to be passed to wait_rcu_gp(), each of which
2334 * invokes invoke_crf_remote() to register a callback appropriately.
2335 */
2336static void __maybe_unused
2337call_rcu_preempt_remote(struct rcu_head *rhp,
2338 void (*func)(struct rcu_head *rhp))
2339{
2340 invoke_crf_remote(rhp, func, call_rcu);
2341}
2342static void call_rcu_bh_remote(struct rcu_head *rhp,
2343 void (*func)(struct rcu_head *rhp))
2344{
2345 invoke_crf_remote(rhp, func, call_rcu_bh);
2346}
2347static void call_rcu_sched_remote(struct rcu_head *rhp,
2348 void (*func)(struct rcu_head *rhp))
2349{
2350 invoke_crf_remote(rhp, func, call_rcu_sched);
2351}
2352
2353/*
2354 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2355 * callbacks queued by the corresponding no-CBs CPU.
2356 */
2357static int rcu_nocb_kthread(void *arg)
2358{
2359 int c, cl;
2360 struct rcu_head *list;
2361 struct rcu_head *next;
2362 struct rcu_head **tail;
2363 struct rcu_data *rdp = arg;
2364
2365 /* Each pass through this loop invokes one batch of callbacks */
2366 for (;;) {
2367 /* If not polling, wait for next batch of callbacks. */
2368 if (!rcu_nocb_poll)
2369 wait_event(rdp->nocb_wq, rdp->nocb_head);
2370 list = ACCESS_ONCE(rdp->nocb_head);
2371 if (!list) {
2372 schedule_timeout_interruptible(1);
2373 continue;
2374 }
2375
2376 /*
2377 * Extract queued callbacks, update counts, and wait
2378 * for a grace period to elapse.
2379 */
2380 ACCESS_ONCE(rdp->nocb_head) = NULL;
2381 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2382 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2383 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2384 ACCESS_ONCE(rdp->nocb_p_count) += c;
2385 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2386 wait_rcu_gp(rdp->rsp->call_remote);
2387
2388 /* Each pass through the following loop invokes a callback. */
2389 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2390 c = cl = 0;
2391 while (list) {
2392 next = list->next;
2393 /* Wait for enqueuing to complete, if needed. */
2394 while (next == NULL && &list->next != tail) {
2395 schedule_timeout_interruptible(1);
2396 next = list->next;
2397 }
2398 debug_rcu_head_unqueue(list);
2399 local_bh_disable();
2400 if (__rcu_reclaim(rdp->rsp->name, list))
2401 cl++;
2402 c++;
2403 local_bh_enable();
2404 list = next;
2405 }
2406 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2407 ACCESS_ONCE(rdp->nocb_p_count) -= c;
2408 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2409 rdp->n_nocbs_invoked += c;
2410 }
2411 return 0;
2412}
2413
2414/* Initialize per-rcu_data variables for no-CBs CPUs. */
2415static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2416{
2417 rdp->nocb_tail = &rdp->nocb_head;
2418 init_waitqueue_head(&rdp->nocb_wq);
2419}
2420
2421/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2422static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2423{
2424 int cpu;
2425 struct rcu_data *rdp;
2426 struct task_struct *t;
2427
2428 if (rcu_nocb_mask == NULL)
2429 return;
2430 for_each_cpu(cpu, rcu_nocb_mask) {
2431 rdp = per_cpu_ptr(rsp->rda, cpu);
2432 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
2433 BUG_ON(IS_ERR(t));
2434 ACCESS_ONCE(rdp->nocb_kthread) = t;
2435 }
2436}
2437
2438/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2439static void init_nocb_callback_list(struct rcu_data *rdp)
2440{
2441 if (rcu_nocb_mask == NULL ||
2442 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2443 return;
2444 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2445}
2446
2447/* Initialize the ->call_remote fields in the rcu_state structures. */
2448static void __init rcu_init_nocb(void)
2449{
2450#ifdef CONFIG_PREEMPT_RCU
2451 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2452#endif /* #ifdef CONFIG_PREEMPT_RCU */
2453 rcu_bh_state.call_remote = call_rcu_bh_remote;
2454 rcu_sched_state.call_remote = call_rcu_sched_remote;
2455}
2456
2457#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2458
2459static bool is_nocb_cpu(int cpu)
2460{
2461 return false;
2462}
2463
2464static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2465 bool lazy)
2466{
2467 return 0;
2468}
2469
2470static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2471 struct rcu_data *rdp)
2472{
2473 return 0;
2474}
2475
2476static bool nocb_cpu_expendable(int cpu)
2477{
2478 return 1;
2479}
2480
2481static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2482{
2483}
2484
2485static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2486{
2487}
2488
2489static void init_nocb_callback_list(struct rcu_data *rdp)
2490{
2491}
2492
2493static void __init rcu_init_nocb(void)
2494{
2495}
2496
2497#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 693513bc50e6..0d095dcaa670 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused) 49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
50{ 53{
51 struct rcu_state *rsp; 54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
52 70
53 for_each_rcu_flavor(rsp) 71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54 seq_printf(m, "%s: bcc: %d nbd: %lu\n", 72{
55 rsp->name, 73 (*pos)++;
56 atomic_read(&rsp->barrier_cpu_count), 74 return r_start(m, pos);
57 rsp->n_barrier_done); 75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
58 return 0; 87 return 0;
59} 88}
60 89
61static int rcubarrier_open(struct inode *inode, struct file *file) 90static int rcubarrier_open(struct inode *inode, struct file *file)
62{ 91{
63 return single_open(file, show_rcubarrier, NULL); 92 return single_open(file, show_rcubarrier, inode->i_private);
64} 93}
65 94
66static const struct file_operations rcubarrier_fops = { 95static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
68 .open = rcubarrier_open, 97 .open = rcubarrier_open,
69 .read = seq_read, 98 .read = seq_read,
70 .llseek = seq_lseek, 99 .llseek = no_llseek,
71 .release = single_release, 100 .release = seq_release,
72}; 101};
73 102
74#ifdef CONFIG_RCU_BOOST 103#ifdef CONFIG_RCU_BOOST
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status)
84 113
85static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 114static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 115{
116 long ql, qll;
117
87 if (!rdp->beenonline) 118 if (!rdp->beenonline)
88 return; 119 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
90 rdp->cpu, 121 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
93 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce, rdp->qs_pending);
94 seq_printf(m, " dt=%d/%llx/%d df=%lu", 125 seq_printf(m, " dt=%d/%llx/%d df=%lu",
95 atomic_read(&rdp->dynticks->dynticks), 126 atomic_read(&rdp->dynticks->dynticks),
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
97 rdp->dynticks->dynticks_nmi_nesting, 128 rdp->dynticks->dynticks_nmi_nesting,
98 rdp->dynticks_fqs); 129 rdp->dynticks_fqs);
99 seq_printf(m, " of=%lu", rdp->offline_fqs); 130 seq_printf(m, " of=%lu", rdp->offline_fqs);
131 rcu_nocb_q_lengths(rdp, &ql, &qll);
132 qll += rdp->qlen_lazy;
133 ql += rdp->qlen;
100 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 134 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
101 rdp->qlen_lazy, rdp->qlen, 135 qll, ql,
102 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 136 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
103 rdp->nxttail[RCU_NEXT_TAIL]], 137 rdp->nxttail[RCU_NEXT_TAIL]],
104 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 138 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
114 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 148 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
115#endif /* #ifdef CONFIG_RCU_BOOST */ 149#endif /* #ifdef CONFIG_RCU_BOOST */
116 seq_printf(m, " b=%ld", rdp->blimit); 150 seq_printf(m, " b=%ld", rdp->blimit);
117 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 151 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
118 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 152 rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
153 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
119} 154}
120 155
121static int show_rcudata(struct seq_file *m, void *unused) 156static int show_rcudata(struct seq_file *m, void *v)
122{ 157{
123 int cpu; 158 print_one_rcu_data(m, (struct rcu_data *)v);
124 struct rcu_state *rsp;
125
126 for_each_rcu_flavor(rsp) {
127 seq_printf(m, "%s:\n", rsp->name);
128 for_each_possible_cpu(cpu)
129 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
130 }
131 return 0; 159 return 0;
132} 160}
133 161
162static const struct seq_operations rcudate_op = {
163 .start = r_start,
164 .next = r_next,
165 .stop = r_stop,
166 .show = show_rcudata,
167};
168
134static int rcudata_open(struct inode *inode, struct file *file) 169static int rcudata_open(struct inode *inode, struct file *file)
135{ 170{
136 return single_open(file, show_rcudata, NULL); 171 return r_open(inode, file, &rcudate_op);
137} 172}
138 173
139static const struct file_operations rcudata_fops = { 174static const struct file_operations rcudata_fops = {
140 .owner = THIS_MODULE, 175 .owner = THIS_MODULE,
141 .open = rcudata_open, 176 .open = rcudata_open,
142 .read = seq_read, 177 .read = seq_read,
143 .llseek = seq_lseek, 178 .llseek = no_llseek,
144 .release = single_release, 179 .release = seq_release,
145}; 180};
146 181
147static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) 182static int show_rcuexp(struct seq_file *m, void *v)
148{
149 if (!rdp->beenonline)
150 return;
151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
152 rdp->cpu,
153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
154 rdp->completed, rdp->gpnum,
155 rdp->passed_quiesce, rdp->qs_pending);
156 seq_printf(m, ",%d,%llx,%d,%lu",
157 atomic_read(&rdp->dynticks->dynticks),
158 rdp->dynticks->dynticks_nesting,
159 rdp->dynticks->dynticks_nmi_nesting,
160 rdp->dynticks_fqs);
161 seq_printf(m, ",%lu", rdp->offline_fqs);
162 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
163 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
164 rdp->nxttail[RCU_NEXT_TAIL]],
165 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
166 rdp->nxttail[RCU_NEXT_READY_TAIL]],
167 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
168 rdp->nxttail[RCU_WAIT_TAIL]],
169 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
170#ifdef CONFIG_RCU_BOOST
171 seq_printf(m, ",%d,\"%c\"",
172 per_cpu(rcu_cpu_has_work, rdp->cpu),
173 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
174 rdp->cpu)));
175#endif /* #ifdef CONFIG_RCU_BOOST */
176 seq_printf(m, ",%ld", rdp->blimit);
177 seq_printf(m, ",%lu,%lu,%lu\n",
178 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
179}
180
181static int show_rcudata_csv(struct seq_file *m, void *unused)
182{ 183{
183 int cpu; 184 struct rcu_state *rsp = (struct rcu_state *)m->private;
184 struct rcu_state *rsp; 185
185 186 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); 187 atomic_long_read(&rsp->expedited_start),
187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 188 atomic_long_read(&rsp->expedited_done),
188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 189 atomic_long_read(&rsp->expedited_wrap),
189#ifdef CONFIG_RCU_BOOST 190 atomic_long_read(&rsp->expedited_tryfail),
190 seq_puts(m, "\"kt\",\"ktl\""); 191 atomic_long_read(&rsp->expedited_workdone1),
191#endif /* #ifdef CONFIG_RCU_BOOST */ 192 atomic_long_read(&rsp->expedited_workdone2),
192 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 193 atomic_long_read(&rsp->expedited_normal),
193 for_each_rcu_flavor(rsp) { 194 atomic_long_read(&rsp->expedited_stoppedcpus),
194 seq_printf(m, "\"%s:\"\n", rsp->name); 195 atomic_long_read(&rsp->expedited_done_tries),
195 for_each_possible_cpu(cpu) 196 atomic_long_read(&rsp->expedited_done_lost),
196 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); 197 atomic_long_read(&rsp->expedited_done_exit));
197 }
198 return 0; 198 return 0;
199} 199}
200 200
201static int rcudata_csv_open(struct inode *inode, struct file *file) 201static int rcuexp_open(struct inode *inode, struct file *file)
202{ 202{
203 return single_open(file, show_rcudata_csv, NULL); 203 return single_open(file, show_rcuexp, inode->i_private);
204} 204}
205 205
206static const struct file_operations rcudata_csv_fops = { 206static const struct file_operations rcuexp_fops = {
207 .owner = THIS_MODULE, 207 .owner = THIS_MODULE,
208 .open = rcudata_csv_open, 208 .open = rcuexp_open,
209 .read = seq_read, 209 .read = seq_read,
210 .llseek = seq_lseek, 210 .llseek = no_llseek,
211 .release = single_release, 211 .release = seq_release,
212}; 212};
213 213
214#ifdef CONFIG_RCU_BOOST 214#ifdef CONFIG_RCU_BOOST
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = {
254 .owner = THIS_MODULE, 254 .owner = THIS_MODULE,
255 .open = rcu_node_boost_open, 255 .open = rcu_node_boost_open,
256 .read = seq_read, 256 .read = seq_read,
257 .llseek = seq_lseek, 257 .llseek = no_llseek,
258 .release = single_release, 258 .release = single_release,
259}; 259};
260 260
261/* 261#endif /* #ifdef CONFIG_RCU_BOOST */
262 * Create the rcuboost debugfs entry. Standard error return.
263 */
264static int rcu_boost_trace_create_file(struct dentry *rcudir)
265{
266 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
267 &rcu_node_boost_fops);
268}
269
270#else /* #ifdef CONFIG_RCU_BOOST */
271
272static int rcu_boost_trace_create_file(struct dentry *rcudir)
273{
274 return 0; /* There cannot be an error if we didn't create it! */
275}
276
277#endif /* #else #ifdef CONFIG_RCU_BOOST */
278 262
279static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
280{ 264{
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 struct rcu_node *rnp; 267 struct rcu_node *rnp;
284 268
285 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
286 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", 270 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
287 rsp->name, rsp->completed, gpnum, rsp->fqs_state, 271 ulong2long(rsp->completed), ulong2long(gpnum),
272 rsp->fqs_state,
288 (long)(rsp->jiffies_force_qs - jiffies), 273 (long)(rsp->jiffies_force_qs - jiffies),
289 (int)(jiffies & 0xffff)); 274 (int)(jiffies & 0xffff));
290 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 275 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
306 seq_puts(m, "\n"); 291 seq_puts(m, "\n");
307} 292}
308 293
309static int show_rcuhier(struct seq_file *m, void *unused) 294static int show_rcuhier(struct seq_file *m, void *v)
310{ 295{
311 struct rcu_state *rsp; 296 struct rcu_state *rsp = (struct rcu_state *)m->private;
312 297 print_one_rcu_state(m, rsp);
313 for_each_rcu_flavor(rsp)
314 print_one_rcu_state(m, rsp);
315 return 0; 298 return 0;
316} 299}
317 300
318static int rcuhier_open(struct inode *inode, struct file *file) 301static int rcuhier_open(struct inode *inode, struct file *file)
319{ 302{
320 return single_open(file, show_rcuhier, NULL); 303 return single_open(file, show_rcuhier, inode->i_private);
321} 304}
322 305
323static const struct file_operations rcuhier_fops = { 306static const struct file_operations rcuhier_fops = {
324 .owner = THIS_MODULE, 307 .owner = THIS_MODULE,
325 .open = rcuhier_open, 308 .open = rcuhier_open,
326 .read = seq_read, 309 .read = seq_read,
327 .llseek = seq_lseek, 310 .llseek = no_llseek,
328 .release = single_release, 311 .release = seq_release,
329}; 312};
330 313
331static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
338 struct rcu_node *rnp = &rsp->node[0]; 321 struct rcu_node *rnp = &rsp->node[0];
339 322
340 raw_spin_lock_irqsave(&rnp->lock, flags); 323 raw_spin_lock_irqsave(&rnp->lock, flags);
341 completed = rsp->completed; 324 completed = ACCESS_ONCE(rsp->completed);
342 gpnum = rsp->gpnum; 325 gpnum = ACCESS_ONCE(rsp->gpnum);
343 if (rsp->completed == rsp->gpnum) 326 if (completed == gpnum)
344 gpage = 0; 327 gpage = 0;
345 else 328 else
346 gpage = jiffies - rsp->gp_start; 329 gpage = jiffies - rsp->gp_start;
347 gpmax = rsp->gp_max; 330 gpmax = rsp->gp_max;
348 raw_spin_unlock_irqrestore(&rnp->lock, flags); 331 raw_spin_unlock_irqrestore(&rnp->lock, flags);
349 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", 332 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
350 rsp->name, completed, gpnum, gpage, gpmax); 333 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
351} 334}
352 335
353static int show_rcugp(struct seq_file *m, void *unused) 336static int show_rcugp(struct seq_file *m, void *v)
354{ 337{
355 struct rcu_state *rsp; 338 struct rcu_state *rsp = (struct rcu_state *)m->private;
356 339 show_one_rcugp(m, rsp);
357 for_each_rcu_flavor(rsp)
358 show_one_rcugp(m, rsp);
359 return 0; 340 return 0;
360} 341}
361 342
362static int rcugp_open(struct inode *inode, struct file *file) 343static int rcugp_open(struct inode *inode, struct file *file)
363{ 344{
364 return single_open(file, show_rcugp, NULL); 345 return single_open(file, show_rcugp, inode->i_private);
365} 346}
366 347
367static const struct file_operations rcugp_fops = { 348static const struct file_operations rcugp_fops = {
368 .owner = THIS_MODULE, 349 .owner = THIS_MODULE,
369 .open = rcugp_open, 350 .open = rcugp_open,
370 .read = seq_read, 351 .read = seq_read,
371 .llseek = seq_lseek, 352 .llseek = no_llseek,
372 .release = single_release, 353 .release = seq_release,
373}; 354};
374 355
375static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
376{ 357{
358 if (!rdp->beenonline)
359 return;
377 seq_printf(m, "%3d%cnp=%ld ", 360 seq_printf(m, "%3d%cnp=%ld ",
378 rdp->cpu, 361 rdp->cpu,
379 cpu_is_offline(rdp->cpu) ? '!' : ' ', 362 cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
389 rdp->n_rp_need_nothing); 372 rdp->n_rp_need_nothing);
390} 373}
391 374
392static int show_rcu_pending(struct seq_file *m, void *unused) 375static int show_rcu_pending(struct seq_file *m, void *v)
393{ 376{
394 int cpu; 377 print_one_rcu_pending(m, (struct rcu_data *)v);
395 struct rcu_data *rdp;
396 struct rcu_state *rsp;
397
398 for_each_rcu_flavor(rsp) {
399 seq_printf(m, "%s:\n", rsp->name);
400 for_each_possible_cpu(cpu) {
401 rdp = per_cpu_ptr(rsp->rda, cpu);
402 if (rdp->beenonline)
403 print_one_rcu_pending(m, rdp);
404 }
405 }
406 return 0; 378 return 0;
407} 379}
408 380
381static const struct seq_operations rcu_pending_op = {
382 .start = r_start,
383 .next = r_next,
384 .stop = r_stop,
385 .show = show_rcu_pending,
386};
387
409static int rcu_pending_open(struct inode *inode, struct file *file) 388static int rcu_pending_open(struct inode *inode, struct file *file)
410{ 389{
411 return single_open(file, show_rcu_pending, NULL); 390 return r_open(inode, file, &rcu_pending_op);
412} 391}
413 392
414static const struct file_operations rcu_pending_fops = { 393static const struct file_operations rcu_pending_fops = {
415 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
416 .open = rcu_pending_open, 395 .open = rcu_pending_open,
417 .read = seq_read, 396 .read = seq_read,
418 .llseek = seq_lseek, 397 .llseek = no_llseek,
419 .release = single_release, 398 .release = seq_release,
420}; 399};
421 400
422static int show_rcutorture(struct seq_file *m, void *unused) 401static int show_rcutorture(struct seq_file *m, void *unused)
@@ -446,43 +425,58 @@ static struct dentry *rcudir;
446 425
447static int __init rcutree_trace_init(void) 426static int __init rcutree_trace_init(void)
448{ 427{
428 struct rcu_state *rsp;
449 struct dentry *retval; 429 struct dentry *retval;
430 struct dentry *rspdir;
450 431
451 rcudir = debugfs_create_dir("rcu", NULL); 432 rcudir = debugfs_create_dir("rcu", NULL);
452 if (!rcudir) 433 if (!rcudir)
453 goto free_out; 434 goto free_out;
454 435
455 retval = debugfs_create_file("rcubarrier", 0444, rcudir, 436 for_each_rcu_flavor(rsp) {
456 NULL, &rcubarrier_fops); 437 rspdir = debugfs_create_dir(rsp->name, rcudir);
457 if (!retval) 438 if (!rspdir)
458 goto free_out; 439 goto free_out;
459 440
460 retval = debugfs_create_file("rcudata", 0444, rcudir, 441 retval = debugfs_create_file("rcudata", 0444,
461 NULL, &rcudata_fops); 442 rspdir, rsp, &rcudata_fops);
462 if (!retval) 443 if (!retval)
463 goto free_out; 444 goto free_out;
464 445
465 retval = debugfs_create_file("rcudata.csv", 0444, rcudir, 446 retval = debugfs_create_file("rcuexp", 0444,
466 NULL, &rcudata_csv_fops); 447 rspdir, rsp, &rcuexp_fops);
467 if (!retval) 448 if (!retval)
468 goto free_out; 449 goto free_out;
469 450
470 if (rcu_boost_trace_create_file(rcudir)) 451 retval = debugfs_create_file("rcu_pending", 0444,
471 goto free_out; 452 rspdir, rsp, &rcu_pending_fops);
453 if (!retval)
454 goto free_out;
455
456 retval = debugfs_create_file("rcubarrier", 0444,
457 rspdir, rsp, &rcubarrier_fops);
458 if (!retval)
459 goto free_out;
472 460
473 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 461#ifdef CONFIG_RCU_BOOST
474 if (!retval) 462 if (rsp == &rcu_preempt_state) {
475 goto free_out; 463 retval = debugfs_create_file("rcuboost", 0444,
464 rspdir, NULL, &rcu_node_boost_fops);
465 if (!retval)
466 goto free_out;
467 }
468#endif
476 469
477 retval = debugfs_create_file("rcuhier", 0444, rcudir, 470 retval = debugfs_create_file("rcugp", 0444,
478 NULL, &rcuhier_fops); 471 rspdir, rsp, &rcugp_fops);
479 if (!retval) 472 if (!retval)
480 goto free_out; 473 goto free_out;
481 474
482 retval = debugfs_create_file("rcu_pending", 0444, rcudir, 475 retval = debugfs_create_file("rcuhier", 0444,
483 NULL, &rcu_pending_fops); 476 rspdir, rsp, &rcuhier_fops);
484 if (!retval) 477 if (!retval)
485 goto free_out; 478 goto free_out;
479 }
486 480
487 retval = debugfs_create_file("rcutorture", 0444, rcudir, 481 retval = debugfs_create_file("rcutorture", 0444, rcudir,
488 NULL, &rcutorture_fops); 482 NULL, &rcutorture_fops);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ad581aa2369a..3920d593e63c 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf,
192 *res = PAGE_ALIGN(*res); 192 *res = PAGE_ALIGN(*res);
193 return 0; 193 return 0;
194} 194}
195
196int res_counter_write(struct res_counter *counter, int member,
197 const char *buf, write_strategy_fn write_strategy)
198{
199 char *end;
200 unsigned long flags;
201 unsigned long long tmp, *val;
202
203 if (write_strategy) {
204 if (write_strategy(buf, &tmp))
205 return -EINVAL;
206 } else {
207 tmp = simple_strtoull(buf, &end, 10);
208 if (*end != '\0')
209 return -EINVAL;
210 }
211 spin_lock_irqsave(&counter->lock, flags);
212 val = res_counter_member(counter, member);
213 *val = tmp;
214 spin_unlock_irqrestore(&counter->lock, flags);
215 return 0;
216}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f5eb1838b3e..257002c13bb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
75 76
76#include <asm/switch_to.h> 77#include <asm/switch_to.h>
77#include <asm/tlb.h> 78#include <asm/tlb.h>
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { }; 193static void sched_feat_enable(int i) { };
193#endif /* HAVE_JUMP_LABEL */ 194#endif /* HAVE_JUMP_LABEL */
194 195
195static ssize_t 196static int sched_feat_set(char *cmp)
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{ 197{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i; 198 int i;
203 199 int neg = 0;
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212 200
213 if (strncmp(cmp, "NO_", 3) == 0) { 201 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1; 202 neg = 1;
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
228 } 216 }
229 } 217 }
230 218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
231 if (i == __SCHED_FEAT_NR) 240 if (i == __SCHED_FEAT_NR)
232 return -EINVAL; 241 return -EINVAL;
233 242
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
922 rq->skip_clock_update = 1; 931 rq->skip_clock_update = 1;
923} 932}
924 933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
925#ifdef CONFIG_SMP 941#ifdef CONFIG_SMP
926void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
927{ 943{
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
952 trace_sched_migrate_task(p, new_cpu); 968 trace_sched_migrate_task(p, new_cpu);
953 969
954 if (task_cpu(p) != new_cpu) { 970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
955 p->se.nr_migrations++; 975 p->se.nr_migrations++;
956 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
957 } 983 }
958 984
959 __set_task_cpu(p, new_cpu); 985 __set_task_cpu(p, new_cpu);
@@ -1524,6 +1550,15 @@ static void __sched_fork(struct task_struct *p)
1524 p->se.vruntime = 0; 1550 p->se.vruntime = 0;
1525 INIT_LIST_HEAD(&p->se.group_node); 1551 INIT_LIST_HEAD(&p->se.group_node);
1526 1552
1553/*
1554 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1555 * removed when useful for applications beyond shares distribution (e.g.
1556 * load-balance).
1557 */
1558#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1559 p->se.avg.runnable_avg_period = 0;
1560 p->se.avg.runnable_avg_sum = 0;
1561#endif
1527#ifdef CONFIG_SCHEDSTATS 1562#ifdef CONFIG_SCHEDSTATS
1528 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1563 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1529#endif 1564#endif
@@ -1533,8 +1568,41 @@ static void __sched_fork(struct task_struct *p)
1533#ifdef CONFIG_PREEMPT_NOTIFIERS 1568#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers); 1569 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif 1570#endif
1571
1572#ifdef CONFIG_NUMA_BALANCING
1573 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1574 p->mm->numa_next_scan = jiffies;
1575 p->mm->numa_next_reset = jiffies;
1576 p->mm->numa_scan_seq = 0;
1577 }
1578
1579 p->node_stamp = 0ULL;
1580 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1581 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1582 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1583 p->numa_work.next = &p->numa_work;
1584#endif /* CONFIG_NUMA_BALANCING */
1536} 1585}
1537 1586
1587#ifdef CONFIG_NUMA_BALANCING
1588#ifdef CONFIG_SCHED_DEBUG
1589void set_numabalancing_state(bool enabled)
1590{
1591 if (enabled)
1592 sched_feat_set("NUMA");
1593 else
1594 sched_feat_set("NO_NUMA");
1595}
1596#else
1597__read_mostly bool numabalancing_enabled;
1598
1599void set_numabalancing_state(bool enabled)
1600{
1601 numabalancing_enabled = enabled;
1602}
1603#endif /* CONFIG_SCHED_DEBUG */
1604#endif /* CONFIG_NUMA_BALANCING */
1605
1538/* 1606/*
1539 * fork()/clone()-time setup: 1607 * fork()/clone()-time setup:
1540 */ 1608 */
@@ -1886,8 +1954,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
1886 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1954 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1887#endif 1955#endif
1888 1956
1957 context_tracking_task_switch(prev, next);
1889 /* Here we just switch the register state and the stack. */ 1958 /* Here we just switch the register state and the stack. */
1890 rcu_switch(prev, next);
1891 switch_to(prev, next, prev); 1959 switch_to(prev, next, prev);
1892 1960
1893 barrier(); 1961 barrier();
@@ -2911,7 +2979,7 @@ asmlinkage void __sched schedule(void)
2911} 2979}
2912EXPORT_SYMBOL(schedule); 2980EXPORT_SYMBOL(schedule);
2913 2981
2914#ifdef CONFIG_RCU_USER_QS 2982#ifdef CONFIG_CONTEXT_TRACKING
2915asmlinkage void __sched schedule_user(void) 2983asmlinkage void __sched schedule_user(void)
2916{ 2984{
2917 /* 2985 /*
@@ -2920,9 +2988,9 @@ asmlinkage void __sched schedule_user(void)
2920 * we haven't yet exited the RCU idle mode. Do it here manually until 2988 * we haven't yet exited the RCU idle mode. Do it here manually until
2921 * we find a better solution. 2989 * we find a better solution.
2922 */ 2990 */
2923 rcu_user_exit(); 2991 user_exit();
2924 schedule(); 2992 schedule();
2925 rcu_user_enter(); 2993 user_enter();
2926} 2994}
2927#endif 2995#endif
2928 2996
@@ -3027,7 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3027 /* Catch callers which need to be fixed */ 3095 /* Catch callers which need to be fixed */
3028 BUG_ON(ti->preempt_count || !irqs_disabled()); 3096 BUG_ON(ti->preempt_count || !irqs_disabled());
3029 3097
3030 rcu_user_exit(); 3098 user_exit();
3031 do { 3099 do {
3032 add_preempt_count(PREEMPT_ACTIVE); 3100 add_preempt_count(PREEMPT_ACTIVE);
3033 local_irq_enable(); 3101 local_irq_enable();
@@ -4480,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4480void sched_show_task(struct task_struct *p) 4548void sched_show_task(struct task_struct *p)
4481{ 4549{
4482 unsigned long free = 0; 4550 unsigned long free = 0;
4551 int ppid;
4483 unsigned state; 4552 unsigned state;
4484 4553
4485 state = p->state ? __ffs(p->state) + 1 : 0; 4554 state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4499,8 +4568,11 @@ void sched_show_task(struct task_struct *p)
4499#ifdef CONFIG_DEBUG_STACK_USAGE 4568#ifdef CONFIG_DEBUG_STACK_USAGE
4500 free = stack_not_used(p); 4569 free = stack_not_used(p);
4501#endif 4570#endif
4571 rcu_read_lock();
4572 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4573 rcu_read_unlock();
4502 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4574 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4503 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4575 task_pid_nr(p), ppid,
4504 (unsigned long)task_thread_info(p)->flags); 4576 (unsigned long)task_thread_info(p)->flags);
4505 4577
4506 show_stack(p, NULL); 4578 show_stack(p, NULL);
@@ -7474,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7474 struct task_group, css); 7546 struct task_group, css);
7475} 7547}
7476 7548
7477static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7549static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7478{ 7550{
7479 struct task_group *tg, *parent; 7551 struct task_group *tg, *parent;
7480 7552
@@ -7491,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7491 return &tg->css; 7563 return &tg->css;
7492} 7564}
7493 7565
7494static void cpu_cgroup_destroy(struct cgroup *cgrp) 7566static void cpu_cgroup_css_free(struct cgroup *cgrp)
7495{ 7567{
7496 struct task_group *tg = cgroup_tg(cgrp); 7568 struct task_group *tg = cgroup_tg(cgrp);
7497 7569
@@ -7851,8 +7923,8 @@ static struct cftype cpu_files[] = {
7851 7923
7852struct cgroup_subsys cpu_cgroup_subsys = { 7924struct cgroup_subsys cpu_cgroup_subsys = {
7853 .name = "cpu", 7925 .name = "cpu",
7854 .create = cpu_cgroup_create, 7926 .css_alloc = cpu_cgroup_css_alloc,
7855 .destroy = cpu_cgroup_destroy, 7927 .css_free = cpu_cgroup_css_free,
7856 .can_attach = cpu_cgroup_can_attach, 7928 .can_attach = cpu_cgroup_can_attach,
7857 .attach = cpu_cgroup_attach, 7929 .attach = cpu_cgroup_attach,
7858 .exit = cpu_cgroup_exit, 7930 .exit = cpu_cgroup_exit,
@@ -7875,7 +7947,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7875struct cpuacct root_cpuacct; 7947struct cpuacct root_cpuacct;
7876 7948
7877/* create a new cpu accounting group */ 7949/* create a new cpu accounting group */
7878static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7950static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7879{ 7951{
7880 struct cpuacct *ca; 7952 struct cpuacct *ca;
7881 7953
@@ -7905,7 +7977,7 @@ out:
7905} 7977}
7906 7978
7907/* destroy an existing cpu accounting group */ 7979/* destroy an existing cpu accounting group */
7908static void cpuacct_destroy(struct cgroup *cgrp) 7980static void cpuacct_css_free(struct cgroup *cgrp)
7909{ 7981{
7910 struct cpuacct *ca = cgroup_ca(cgrp); 7982 struct cpuacct *ca = cgroup_ca(cgrp);
7911 7983
@@ -8076,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8076 8148
8077struct cgroup_subsys cpuacct_subsys = { 8149struct cgroup_subsys cpuacct_subsys = {
8078 .name = "cpuacct", 8150 .name = "cpuacct",
8079 .create = cpuacct_create, 8151 .css_alloc = cpuacct_css_alloc,
8080 .destroy = cpuacct_destroy, 8152 .css_free = cpuacct_css_free,
8081 .subsys_id = cpuacct_subsys_id, 8153 .subsys_id = cpuacct_subsys_id,
8082 .base_cftypes = files, 8154 .base_cftypes = files,
8083}; 8155};
8084#endif /* CONFIG_CGROUP_CPUACCT */ 8156#endif /* CONFIG_CGROUP_CPUACCT */
8157
8158void dump_cpu_task(int cpu)
8159{
8160 pr_info("Task dump for CPU %d:\n", cpu);
8161 sched_show_task(cpu_curr(cpu));
8162}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 81b763ba58a6..293b202fcf79 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
43 * Called before incrementing preempt_count on {soft,}irq_enter 43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit. 44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */ 45 */
46void vtime_account(struct task_struct *curr) 46void irqtime_account_irq(struct task_struct *curr)
47{ 47{
48 unsigned long flags; 48 unsigned long flags;
49 s64 delta; 49 s64 delta;
@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr)
73 irq_time_write_end(); 73 irq_time_write_end();
74 local_irq_restore(flags); 74 local_irq_restore(flags);
75} 75}
76EXPORT_SYMBOL_GPL(vtime_account); 76EXPORT_SYMBOL_GPL(irqtime_account_irq);
77 77
78static int irqtime_account_hi_update(void) 78static int irqtime_account_hi_update(void)
79{ 79{
@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void)
288 return false; 288 return false;
289} 289}
290 290
291/*
292 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
293 * tasks (sum on group iteration) belonging to @tsk's group.
294 */
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{
297 struct signal_struct *sig = tsk->signal;
298 struct task_struct *t;
299
300 times->utime = sig->utime;
301 times->stime = sig->stime;
302 times->sum_exec_runtime = sig->sum_sched_runtime;
303
304 rcu_read_lock();
305 /* make sure we can trust tsk->thread_group list */
306 if (!likely(pid_alive(tsk)))
307 goto out;
308
309 t = tsk;
310 do {
311 times->utime += t->utime;
312 times->stime += t->stime;
313 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t);
315out:
316 rcu_read_unlock();
317}
318
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING 319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
292 320
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING 321#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks)
417 * Use precise platform statistics if available: 445 * Use precise platform statistics if available:
418 */ 446 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING 447#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 448void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{ 449{
422 *ut = p->utime; 450 *ut = p->utime;
423 *st = p->stime; 451 *st = p->stime;
424} 452}
425 453
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 454void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{ 455{
428 struct task_cputime cputime; 456 struct task_cputime cputime;
429 457
@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
433 *st = cputime.stime; 461 *st = cputime.stime;
434} 462}
435 463
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev)
476{
477 if (is_idle_task(prev))
478 vtime_account_idle(prev);
479 else
480 vtime_account_system(prev);
481
482 vtime_account_user(prev);
483 arch_vtime_task_switch(prev);
484}
485#endif
486
436/* 487/*
437 * Archs that account the whole time spent in the idle task 488 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement 489 * (outside irq) as idle time can rely on this and just implement
@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
444#ifndef __ARCH_HAS_VTIME_ACCOUNT 495#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk) 496void vtime_account(struct task_struct *tsk)
446{ 497{
447 unsigned long flags;
448
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk)) 498 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk); 499 vtime_account_system(tsk);
453 else 500 else
454 vtime_account_idle(tsk); 501 vtime_account_idle(tsk);
455
456 local_irq_restore(flags);
457} 502}
458EXPORT_SYMBOL_GPL(vtime_account); 503EXPORT_SYMBOL_GPL(vtime_account);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 504#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
478 return (__force cputime_t) temp; 523 return (__force cputime_t) temp;
479} 524}
480 525
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 526/*
527 * Adjust tick based cputime random precision against scheduler
528 * runtime accounting.
529 */
530static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st)
482{ 533{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime; 534 cputime_t rtime, utime, total;
535
536 utime = curr->utime;
537 total = utime + curr->stime;
484 538
485 /* 539 /*
486 * Use CFS's precise accounting: 540 * Tick based cputime accounting depend on random scheduling
541 * timeslices of a task to be interrupted or not by the timer.
542 * Depending on these circumstances, the number of these interrupts
543 * may be over or under-optimistic, matching the real user and system
544 * cputime with a variable precision.
545 *
546 * Fix this by scaling these tick based values against the total
547 * runtime accounted by the CFS scheduler.
487 */ 548 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
489 550
490 if (total) 551 if (total)
491 utime = scale_utime(utime, rtime, total); 552 utime = scale_utime(utime, rtime, total);
@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
493 utime = rtime; 554 utime = rtime;
494 555
495 /* 556 /*
496 * Compare with previous values, to keep monotonicity: 557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity.
497 */ 560 */
498 p->prev_utime = max(p->prev_utime, utime); 561 prev->utime = max(prev->utime, utime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); 562 prev->stime = max(prev->stime, rtime - prev->utime);
500 563
501 *ut = p->prev_utime; 564 *ut = prev->utime;
502 *st = p->prev_stime; 565 *st = prev->stime;
566}
567
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{
570 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime,
574 };
575
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
503} 577}
504 578
505/* 579/*
506 * Must be called with siglock held. 580 * Must be called with siglock held.
507 */ 581 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 582void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{ 583{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime; 584 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513 585
514 thread_group_cputime(p, &cputime); 586 thread_group_cputime(p, &cputime);
515 587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
516 total = cputime.utime + cputime.stime;
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
518
519 if (total)
520 utime = scale_utime(cputime.utime, rtime, total);
521 else
522 utime = rtime;
523
524 sig->prev_utime = max(sig->prev_utime, utime);
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
526
527 *ut = sig->prev_utime;
528 *st = sig->prev_stime;
529} 588}
530#endif 589#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..2cd3c1b4e582 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{ 62{
63 struct sched_entity *se = tg->se[cpu]; 63 struct sched_entity *se = tg->se[cpu];
64 if (!se)
65 return;
66 64
67#define P(F) \ 65#define P(F) \
68 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
69#define PN(F) \ 67#define PN(F) \
70 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
71 69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
72 PN(se->exec_start); 78 PN(se->exec_start);
73 PN(se->vruntime); 79 PN(se->vruntime);
74 PN(se->sum_exec_runtime); 80 PN(se->sum_exec_runtime);
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
85 P(se->statistics.wait_count); 91 P(se->statistics.wait_count);
86#endif 92#endif
87 P(se->load.weight); 93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
88#undef PN 100#undef PN
89#undef P 101#undef P
90} 102}
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 218 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 219#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 220#ifdef CONFIG_SMP
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", 221 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
210 SPLIT_NS(cfs_rq->load_avg)); 222 cfs_rq->runnable_load_avg);
211 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", 223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
212 SPLIT_NS(cfs_rq->load_period)); 224 cfs_rq->blocked_load_avg);
213 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", 225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
214 cfs_rq->load_contribution); 226 atomic64_read(&cfs_rq->tg->load_avg));
215 SEQ_printf(m, " .%-30s: %d\n", "load_tg", 227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
216 atomic_read(&cfs_rq->tg->load_weight)); 228 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
230 cfs_rq->tg_runnable_contrib);
231 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
232 atomic_read(&cfs_rq->tg->runnable_avg));
217#endif 233#endif
218 234
219 print_cfs_group_stats(m, cpu, cfs_rq->tg); 235 print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..4603d6cb9e25 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
259 return grp->my_q; 262 return grp->my_q;
260} 263}
261 264
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
267
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{ 269{
264 if (!cfs_rq->on_list) { 270 if (!cfs_rq->on_list) {
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
278 } 284 }
279 285
280 cfs_rq->on_list = 1; 286 cfs_rq->on_list = 1;
287 /* We should have no load, but we need to update last_decay. */
288 update_cfs_rq_blocked_load(cfs_rq, 0);
281 } 289 }
282} 290}
283 291
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653 return calc_delta_fair(sched_slice(cfs_rq, se), se); 661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654} 662}
655 663
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659/* 664/*
660 * Update the current task's runtime statistics. Skip current tasks that 665 * Update the current task's runtime statistics. Skip current tasks that
661 * are not in our scheduling class. 666 * are not in our scheduling class.
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
675 680
676 curr->vruntime += delta_exec_weighted; 681 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq); 682 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682} 683}
683 684
684static void update_curr(struct cfs_rq *cfs_rq) 685static void update_curr(struct cfs_rq *cfs_rq)
@@ -776,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
777 */ 778 */
778 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
797
798 if (p->numa_scan_seq == seq)
799 return;
800 p->numa_scan_seq = seq;
801
802 /* FIXME: Scheduling placement policy hints go here */
803}
804
805/*
806 * Got a PROT_NONE fault for a page on @node.
807 */
808void task_numa_fault(int node, int pages, bool migrated)
809{
810 struct task_struct *p = current;
811
812 if (!sched_feat_numa(NUMA))
813 return;
814
815 /* FIXME: Allocate task-specific structure for placement policy here */
816
817 /*
818 * If pages are properly placed (did not migrate) then scan slower.
819 * This is reset periodically in case of phase changes
820 */
821 if (!migrated)
822 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
823 p->numa_scan_period + jiffies_to_msecs(10));
824
825 task_numa_placement(p);
826}
827
828static void reset_ptenuma_scan(struct task_struct *p)
829{
830 ACCESS_ONCE(p->mm->numa_scan_seq)++;
831 p->mm->numa_scan_offset = 0;
832}
833
834/*
835 * The expensive part of numa migration is done from task_work context.
836 * Triggered from task_tick_numa().
837 */
838void task_numa_work(struct callback_head *work)
839{
840 unsigned long migrate, next_scan, now = jiffies;
841 struct task_struct *p = current;
842 struct mm_struct *mm = p->mm;
843 struct vm_area_struct *vma;
844 unsigned long start, end;
845 long pages;
846
847 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
848
849 work->next = work; /* protect against double add */
850 /*
851 * Who cares about NUMA placement when they're dying.
852 *
853 * NOTE: make sure not to dereference p->mm before this check,
854 * exit_task_work() happens _after_ exit_mm() so we could be called
855 * without p->mm even though we still had it when we enqueued this
856 * work.
857 */
858 if (p->flags & PF_EXITING)
859 return;
860
861 /*
862 * We do not care about task placement until a task runs on a node
863 * other than the first one used by the address space. This is
864 * largely because migrations are driven by what CPU the task
865 * is running on. If it's never scheduled on another node, it'll
866 * not migrate so why bother trapping the fault.
867 */
868 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
869 mm->first_nid = numa_node_id();
870 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
871 /* Are we running on a new node yet? */
872 if (numa_node_id() == mm->first_nid &&
873 !sched_feat_numa(NUMA_FORCE))
874 return;
875
876 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
877 }
878
879 /*
880 * Reset the scan period if enough time has gone by. Objective is that
881 * scanning will be reduced if pages are properly placed. As tasks
882 * can enter different phases this needs to be re-examined. Lacking
883 * proper tracking of reference behaviour, this blunt hammer is used.
884 */
885 migrate = mm->numa_next_reset;
886 if (time_after(now, migrate)) {
887 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
888 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
889 xchg(&mm->numa_next_reset, next_scan);
890 }
891
892 /*
893 * Enforce maximal scan/migration frequency..
894 */
895 migrate = mm->numa_next_scan;
896 if (time_before(now, migrate))
897 return;
898
899 if (p->numa_scan_period == 0)
900 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
901
902 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
903 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
904 return;
905
906 /*
907 * Do not set pte_numa if the current running node is rate-limited.
908 * This loses statistics on the fault but if we are unwilling to
909 * migrate to this node, it is less likely we can do useful work
910 */
911 if (migrate_ratelimited(numa_node_id()))
912 return;
913
914 start = mm->numa_scan_offset;
915 pages = sysctl_numa_balancing_scan_size;
916 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
917 if (!pages)
918 return;
919
920 down_read(&mm->mmap_sem);
921 vma = find_vma(mm, start);
922 if (!vma) {
923 reset_ptenuma_scan(p);
924 start = 0;
925 vma = mm->mmap;
926 }
927 for (; vma; vma = vma->vm_next) {
928 if (!vma_migratable(vma))
929 continue;
930
931 /* Skip small VMAs. They are not likely to be of relevance */
932 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
933 continue;
934
935 do {
936 start = max(start, vma->vm_start);
937 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
938 end = min(end, vma->vm_end);
939 pages -= change_prot_numa(vma, start, end);
940
941 start = end;
942 if (pages <= 0)
943 goto out;
944 } while (end != vma->vm_end);
945 }
946
947out:
948 /*
949 * It is possible to reach the end of the VMA list but the last few VMAs are
950 * not guaranteed to the vma_migratable. If they are not, we would find the
951 * !migratable VMA on the next scan but not reset the scanner to the start
952 * so check it now.
953 */
954 if (vma)
955 mm->numa_scan_offset = start;
956 else
957 reset_ptenuma_scan(p);
958 up_read(&mm->mmap_sem);
959}
960
961/*
962 * Drive the periodic memory faults..
963 */
964void task_tick_numa(struct rq *rq, struct task_struct *curr)
965{
966 struct callback_head *work = &curr->numa_work;
967 u64 period, now;
968
969 /*
970 * We don't care about NUMA placement if we don't have memory.
971 */
972 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
973 return;
974
975 /*
976 * Using runtime rather than walltime has the dual advantage that
977 * we (mostly) drive the selection from busy threads and that the
978 * task needs to have done some actual work before we bother with
979 * NUMA placement.
980 */
981 now = curr->se.sum_exec_runtime;
982 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
983
984 if (now - curr->node_stamp > period) {
985 if (!curr->node_stamp)
986 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
987 curr->node_stamp = now;
988
989 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
990 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
991 task_work_add(curr, work, true);
992 }
993 }
994}
995#else
996static void task_tick_numa(struct rq *rq, struct task_struct *curr)
997{
998}
999#endif /* CONFIG_NUMA_BALANCING */
1000
779static void 1001static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1002account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 1003{
@@ -801,72 +1023,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
801} 1023}
802 1024
803#ifdef CONFIG_FAIR_GROUP_SCHED 1025#ifdef CONFIG_FAIR_GROUP_SCHED
804/* we need this in update_cfs_load and load-balance functions below */
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP 1026# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834 /* truncate load history at 4 idle periods */
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850 /* consider updating load contribution on each fold or truncate */
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856 /*
857 * Inline assembly required to prevent the compiler
858 * optimising this loop into a divmod call.
859 * See __iter_div_u64_rem() for another example of this.
860 */
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) 1027static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{ 1028{
872 long tg_weight; 1029 long tg_weight;
@@ -876,8 +1033,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
876 * to gain a more accurate current total weight. See 1033 * to gain a more accurate current total weight. See
877 * update_cfs_rq_load_contribution(). 1034 * update_cfs_rq_load_contribution().
878 */ 1035 */
879 tg_weight = atomic_read(&tg->load_weight); 1036 tg_weight = atomic64_read(&tg->load_avg);
880 tg_weight -= cfs_rq->load_contribution; 1037 tg_weight -= cfs_rq->tg_load_contrib;
881 tg_weight += cfs_rq->load.weight; 1038 tg_weight += cfs_rq->load.weight;
882 1039
883 return tg_weight; 1040 return tg_weight;
@@ -901,27 +1058,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
901 1058
902 return shares; 1059 return shares;
903} 1060}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else /* CONFIG_SMP */ 1061# else /* CONFIG_SMP */
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 1062static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{ 1063{
919 return tg->shares; 1064 return tg->shares;
920} 1065}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif /* CONFIG_SMP */ 1066# endif /* CONFIG_SMP */
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 1067static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight) 1068 unsigned long weight)
@@ -939,6 +1080,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
939 account_entity_enqueue(cfs_rq, se); 1080 account_entity_enqueue(cfs_rq, se);
940} 1081}
941 1082
1083static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1084
942static void update_cfs_shares(struct cfs_rq *cfs_rq) 1085static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{ 1086{
944 struct task_group *tg; 1087 struct task_group *tg;
@@ -958,18 +1101,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
958 reweight_entity(cfs_rq_of(se), se, shares); 1101 reweight_entity(cfs_rq_of(se), se, shares);
959} 1102}
960#else /* CONFIG_FAIR_GROUP_SCHED */ 1103#else /* CONFIG_FAIR_GROUP_SCHED */
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 1104static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
962{ 1105{
963} 1106}
1107#endif /* CONFIG_FAIR_GROUP_SCHED */
964 1108
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq) 1109/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1110#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1111/*
1112 * We choose a half-life close to 1 scheduling period.
1113 * Note: The tables below are dependent on this value.
1114 */
1115#define LOAD_AVG_PERIOD 32
1116#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1117#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1118
1119/* Precomputed fixed inverse multiplies for multiplication by y^n */
1120static const u32 runnable_avg_yN_inv[] = {
1121 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1122 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1123 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1124 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1125 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1126 0x85aac367, 0x82cd8698,
1127};
1128
1129/*
1130 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1131 * over-estimates when re-combining.
1132 */
1133static const u32 runnable_avg_yN_sum[] = {
1134 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1135 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1136 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1137};
1138
1139/*
1140 * Approximate:
1141 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1142 */
1143static __always_inline u64 decay_load(u64 val, u64 n)
1144{
1145 unsigned int local_n;
1146
1147 if (!n)
1148 return val;
1149 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1150 return 0;
1151
1152 /* after bounds checking we can collapse to 32-bit */
1153 local_n = n;
1154
1155 /*
1156 * As y^PERIOD = 1/2, we can combine
1157 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1158 * With a look-up table which covers k^n (n<PERIOD)
1159 *
1160 * To achieve constant time decay_load.
1161 */
1162 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1163 val >>= local_n / LOAD_AVG_PERIOD;
1164 local_n %= LOAD_AVG_PERIOD;
1165 }
1166
1167 val *= runnable_avg_yN_inv[local_n];
1168 /* We don't use SRR here since we always want to round down. */
1169 return val >> 32;
1170}
1171
1172/*
1173 * For updates fully spanning n periods, the contribution to runnable
1174 * average will be: \Sum 1024*y^n
1175 *
1176 * We can compute this reasonably efficiently by combining:
1177 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1178 */
1179static u32 __compute_runnable_contrib(u64 n)
966{ 1180{
1181 u32 contrib = 0;
1182
1183 if (likely(n <= LOAD_AVG_PERIOD))
1184 return runnable_avg_yN_sum[n];
1185 else if (unlikely(n >= LOAD_AVG_MAX_N))
1186 return LOAD_AVG_MAX;
1187
1188 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1189 do {
1190 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1191 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1192
1193 n -= LOAD_AVG_PERIOD;
1194 } while (n > LOAD_AVG_PERIOD);
1195
1196 contrib = decay_load(contrib, n);
1197 return contrib + runnable_avg_yN_sum[n];
1198}
1199
1200/*
1201 * We can represent the historical contribution to runnable average as the
1202 * coefficients of a geometric series. To do this we sub-divide our runnable
1203 * history into segments of approximately 1ms (1024us); label the segment that
1204 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1205 *
1206 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1207 * p0 p1 p2
1208 * (now) (~1ms ago) (~2ms ago)
1209 *
1210 * Let u_i denote the fraction of p_i that the entity was runnable.
1211 *
1212 * We then designate the fractions u_i as our co-efficients, yielding the
1213 * following representation of historical load:
1214 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1215 *
1216 * We choose y based on the with of a reasonably scheduling period, fixing:
1217 * y^32 = 0.5
1218 *
1219 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1220 * approximately half as much as the contribution to load within the last ms
1221 * (u_0).
1222 *
1223 * When a period "rolls over" and we have new u_0`, multiplying the previous
1224 * sum again by y is sufficient to update:
1225 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1226 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1227 */
1228static __always_inline int __update_entity_runnable_avg(u64 now,
1229 struct sched_avg *sa,
1230 int runnable)
1231{
1232 u64 delta, periods;
1233 u32 runnable_contrib;
1234 int delta_w, decayed = 0;
1235
1236 delta = now - sa->last_runnable_update;
1237 /*
1238 * This should only happen when time goes backwards, which it
1239 * unfortunately does during sched clock init when we swap over to TSC.
1240 */
1241 if ((s64)delta < 0) {
1242 sa->last_runnable_update = now;
1243 return 0;
1244 }
1245
1246 /*
1247 * Use 1024ns as the unit of measurement since it's a reasonable
1248 * approximation of 1us and fast to compute.
1249 */
1250 delta >>= 10;
1251 if (!delta)
1252 return 0;
1253 sa->last_runnable_update = now;
1254
1255 /* delta_w is the amount already accumulated against our next period */
1256 delta_w = sa->runnable_avg_period % 1024;
1257 if (delta + delta_w >= 1024) {
1258 /* period roll-over */
1259 decayed = 1;
1260
1261 /*
1262 * Now that we know we're crossing a period boundary, figure
1263 * out how much from delta we need to complete the current
1264 * period and accrue it.
1265 */
1266 delta_w = 1024 - delta_w;
1267 if (runnable)
1268 sa->runnable_avg_sum += delta_w;
1269 sa->runnable_avg_period += delta_w;
1270
1271 delta -= delta_w;
1272
1273 /* Figure out how many additional periods this update spans */
1274 periods = delta / 1024;
1275 delta %= 1024;
1276
1277 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1278 periods + 1);
1279 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1280 periods + 1);
1281
1282 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1283 runnable_contrib = __compute_runnable_contrib(periods);
1284 if (runnable)
1285 sa->runnable_avg_sum += runnable_contrib;
1286 sa->runnable_avg_period += runnable_contrib;
1287 }
1288
1289 /* Remainder of delta accrued against u_0` */
1290 if (runnable)
1291 sa->runnable_avg_sum += delta;
1292 sa->runnable_avg_period += delta;
1293
1294 return decayed;
967} 1295}
968 1296
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) 1297/* Synchronize an entity's decay with its parenting cfs_rq.*/
1298static inline u64 __synchronize_entity_decay(struct sched_entity *se)
970{ 1299{
1300 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1301 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1302
1303 decays -= se->avg.decay_count;
1304 if (!decays)
1305 return 0;
1306
1307 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1308 se->avg.decay_count = 0;
1309
1310 return decays;
1311}
1312
1313#ifdef CONFIG_FAIR_GROUP_SCHED
1314static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1315 int force_update)
1316{
1317 struct task_group *tg = cfs_rq->tg;
1318 s64 tg_contrib;
1319
1320 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1321 tg_contrib -= cfs_rq->tg_load_contrib;
1322
1323 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1324 atomic64_add(tg_contrib, &tg->load_avg);
1325 cfs_rq->tg_load_contrib += tg_contrib;
1326 }
1327}
1328
1329/*
1330 * Aggregate cfs_rq runnable averages into an equivalent task_group
1331 * representation for computing load contributions.
1332 */
1333static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1334 struct cfs_rq *cfs_rq)
1335{
1336 struct task_group *tg = cfs_rq->tg;
1337 long contrib;
1338
1339 /* The fraction of a cpu used by this cfs_rq */
1340 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1341 sa->runnable_avg_period + 1);
1342 contrib -= cfs_rq->tg_runnable_contrib;
1343
1344 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1345 atomic_add(contrib, &tg->runnable_avg);
1346 cfs_rq->tg_runnable_contrib += contrib;
1347 }
1348}
1349
1350static inline void __update_group_entity_contrib(struct sched_entity *se)
1351{
1352 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1353 struct task_group *tg = cfs_rq->tg;
1354 int runnable_avg;
1355
1356 u64 contrib;
1357
1358 contrib = cfs_rq->tg_load_contrib * tg->shares;
1359 se->avg.load_avg_contrib = div64_u64(contrib,
1360 atomic64_read(&tg->load_avg) + 1);
1361
1362 /*
1363 * For group entities we need to compute a correction term in the case
1364 * that they are consuming <1 cpu so that we would contribute the same
1365 * load as a task of equal weight.
1366 *
1367 * Explicitly co-ordinating this measurement would be expensive, but
1368 * fortunately the sum of each cpus contribution forms a usable
1369 * lower-bound on the true value.
1370 *
1371 * Consider the aggregate of 2 contributions. Either they are disjoint
1372 * (and the sum represents true value) or they are disjoint and we are
1373 * understating by the aggregate of their overlap.
1374 *
1375 * Extending this to N cpus, for a given overlap, the maximum amount we
1376 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1377 * cpus that overlap for this interval and w_i is the interval width.
1378 *
1379 * On a small machine; the first term is well-bounded which bounds the
1380 * total error since w_i is a subset of the period. Whereas on a
1381 * larger machine, while this first term can be larger, if w_i is the
1382 * of consequential size guaranteed to see n_i*w_i quickly converge to
1383 * our upper bound of 1-cpu.
1384 */
1385 runnable_avg = atomic_read(&tg->runnable_avg);
1386 if (runnable_avg < NICE_0_LOAD) {
1387 se->avg.load_avg_contrib *= runnable_avg;
1388 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1389 }
1390}
1391#else
1392static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1393 int force_update) {}
1394static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1395 struct cfs_rq *cfs_rq) {}
1396static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1397#endif
1398
1399static inline void __update_task_entity_contrib(struct sched_entity *se)
1400{
1401 u32 contrib;
1402
1403 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1404 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1405 contrib /= (se->avg.runnable_avg_period + 1);
1406 se->avg.load_avg_contrib = scale_load(contrib);
971} 1407}
972#endif /* CONFIG_FAIR_GROUP_SCHED */ 1408
1409/* Compute the current contribution to load_avg by se, return any delta */
1410static long __update_entity_load_avg_contrib(struct sched_entity *se)
1411{
1412 long old_contrib = se->avg.load_avg_contrib;
1413
1414 if (entity_is_task(se)) {
1415 __update_task_entity_contrib(se);
1416 } else {
1417 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1418 __update_group_entity_contrib(se);
1419 }
1420
1421 return se->avg.load_avg_contrib - old_contrib;
1422}
1423
1424static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1425 long load_contrib)
1426{
1427 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1428 cfs_rq->blocked_load_avg -= load_contrib;
1429 else
1430 cfs_rq->blocked_load_avg = 0;
1431}
1432
1433static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1434
1435/* Update a sched_entity's runnable average */
1436static inline void update_entity_load_avg(struct sched_entity *se,
1437 int update_cfs_rq)
1438{
1439 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1440 long contrib_delta;
1441 u64 now;
1442
1443 /*
1444 * For a group entity we need to use their owned cfs_rq_clock_task() in
1445 * case they are the parent of a throttled hierarchy.
1446 */
1447 if (entity_is_task(se))
1448 now = cfs_rq_clock_task(cfs_rq);
1449 else
1450 now = cfs_rq_clock_task(group_cfs_rq(se));
1451
1452 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1453 return;
1454
1455 contrib_delta = __update_entity_load_avg_contrib(se);
1456
1457 if (!update_cfs_rq)
1458 return;
1459
1460 if (se->on_rq)
1461 cfs_rq->runnable_load_avg += contrib_delta;
1462 else
1463 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1464}
1465
1466/*
1467 * Decay the load contributed by all blocked children and account this so that
1468 * their contribution may appropriately discounted when they wake up.
1469 */
1470static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1471{
1472 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1473 u64 decays;
1474
1475 decays = now - cfs_rq->last_decay;
1476 if (!decays && !force_update)
1477 return;
1478
1479 if (atomic64_read(&cfs_rq->removed_load)) {
1480 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1481 subtract_blocked_load_contrib(cfs_rq, removed_load);
1482 }
1483
1484 if (decays) {
1485 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1486 decays);
1487 atomic64_add(decays, &cfs_rq->decay_counter);
1488 cfs_rq->last_decay = now;
1489 }
1490
1491 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1492}
1493
1494static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1495{
1496 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1497 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1498}
1499
1500/* Add the load generated by se into cfs_rq's child load-average */
1501static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1502 struct sched_entity *se,
1503 int wakeup)
1504{
1505 /*
1506 * We track migrations using entity decay_count <= 0, on a wake-up
1507 * migration we use a negative decay count to track the remote decays
1508 * accumulated while sleeping.
1509 */
1510 if (unlikely(se->avg.decay_count <= 0)) {
1511 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1512 if (se->avg.decay_count) {
1513 /*
1514 * In a wake-up migration we have to approximate the
1515 * time sleeping. This is because we can't synchronize
1516 * clock_task between the two cpus, and it is not
1517 * guaranteed to be read-safe. Instead, we can
1518 * approximate this using our carried decays, which are
1519 * explicitly atomically readable.
1520 */
1521 se->avg.last_runnable_update -= (-se->avg.decay_count)
1522 << 20;
1523 update_entity_load_avg(se, 0);
1524 /* Indicate that we're now synchronized and on-rq */
1525 se->avg.decay_count = 0;
1526 }
1527 wakeup = 0;
1528 } else {
1529 __synchronize_entity_decay(se);
1530 }
1531
1532 /* migrated tasks did not contribute to our blocked load */
1533 if (wakeup) {
1534 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1535 update_entity_load_avg(se, 0);
1536 }
1537
1538 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1539 /* we force update consideration on load-balancer moves */
1540 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1541}
1542
1543/*
1544 * Remove se's load from this cfs_rq child load-average, if the entity is
1545 * transitioning to a blocked state we track its projected decay using
1546 * blocked_load_avg.
1547 */
1548static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1549 struct sched_entity *se,
1550 int sleep)
1551{
1552 update_entity_load_avg(se, 1);
1553 /* we force update consideration on load-balancer moves */
1554 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1555
1556 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1557 if (sleep) {
1558 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1559 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1560 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1561}
1562#else
1563static inline void update_entity_load_avg(struct sched_entity *se,
1564 int update_cfs_rq) {}
1565static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1566static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1567 struct sched_entity *se,
1568 int wakeup) {}
1569static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 struct sched_entity *se,
1571 int sleep) {}
1572static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1573 int force_update) {}
1574#endif
973 1575
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 1576static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{ 1577{
@@ -1096,7 +1698,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1096 * Update run-time statistics of the 'current'. 1698 * Update run-time statistics of the 'current'.
1097 */ 1699 */
1098 update_curr(cfs_rq); 1700 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0); 1701 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1100 account_entity_enqueue(cfs_rq, se); 1702 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq); 1703 update_cfs_shares(cfs_rq);
1102 1704
@@ -1171,6 +1773,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1171 * Update run-time statistics of the 'current'. 1773 * Update run-time statistics of the 'current'.
1172 */ 1774 */
1173 update_curr(cfs_rq); 1775 update_curr(cfs_rq);
1776 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1174 1777
1175 update_stats_dequeue(cfs_rq, se); 1778 update_stats_dequeue(cfs_rq, se);
1176 if (flags & DEQUEUE_SLEEP) { 1779 if (flags & DEQUEUE_SLEEP) {
@@ -1191,7 +1794,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1191 if (se != cfs_rq->curr) 1794 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se); 1795 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0; 1796 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se); 1797 account_entity_dequeue(cfs_rq, se);
1196 1798
1197 /* 1799 /*
@@ -1340,6 +1942,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1340 update_stats_wait_start(cfs_rq, prev); 1942 update_stats_wait_start(cfs_rq, prev);
1341 /* Put 'current' back into the tree. */ 1943 /* Put 'current' back into the tree. */
1342 __enqueue_entity(cfs_rq, prev); 1944 __enqueue_entity(cfs_rq, prev);
1945 /* in !on_rq case, update occurred at dequeue */
1946 update_entity_load_avg(prev, 1);
1343 } 1947 }
1344 cfs_rq->curr = NULL; 1948 cfs_rq->curr = NULL;
1345} 1949}
@@ -1353,9 +1957,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1353 update_curr(cfs_rq); 1957 update_curr(cfs_rq);
1354 1958
1355 /* 1959 /*
1356 * Update share accounting for long-running entities. 1960 * Ensure that runnable average is periodically updated.
1357 */ 1961 */
1358 update_entity_shares_tick(cfs_rq); 1962 update_entity_load_avg(curr, 1);
1963 update_cfs_rq_blocked_load(cfs_rq, 1);
1359 1964
1360#ifdef CONFIG_SCHED_HRTICK 1965#ifdef CONFIG_SCHED_HRTICK
1361 /* 1966 /*
@@ -1448,6 +2053,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1448 return &tg->cfs_bandwidth; 2053 return &tg->cfs_bandwidth;
1449} 2054}
1450 2055
2056/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2057static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2058{
2059 if (unlikely(cfs_rq->throttle_count))
2060 return cfs_rq->throttled_clock_task;
2061
2062 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2063}
2064
1451/* returns 0 on failure to allocate runtime */ 2065/* returns 0 on failure to allocate runtime */
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2066static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{ 2067{
@@ -1592,14 +2206,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
1592 cfs_rq->throttle_count--; 2206 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP 2207#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) { 2208 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp; 2209 /* adjust cfs_rq_clock_task() */
1596 2210 cfs_rq->throttled_clock_task_time += rq->clock_task -
1597 /* leaving throttled state, advance shares averaging windows */ 2211 cfs_rq->throttled_clock_task;
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601 /* update entity weight now that we are on_rq again */
1602 update_cfs_shares(cfs_rq);
1603 } 2212 }
1604#endif 2213#endif
1605 2214
@@ -1611,9 +2220,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
1611 struct rq *rq = data; 2220 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 2221 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613 2222
1614 /* group is entering throttled state, record last load */ 2223 /* group is entering throttled state, stop time */
1615 if (!cfs_rq->throttle_count) 2224 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0); 2225 cfs_rq->throttled_clock_task = rq->clock_task;
1617 cfs_rq->throttle_count++; 2226 cfs_rq->throttle_count++;
1618 2227
1619 return 0; 2228 return 0;
@@ -1628,7 +2237,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1628 2237
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2238 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630 2239
1631 /* account load preceding throttle */ 2240 /* freeze hierarchy runnable averages while throttled */
1632 rcu_read_lock(); 2241 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 2242 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock(); 2243 rcu_read_unlock();
@@ -1652,7 +2261,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1652 rq->nr_running -= task_delta; 2261 rq->nr_running -= task_delta;
1653 2262
1654 cfs_rq->throttled = 1; 2263 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock; 2264 cfs_rq->throttled_clock = rq->clock;
1656 raw_spin_lock(&cfs_b->lock); 2265 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2266 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock); 2267 raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2279,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1670 2279
1671 cfs_rq->throttled = 0; 2280 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock); 2281 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; 2282 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
1674 list_del_rcu(&cfs_rq->throttled_list); 2283 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock); 2284 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677 2285
1678 update_rq_clock(rq); 2286 update_rq_clock(rq);
1679 /* update hierarchical throttle state */ 2287 /* update hierarchical throttle state */
@@ -2073,8 +2681,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
2073} 2681}
2074 2682
2075#else /* CONFIG_CFS_BANDWIDTH */ 2683#else /* CONFIG_CFS_BANDWIDTH */
2076static __always_inline 2684static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} 2685{
2686 return rq_of(cfs_rq)->clock_task;
2687}
2688
2689static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2690 unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2691static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2692static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2693static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2820,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2207 if (cfs_rq_throttled(cfs_rq)) 2820 if (cfs_rq_throttled(cfs_rq))
2208 break; 2821 break;
2209 2822
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq); 2823 update_cfs_shares(cfs_rq);
2824 update_entity_load_avg(se, 1);
2212 } 2825 }
2213 2826
2214 if (!se) 2827 if (!se) {
2828 update_rq_runnable_avg(rq, rq->nr_running);
2215 inc_nr_running(rq); 2829 inc_nr_running(rq);
2830 }
2216 hrtick_update(rq); 2831 hrtick_update(rq);
2217} 2832}
2218 2833
@@ -2266,12 +2881,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2266 if (cfs_rq_throttled(cfs_rq)) 2881 if (cfs_rq_throttled(cfs_rq))
2267 break; 2882 break;
2268 2883
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq); 2884 update_cfs_shares(cfs_rq);
2885 update_entity_load_avg(se, 1);
2271 } 2886 }
2272 2887
2273 if (!se) 2888 if (!se) {
2274 dec_nr_running(rq); 2889 dec_nr_running(rq);
2890 update_rq_runnable_avg(rq, 1);
2891 }
2275 hrtick_update(rq); 2892 hrtick_update(rq);
2276} 2893}
2277 2894
@@ -2781,6 +3398,37 @@ unlock:
2781 3398
2782 return new_cpu; 3399 return new_cpu;
2783} 3400}
3401
3402/*
3403 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3404 * removed when useful for applications beyond shares distribution (e.g.
3405 * load-balance).
3406 */
3407#ifdef CONFIG_FAIR_GROUP_SCHED
3408/*
3409 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3410 * cfs_rq_of(p) references at time of call are still valid and identify the
3411 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3412 * other assumptions, including the state of rq->lock, should be made.
3413 */
3414static void
3415migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3416{
3417 struct sched_entity *se = &p->se;
3418 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3419
3420 /*
3421 * Load tracking: accumulate removed load so that it can be processed
3422 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3423 * to blocked load iff they have a positive decay-count. It can never
3424 * be negative here since on-rq tasks have decay-count == 0.
3425 */
3426 if (se->avg.decay_count) {
3427 se->avg.decay_count = -__synchronize_entity_decay(se);
3428 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3429 }
3430}
3431#endif
2784#endif /* CONFIG_SMP */ 3432#endif /* CONFIG_SMP */
2785 3433
2786static unsigned long 3434static unsigned long
@@ -2907,7 +3555,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2907 * Batch and idle tasks do not preempt non-idle tasks (their preemption 3555 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2908 * is driven by the tick): 3556 * is driven by the tick):
2909 */ 3557 */
2910 if (unlikely(p->policy != SCHED_NORMAL)) 3558 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
2911 return; 3559 return;
2912 3560
2913 find_matching_se(&se, &pse); 3561 find_matching_se(&se, &pse);
@@ -3033,8 +3681,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3033 3681
3034#ifdef CONFIG_SMP 3682#ifdef CONFIG_SMP
3035/************************************************** 3683/**************************************************
3036 * Fair scheduling class load-balancing methods: 3684 * Fair scheduling class load-balancing methods.
3037 */ 3685 *
3686 * BASICS
3687 *
3688 * The purpose of load-balancing is to achieve the same basic fairness the
3689 * per-cpu scheduler provides, namely provide a proportional amount of compute
3690 * time to each task. This is expressed in the following equation:
3691 *
3692 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3693 *
3694 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3695 * W_i,0 is defined as:
3696 *
3697 * W_i,0 = \Sum_j w_i,j (2)
3698 *
3699 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3700 * is derived from the nice value as per prio_to_weight[].
3701 *
3702 * The weight average is an exponential decay average of the instantaneous
3703 * weight:
3704 *
3705 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3706 *
3707 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3708 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3709 * can also include other factors [XXX].
3710 *
3711 * To achieve this balance we define a measure of imbalance which follows
3712 * directly from (1):
3713 *
3714 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3715 *
3716 * We them move tasks around to minimize the imbalance. In the continuous
3717 * function space it is obvious this converges, in the discrete case we get
3718 * a few fun cases generally called infeasible weight scenarios.
3719 *
3720 * [XXX expand on:
3721 * - infeasible weights;
3722 * - local vs global optima in the discrete case. ]
3723 *
3724 *
3725 * SCHED DOMAINS
3726 *
3727 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3728 * for all i,j solution, we create a tree of cpus that follows the hardware
3729 * topology where each level pairs two lower groups (or better). This results
3730 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3731 * tree to only the first of the previous level and we decrease the frequency
3732 * of load-balance at each level inv. proportional to the number of cpus in
3733 * the groups.
3734 *
3735 * This yields:
3736 *
3737 * log_2 n 1 n
3738 * \Sum { --- * --- * 2^i } = O(n) (5)
3739 * i = 0 2^i 2^i
3740 * `- size of each group
3741 * | | `- number of cpus doing load-balance
3742 * | `- freq
3743 * `- sum over all levels
3744 *
3745 * Coupled with a limit on how many tasks we can migrate every balance pass,
3746 * this makes (5) the runtime complexity of the balancer.
3747 *
3748 * An important property here is that each CPU is still (indirectly) connected
3749 * to every other cpu in at most O(log n) steps:
3750 *
3751 * The adjacency matrix of the resulting graph is given by:
3752 *
3753 * log_2 n
3754 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3755 * k = 0
3756 *
3757 * And you'll find that:
3758 *
3759 * A^(log_2 n)_i,j != 0 for all i,j (7)
3760 *
3761 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3762 * The task movement gives a factor of O(m), giving a convergence complexity
3763 * of:
3764 *
3765 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3766 *
3767 *
3768 * WORK CONSERVING
3769 *
3770 * In order to avoid CPUs going idle while there's still work to do, new idle
3771 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3772 * tree itself instead of relying on other CPUs to bring it work.
3773 *
3774 * This adds some complexity to both (5) and (8) but it reduces the total idle
3775 * time.
3776 *
3777 * [XXX more?]
3778 *
3779 *
3780 * CGROUPS
3781 *
3782 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3783 *
3784 * s_k,i
3785 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3786 * S_k
3787 *
3788 * Where
3789 *
3790 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3791 *
3792 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3793 *
3794 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3795 * property.
3796 *
3797 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3798 * rewrite all of this once again.]
3799 */
3038 3800
3039static unsigned long __read_mostly max_load_balance_interval = HZ/10; 3801static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3040 3802
@@ -3300,52 +4062,58 @@ next:
3300/* 4062/*
3301 * update tg->load_weight by folding this cpu's load_avg 4063 * update tg->load_weight by folding this cpu's load_avg
3302 */ 4064 */
3303static int update_shares_cpu(struct task_group *tg, int cpu) 4065static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
3304{ 4066{
3305 struct cfs_rq *cfs_rq; 4067 struct sched_entity *se = tg->se[cpu];
3306 unsigned long flags; 4068 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
3307 struct rq *rq;
3308
3309 if (!tg->se[cpu])
3310 return 0;
3311
3312 rq = cpu_rq(cpu);
3313 cfs_rq = tg->cfs_rq[cpu];
3314
3315 raw_spin_lock_irqsave(&rq->lock, flags);
3316
3317 update_rq_clock(rq);
3318 update_cfs_load(cfs_rq, 1);
3319 4069
3320 /* 4070 /* throttled entities do not contribute to load */
3321 * We need to update shares after updating tg->load_weight in 4071 if (throttled_hierarchy(cfs_rq))
3322 * order to adjust the weight of groups with long running tasks. 4072 return;
3323 */
3324 update_cfs_shares(cfs_rq);
3325 4073
3326 raw_spin_unlock_irqrestore(&rq->lock, flags); 4074 update_cfs_rq_blocked_load(cfs_rq, 1);
3327 4075
3328 return 0; 4076 if (se) {
4077 update_entity_load_avg(se, 1);
4078 /*
4079 * We pivot on our runnable average having decayed to zero for
4080 * list removal. This generally implies that all our children
4081 * have also been removed (modulo rounding error or bandwidth
4082 * control); however, such cases are rare and we can fix these
4083 * at enqueue.
4084 *
4085 * TODO: fix up out-of-order children on enqueue.
4086 */
4087 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4088 list_del_leaf_cfs_rq(cfs_rq);
4089 } else {
4090 struct rq *rq = rq_of(cfs_rq);
4091 update_rq_runnable_avg(rq, rq->nr_running);
4092 }
3329} 4093}
3330 4094
3331static void update_shares(int cpu) 4095static void update_blocked_averages(int cpu)
3332{ 4096{
3333 struct cfs_rq *cfs_rq;
3334 struct rq *rq = cpu_rq(cpu); 4097 struct rq *rq = cpu_rq(cpu);
4098 struct cfs_rq *cfs_rq;
4099 unsigned long flags;
3335 4100
3336 rcu_read_lock(); 4101 raw_spin_lock_irqsave(&rq->lock, flags);
4102 update_rq_clock(rq);
3337 /* 4103 /*
3338 * Iterates the task_group tree in a bottom up fashion, see 4104 * Iterates the task_group tree in a bottom up fashion, see
3339 * list_add_leaf_cfs_rq() for details. 4105 * list_add_leaf_cfs_rq() for details.
3340 */ 4106 */
3341 for_each_leaf_cfs_rq(rq, cfs_rq) { 4107 for_each_leaf_cfs_rq(rq, cfs_rq) {
3342 /* throttled entities do not contribute to load */ 4108 /*
3343 if (throttled_hierarchy(cfs_rq)) 4109 * Note: We may want to consider periodically releasing
3344 continue; 4110 * rq->lock about these updates so that creating many task
3345 4111 * groups does not result in continually extending hold time.
3346 update_shares_cpu(cfs_rq->tg, cpu); 4112 */
4113 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
3347 } 4114 }
3348 rcu_read_unlock(); 4115
4116 raw_spin_unlock_irqrestore(&rq->lock, flags);
3349} 4117}
3350 4118
3351/* 4119/*
@@ -3397,7 +4165,7 @@ static unsigned long task_h_load(struct task_struct *p)
3397 return load; 4165 return load;
3398} 4166}
3399#else 4167#else
3400static inline void update_shares(int cpu) 4168static inline void update_blocked_averages(int cpu)
3401{ 4169{
3402} 4170}
3403 4171
@@ -4457,12 +5225,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
4457 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5225 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4458 return; 5226 return;
4459 5227
5228 update_rq_runnable_avg(this_rq, 1);
5229
4460 /* 5230 /*
4461 * Drop the rq->lock, but keep IRQ/preempt disabled. 5231 * Drop the rq->lock, but keep IRQ/preempt disabled.
4462 */ 5232 */
4463 raw_spin_unlock(&this_rq->lock); 5233 raw_spin_unlock(&this_rq->lock);
4464 5234
4465 update_shares(this_cpu); 5235 update_blocked_averages(this_cpu);
4466 rcu_read_lock(); 5236 rcu_read_lock();
4467 for_each_domain(this_cpu, sd) { 5237 for_each_domain(this_cpu, sd) {
4468 unsigned long interval; 5238 unsigned long interval;
@@ -4717,7 +5487,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4717 int update_next_balance = 0; 5487 int update_next_balance = 0;
4718 int need_serialize; 5488 int need_serialize;
4719 5489
4720 update_shares(cpu); 5490 update_blocked_averages(cpu);
4721 5491
4722 rcu_read_lock(); 5492 rcu_read_lock();
4723 for_each_domain(cpu, sd) { 5493 for_each_domain(cpu, sd) {
@@ -4954,6 +5724,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5724 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5725 entity_tick(cfs_rq, se, queued);
4956 } 5726 }
5727
5728 if (sched_feat_numa(NUMA))
5729 task_tick_numa(rq, curr);
5730
5731 update_rq_runnable_avg(rq, 1);
4957} 5732}
4958 5733
4959/* 5734/*
@@ -5046,6 +5821,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5046 place_entity(cfs_rq, se, 0); 5821 place_entity(cfs_rq, se, 0);
5047 se->vruntime -= cfs_rq->min_vruntime; 5822 se->vruntime -= cfs_rq->min_vruntime;
5048 } 5823 }
5824
5825#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5826 /*
5827 * Remove our load from contribution when we leave sched_fair
5828 * and ensure we don't carry in an old decay_count if we
5829 * switch back.
5830 */
5831 if (p->se.avg.decay_count) {
5832 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5833 __synchronize_entity_decay(&p->se);
5834 subtract_blocked_load_contrib(cfs_rq,
5835 p->se.avg.load_avg_contrib);
5836 }
5837#endif
5049} 5838}
5050 5839
5051/* 5840/*
@@ -5092,11 +5881,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5092#ifndef CONFIG_64BIT 5881#ifndef CONFIG_64BIT
5093 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5882 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5094#endif 5883#endif
5884#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5885 atomic64_set(&cfs_rq->decay_counter, 1);
5886 atomic64_set(&cfs_rq->removed_load, 0);
5887#endif
5095} 5888}
5096 5889
5097#ifdef CONFIG_FAIR_GROUP_SCHED 5890#ifdef CONFIG_FAIR_GROUP_SCHED
5098static void task_move_group_fair(struct task_struct *p, int on_rq) 5891static void task_move_group_fair(struct task_struct *p, int on_rq)
5099{ 5892{
5893 struct cfs_rq *cfs_rq;
5100 /* 5894 /*
5101 * If the task was not on the rq at the time of this cgroup movement 5895 * If the task was not on the rq at the time of this cgroup movement
5102 * it must have been asleep, sleeping tasks keep their ->vruntime 5896 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +5922,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5128 if (!on_rq) 5922 if (!on_rq)
5129 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5923 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5130 set_task_rq(p, task_cpu(p)); 5924 set_task_rq(p, task_cpu(p));
5131 if (!on_rq) 5925 if (!on_rq) {
5132 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5926 cfs_rq = cfs_rq_of(&p->se);
5927 p->se.vruntime += cfs_rq->min_vruntime;
5928#ifdef CONFIG_SMP
5929 /*
5930 * migrate_task_rq_fair() will have removed our previous
5931 * contribution, but we must synchronize for ongoing future
5932 * decay.
5933 */
5934 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5935 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5936#endif
5937 }
5133} 5938}
5134 5939
5135void free_fair_sched_group(struct task_group *tg) 5940void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6019,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5214 6019
5215 cfs_rq->tg = tg; 6020 cfs_rq->tg = tg;
5216 cfs_rq->rq = rq; 6021 cfs_rq->rq = rq;
5217#ifdef CONFIG_SMP
5218 /* allow initial update_cfs_load() to truncate */
5219 cfs_rq->load_stamp = 1;
5220#endif
5221 init_cfs_rq_runtime(cfs_rq); 6022 init_cfs_rq_runtime(cfs_rq);
5222 6023
5223 tg->cfs_rq[cpu] = cfs_rq; 6024 tg->cfs_rq[cpu] = cfs_rq;
@@ -5319,7 +6120,9 @@ const struct sched_class fair_sched_class = {
5319 6120
5320#ifdef CONFIG_SMP 6121#ifdef CONFIG_SMP
5321 .select_task_rq = select_task_rq_fair, 6122 .select_task_rq = select_task_rq_fair,
5322 6123#ifdef CONFIG_FAIR_GROUP_SCHED
6124 .migrate_task_rq = migrate_task_rq_fair,
6125#endif
5323 .rq_online = rq_online_fair, 6126 .rq_online = rq_online_fair,
5324 .rq_offline = rq_offline_fair, 6127 .rq_offline = rq_offline_fair,
5325 6128
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true)
32SCHED_FEAT(CACHE_HOT_BUDDY, true) 32SCHED_FEAT(CACHE_HOT_BUDDY, true)
33 33
34/* 34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
35 * Use arch dependent cpu power functions 40 * Use arch dependent cpu power functions
36 */ 41 */
37SCHED_FEAT(ARCH_POWER, true) 42SCHED_FEAT(ARCH_POWER, true)
@@ -61,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
61SCHED_FEAT(FORCE_SD_OVERLAP, false) 66SCHED_FEAT(FORCE_SD_OVERLAP, false)
62SCHED_FEAT(RT_RUNTIME_SHARE, true) 67SCHED_FEAT(RT_RUNTIME_SHARE, true)
63SCHED_FEAT(LB_MIN, false) 68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..fc886441436a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
112 unsigned long shares; 112 unsigned long shares;
113 113
114 atomic_t load_weight; 114 atomic_t load_weight;
115 atomic64_t load_avg;
116 atomic_t runnable_avg;
115#endif 117#endif
116 118
117#ifdef CONFIG_RT_GROUP_SCHED 119#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,29 @@ struct cfs_rq {
222 unsigned int nr_spread_over; 224 unsigned int nr_spread_over;
223#endif 225#endif
224 226
227#ifdef CONFIG_SMP
228/*
229 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
230 * removed when useful for applications beyond shares distribution (e.g.
231 * load-balance).
232 */
225#ifdef CONFIG_FAIR_GROUP_SCHED 233#ifdef CONFIG_FAIR_GROUP_SCHED
226 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
227
228 /* 234 /*
229 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 235 * CFS Load tracking
230 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 236 * Under CFS, load is tracked on a per-entity basis and aggregated up.
231 * (like users, containers etc.) 237 * This allows for the description of both thread and group usage (in
232 * 238 * the FAIR_GROUP_SCHED case).
233 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
234 * list is used during load balance.
235 */ 239 */
236 int on_list; 240 u64 runnable_load_avg, blocked_load_avg;
237 struct list_head leaf_cfs_rq_list; 241 atomic64_t decay_counter, removed_load;
238 struct task_group *tg; /* group that "owns" this runqueue */ 242 u64 last_decay;
243#endif /* CONFIG_FAIR_GROUP_SCHED */
244/* These always depend on CONFIG_FAIR_GROUP_SCHED */
245#ifdef CONFIG_FAIR_GROUP_SCHED
246 u32 tg_runnable_contrib;
247 u64 tg_load_contrib;
248#endif /* CONFIG_FAIR_GROUP_SCHED */
239 249
240#ifdef CONFIG_SMP
241 /* 250 /*
242 * h_load = weight * f(tg) 251 * h_load = weight * f(tg)
243 * 252 *
@@ -245,26 +254,30 @@ struct cfs_rq {
245 * this group. 254 * this group.
246 */ 255 */
247 unsigned long h_load; 256 unsigned long h_load;
257#endif /* CONFIG_SMP */
258
259#ifdef CONFIG_FAIR_GROUP_SCHED
260 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
248 261
249 /* 262 /*
250 * Maintaining per-cpu shares distribution for group scheduling 263 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
264 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
265 * (like users, containers etc.)
251 * 266 *
252 * load_stamp is the last time we updated the load average 267 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
253 * load_last is the last time we updated the load average and saw load 268 * list is used during load balance.
254 * load_unacc_exec_time is currently unaccounted execution time
255 */ 269 */
256 u64 load_avg; 270 int on_list;
257 u64 load_period; 271 struct list_head leaf_cfs_rq_list;
258 u64 load_stamp, load_last, load_unacc_exec_time; 272 struct task_group *tg; /* group that "owns" this runqueue */
259 273
260 unsigned long load_contribution;
261#endif /* CONFIG_SMP */
262#ifdef CONFIG_CFS_BANDWIDTH 274#ifdef CONFIG_CFS_BANDWIDTH
263 int runtime_enabled; 275 int runtime_enabled;
264 u64 runtime_expires; 276 u64 runtime_expires;
265 s64 runtime_remaining; 277 s64 runtime_remaining;
266 278
267 u64 throttled_timestamp; 279 u64 throttled_clock, throttled_clock_task;
280 u64 throttled_clock_task_time;
268 int throttled, throttle_count; 281 int throttled, throttle_count;
269 struct list_head throttled_list; 282 struct list_head throttled_list;
270#endif /* CONFIG_CFS_BANDWIDTH */ 283#endif /* CONFIG_CFS_BANDWIDTH */
@@ -467,6 +480,8 @@ struct rq {
467#ifdef CONFIG_SMP 480#ifdef CONFIG_SMP
468 struct llist_head wake_list; 481 struct llist_head wake_list;
469#endif 482#endif
483
484 struct sched_avg avg;
470}; 485};
471 486
472static inline int cpu_of(struct rq *rq) 487static inline int cpu_of(struct rq *rq)
@@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 665
666#ifdef CONFIG_NUMA_BALANCING
667#define sched_feat_numa(x) sched_feat(x)
668#ifdef CONFIG_SCHED_DEBUG
669#define numabalancing_enabled sched_feat_numa(NUMA)
670#else
671extern bool numabalancing_enabled;
672#endif /* CONFIG_SCHED_DEBUG */
673#else
674#define sched_feat_numa(x) (0)
675#define numabalancing_enabled (0)
676#endif /* CONFIG_NUMA_BALANCING */
677
651static inline u64 global_rt_period(void) 678static inline u64 global_rt_period(void)
652{ 679{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 680 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -1212,4 +1239,3 @@ static inline u64 irq_time_read(int cpu)
1212} 1239}
1213#endif /* CONFIG_64BIT */ 1240#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1241#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
396#ifdef CONFIG_SECCOMP_FILTER 396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: { 397 case SECCOMP_MODE_FILTER: {
398 int data; 398 int data;
399 struct pt_regs *regs = task_pt_regs(current);
399 ret = seccomp_run_filters(this_syscall); 400 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA; 401 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION; 402 ret &= SECCOMP_RET_ACTION;
402 switch (ret) { 403 switch (ret) {
403 case SECCOMP_RET_ERRNO: 404 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */ 405 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current), 406 syscall_set_return_value(current, regs,
406 -data, 0); 407 -data, 0);
407 goto skip; 408 goto skip;
408 case SECCOMP_RET_TRAP: 409 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */ 410 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current)); 411 syscall_rollback(current, regs);
411 /* Let the filter pass back 16 bits of data. */ 412 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data); 413 seccomp_send_sigsys(this_syscall, data);
413 goto skip; 414 goto skip;
414 case SECCOMP_RET_TRACE: 415 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */ 416 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) 417 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
418 syscall_set_return_value(current, regs,
419 -ENOSYS, 0);
417 goto skip; 420 goto skip;
421 }
418 /* Allow the BPF to provide the event message */ 422 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data); 423 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /* 424 /*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
425 */ 429 */
426 if (fatal_signal_pending(current)) 430 if (fatal_signal_pending(current))
427 break; 431 break;
432 if (syscall_get_nr(current, regs) < 0)
433 goto skip; /* Explicit request to skip. */
434
428 return 0; 435 return 0;
429 case SECCOMP_RET_ALLOW: 436 case SECCOMP_RET_ALLOW:
430 return 0; 437 return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index b2445d86f226..580a91e63471 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1159,8 +1159,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1159 return __send_signal(sig, info, t, group, from_ancestor_ns); 1159 return __send_signal(sig, info, t, group, from_ancestor_ns);
1160} 1160}
1161 1161
1162static void print_fatal_signal(struct pt_regs *regs, int signr) 1162static void print_fatal_signal(int signr)
1163{ 1163{
1164 struct pt_regs *regs = signal_pt_regs();
1164 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1165 printk("%s/%d: potentially unexpected fatal signal %d.\n",
1165 current->comm, task_pid_nr(current), signr); 1166 current->comm, task_pid_nr(current), signr);
1166 1167
@@ -1908,7 +1909,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1908 preempt_disable(); 1909 preempt_disable();
1909 read_unlock(&tasklist_lock); 1910 read_unlock(&tasklist_lock);
1910 preempt_enable_no_resched(); 1911 preempt_enable_no_resched();
1911 schedule(); 1912 freezable_schedule();
1912 } else { 1913 } else {
1913 /* 1914 /*
1914 * By the time we got the lock, our tracer went away. 1915 * By the time we got the lock, our tracer went away.
@@ -1930,13 +1931,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1930 } 1931 }
1931 1932
1932 /* 1933 /*
1933 * While in TASK_TRACED, we were considered "frozen enough".
1934 * Now that we woke up, it's crucial if we're supposed to be
1935 * frozen that we freeze now before running anything substantial.
1936 */
1937 try_to_freeze();
1938
1939 /*
1940 * We are back. Now reacquire the siglock before touching 1934 * We are back. Now reacquire the siglock before touching
1941 * last_siginfo, so that we are sure to have synchronized with 1935 * last_siginfo, so that we are sure to have synchronized with
1942 * any signal-sending on another CPU that wants to examine it. 1936 * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2086,7 @@ static bool do_signal_stop(int signr)
2092 } 2086 }
2093 2087
2094 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2088 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2095 schedule(); 2089 freezable_schedule();
2096 return true; 2090 return true;
2097 } else { 2091 } else {
2098 /* 2092 /*
@@ -2138,10 +2132,9 @@ static void do_jobctl_trap(void)
2138 } 2132 }
2139} 2133}
2140 2134
2141static int ptrace_signal(int signr, siginfo_t *info, 2135static int ptrace_signal(int signr, siginfo_t *info)
2142 struct pt_regs *regs, void *cookie)
2143{ 2136{
2144 ptrace_signal_deliver(regs, cookie); 2137 ptrace_signal_deliver();
2145 /* 2138 /*
2146 * We do not check sig_kernel_stop(signr) but set this marker 2139 * We do not check sig_kernel_stop(signr) but set this marker
2147 * unconditionally because we do not know whether debugger will 2140 * unconditionally because we do not know whether debugger will
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2200 if (unlikely(uprobe_deny_signal())) 2193 if (unlikely(uprobe_deny_signal()))
2201 return 0; 2194 return 0;
2202 2195
2203relock:
2204 /* 2196 /*
2205 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2197 * Do this once, we can't return to user-mode if freezing() == T.
2206 * While in TASK_STOPPED, we were considered "frozen enough". 2198 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2207 * Now that we woke up, it's crucial if we're supposed to be 2199 * thus do not need another check after return.
2208 * frozen that we freeze now before running anything substantial.
2209 */ 2200 */
2210 try_to_freeze(); 2201 try_to_freeze();
2211 2202
2203relock:
2212 spin_lock_irq(&sighand->siglock); 2204 spin_lock_irq(&sighand->siglock);
2213 /* 2205 /*
2214 * Every stopped thread goes here after wakeup. Check to see if 2206 * Every stopped thread goes here after wakeup. Check to see if
@@ -2265,8 +2257,7 @@ relock:
2265 break; /* will return 0 */ 2257 break; /* will return 0 */
2266 2258
2267 if (unlikely(current->ptrace) && signr != SIGKILL) { 2259 if (unlikely(current->ptrace) && signr != SIGKILL) {
2268 signr = ptrace_signal(signr, info, 2260 signr = ptrace_signal(signr, info);
2269 regs, cookie);
2270 if (!signr) 2261 if (!signr)
2271 continue; 2262 continue;
2272 } 2263 }
@@ -2351,7 +2342,7 @@ relock:
2351 2342
2352 if (sig_kernel_coredump(signr)) { 2343 if (sig_kernel_coredump(signr)) {
2353 if (print_fatal_signals) 2344 if (print_fatal_signals)
2354 print_fatal_signal(regs, info->si_signo); 2345 print_fatal_signal(info->si_signo);
2355 /* 2346 /*
2356 * If it was able to dump core, this kills all 2347 * If it was able to dump core, this kills all
2357 * other threads in the group and synchronizes with 2348 * other threads in the group and synchronizes with
@@ -2360,7 +2351,7 @@ relock:
2360 * first and our do_group_exit call below will use 2351 * first and our do_group_exit call below will use
2361 * that value and ignore the one we pass it. 2352 * that value and ignore the one we pass it.
2362 */ 2353 */
2363 do_coredump(info, regs); 2354 do_coredump(info);
2364 } 2355 }
2365 2356
2366 /* 2357 /*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index cc96bdc0c2c9..ed567babe789 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 vtime_account(current); 224 vtime_account_irq_enter(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -272,7 +272,7 @@ restart:
272 272
273 lockdep_softirq_exit(); 273 lockdep_softirq_exit();
274 274
275 vtime_account(current); 275 vtime_account_irq_exit(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 276 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 278}
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
341 */ 341 */
342void irq_exit(void) 342void irq_exit(void)
343{ 343{
344 vtime_account(current); 344 vtime_account_irq_exit(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd844..2b859828cdc3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
34#include <linux/delay.h> 36#include <linux/delay.h>
35#include <linux/srcu.h> 37#include <linux/srcu.h>
36 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
37/* 43/*
38 * Initialize an rcu_batch structure to empty. 44 * Initialize an rcu_batch structure to empty.
39 */ 45 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
92 } 98 }
93} 99}
94 100
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
98static int init_srcu_struct_fields(struct srcu_struct *sp) 101static int init_srcu_struct_fields(struct srcu_struct *sp)
99{ 102{
100 sp->completed = 0; 103 sp->completed = 0;
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
464 */ 467 */
465void synchronize_srcu(struct srcu_struct *sp) 468void synchronize_srcu(struct srcu_struct *sp)
466{ 469{
467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 470 __synchronize_srcu(sp, rcu_expedited
471 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
472 : SYNCHRONIZE_SRCU_TRYCOUNT);
468} 473}
469EXPORT_SYMBOL_GPL(synchronize_srcu); 474EXPORT_SYMBOL_GPL(synchronize_srcu);
470 475
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
637/* 642/*
638 * This is the work-queue function that handles SRCU grace periods. 643 * This is the work-queue function that handles SRCU grace periods.
639 */ 644 */
640static void process_srcu(struct work_struct *work) 645void process_srcu(struct work_struct *work)
641{ 646{
642 struct srcu_struct *sp; 647 struct srcu_struct *sp;
643 648
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work)
648 srcu_invoke_callbacks(sp); 653 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp); 654 srcu_reschedule(sp);
650} 655}
656EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/sys.c b/kernel/sys.c
index e6e0ece5f6a0..265b37690421 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms)
1046 cputime_t tgutime, tgstime, cutime, cstime; 1046 cputime_t tgutime, tgstime, cutime, cstime;
1047 1047
1048 spin_lock_irq(&current->sighand->siglock); 1048 spin_lock_irq(&current->sighand->siglock);
1049 thread_group_times(current, &tgutime, &tgstime); 1049 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
1050 cutime = current->signal->cutime; 1050 cutime = current->signal->cutime;
1051 cstime = current->signal->cstime; 1051 cstime = current->signal->cstime;
1052 spin_unlock_irq(&current->sighand->siglock); 1052 spin_unlock_irq(&current->sighand->siglock);
@@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1704 utime = stime = 0; 1704 utime = stime = 0;
1705 1705
1706 if (who == RUSAGE_THREAD) { 1706 if (who == RUSAGE_THREAD) {
1707 task_times(current, &utime, &stime); 1707 task_cputime_adjusted(current, &utime, &stime);
1708 accumulate_thread_rusage(p, r); 1708 accumulate_thread_rusage(p, r);
1709 maxrss = p->signal->maxrss; 1709 maxrss = p->signal->maxrss;
1710 goto out; 1710 goto out;
@@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1730 break; 1730 break;
1731 1731
1732 case RUSAGE_SELF: 1732 case RUSAGE_SELF:
1733 thread_group_times(p, &tgutime, &tgstime); 1733 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1734 utime += tgutime; 1734 utime += tgutime;
1735 stime += tgstime; 1735 stime += tgstime;
1736 r->ru_nvcsw += p->signal->nvcsw; 1736 r->ru_nvcsw += p->signal->nvcsw;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..c88878db491e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 257static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
259static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261#endif 262#endif /* CONFIG_SMP */
263#endif /* CONFIG_SCHED_DEBUG */
262 264
263#ifdef CONFIG_COMPACTION 265#ifdef CONFIG_COMPACTION
264static int min_extfrag_threshold; 266static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301 .extra1 = &min_wakeup_granularity_ns, 303 .extra1 = &min_wakeup_granularity_ns,
302 .extra2 = &max_wakeup_granularity_ns, 304 .extra2 = &max_wakeup_granularity_ns,
303 }, 305 },
306#ifdef CONFIG_SMP
304 { 307 {
305 .procname = "sched_tunable_scaling", 308 .procname = "sched_tunable_scaling",
306 .data = &sysctl_sched_tunable_scaling, 309 .data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = {
347 .extra1 = &zero, 350 .extra1 = &zero,
348 .extra2 = &one, 351 .extra2 = &one,
349 }, 352 },
350#endif 353#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_delay_ms",
357 .data = &sysctl_numa_balancing_scan_delay,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_min_ms",
364 .data = &sysctl_numa_balancing_scan_period_min,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369 {
370 .procname = "numa_balancing_scan_period_reset",
371 .data = &sysctl_numa_balancing_scan_period_reset,
372 .maxlen = sizeof(unsigned int),
373 .mode = 0644,
374 .proc_handler = proc_dointvec,
375 },
376 {
377 .procname = "numa_balancing_scan_period_max_ms",
378 .data = &sysctl_numa_balancing_scan_period_max,
379 .maxlen = sizeof(unsigned int),
380 .mode = 0644,
381 .proc_handler = proc_dointvec,
382 },
383 {
384 .procname = "numa_balancing_scan_size_mb",
385 .data = &sysctl_numa_balancing_scan_size,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec,
389 },
390#endif /* CONFIG_NUMA_BALANCING */
391#endif /* CONFIG_SCHED_DEBUG */
351 { 392 {
352 .procname = "sched_rt_period_us", 393 .procname = "sched_rt_period_us",
353 .data = &sysctl_sched_rt_period, 394 .data = &sysctl_sched_rt_period,
@@ -565,7 +606,7 @@ static struct ctl_table kern_table[] = {
565 .extra2 = &one, 606 .extra2 = &one,
566 }, 607 },
567#endif 608#endif
568#ifdef CONFIG_HOTPLUG 609
569 { 610 {
570 .procname = "hotplug", 611 .procname = "hotplug",
571 .data = &uevent_helper, 612 .data = &uevent_helper,
@@ -573,7 +614,7 @@ static struct ctl_table kern_table[] = {
573 .mode = 0644, 614 .mode = 0644,
574 .proc_handler = proc_dostring, 615 .proc_handler = proc_dostring,
575 }, 616 },
576#endif 617
577#ifdef CONFIG_CHR_DEV_SG 618#ifdef CONFIG_CHR_DEV_SG
578 { 619 {
579 .procname = "sg-big-buff", 620 .procname = "sg-big-buff",
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c2..ff7d9d2ab504 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 6629bf7b5285..7a925ba456fb 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
58 return (cycle_t) jiffies; 58 return (cycle_t) jiffies;
59} 59}
60 60
61struct clocksource clocksource_jiffies = { 61static struct clocksource clocksource_jiffies = {
62 .name = "jiffies", 62 .name = "jiffies",
63 .rating = 1, /* lowest valid rating*/ 63 .rating = 1, /* lowest valid rating*/
64 .read = jiffies_read, 64 .read = jiffies_read,
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = {
67 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
68}; 68};
69 69
70__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
71
70#if (BITS_PER_LONG < 64) 72#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void) 73u64 get_jiffies_64(void)
72{ 74{
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void)
74 u64 ret; 76 u64 ret;
75 77
76 do { 78 do {
77 seq = read_seqbegin(&xtime_lock); 79 seq = read_seqbegin(&jiffies_lock);
78 ret = jiffies_64; 80 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq)); 81 } while (read_seqretry(&jiffies_lock, seq));
80 return ret; 82 return ret;
81} 83}
82EXPORT_SYMBOL(get_jiffies_64); 84EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e4..b1600a6973f4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
63static void tick_periodic(int cpu) 63static void tick_periodic(int cpu)
64{ 64{
65 if (tick_do_timer_cpu == cpu) { 65 if (tick_do_timer_cpu == cpu) {
66 write_seqlock(&xtime_lock); 66 write_seqlock(&jiffies_lock);
67 67
68 /* Keep track of the next tick event */ 68 /* Keep track of the next tick event */
69 tick_next_period = ktime_add(tick_next_period, tick_period); 69 tick_next_period = ktime_add(tick_next_period, tick_period);
70 70
71 do_timer(1); 71 do_timer(1);
72 write_sequnlock(&xtime_lock); 72 write_sequnlock(&jiffies_lock);
73 } 73 }
74 74
75 update_process_times(user_mode(get_irq_regs())); 75 update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
130 ktime_t next; 130 ktime_t next;
131 131
132 do { 132 do {
133 seq = read_seqbegin(&xtime_lock); 133 seq = read_seqbegin(&jiffies_lock);
134 next = tick_next_period; 134 next = tick_next_period;
135 } while (read_seqretry(&xtime_lock, seq)); 135 } while (read_seqretry(&jiffies_lock, seq));
136 136
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e265b901fed..cf3e59ed6dc0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
141#endif 141#endif
142 142
143extern void do_timer(unsigned long ticks); 143extern void do_timer(unsigned long ticks);
144extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a40260885265..d58e552d9fd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 32
33/* 33/*
34 * The time, when the last jiffy update happened. Protected by xtime_lock. 34 * The time, when the last jiffy update happened. Protected by jiffies_lock.
35 */ 35 */
36static ktime_t last_jiffies_update; 36static ktime_t last_jiffies_update;
37 37
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now)
49 ktime_t delta; 49 ktime_t delta;
50 50
51 /* 51 /*
52 * Do a quick check without holding xtime_lock: 52 * Do a quick check without holding jiffies_lock:
53 */ 53 */
54 delta = ktime_sub(now, last_jiffies_update); 54 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 < tick_period.tv64) 55 if (delta.tv64 < tick_period.tv64)
56 return; 56 return;
57 57
58 /* Reevalute with xtime_lock held */ 58 /* Reevalute with jiffies_lock held */
59 write_seqlock(&xtime_lock); 59 write_seqlock(&jiffies_lock);
60 60
61 delta = ktime_sub(now, last_jiffies_update); 61 delta = ktime_sub(now, last_jiffies_update);
62 if (delta.tv64 >= tick_period.tv64) { 62 if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now)
79 /* Keep the tick_next_period variable up to date */ 79 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period); 80 tick_next_period = ktime_add(last_jiffies_update, tick_period);
81 } 81 }
82 write_sequnlock(&xtime_lock); 82 write_sequnlock(&jiffies_lock);
83} 83}
84 84
85/* 85/*
@@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void)
89{ 89{
90 ktime_t period; 90 ktime_t period;
91 91
92 write_seqlock(&xtime_lock); 92 write_seqlock(&jiffies_lock);
93 /* Did we start the jiffies update yet ? */ 93 /* Did we start the jiffies update yet ? */
94 if (last_jiffies_update.tv64 == 0) 94 if (last_jiffies_update.tv64 == 0)
95 last_jiffies_update = tick_next_period; 95 last_jiffies_update = tick_next_period;
96 period = last_jiffies_update; 96 period = last_jiffies_update;
97 write_sequnlock(&xtime_lock); 97 write_sequnlock(&jiffies_lock);
98 return period; 98 return period;
99} 99}
100 100
101
102static void tick_sched_do_timer(ktime_t now)
103{
104 int cpu = smp_processor_id();
105
106#ifdef CONFIG_NO_HZ
107 /*
108 * Check if the do_timer duty was dropped. We don't care about
109 * concurrency: This happens only when the cpu in charge went
110 * into a long sleep. If two cpus happen to assign themself to
111 * this duty, then the jiffies update is still serialized by
112 * jiffies_lock.
113 */
114 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
115 tick_do_timer_cpu = cpu;
116#endif
117
118 /* Check, if the jiffies need an update */
119 if (tick_do_timer_cpu == cpu)
120 tick_do_update_jiffies64(now);
121}
122
123static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
124{
125#ifdef CONFIG_NO_HZ
126 /*
127 * When we are idle and the tick is stopped, we have to touch
128 * the watchdog as we might not schedule for a really long
129 * time. This happens on complete idle SMP systems while
130 * waiting on the login prompt. We also increment the "start of
131 * idle" jiffy stamp so the idle accounting adjustment we do
132 * when we go busy again does not account too much ticks.
133 */
134 if (ts->tick_stopped) {
135 touch_softlockup_watchdog();
136 if (is_idle_task(current))
137 ts->idle_jiffies++;
138 }
139#endif
140 update_process_times(user_mode(regs));
141 profile_tick(CPU_PROFILING);
142}
143
101/* 144/*
102 * NOHZ - aka dynamic tick functionality 145 * NOHZ - aka dynamic tick functionality
103 */ 146 */
@@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
282 325
283 /* Read jiffies and the time when jiffies were updated last */ 326 /* Read jiffies and the time when jiffies were updated last */
284 do { 327 do {
285 seq = read_seqbegin(&xtime_lock); 328 seq = read_seqbegin(&jiffies_lock);
286 last_update = last_jiffies_update; 329 last_update = last_jiffies_update;
287 last_jiffies = jiffies; 330 last_jiffies = jiffies;
288 time_delta = timekeeping_max_deferment(); 331 time_delta = timekeeping_max_deferment();
289 } while (read_seqretry(&xtime_lock, seq)); 332 } while (read_seqretry(&jiffies_lock, seq));
290 333
291 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
292 arch_needs_cpu(cpu)) { 335 arch_needs_cpu(cpu)) {
@@ -526,6 +569,8 @@ void tick_nohz_irq_exit(void)
526 if (!ts->inidle) 569 if (!ts->inidle)
527 return; 570 return;
528 571
572 /* Cancel the timer because CPU already waken up from the C-states*/
573 menu_hrtimer_cancel();
529 __tick_nohz_idle_enter(ts); 574 __tick_nohz_idle_enter(ts);
530} 575}
531 576
@@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void)
621 666
622 ts->inidle = 0; 667 ts->inidle = 0;
623 668
669 /* Cancel the timer because CPU already waken up from the C-states*/
670 menu_hrtimer_cancel();
624 if (ts->idle_active || ts->tick_stopped) 671 if (ts->idle_active || ts->tick_stopped)
625 now = ktime_get(); 672 now = ktime_get();
626 673
@@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)
648{ 695{
649 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 696 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
650 struct pt_regs *regs = get_irq_regs(); 697 struct pt_regs *regs = get_irq_regs();
651 int cpu = smp_processor_id();
652 ktime_t now = ktime_get(); 698 ktime_t now = ktime_get();
653 699
654 dev->next_event.tv64 = KTIME_MAX; 700 dev->next_event.tv64 = KTIME_MAX;
655 701
656 /* 702 tick_sched_do_timer(now);
657 * Check if the do_timer duty was dropped. We don't care about 703 tick_sched_handle(ts, regs);
658 * concurrency: This happens only when the cpu in charge went
659 * into a long sleep. If two cpus happen to assign themself to
660 * this duty, then the jiffies update is still serialized by
661 * xtime_lock.
662 */
663 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
664 tick_do_timer_cpu = cpu;
665
666 /* Check, if the jiffies need an update */
667 if (tick_do_timer_cpu == cpu)
668 tick_do_update_jiffies64(now);
669
670 /*
671 * When we are idle and the tick is stopped, we have to touch
672 * the watchdog as we might not schedule for a really long
673 * time. This happens on complete idle SMP systems while
674 * waiting on the login prompt. We also increment the "start
675 * of idle" jiffy stamp so the idle accounting adjustment we
676 * do when we go busy again does not account too much ticks.
677 */
678 if (ts->tick_stopped) {
679 touch_softlockup_watchdog();
680 ts->idle_jiffies++;
681 }
682
683 update_process_times(user_mode(regs));
684 profile_tick(CPU_PROFILING);
685 704
686 while (tick_nohz_reprogram(ts, now)) { 705 while (tick_nohz_reprogram(ts, now)) {
687 now = ktime_get(); 706 now = ktime_get();
@@ -794,7 +813,7 @@ void tick_check_idle(int cpu)
794#ifdef CONFIG_HIGH_RES_TIMERS 813#ifdef CONFIG_HIGH_RES_TIMERS
795/* 814/*
796 * We rearm the timer until we get disabled by the idle code. 815 * We rearm the timer until we get disabled by the idle code.
797 * Called with interrupts disabled and timer->base->cpu_base->lock held. 816 * Called with interrupts disabled.
798 */ 817 */
799static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 818static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
800{ 819{
@@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
802 container_of(timer, struct tick_sched, sched_timer); 821 container_of(timer, struct tick_sched, sched_timer);
803 struct pt_regs *regs = get_irq_regs(); 822 struct pt_regs *regs = get_irq_regs();
804 ktime_t now = ktime_get(); 823 ktime_t now = ktime_get();
805 int cpu = smp_processor_id();
806 824
807#ifdef CONFIG_NO_HZ 825 tick_sched_do_timer(now);
808 /*
809 * Check if the do_timer duty was dropped. We don't care about
810 * concurrency: This happens only when the cpu in charge went
811 * into a long sleep. If two cpus happen to assign themself to
812 * this duty, then the jiffies update is still serialized by
813 * xtime_lock.
814 */
815 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
816 tick_do_timer_cpu = cpu;
817#endif
818
819 /* Check, if the jiffies need an update */
820 if (tick_do_timer_cpu == cpu)
821 tick_do_update_jiffies64(now);
822 826
823 /* 827 /*
824 * Do not call, when we are not in irq context and have 828 * Do not call, when we are not in irq context and have
825 * no valid regs pointer 829 * no valid regs pointer
826 */ 830 */
827 if (regs) { 831 if (regs)
828 /* 832 tick_sched_handle(ts, regs);
829 * When we are idle and the tick is stopped, we have to touch
830 * the watchdog as we might not schedule for a really long
831 * time. This happens on complete idle SMP systems while
832 * waiting on the login prompt. We also increment the "start of
833 * idle" jiffy stamp so the idle accounting adjustment we do
834 * when we go busy again does not account too much ticks.
835 */
836 if (ts->tick_stopped) {
837 touch_softlockup_watchdog();
838 if (is_idle_task(current))
839 ts->idle_jiffies++;
840 }
841 update_process_times(user_mode(regs));
842 profile_tick(CPU_PROFILING);
843 }
844 833
845 hrtimer_forward(timer, now, tick_period); 834 hrtimer_forward(timer, now, tick_period);
846 835
@@ -874,7 +863,7 @@ void tick_setup_sched_timer(void)
874 /* Get the next period (per cpu) */ 863 /* Get the next period (per cpu) */
875 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 864 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
876 865
877 /* Offset the tick to avert xtime_lock contention. */ 866 /* Offset the tick to avert jiffies_lock contention. */
878 if (sched_skew_tick) { 867 if (sched_skew_tick) {
879 u64 offset = ktime_to_ns(tick_period) >> 1; 868 u64 offset = ktime_to_ns(tick_period) >> 1;
880 do_div(offset, num_possible_cpus()); 869 do_div(offset, num_possible_cpus());
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
deleted file mode 100644
index a9ae369925ce..000000000000
--- a/kernel/time/timecompare.c
+++ /dev/null
@@ -1,193 +0,0 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/math64.h>
24#include <linux/kernel.h>
25
26/*
27 * fixed point arithmetic scale factor for skew
28 *
29 * Usually one would measure skew in ppb (parts per billion, 1e9), but
30 * using a factor of 2 simplifies the math.
31 */
32#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
33
34ktime_t timecompare_transform(struct timecompare *sync,
35 u64 source_tstamp)
36{
37 u64 nsec;
38
39 nsec = source_tstamp + sync->offset;
40 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
41 TIMECOMPARE_SKEW_RESOLUTION;
42
43 return ns_to_ktime(nsec);
44}
45EXPORT_SYMBOL_GPL(timecompare_transform);
46
47int timecompare_offset(struct timecompare *sync,
48 s64 *offset,
49 u64 *source_tstamp)
50{
51 u64 start_source = 0, end_source = 0;
52 struct {
53 s64 offset;
54 s64 duration_target;
55 } buffer[10], sample, *samples;
56 int counter = 0, i;
57 int used;
58 int index;
59 int num_samples = sync->num_samples;
60
61 if (num_samples > ARRAY_SIZE(buffer)) {
62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
63 if (!samples) {
64 samples = buffer;
65 num_samples = ARRAY_SIZE(buffer);
66 }
67 } else {
68 samples = buffer;
69 }
70
71 /* run until we have enough valid samples, but do not try forever */
72 i = 0;
73 counter = 0;
74 while (1) {
75 u64 ts;
76 ktime_t start, end;
77
78 start = sync->target();
79 ts = timecounter_read(sync->source);
80 end = sync->target();
81
82 if (!i)
83 start_source = ts;
84
85 /* ignore negative durations */
86 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
87 if (sample.duration_target >= 0) {
88 /*
89 * assume symetric delay to and from source:
90 * average target time corresponds to measured
91 * source time
92 */
93 sample.offset =
94 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
95 ts;
96
97 /* simple insertion sort based on duration */
98 index = counter - 1;
99 while (index >= 0) {
100 if (samples[index].duration_target <
101 sample.duration_target)
102 break;
103 samples[index + 1] = samples[index];
104 index--;
105 }
106 samples[index + 1] = sample;
107 counter++;
108 }
109
110 i++;
111 if (counter >= num_samples || i >= 100000) {
112 end_source = ts;
113 break;
114 }
115 }
116
117 *source_tstamp = (end_source + start_source) / 2;
118
119 /* remove outliers by only using 75% of the samples */
120 used = counter * 3 / 4;
121 if (!used)
122 used = counter;
123 if (used) {
124 /* calculate average */
125 s64 off = 0;
126 for (index = 0; index < used; index++)
127 off += samples[index].offset;
128 *offset = div_s64(off, used);
129 }
130
131 if (samples && samples != buffer)
132 kfree(samples);
133
134 return used;
135}
136EXPORT_SYMBOL_GPL(timecompare_offset);
137
138void __timecompare_update(struct timecompare *sync,
139 u64 source_tstamp)
140{
141 s64 offset;
142 u64 average_time;
143
144 if (!timecompare_offset(sync, &offset, &average_time))
145 return;
146
147 if (!sync->last_update) {
148 sync->last_update = average_time;
149 sync->offset = offset;
150 sync->skew = 0;
151 } else {
152 s64 delta_nsec = average_time - sync->last_update;
153
154 /* avoid division by negative or small deltas */
155 if (delta_nsec >= 10000) {
156 s64 delta_offset_nsec = offset - sync->offset;
157 s64 skew; /* delta_offset_nsec *
158 TIMECOMPARE_SKEW_RESOLUTION /
159 delta_nsec */
160 u64 divisor;
161
162 /* div_s64() is limited to 32 bit divisor */
163 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
164 divisor = delta_nsec;
165 while (unlikely(divisor >= ((s64)1) << 32)) {
166 /* divide both by 2; beware, right shift
167 of negative value has undefined
168 behavior and can only be used for
169 the positive divisor */
170 skew = div_s64(skew, 2);
171 divisor >>= 1;
172 }
173 skew = div_s64(skew, divisor);
174
175 /*
176 * Calculate new overall skew as 4/16 the
177 * old value and 12/16 the new one. This is
178 * a rather arbitrary tradeoff between
179 * only using the latest measurement (0/16 and
180 * 16/16) and even more weight on past measurements.
181 */
182#define TIMECOMPARE_NEW_SKEW_PER_16 12
183 sync->skew =
184 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
185 sync->skew +
186 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
187 16);
188 sync->last_update = average_time;
189 sync->offset = offset;
190 }
191 }
192}
193EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e424970bb562..cbc6acb0db3f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,16 +21,11 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h>
24 25
25 26
26static struct timekeeper timekeeper; 27static struct timekeeper timekeeper;
27 28
28/*
29 * This read-write spinlock protects us from races in SMP while
30 * playing with xtime.
31 */
32__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
33
34/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
35int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
36 31
@@ -180,6 +175,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
180 return nsec + arch_gettimeoffset(); 175 return nsec + arch_gettimeoffset();
181} 176}
182 177
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
179
180static void update_pvclock_gtod(struct timekeeper *tk)
181{
182 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
183}
184
185/**
186 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
187 *
188 * Must hold write on timekeeper.lock
189 */
190int pvclock_gtod_register_notifier(struct notifier_block *nb)
191{
192 struct timekeeper *tk = &timekeeper;
193 unsigned long flags;
194 int ret;
195
196 write_seqlock_irqsave(&tk->lock, flags);
197 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
198 /* update timekeeping data */
199 update_pvclock_gtod(tk);
200 write_sequnlock_irqrestore(&tk->lock, flags);
201
202 return ret;
203}
204EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
205
206/**
207 * pvclock_gtod_unregister_notifier - unregister a pvclock
208 * timedata update listener
209 *
210 * Must hold write on timekeeper.lock
211 */
212int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
213{
214 struct timekeeper *tk = &timekeeper;
215 unsigned long flags;
216 int ret;
217
218 write_seqlock_irqsave(&tk->lock, flags);
219 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
220 write_sequnlock_irqrestore(&tk->lock, flags);
221
222 return ret;
223}
224EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
225
183/* must hold write on timekeeper.lock */ 226/* must hold write on timekeeper.lock */
184static void timekeeping_update(struct timekeeper *tk, bool clearntp) 227static void timekeeping_update(struct timekeeper *tk, bool clearntp)
185{ 228{
@@ -188,6 +231,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
188 ntp_clear(); 231 ntp_clear();
189 } 232 }
190 update_vsyscall(tk); 233 update_vsyscall(tk);
234 update_pvclock_gtod(tk);
191} 235}
192 236
193/** 237/**
@@ -1299,9 +1343,7 @@ struct timespec get_monotonic_coarse(void)
1299} 1343}
1300 1344
1301/* 1345/*
1302 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1346 * Must hold jiffies_lock
1303 * without sampling the sequence number in xtime_lock.
1304 * jiffies is defined in the linker script...
1305 */ 1347 */
1306void do_timer(unsigned long ticks) 1348void do_timer(unsigned long ticks)
1307{ 1349{
@@ -1389,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1389 */ 1431 */
1390void xtime_update(unsigned long ticks) 1432void xtime_update(unsigned long ticks)
1391{ 1433{
1392 write_seqlock(&xtime_lock); 1434 write_seqlock(&jiffies_lock);
1393 do_timer(ticks); 1435 do_timer(ticks);
1394 write_sequnlock(&xtime_lock); 1436 write_sequnlock(&jiffies_lock);
1395} 1437}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4cea4f41c1d9..5d89335a485f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -119,6 +119,7 @@ config TRACING
119 select BINARY_PRINTF 119 select BINARY_PRINTF
120 select EVENT_TRACING 120 select EVENT_TRACING
121 select TRACE_CLOCK 121 select TRACE_CLOCK
122 select IRQ_WORK
122 123
123config GENERIC_TRACER 124config GENERIC_TRACER
124 bool 125 bool
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9dcf15d38380..3ffe4c5ad3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
10 * Based on code in the latency_tracer, that is: 10 * Based on code in the latency_tracer, that is:
11 * 11 *
12 * Copyright (C) 2004-2006 Ingo Molnar 12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III 13 * Copyright (C) 2004 Nadia Yvette Chambers
14 */ 14 */
15 15
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
@@ -2437,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2437{ 2437{
2438 iter->pos = 0; 2438 iter->pos = 0;
2439 iter->func_pos = 0; 2439 iter->func_pos = 0;
2440 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); 2440 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
2441} 2441}
2442 2442
2443static void *t_start(struct seq_file *m, loff_t *pos) 2443static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2675} 2675}
2676 2676
2677loff_t 2677loff_t
2678ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2678ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2679{ 2679{
2680 loff_t ret; 2680 loff_t ret;
2681 2681
2682 if (file->f_mode & FMODE_READ) 2682 if (file->f_mode & FMODE_READ)
2683 ret = seq_lseek(file, offset, origin); 2683 ret = seq_lseek(file, offset, whence);
2684 else 2684 else
2685 file->f_pos = ret = 1; 2685 file->f_pos = ret = 1;
2686 2686
@@ -2868,7 +2868,7 @@ static int __init ftrace_mod_cmd_init(void)
2868{ 2868{
2869 return register_ftrace_command(&ftrace_mod_cmd); 2869 return register_ftrace_command(&ftrace_mod_cmd);
2870} 2870}
2871device_initcall(ftrace_mod_cmd_init); 2871core_initcall(ftrace_mod_cmd_init);
2872 2872
2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, 2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2874 struct ftrace_ops *op, struct pt_regs *pt_regs) 2874 struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -4055,7 +4055,7 @@ static int __init ftrace_nodyn_init(void)
4055 ftrace_enabled = 1; 4055 ftrace_enabled = 1;
4056 return 0; 4056 return 0;
4057} 4057}
4058device_initcall(ftrace_nodyn_init); 4058core_initcall(ftrace_nodyn_init);
4059 4059
4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4061static inline void ftrace_startup_enable(int command) { } 4061static inline void ftrace_startup_enable(int command) { }
@@ -4381,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
4381 if (strlen(tmp) == 0) 4381 if (strlen(tmp) == 0)
4382 return 1; 4382 return 1;
4383 4383
4384 ret = strict_strtol(tmp, 10, &val); 4384 ret = kstrtol(tmp, 10, &val);
4385 if (ret < 0) 4385 if (ret < 0)
4386 return ret; 4386 return ret;
4387 4387
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b979426d16c6..ce8514feedcd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -460,9 +460,10 @@ struct ring_buffer_per_cpu {
460 unsigned long lost_events; 460 unsigned long lost_events;
461 unsigned long last_overrun; 461 unsigned long last_overrun;
462 local_t entries_bytes; 462 local_t entries_bytes;
463 local_t commit_overrun;
464 local_t overrun;
465 local_t entries; 463 local_t entries;
464 local_t overrun;
465 local_t commit_overrun;
466 local_t dropped_events;
466 local_t committing; 467 local_t committing;
467 local_t commits; 468 local_t commits;
468 unsigned long read; 469 unsigned long read;
@@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1396 struct list_head *head_page_with_bit; 1397 struct list_head *head_page_with_bit;
1397 1398
1398 head_page = &rb_set_head_page(cpu_buffer)->list; 1399 head_page = &rb_set_head_page(cpu_buffer)->list;
1400 if (!head_page)
1401 break;
1399 prev_page = head_page->prev; 1402 prev_page = head_page->prev;
1400 1403
1401 first_page = pages->next; 1404 first_page = pages->next;
@@ -1820,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1820} 1823}
1821 1824
1822/** 1825/**
1823 * ring_buffer_update_event - update event type and data 1826 * rb_update_event - update event type and data
1824 * @event: the even to update 1827 * @event: the even to update
1825 * @type: the type of event 1828 * @type: the type of event
1826 * @length: the size of the event field in the ring buffer 1829 * @length: the size of the event field in the ring buffer
@@ -2155,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2155 * If we are not in overwrite mode, 2158 * If we are not in overwrite mode,
2156 * this is easy, just stop here. 2159 * this is easy, just stop here.
2157 */ 2160 */
2158 if (!(buffer->flags & RB_FL_OVERWRITE)) 2161 if (!(buffer->flags & RB_FL_OVERWRITE)) {
2162 local_inc(&cpu_buffer->dropped_events);
2159 goto out_reset; 2163 goto out_reset;
2164 }
2160 2165
2161 ret = rb_handle_head_page(cpu_buffer, 2166 ret = rb_handle_head_page(cpu_buffer,
2162 tail_page, 2167 tail_page,
@@ -2720,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2720 * and not the length of the event which would hold the header. 2725 * and not the length of the event which would hold the header.
2721 */ 2726 */
2722int ring_buffer_write(struct ring_buffer *buffer, 2727int ring_buffer_write(struct ring_buffer *buffer,
2723 unsigned long length, 2728 unsigned long length,
2724 void *data) 2729 void *data)
2725{ 2730{
2726 struct ring_buffer_per_cpu *cpu_buffer; 2731 struct ring_buffer_per_cpu *cpu_buffer;
2727 struct ring_buffer_event *event; 2732 struct ring_buffer_event *event;
@@ -2929,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2929 * @buffer: The ring buffer 2934 * @buffer: The ring buffer
2930 * @cpu: The per CPU buffer to read from. 2935 * @cpu: The per CPU buffer to read from.
2931 */ 2936 */
2932unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) 2937u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2933{ 2938{
2934 unsigned long flags; 2939 unsigned long flags;
2935 struct ring_buffer_per_cpu *cpu_buffer; 2940 struct ring_buffer_per_cpu *cpu_buffer;
2936 struct buffer_page *bpage; 2941 struct buffer_page *bpage;
2937 unsigned long ret; 2942 u64 ret = 0;
2938 2943
2939 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2944 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2940 return 0; 2945 return 0;
@@ -2949,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2949 bpage = cpu_buffer->reader_page; 2954 bpage = cpu_buffer->reader_page;
2950 else 2955 else
2951 bpage = rb_set_head_page(cpu_buffer); 2956 bpage = rb_set_head_page(cpu_buffer);
2952 ret = bpage->page->time_stamp; 2957 if (bpage)
2958 ret = bpage->page->time_stamp;
2953 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2959 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2954 2960
2955 return ret; 2961 return ret;
@@ -2995,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2995EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 3001EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2996 3002
2997/** 3003/**
2998 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 3004 * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
3005 * buffer wrapping around (only if RB_FL_OVERWRITE is on).
2999 * @buffer: The ring buffer 3006 * @buffer: The ring buffer
3000 * @cpu: The per CPU buffer to get the number of overruns from 3007 * @cpu: The per CPU buffer to get the number of overruns from
3001 */ 3008 */
@@ -3015,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
3015EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 3022EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
3016 3023
3017/** 3024/**
3018 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 3025 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
3026 * commits failing due to the buffer wrapping around while there are uncommitted
3027 * events, such as during an interrupt storm.
3019 * @buffer: The ring buffer 3028 * @buffer: The ring buffer
3020 * @cpu: The per CPU buffer to get the number of overruns from 3029 * @cpu: The per CPU buffer to get the number of overruns from
3021 */ 3030 */
@@ -3036,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
3036EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); 3045EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
3037 3046
3038/** 3047/**
3048 * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
3049 * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
3050 * @buffer: The ring buffer
3051 * @cpu: The per CPU buffer to get the number of overruns from
3052 */
3053unsigned long
3054ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3055{
3056 struct ring_buffer_per_cpu *cpu_buffer;
3057 unsigned long ret;
3058
3059 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3060 return 0;
3061
3062 cpu_buffer = buffer->buffers[cpu];
3063 ret = local_read(&cpu_buffer->dropped_events);
3064
3065 return ret;
3066}
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068
3069/**
3039 * ring_buffer_entries - get the number of entries in a buffer 3070 * ring_buffer_entries - get the number of entries in a buffer
3040 * @buffer: The ring buffer 3071 * @buffer: The ring buffer
3041 * 3072 *
@@ -3260,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3260 * Splice the empty reader page into the list around the head. 3291 * Splice the empty reader page into the list around the head.
3261 */ 3292 */
3262 reader = rb_set_head_page(cpu_buffer); 3293 reader = rb_set_head_page(cpu_buffer);
3294 if (!reader)
3295 goto out;
3263 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); 3296 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
3264 cpu_buffer->reader_page->list.prev = reader->list.prev; 3297 cpu_buffer->reader_page->list.prev = reader->list.prev;
3265 3298
@@ -3778,12 +3811,17 @@ void
3778ring_buffer_read_finish(struct ring_buffer_iter *iter) 3811ring_buffer_read_finish(struct ring_buffer_iter *iter)
3779{ 3812{
3780 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3813 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3814 unsigned long flags;
3781 3815
3782 /* 3816 /*
3783 * Ring buffer is disabled from recording, here's a good place 3817 * Ring buffer is disabled from recording, here's a good place
3784 * to check the integrity of the ring buffer. 3818 * to check the integrity of the ring buffer.
3819 * Must prevent readers from trying to read, as the check
3820 * clears the HEAD page and readers require it.
3785 */ 3821 */
3822 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3786 rb_check_pages(cpu_buffer); 3823 rb_check_pages(cpu_buffer);
3824 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3825
3788 atomic_dec(&cpu_buffer->record_disabled); 3826 atomic_dec(&cpu_buffer->record_disabled);
3789 atomic_dec(&cpu_buffer->buffer->resize_disabled); 3827 atomic_dec(&cpu_buffer->buffer->resize_disabled);
@@ -3864,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3864 local_set(&cpu_buffer->reader_page->page->commit, 0); 3902 local_set(&cpu_buffer->reader_page->page->commit, 0);
3865 cpu_buffer->reader_page->read = 0; 3903 cpu_buffer->reader_page->read = 0;
3866 3904
3867 local_set(&cpu_buffer->commit_overrun, 0);
3868 local_set(&cpu_buffer->entries_bytes, 0); 3905 local_set(&cpu_buffer->entries_bytes, 0);
3869 local_set(&cpu_buffer->overrun, 0); 3906 local_set(&cpu_buffer->overrun, 0);
3907 local_set(&cpu_buffer->commit_overrun, 0);
3908 local_set(&cpu_buffer->dropped_events, 0);
3870 local_set(&cpu_buffer->entries, 0); 3909 local_set(&cpu_buffer->entries, 0);
3871 local_set(&cpu_buffer->committing, 0); 3910 local_set(&cpu_buffer->committing, 0);
3872 local_set(&cpu_buffer->commits, 0); 3911 local_set(&cpu_buffer->commits, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 31e4f55773f1..61e081b4ba11 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Based on code from the latency_tracer, that is: 10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 Nadia Yvette Chambers
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <generated/utsrelease.h> 15#include <generated/utsrelease.h>
@@ -19,6 +19,7 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/hardirq.h> 25#include <linux/hardirq.h>
@@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
78} 79}
79 80
80/* 81/*
82 * To prevent the comm cache from being overwritten when no
83 * tracing is active, only save the comm when a trace event
84 * occurred.
85 */
86static DEFINE_PER_CPU(bool, trace_cmdline_save);
87
88/*
89 * When a reader is waiting for data, then this variable is
90 * set to true.
91 */
92static bool trace_wakeup_needed;
93
94static struct irq_work trace_work_wakeup;
95
96/*
81 * Kill all tracing for good (never come back). 97 * Kill all tracing for good (never come back).
82 * It is initialized to 1 but will turn to zero if the initialization 98 * It is initialized to 1 but will turn to zero if the initialization
83 * of the tracer is successful. But that is the only place that sets 99 * of the tracer is successful. But that is the only place that sets
@@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str)
139} 155}
140__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 156__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
141 157
158
159static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
160static char *trace_boot_options __initdata;
161
162static int __init set_trace_boot_options(char *str)
163{
164 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
165 trace_boot_options = trace_boot_options_buf;
166 return 0;
167}
168__setup("trace_options=", set_trace_boot_options);
169
142unsigned long long ns2usecs(cycle_t nsec) 170unsigned long long ns2usecs(cycle_t nsec)
143{ 171{
144 nsec += 500; 172 nsec += 500;
@@ -198,20 +226,9 @@ static struct trace_array max_tr;
198 226
199static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); 227static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
200 228
201/* tracer_enabled is used to toggle activation of a tracer */
202static int tracer_enabled = 1;
203
204/**
205 * tracing_is_enabled - return tracer_enabled status
206 *
207 * This function is used by other tracers to know the status
208 * of the tracer_enabled flag. Tracers may use this function
209 * to know if it should enable their features when starting
210 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
211 */
212int tracing_is_enabled(void) 229int tracing_is_enabled(void)
213{ 230{
214 return tracer_enabled; 231 return tracing_is_on();
215} 232}
216 233
217/* 234/*
@@ -333,12 +350,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
333static int trace_stop_count; 350static int trace_stop_count;
334static DEFINE_RAW_SPINLOCK(tracing_start_lock); 351static DEFINE_RAW_SPINLOCK(tracing_start_lock);
335 352
336static void wakeup_work_handler(struct work_struct *work) 353/**
354 * trace_wake_up - wake up tasks waiting for trace input
355 *
356 * Schedules a delayed work to wake up any task that is blocked on the
357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
359 */
360static void trace_wake_up(struct irq_work *work)
337{ 361{
338 wake_up(&trace_wait); 362 wake_up_all(&trace_wait);
339}
340 363
341static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); 364}
342 365
343/** 366/**
344 * tracing_on - enable tracing buffers 367 * tracing_on - enable tracing buffers
@@ -393,22 +416,6 @@ int tracing_is_on(void)
393} 416}
394EXPORT_SYMBOL_GPL(tracing_is_on); 417EXPORT_SYMBOL_GPL(tracing_is_on);
395 418
396/**
397 * trace_wake_up - wake up tasks waiting for trace input
398 *
399 * Schedules a delayed work to wake up any task that is blocked on the
400 * trace_wait queue. These is used with trace_poll for tasks polling the
401 * trace.
402 */
403void trace_wake_up(void)
404{
405 const unsigned long delay = msecs_to_jiffies(2);
406
407 if (trace_flags & TRACE_ITER_BLOCK)
408 return;
409 schedule_delayed_work(&wakeup_work, delay);
410}
411
412static int __init set_buf_size(char *str) 419static int __init set_buf_size(char *str)
413{ 420{
414 unsigned long buf_size; 421 unsigned long buf_size;
@@ -431,7 +438,7 @@ static int __init set_tracing_thresh(char *str)
431 438
432 if (!str) 439 if (!str)
433 return 0; 440 return 0;
434 ret = strict_strtoul(str, 0, &threshold); 441 ret = kstrtoul(str, 0, &threshold);
435 if (ret < 0) 442 if (ret < 0)
436 return 0; 443 return 0;
437 tracing_thresh = threshold * 1000; 444 tracing_thresh = threshold * 1000;
@@ -477,10 +484,12 @@ static const char *trace_options[] = {
477static struct { 484static struct {
478 u64 (*func)(void); 485 u64 (*func)(void);
479 const char *name; 486 const char *name;
487 int in_ns; /* is this clock in nanoseconds? */
480} trace_clocks[] = { 488} trace_clocks[] = {
481 { trace_clock_local, "local" }, 489 { trace_clock_local, "local", 1 },
482 { trace_clock_global, "global" }, 490 { trace_clock_global, "global", 1 },
483 { trace_clock_counter, "counter" }, 491 { trace_clock_counter, "counter", 0 },
492 ARCH_TRACE_CLOCKS
484}; 493};
485 494
486int trace_clock_id; 495int trace_clock_id;
@@ -757,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
757} 766}
758#endif /* CONFIG_TRACER_MAX_TRACE */ 767#endif /* CONFIG_TRACER_MAX_TRACE */
759 768
769static void default_wait_pipe(struct trace_iterator *iter)
770{
771 DEFINE_WAIT(wait);
772
773 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
774
775 /*
776 * The events can happen in critical sections where
777 * checking a work queue can cause deadlocks.
778 * After adding a task to the queue, this flag is set
779 * only to notify events to try to wake up the queue
780 * using irq_work.
781 *
782 * We don't clear it even if the buffer is no longer
783 * empty. The flag only causes the next event to run
784 * irq_work to do the work queue wake up. The worse
785 * that can happen if we race with !trace_empty() is that
786 * an event will cause an irq_work to try to wake up
787 * an empty queue.
788 *
789 * There's no reason to protect this flag either, as
790 * the work queue and irq_work logic will do the necessary
791 * synchronization for the wake ups. The only thing
792 * that is necessary is that the wake up happens after
793 * a task has been queued. It's OK for spurious wake ups.
794 */
795 trace_wakeup_needed = true;
796
797 if (trace_empty(iter))
798 schedule();
799
800 finish_wait(&trace_wait, &wait);
801}
802
760/** 803/**
761 * register_tracer - register a tracer with the ftrace system. 804 * register_tracer - register a tracer with the ftrace system.
762 * @type - the plugin for the tracer 805 * @type - the plugin for the tracer
@@ -875,32 +918,6 @@ int register_tracer(struct tracer *type)
875 return ret; 918 return ret;
876} 919}
877 920
878void unregister_tracer(struct tracer *type)
879{
880 struct tracer **t;
881
882 mutex_lock(&trace_types_lock);
883 for (t = &trace_types; *t; t = &(*t)->next) {
884 if (*t == type)
885 goto found;
886 }
887 pr_info("Tracer %s not registered\n", type->name);
888 goto out;
889
890 found:
891 *t = (*t)->next;
892
893 if (type == current_trace && tracer_enabled) {
894 tracer_enabled = 0;
895 tracing_stop();
896 if (current_trace->stop)
897 current_trace->stop(&global_trace);
898 current_trace = &nop_trace;
899 }
900out:
901 mutex_unlock(&trace_types_lock);
902}
903
904void tracing_reset(struct trace_array *tr, int cpu) 921void tracing_reset(struct trace_array *tr, int cpu)
905{ 922{
906 struct ring_buffer *buffer = tr->buffer; 923 struct ring_buffer *buffer = tr->buffer;
@@ -1131,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[])
1131 1148
1132void tracing_record_cmdline(struct task_struct *tsk) 1149void tracing_record_cmdline(struct task_struct *tsk)
1133{ 1150{
1134 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || 1151 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
1135 !tracing_is_on())
1136 return; 1152 return;
1137 1153
1154 if (!__this_cpu_read(trace_cmdline_save))
1155 return;
1156
1157 __this_cpu_write(trace_cmdline_save, false);
1158
1138 trace_save_cmdline(tsk); 1159 trace_save_cmdline(tsk);
1139} 1160}
1140 1161
@@ -1178,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
1178 return event; 1199 return event;
1179} 1200}
1180 1201
1202void
1203__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1204{
1205 __this_cpu_write(trace_cmdline_save, true);
1206 if (trace_wakeup_needed) {
1207 trace_wakeup_needed = false;
1208 /* irq_work_queue() supplies it's own memory barriers */
1209 irq_work_queue(&trace_work_wakeup);
1210 }
1211 ring_buffer_unlock_commit(buffer, event);
1212}
1213
1181static inline void 1214static inline void
1182__trace_buffer_unlock_commit(struct ring_buffer *buffer, 1215__trace_buffer_unlock_commit(struct ring_buffer *buffer,
1183 struct ring_buffer_event *event, 1216 struct ring_buffer_event *event,
1184 unsigned long flags, int pc, 1217 unsigned long flags, int pc)
1185 int wake)
1186{ 1218{
1187 ring_buffer_unlock_commit(buffer, event); 1219 __buffer_unlock_commit(buffer, event);
1188 1220
1189 ftrace_trace_stack(buffer, flags, 6, pc); 1221 ftrace_trace_stack(buffer, flags, 6, pc);
1190 ftrace_trace_userstack(buffer, flags, pc); 1222 ftrace_trace_userstack(buffer, flags, pc);
1191
1192 if (wake)
1193 trace_wake_up();
1194} 1223}
1195 1224
1196void trace_buffer_unlock_commit(struct ring_buffer *buffer, 1225void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event, 1226 struct ring_buffer_event *event,
1198 unsigned long flags, int pc) 1227 unsigned long flags, int pc)
1199{ 1228{
1200 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1229 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1201} 1230}
1231EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1202 1232
1203struct ring_buffer_event * 1233struct ring_buffer_event *
1204trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1234trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1215,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1215 struct ring_buffer_event *event, 1245 struct ring_buffer_event *event,
1216 unsigned long flags, int pc) 1246 unsigned long flags, int pc)
1217{ 1247{
1218 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1248 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1219} 1249}
1220EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1250EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
1221 1251
1222void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, 1252void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1223 struct ring_buffer_event *event, 1253 struct ring_buffer_event *event,
1224 unsigned long flags, int pc) 1254 unsigned long flags, int pc,
1255 struct pt_regs *regs)
1225{ 1256{
1226 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); 1257 __buffer_unlock_commit(buffer, event);
1227}
1228EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1229
1230void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1231 struct ring_buffer_event *event,
1232 unsigned long flags, int pc,
1233 struct pt_regs *regs)
1234{
1235 ring_buffer_unlock_commit(buffer, event);
1236 1258
1237 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); 1259 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1238 ftrace_trace_userstack(buffer, flags, pc); 1260 ftrace_trace_userstack(buffer, flags, pc);
1239} 1261}
1240EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); 1262EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
1241 1263
1242void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1264void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1243 struct ring_buffer_event *event) 1265 struct ring_buffer_event *event)
@@ -1269,7 +1291,7 @@ trace_function(struct trace_array *tr,
1269 entry->parent_ip = parent_ip; 1291 entry->parent_ip = parent_ip;
1270 1292
1271 if (!filter_check_discard(call, entry, buffer, event)) 1293 if (!filter_check_discard(call, entry, buffer, event))
1272 ring_buffer_unlock_commit(buffer, event); 1294 __buffer_unlock_commit(buffer, event);
1273} 1295}
1274 1296
1275void 1297void
@@ -1362,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1362 entry->size = trace.nr_entries; 1384 entry->size = trace.nr_entries;
1363 1385
1364 if (!filter_check_discard(call, entry, buffer, event)) 1386 if (!filter_check_discard(call, entry, buffer, event))
1365 ring_buffer_unlock_commit(buffer, event); 1387 __buffer_unlock_commit(buffer, event);
1366 1388
1367 out: 1389 out:
1368 /* Again, don't let gcc optimize things here */ 1390 /* Again, don't let gcc optimize things here */
@@ -1458,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1458 1480
1459 save_stack_trace_user(&trace); 1481 save_stack_trace_user(&trace);
1460 if (!filter_check_discard(call, entry, buffer, event)) 1482 if (!filter_check_discard(call, entry, buffer, event))
1461 ring_buffer_unlock_commit(buffer, event); 1483 __buffer_unlock_commit(buffer, event);
1462 1484
1463 out_drop_count: 1485 out_drop_count:
1464 __this_cpu_dec(user_stack_count); 1486 __this_cpu_dec(user_stack_count);
@@ -1559,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void)
1559 return -ENOMEM; 1581 return -ENOMEM;
1560} 1582}
1561 1583
1584static int buffers_allocated;
1585
1562void trace_printk_init_buffers(void) 1586void trace_printk_init_buffers(void)
1563{ 1587{
1564 static int buffers_allocated;
1565
1566 if (buffers_allocated) 1588 if (buffers_allocated)
1567 return; 1589 return;
1568 1590
@@ -1571,7 +1593,38 @@ void trace_printk_init_buffers(void)
1571 1593
1572 pr_info("ftrace: Allocated trace_printk buffers\n"); 1594 pr_info("ftrace: Allocated trace_printk buffers\n");
1573 1595
1596 /* Expand the buffers to set size */
1597 tracing_update_buffers();
1598
1574 buffers_allocated = 1; 1599 buffers_allocated = 1;
1600
1601 /*
1602 * trace_printk_init_buffers() can be called by modules.
1603 * If that happens, then we need to start cmdline recording
1604 * directly here. If the global_trace.buffer is already
1605 * allocated here, then this was called by module code.
1606 */
1607 if (global_trace.buffer)
1608 tracing_start_cmdline_record();
1609}
1610
1611void trace_printk_start_comm(void)
1612{
1613 /* Start tracing comms if trace printk is set */
1614 if (!buffers_allocated)
1615 return;
1616 tracing_start_cmdline_record();
1617}
1618
1619static void trace_printk_start_stop_comm(int enabled)
1620{
1621 if (!buffers_allocated)
1622 return;
1623
1624 if (enabled)
1625 tracing_start_cmdline_record();
1626 else
1627 tracing_stop_cmdline_record();
1575} 1628}
1576 1629
1577/** 1630/**
@@ -1622,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1622 1675
1623 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 1676 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1624 if (!filter_check_discard(call, entry, buffer, event)) { 1677 if (!filter_check_discard(call, entry, buffer, event)) {
1625 ring_buffer_unlock_commit(buffer, event); 1678 __buffer_unlock_commit(buffer, event);
1626 ftrace_trace_stack(buffer, flags, 6, pc); 1679 ftrace_trace_stack(buffer, flags, 6, pc);
1627 } 1680 }
1628 1681
@@ -1693,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr,
1693 memcpy(&entry->buf, tbuffer, len); 1746 memcpy(&entry->buf, tbuffer, len);
1694 entry->buf[len] = '\0'; 1747 entry->buf[len] = '\0';
1695 if (!filter_check_discard(call, entry, buffer, event)) { 1748 if (!filter_check_discard(call, entry, buffer, event)) {
1696 ring_buffer_unlock_commit(buffer, event); 1749 __buffer_unlock_commit(buffer, event);
1697 ftrace_trace_stack(buffer, flags, 6, pc); 1750 ftrace_trace_stack(buffer, flags, 6, pc);
1698 } 1751 }
1699 out: 1752 out:
@@ -2426,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file)
2426 if (ring_buffer_overruns(iter->tr->buffer)) 2479 if (ring_buffer_overruns(iter->tr->buffer))
2427 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2480 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2428 2481
2482 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2483 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485
2429 /* stop the trace while dumping */ 2486 /* stop the trace while dumping */
2430 tracing_stop(); 2487 tracing_stop();
2431 2488
@@ -2794,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2794 2851
2795 if (mask == TRACE_ITER_OVERWRITE) 2852 if (mask == TRACE_ITER_OVERWRITE)
2796 ring_buffer_change_overwrite(global_trace.buffer, enabled); 2853 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2854
2855 if (mask == TRACE_ITER_PRINTK)
2856 trace_printk_start_stop_comm(enabled);
2797} 2857}
2798 2858
2799static ssize_t 2859static int trace_set_options(char *option)
2800tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2801 size_t cnt, loff_t *ppos)
2802{ 2860{
2803 char buf[64];
2804 char *cmp; 2861 char *cmp;
2805 int neg = 0; 2862 int neg = 0;
2806 int ret; 2863 int ret = 0;
2807 int i; 2864 int i;
2808 2865
2809 if (cnt >= sizeof(buf)) 2866 cmp = strstrip(option);
2810 return -EINVAL;
2811
2812 if (copy_from_user(&buf, ubuf, cnt))
2813 return -EFAULT;
2814
2815 buf[cnt] = 0;
2816 cmp = strstrip(buf);
2817 2867
2818 if (strncmp(cmp, "no", 2) == 0) { 2868 if (strncmp(cmp, "no", 2) == 0) {
2819 neg = 1; 2869 neg = 1;
@@ -2832,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2832 mutex_lock(&trace_types_lock); 2882 mutex_lock(&trace_types_lock);
2833 ret = set_tracer_option(current_trace, cmp, neg); 2883 ret = set_tracer_option(current_trace, cmp, neg);
2834 mutex_unlock(&trace_types_lock); 2884 mutex_unlock(&trace_types_lock);
2835 if (ret)
2836 return ret;
2837 } 2885 }
2838 2886
2887 return ret;
2888}
2889
2890static ssize_t
2891tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2892 size_t cnt, loff_t *ppos)
2893{
2894 char buf[64];
2895
2896 if (cnt >= sizeof(buf))
2897 return -EINVAL;
2898
2899 if (copy_from_user(&buf, ubuf, cnt))
2900 return -EFAULT;
2901
2902 trace_set_options(buf);
2903
2839 *ppos += cnt; 2904 *ppos += cnt;
2840 2905
2841 return cnt; 2906 return cnt;
@@ -2940,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
2940}; 3005};
2941 3006
2942static ssize_t 3007static ssize_t
2943tracing_ctrl_read(struct file *filp, char __user *ubuf,
2944 size_t cnt, loff_t *ppos)
2945{
2946 char buf[64];
2947 int r;
2948
2949 r = sprintf(buf, "%u\n", tracer_enabled);
2950 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2951}
2952
2953static ssize_t
2954tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2955 size_t cnt, loff_t *ppos)
2956{
2957 struct trace_array *tr = filp->private_data;
2958 unsigned long val;
2959 int ret;
2960
2961 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2962 if (ret)
2963 return ret;
2964
2965 val = !!val;
2966
2967 mutex_lock(&trace_types_lock);
2968 if (tracer_enabled ^ val) {
2969
2970 /* Only need to warn if this is used to change the state */
2971 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2972
2973 if (val) {
2974 tracer_enabled = 1;
2975 if (current_trace->start)
2976 current_trace->start(tr);
2977 tracing_start();
2978 } else {
2979 tracer_enabled = 0;
2980 tracing_stop();
2981 if (current_trace->stop)
2982 current_trace->stop(tr);
2983 }
2984 }
2985 mutex_unlock(&trace_types_lock);
2986
2987 *ppos += cnt;
2988
2989 return cnt;
2990}
2991
2992static ssize_t
2993tracing_set_trace_read(struct file *filp, char __user *ubuf, 3008tracing_set_trace_read(struct file *filp, char __user *ubuf,
2994 size_t cnt, loff_t *ppos) 3009 size_t cnt, loff_t *ppos)
2995{ 3010{
@@ -3030,6 +3045,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3030 */ 3045 */
3031 ring_buffer_expanded = 1; 3046 ring_buffer_expanded = 1;
3032 3047
3048 /* May be called before buffers are initialized */
3049 if (!global_trace.buffer)
3050 return 0;
3051
3033 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3052 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
3034 if (ret < 0) 3053 if (ret < 0)
3035 return ret; 3054 return ret;
@@ -3325,6 +3344,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3325 if (trace_flags & TRACE_ITER_LATENCY_FMT) 3344 if (trace_flags & TRACE_ITER_LATENCY_FMT)
3326 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3345 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3327 3346
3347 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3348 if (trace_clocks[trace_clock_id].in_ns)
3349 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3350
3328 iter->cpu_file = cpu_file; 3351 iter->cpu_file = cpu_file;
3329 iter->tr = &global_trace; 3352 iter->tr = &global_trace;
3330 mutex_init(&iter->mutex); 3353 mutex_init(&iter->mutex);
@@ -3385,19 +3408,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3385 } 3408 }
3386} 3409}
3387 3410
3388
3389void default_wait_pipe(struct trace_iterator *iter)
3390{
3391 DEFINE_WAIT(wait);
3392
3393 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3394
3395 if (trace_empty(iter))
3396 schedule();
3397
3398 finish_wait(&trace_wait, &wait);
3399}
3400
3401/* 3411/*
3402 * This is a make-shift waitqueue. 3412 * This is a make-shift waitqueue.
3403 * A tracer might use this callback on some rare cases: 3413 * A tracer might use this callback on some rare cases:
@@ -3438,7 +3448,7 @@ static int tracing_wait_pipe(struct file *filp)
3438 return -EINTR; 3448 return -EINTR;
3439 3449
3440 /* 3450 /*
3441 * We block until we read something and tracing is disabled. 3451 * We block until we read something and tracing is enabled.
3442 * We still block if tracing is disabled, but we have never 3452 * We still block if tracing is disabled, but we have never
3443 * read anything. This allows a user to cat this file, and 3453 * read anything. This allows a user to cat this file, and
3444 * then enable tracing. But after we have read something, 3454 * then enable tracing. But after we have read something,
@@ -3446,7 +3456,7 @@ static int tracing_wait_pipe(struct file *filp)
3446 * 3456 *
3447 * iter->pos will be 0 if we haven't read anything. 3457 * iter->pos will be 0 if we haven't read anything.
3448 */ 3458 */
3449 if (!tracer_enabled && iter->pos) 3459 if (tracing_is_enabled() && iter->pos)
3450 break; 3460 break;
3451 } 3461 }
3452 3462
@@ -3955,7 +3965,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3955 } else 3965 } else
3956 entry->buf[cnt] = '\0'; 3966 entry->buf[cnt] = '\0';
3957 3967
3958 ring_buffer_unlock_commit(buffer, event); 3968 __buffer_unlock_commit(buffer, event);
3959 3969
3960 written = cnt; 3970 written = cnt;
3961 3971
@@ -4016,6 +4026,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4016 if (max_tr.buffer) 4026 if (max_tr.buffer)
4017 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4027 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
4018 4028
4029 /*
4030 * New clock may not be consistent with the previous clock.
4031 * Reset the buffer so that it doesn't have incomparable timestamps.
4032 */
4033 tracing_reset_online_cpus(&global_trace);
4034 if (max_tr.buffer)
4035 tracing_reset_online_cpus(&max_tr);
4036
4019 mutex_unlock(&trace_types_lock); 4037 mutex_unlock(&trace_types_lock);
4020 4038
4021 *fpos += cnt; 4039 *fpos += cnt;
@@ -4037,13 +4055,6 @@ static const struct file_operations tracing_max_lat_fops = {
4037 .llseek = generic_file_llseek, 4055 .llseek = generic_file_llseek,
4038}; 4056};
4039 4057
4040static const struct file_operations tracing_ctrl_fops = {
4041 .open = tracing_open_generic,
4042 .read = tracing_ctrl_read,
4043 .write = tracing_ctrl_write,
4044 .llseek = generic_file_llseek,
4045};
4046
4047static const struct file_operations set_tracer_fops = { 4058static const struct file_operations set_tracer_fops = {
4048 .open = tracing_open_generic, 4059 .open = tracing_open_generic,
4049 .read = tracing_set_trace_read, 4060 .read = tracing_set_trace_read,
@@ -4377,13 +4388,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4377 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 4388 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4378 trace_seq_printf(s, "bytes: %ld\n", cnt); 4389 trace_seq_printf(s, "bytes: %ld\n", cnt);
4379 4390
4380 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 4391 if (trace_clocks[trace_clock_id].in_ns) {
4381 usec_rem = do_div(t, USEC_PER_SEC); 4392 /* local or global for trace_clock */
4382 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); 4393 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4394 usec_rem = do_div(t, USEC_PER_SEC);
4395 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4396 t, usec_rem);
4397
4398 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4399 usec_rem = do_div(t, USEC_PER_SEC);
4400 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4401 } else {
4402 /* counter or tsc mode for trace_clock */
4403 trace_seq_printf(s, "oldest event ts: %llu\n",
4404 ring_buffer_oldest_event_ts(tr->buffer, cpu));
4405
4406 trace_seq_printf(s, "now ts: %llu\n",
4407 ring_buffer_time_stamp(tr->buffer, cpu));
4408 }
4383 4409
4384 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 4410 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4385 usec_rem = do_div(t, USEC_PER_SEC); 4411 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4386 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4387 4412
4388 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4413 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4389 4414
@@ -4815,9 +4840,6 @@ static __init int tracer_init_debugfs(void)
4815 4840
4816 d_tracer = tracing_init_dentry(); 4841 d_tracer = tracing_init_dentry();
4817 4842
4818 trace_create_file("tracing_enabled", 0644, d_tracer,
4819 &global_trace, &tracing_ctrl_fops);
4820
4821 trace_create_file("trace_options", 0644, d_tracer, 4843 trace_create_file("trace_options", 0644, d_tracer,
4822 NULL, &tracing_iter_fops); 4844 NULL, &tracing_iter_fops);
4823 4845
@@ -5089,6 +5111,7 @@ __init static int tracer_alloc_buffers(void)
5089 5111
5090 /* Only allocate trace_printk buffers if a trace_printk exists */ 5112 /* Only allocate trace_printk buffers if a trace_printk exists */
5091 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) 5113 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5114 /* Must be called before global_trace.buffer is allocated */
5092 trace_printk_init_buffers(); 5115 trace_printk_init_buffers();
5093 5116
5094 /* To save memory, keep the ring buffer size to its minimum */ 5117 /* To save memory, keep the ring buffer size to its minimum */
@@ -5136,6 +5159,7 @@ __init static int tracer_alloc_buffers(void)
5136#endif 5159#endif
5137 5160
5138 trace_init_cmdlines(); 5161 trace_init_cmdlines();
5162 init_irq_work(&trace_work_wakeup, trace_wake_up);
5139 5163
5140 register_tracer(&nop_trace); 5164 register_tracer(&nop_trace);
5141 current_trace = &nop_trace; 5165 current_trace = &nop_trace;
@@ -5147,6 +5171,13 @@ __init static int tracer_alloc_buffers(void)
5147 5171
5148 register_die_notifier(&trace_die_notifier); 5172 register_die_notifier(&trace_die_notifier);
5149 5173
5174 while (trace_boot_options) {
5175 char *option;
5176
5177 option = strsep(&trace_boot_options, ",");
5178 trace_set_options(option);
5179 }
5180
5150 return 0; 5181 return 0;
5151 5182
5152out_free_cpumask: 5183out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c15f528c1af4..c75d7988902c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -285,8 +285,8 @@ struct tracer {
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 285 int (*set_flag)(u32 old_flags, u32 bit, int set);
286 struct tracer *next; 286 struct tracer *next;
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 int print_max; 288 bool print_max;
289 int use_max_tr; 289 bool use_max_tr;
290}; 290};
291 291
292 292
@@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
327 327
328int tracer_init(struct tracer *t, struct trace_array *tr); 328int tracer_init(struct tracer *t, struct trace_array *tr);
329int tracing_is_enabled(void); 329int tracing_is_enabled(void);
330void trace_wake_up(void);
331void tracing_reset(struct trace_array *tr, int cpu); 330void tracing_reset(struct trace_array *tr, int cpu);
332void tracing_reset_online_cpus(struct trace_array *tr); 331void tracing_reset_online_cpus(struct trace_array *tr);
333void tracing_reset_current(int cpu); 332void tracing_reset_current(int cpu);
@@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
349 unsigned long len, 348 unsigned long len,
350 unsigned long flags, 349 unsigned long flags,
351 int pc); 350 int pc);
352void trace_buffer_unlock_commit(struct ring_buffer *buffer,
353 struct ring_buffer_event *event,
354 unsigned long flags, int pc);
355 351
356struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 352struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
357 struct trace_array_cpu *data); 353 struct trace_array_cpu *data);
@@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
359struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 355struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
360 int *ent_cpu, u64 *ent_ts); 356 int *ent_cpu, u64 *ent_ts);
361 357
358void __buffer_unlock_commit(struct ring_buffer *buffer,
359 struct ring_buffer_event *event);
360
362int trace_empty(struct trace_iterator *iter); 361int trace_empty(struct trace_iterator *iter);
363 362
364void *trace_find_next_entry_inc(struct trace_iterator *iter); 363void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
367 366
368void tracing_iter_reset(struct trace_iterator *iter, int cpu); 367void tracing_iter_reset(struct trace_iterator *iter, int cpu);
369 368
370void default_wait_pipe(struct trace_iterator *iter);
371void poll_wait_pipe(struct trace_iterator *iter); 369void poll_wait_pipe(struct trace_iterator *iter);
372 370
373void ftrace(struct trace_array *tr, 371void ftrace(struct trace_array *tr,
@@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
407void tracing_stop_sched_switch_record(void); 405void tracing_stop_sched_switch_record(void);
408void tracing_start_sched_switch_record(void); 406void tracing_start_sched_switch_record(void);
409int register_tracer(struct tracer *type); 407int register_tracer(struct tracer *type);
410void unregister_tracer(struct tracer *type);
411int is_tracing_stopped(void); 408int is_tracing_stopped(void);
412enum trace_file_type {
413 TRACE_FILE_LAT_FMT = 1,
414 TRACE_FILE_ANNOTATE = 2,
415};
416 409
417extern cpumask_var_t __read_mostly tracing_buffer_mask; 410extern cpumask_var_t __read_mostly tracing_buffer_mask;
418 411
@@ -841,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[];
841extern const char *__stop___trace_bprintk_fmt[]; 834extern const char *__stop___trace_bprintk_fmt[];
842 835
843void trace_printk_init_buffers(void); 836void trace_printk_init_buffers(void);
837void trace_printk_start_comm(void);
844 838
845#undef FTRACE_ENTRY 839#undef FTRACE_ENTRY
846#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 840#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8d3538b4ea5f..95e96842ed29 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
77 entry->correct = val == expect; 77 entry->correct = val == expect;
78 78
79 if (!filter_check_discard(call, entry, buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
80 ring_buffer_unlock_commit(buffer, event); 80 __buffer_unlock_commit(buffer, event);
81 81
82 out: 82 out:
83 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
199 } 199 }
200 return register_tracer(&branch_trace); 200 return register_tracer(&branch_trace);
201} 201}
202device_initcall(init_branch_tracer); 202core_initcall(init_branch_tracer);
203 203
204#else 204#else
205static inline 205static inline
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d608d09d08c0..880073d0b946 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p)
491 mutex_unlock(&event_mutex); 491 mutex_unlock(&event_mutex);
492} 492}
493 493
494static int
495ftrace_event_seq_open(struct inode *inode, struct file *file)
496{
497 const struct seq_operations *seq_ops;
498
499 if ((file->f_mode & FMODE_WRITE) &&
500 (file->f_flags & O_TRUNC))
501 ftrace_clear_events();
502
503 seq_ops = inode->i_private;
504 return seq_open(file, seq_ops);
505}
506
507static ssize_t 494static ssize_t
508event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 495event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
509 loff_t *ppos) 496 loff_t *ppos)
@@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
980 return r; 967 return r;
981} 968}
982 969
970static int ftrace_event_avail_open(struct inode *inode, struct file *file);
971static int ftrace_event_set_open(struct inode *inode, struct file *file);
972
983static const struct seq_operations show_event_seq_ops = { 973static const struct seq_operations show_event_seq_ops = {
984 .start = t_start, 974 .start = t_start,
985 .next = t_next, 975 .next = t_next,
@@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = {
995}; 985};
996 986
997static const struct file_operations ftrace_avail_fops = { 987static const struct file_operations ftrace_avail_fops = {
998 .open = ftrace_event_seq_open, 988 .open = ftrace_event_avail_open,
999 .read = seq_read, 989 .read = seq_read,
1000 .llseek = seq_lseek, 990 .llseek = seq_lseek,
1001 .release = seq_release, 991 .release = seq_release,
1002}; 992};
1003 993
1004static const struct file_operations ftrace_set_event_fops = { 994static const struct file_operations ftrace_set_event_fops = {
1005 .open = ftrace_event_seq_open, 995 .open = ftrace_event_set_open,
1006 .read = seq_read, 996 .read = seq_read,
1007 .write = ftrace_event_write, 997 .write = ftrace_event_write,
1008 .llseek = seq_lseek, 998 .llseek = seq_lseek,
@@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void)
1078 return d_events; 1068 return d_events;
1079} 1069}
1080 1070
1071static int
1072ftrace_event_avail_open(struct inode *inode, struct file *file)
1073{
1074 const struct seq_operations *seq_ops = &show_event_seq_ops;
1075
1076 return seq_open(file, seq_ops);
1077}
1078
1079static int
1080ftrace_event_set_open(struct inode *inode, struct file *file)
1081{
1082 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1083
1084 if ((file->f_mode & FMODE_WRITE) &&
1085 (file->f_flags & O_TRUNC))
1086 ftrace_clear_events();
1087
1088 return seq_open(file, seq_ops);
1089}
1090
1081static struct dentry * 1091static struct dentry *
1082event_subsystem_dir(const char *name, struct dentry *d_events) 1092event_subsystem_dir(const char *name, struct dentry *d_events)
1083{ 1093{
@@ -1489,6 +1499,9 @@ static __init int event_trace_enable(void)
1489 if (ret) 1499 if (ret)
1490 pr_warn("Failed to enable trace event: %s\n", token); 1500 pr_warn("Failed to enable trace event: %s\n", token);
1491 } 1501 }
1502
1503 trace_printk_start_comm();
1504
1492 return 0; 1505 return 0;
1493} 1506}
1494 1507
@@ -1505,15 +1518,13 @@ static __init int event_trace_init(void)
1505 return 0; 1518 return 0;
1506 1519
1507 entry = debugfs_create_file("available_events", 0444, d_tracer, 1520 entry = debugfs_create_file("available_events", 0444, d_tracer,
1508 (void *)&show_event_seq_ops, 1521 NULL, &ftrace_avail_fops);
1509 &ftrace_avail_fops);
1510 if (!entry) 1522 if (!entry)
1511 pr_warning("Could not create debugfs " 1523 pr_warning("Could not create debugfs "
1512 "'available_events' entry\n"); 1524 "'available_events' entry\n");
1513 1525
1514 entry = debugfs_create_file("set_event", 0644, d_tracer, 1526 entry = debugfs_create_file("set_event", 0644, d_tracer,
1515 (void *)&show_set_event_seq_ops, 1527 NULL, &ftrace_set_event_fops);
1516 &ftrace_set_event_fops);
1517 if (!entry) 1528 if (!entry)
1518 pr_warning("Could not create debugfs " 1529 pr_warning("Could not create debugfs "
1519 "'set_event' entry\n"); 1530 "'set_event' entry\n");
@@ -1749,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
1749 entry->ip = ip; 1760 entry->ip = ip;
1750 entry->parent_ip = parent_ip; 1761 entry->parent_ip = parent_ip;
1751 1762
1752 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1763 trace_buffer_unlock_commit(buffer, event, flags, pc);
1753 1764
1754 out: 1765 out:
1755 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1766 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c154797a7ff7..e5b0ca8b8d4d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps,
1000 } 1000 }
1001 } else { 1001 } else {
1002 if (field->is_signed) 1002 if (field->is_signed)
1003 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = kstrtoll(pred->regex.pattern, 0, &val);
1004 else 1004 else
1005 ret = strict_strtoull(pred->regex.pattern, 0, &val); 1005 ret = kstrtoull(pred->regex.pattern, 0, &val);
1006 if (ret) { 1006 if (ret) {
1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
1008 return -EINVAL; 1008 return -EINVAL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 507a7a9630bf..8e3ad8082ab7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/ring_buffer.h> 12#include <linux/ring_buffer.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -366,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
366 * We use the callback data field (which is a pointer) 366 * We use the callback data field (which is a pointer)
367 * as our counter. 367 * as our counter.
368 */ 368 */
369 ret = strict_strtoul(number, 0, (unsigned long *)&count); 369 ret = kstrtoul(number, 0, (unsigned long *)&count);
370 if (ret) 370 if (ret)
371 return ret; 371 return ret;
372 372
@@ -411,5 +411,4 @@ static __init int init_function_trace(void)
411 init_func_cmd_traceon(); 411 init_func_cmd_traceon();
412 return register_tracer(&function_trace); 412 return register_tracer(&function_trace);
413} 413}
414device_initcall(init_function_trace); 414core_initcall(init_function_trace);
415
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 99b4378393d5..4edb4b74eb7e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -223,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr,
223 entry = ring_buffer_event_data(event); 223 entry = ring_buffer_event_data(event);
224 entry->graph_ent = *trace; 224 entry->graph_ent = *trace;
225 if (!filter_current_check_discard(buffer, call, entry, event)) 225 if (!filter_current_check_discard(buffer, call, entry, event))
226 ring_buffer_unlock_commit(buffer, event); 226 __buffer_unlock_commit(buffer, event);
227 227
228 return 1; 228 return 1;
229} 229}
@@ -327,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr,
327 entry = ring_buffer_event_data(event); 327 entry = ring_buffer_event_data(event);
328 entry->ret = *trace; 328 entry->ret = *trace;
329 if (!filter_current_check_discard(buffer, call, entry, event)) 329 if (!filter_current_check_discard(buffer, call, entry, event))
330 ring_buffer_unlock_commit(buffer, event); 330 __buffer_unlock_commit(buffer, event);
331} 331}
332 332
333void trace_graph_return(struct ftrace_graph_ret *trace) 333void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1474,4 +1474,4 @@ static __init int init_graph_trace(void)
1474 return register_tracer(&graph_trace); 1474 return register_tracer(&graph_trace);
1475} 1475}
1476 1476
1477device_initcall(init_graph_trace); 1477core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d98ee8283b29..713a2cac4881 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
7 * From code in the latency_tracer, that is: 7 * From code in the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -604,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly =
604 .reset = irqsoff_tracer_reset, 604 .reset = irqsoff_tracer_reset,
605 .start = irqsoff_tracer_start, 605 .start = irqsoff_tracer_start,
606 .stop = irqsoff_tracer_stop, 606 .stop = irqsoff_tracer_stop,
607 .print_max = 1, 607 .print_max = true,
608 .print_header = irqsoff_print_header, 608 .print_header = irqsoff_print_header,
609 .print_line = irqsoff_print_line, 609 .print_line = irqsoff_print_line,
610 .flags = &tracer_flags, 610 .flags = &tracer_flags,
@@ -614,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly =
614#endif 614#endif
615 .open = irqsoff_trace_open, 615 .open = irqsoff_trace_open,
616 .close = irqsoff_trace_close, 616 .close = irqsoff_trace_close,
617 .use_max_tr = 1, 617 .use_max_tr = true,
618}; 618};
619# define register_irqsoff(trace) register_tracer(&trace) 619# define register_irqsoff(trace) register_tracer(&trace)
620#else 620#else
@@ -637,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly =
637 .reset = irqsoff_tracer_reset, 637 .reset = irqsoff_tracer_reset,
638 .start = irqsoff_tracer_start, 638 .start = irqsoff_tracer_start,
639 .stop = irqsoff_tracer_stop, 639 .stop = irqsoff_tracer_stop,
640 .print_max = 1, 640 .print_max = true,
641 .print_header = irqsoff_print_header, 641 .print_header = irqsoff_print_header,
642 .print_line = irqsoff_print_line, 642 .print_line = irqsoff_print_line,
643 .flags = &tracer_flags, 643 .flags = &tracer_flags,
@@ -647,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly =
647#endif 647#endif
648 .open = irqsoff_trace_open, 648 .open = irqsoff_trace_open,
649 .close = irqsoff_trace_close, 649 .close = irqsoff_trace_close,
650 .use_max_tr = 1, 650 .use_max_tr = true,
651}; 651};
652# define register_preemptoff(trace) register_tracer(&trace) 652# define register_preemptoff(trace) register_tracer(&trace)
653#else 653#else
@@ -672,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
672 .reset = irqsoff_tracer_reset, 672 .reset = irqsoff_tracer_reset,
673 .start = irqsoff_tracer_start, 673 .start = irqsoff_tracer_start,
674 .stop = irqsoff_tracer_stop, 674 .stop = irqsoff_tracer_stop,
675 .print_max = 1, 675 .print_max = true,
676 .print_header = irqsoff_print_header, 676 .print_header = irqsoff_print_header,
677 .print_line = irqsoff_print_line, 677 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags, 678 .flags = &tracer_flags,
@@ -682,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
682#endif 682#endif
683 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
684 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
685 .use_max_tr = 1, 685 .use_max_tr = true,
686}; 686};
687 687
688# define register_preemptirqsoff(trace) register_tracer(&trace) 688# define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -698,4 +698,4 @@ __init static int init_irqsoff_tracer(void)
698 698
699 return 0; 699 return 0;
700} 700}
701device_initcall(init_irqsoff_tracer); 701core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1a2117043bb1..1865d5f76538 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* an address specified */ 446 /* an address specified */
447 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); 447 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
448 if (ret) { 448 if (ret) {
449 pr_info("Failed to parse address.\n"); 449 pr_info("Failed to parse address.\n");
450 return ret; 450 return ret;
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
752 752
753 if (!filter_current_check_discard(buffer, call, entry, event)) 753 if (!filter_current_check_discard(buffer, call, entry, event))
754 trace_nowake_buffer_unlock_commit_regs(buffer, event, 754 trace_buffer_unlock_commit_regs(buffer, event,
755 irq_flags, pc, regs); 755 irq_flags, pc, regs);
756} 756}
757 757
758/* Kretprobe handler */ 758/* Kretprobe handler */
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
785 785
786 if (!filter_current_check_discard(buffer, call, entry, event)) 786 if (!filter_current_check_discard(buffer, call, entry, event))
787 trace_nowake_buffer_unlock_commit_regs(buffer, event, 787 trace_buffer_unlock_commit_regs(buffer, event,
788 irq_flags, pc, regs); 788 irq_flags, pc, regs);
789} 789}
790 790
791/* Event entry printers */ 791/* Event entry printers */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 123b189c732c..194d79602dc7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
610 return trace_print_lat_fmt(s, entry); 610 return trace_print_lat_fmt(s, entry);
611} 611}
612 612
613static unsigned long preempt_mark_thresh = 100; 613static unsigned long preempt_mark_thresh_us = 100;
614 614
615static int 615static int
616lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, 616lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617 unsigned long rel_usecs)
618{ 617{
619 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, 618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
620 rel_usecs > preempt_mark_thresh ? '!' : 619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
621 rel_usecs > 1 ? '+' : ' '); 620 unsigned long long abs_ts = iter->ts - iter->tr->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq;
623
624 if (in_ns) {
625 abs_ts = ns2usecs(abs_ts);
626 rel_ts = ns2usecs(rel_ts);
627 }
628
629 if (verbose && in_ns) {
630 unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
631 unsigned long abs_msec = (unsigned long)abs_ts;
632 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
633 unsigned long rel_msec = (unsigned long)rel_ts;
634
635 return trace_seq_printf(
636 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
637 ns2usecs(iter->ts),
638 abs_msec, abs_usec,
639 rel_msec, rel_usec);
640 } else if (verbose && !in_ns) {
641 return trace_seq_printf(
642 s, "[%016llx] %lld (+%lld): ",
643 iter->ts, abs_ts, rel_ts);
644 } else if (!verbose && in_ns) {
645 return trace_seq_printf(
646 s, " %4lldus%c: ",
647 abs_ts,
648 rel_ts > preempt_mark_thresh_us ? '!' :
649 rel_ts > 1 ? '+' : ' ');
650 } else { /* !verbose && !in_ns */
651 return trace_seq_printf(s, " %4lld: ", abs_ts);
652 }
622} 653}
623 654
624int trace_print_context(struct trace_iterator *iter) 655int trace_print_context(struct trace_iterator *iter)
625{ 656{
626 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
627 struct trace_entry *entry = iter->ent; 658 struct trace_entry *entry = iter->ent;
628 unsigned long long t = ns2usecs(iter->ts); 659 unsigned long long t;
629 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 660 unsigned long secs, usec_rem;
630 unsigned long secs = (unsigned long)t;
631 char comm[TASK_COMM_LEN]; 661 char comm[TASK_COMM_LEN];
632 int ret; 662 int ret;
633 663
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter)
644 return 0; 674 return 0;
645 } 675 }
646 676
647 return trace_seq_printf(s, " %5lu.%06lu: ", 677 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
648 secs, usec_rem); 678 t = ns2usecs(iter->ts);
679 usec_rem = do_div(t, USEC_PER_SEC);
680 secs = (unsigned long)t;
681 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
682 } else
683 return trace_seq_printf(s, " %12llu: ", iter->ts);
649} 684}
650 685
651int trace_print_lat_context(struct trace_iterator *iter) 686int trace_print_lat_context(struct trace_iterator *iter)
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter)
659 *next_entry = trace_find_next_entry(iter, NULL, 694 *next_entry = trace_find_next_entry(iter, NULL,
660 &next_ts); 695 &next_ts);
661 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 696 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
662 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
663 unsigned long rel_usecs;
664 697
665 /* Restore the original ent_size */ 698 /* Restore the original ent_size */
666 iter->ent_size = ent_size; 699 iter->ent_size = ent_size;
667 700
668 if (!next_entry) 701 if (!next_entry)
669 next_ts = iter->ts; 702 next_ts = iter->ts;
670 rel_usecs = ns2usecs(next_ts - iter->ts);
671 703
672 if (verbose) { 704 if (verbose) {
673 char comm[TASK_COMM_LEN]; 705 char comm[TASK_COMM_LEN];
674 706
675 trace_find_cmdline(entry->pid, comm); 707 trace_find_cmdline(entry->pid, comm);
676 708
677 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" 709 ret = trace_seq_printf(
678 " %ld.%03ldms (+%ld.%03ldms): ", comm, 710 s, "%16s %5d %3d %d %08x %08lx ",
679 entry->pid, iter->cpu, entry->flags, 711 comm, entry->pid, iter->cpu, entry->flags,
680 entry->preempt_count, iter->idx, 712 entry->preempt_count, iter->idx);
681 ns2usecs(iter->ts),
682 abs_usecs / USEC_PER_MSEC,
683 abs_usecs % USEC_PER_MSEC,
684 rel_usecs / USEC_PER_MSEC,
685 rel_usecs % USEC_PER_MSEC);
686 } else { 713 } else {
687 ret = lat_print_generic(s, entry, iter->cpu); 714 ret = lat_print_generic(s, entry, iter->cpu);
688 if (ret)
689 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
690 } 715 }
691 716
717 if (ret)
718 ret = lat_print_timestamp(iter, next_ts);
719
692 return ret; 720 return ret;
693} 721}
694 722
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index daa9980153af..412e959709b4 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type)
441 goto fail; 441 goto fail;
442 442
443 type++; 443 type++;
444 if (strict_strtoul(type, 0, &bs)) 444 if (kstrtoul(type, 0, &bs))
445 goto fail; 445 goto fail;
446 446
447 switch (bs) { 447 switch (bs) {
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
501 501
502 tmp = strchr(symbol, '+'); 502 tmp = strchr(symbol, '+');
503 if (tmp) { 503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */ 504 /* skip sign because kstrtoul doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset); 505 ret = kstrtoul(tmp + 1, 0, offset);
506 if (ret) 506 if (ret)
507 return ret; 507 return ret;
508 508
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
533 else 533 else
534 ret = -EINVAL; 534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) { 535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param); 536 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK) 537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL; 538 ret = -EINVAL;
539 else { 539 else {
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
579 579
580 case '@': /* memory or symbol */ 580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) { 581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param); 582 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret) 583 if (ret)
584 break; 584 break;
585 585
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
597 break; 597 break;
598 598
599 case '+': /* deref memory */ 599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */ 600 arg++; /* Skip '+', because kstrtol() rejects it. */
601 case '-': 601 case '-':
602 tmp = strchr(arg, '('); 602 tmp = strchr(arg, '(');
603 if (!tmp) 603 if (!tmp)
604 break; 604 break;
605 605
606 *tmp = '\0'; 606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset); 607 ret = kstrtol(arg, 0, &offset);
608 608
609 if (ret) 609 if (ret)
610 break; 610 break;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7e62c0a18456..3374c792ccd8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!filter_check_discard(call, entry, buffer, event))
105 ring_buffer_unlock_commit(buffer, event); 105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106 ftrace_trace_stack(tr->buffer, flags, 6, pc);
107 ftrace_trace_userstack(tr->buffer, flags, pc);
108} 106}
109 107
110static void 108static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 02170c00c413..9fe45fcefca0 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
@@ -589,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly =
589 .reset = wakeup_tracer_reset, 589 .reset = wakeup_tracer_reset,
590 .start = wakeup_tracer_start, 590 .start = wakeup_tracer_start,
591 .stop = wakeup_tracer_stop, 591 .stop = wakeup_tracer_stop,
592 .print_max = 1, 592 .print_max = true,
593 .print_header = wakeup_print_header, 593 .print_header = wakeup_print_header,
594 .print_line = wakeup_print_line, 594 .print_line = wakeup_print_line,
595 .flags = &tracer_flags, 595 .flags = &tracer_flags,
@@ -599,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly =
599#endif 599#endif
600 .open = wakeup_trace_open, 600 .open = wakeup_trace_open,
601 .close = wakeup_trace_close, 601 .close = wakeup_trace_close,
602 .use_max_tr = 1, 602 .use_max_tr = true,
603}; 603};
604 604
605static struct tracer wakeup_rt_tracer __read_mostly = 605static struct tracer wakeup_rt_tracer __read_mostly =
@@ -610,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
610 .start = wakeup_tracer_start, 610 .start = wakeup_tracer_start,
611 .stop = wakeup_tracer_stop, 611 .stop = wakeup_tracer_stop,
612 .wait_pipe = poll_wait_pipe, 612 .wait_pipe = poll_wait_pipe,
613 .print_max = 1, 613 .print_max = true,
614 .print_header = wakeup_print_header, 614 .print_header = wakeup_print_header,
615 .print_line = wakeup_print_line, 615 .print_line = wakeup_print_line,
616 .flags = &tracer_flags, 616 .flags = &tracer_flags,
@@ -620,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
620#endif 620#endif
621 .open = wakeup_trace_open, 621 .open = wakeup_trace_open,
622 .close = wakeup_trace_close, 622 .close = wakeup_trace_close,
623 .use_max_tr = 1, 623 .use_max_tr = true,
624}; 624};
625 625
626__init static int init_wakeup_tracer(void) 626__init static int init_wakeup_tracer(void)
@@ -637,4 +637,4 @@ __init static int init_wakeup_tracer(void)
637 637
638 return 0; 638 return 0;
639} 639}
640device_initcall(init_wakeup_tracer); 640core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2c00a691a540..47623169a815 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
320 int (*func)(void)) 320 int (*func)(void))
321{ 321{
322 int save_ftrace_enabled = ftrace_enabled; 322 int save_ftrace_enabled = ftrace_enabled;
323 int save_tracer_enabled = tracer_enabled;
324 unsigned long count; 323 unsigned long count;
325 char *func_name; 324 char *func_name;
326 int ret; 325 int ret;
@@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
331 330
332 /* enable tracing, and record the filter function */ 331 /* enable tracing, and record the filter function */
333 ftrace_enabled = 1; 332 ftrace_enabled = 1;
334 tracer_enabled = 1;
335 333
336 /* passed in by parameter to fool gcc from optimizing */ 334 /* passed in by parameter to fool gcc from optimizing */
337 func(); 335 func();
@@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
395 393
396 out: 394 out:
397 ftrace_enabled = save_ftrace_enabled; 395 ftrace_enabled = save_ftrace_enabled;
398 tracer_enabled = save_tracer_enabled;
399 396
400 /* Enable tracing on all functions again */ 397 /* Enable tracing on all functions again */
401 ftrace_set_global_filter(NULL, 0, 1); 398 ftrace_set_global_filter(NULL, 0, 1);
@@ -452,7 +449,6 @@ static int
452trace_selftest_function_recursion(void) 449trace_selftest_function_recursion(void)
453{ 450{
454 int save_ftrace_enabled = ftrace_enabled; 451 int save_ftrace_enabled = ftrace_enabled;
455 int save_tracer_enabled = tracer_enabled;
456 char *func_name; 452 char *func_name;
457 int len; 453 int len;
458 int ret; 454 int ret;
@@ -465,7 +461,6 @@ trace_selftest_function_recursion(void)
465 461
466 /* enable tracing, and record the filter function */ 462 /* enable tracing, and record the filter function */
467 ftrace_enabled = 1; 463 ftrace_enabled = 1;
468 tracer_enabled = 1;
469 464
470 /* Handle PPC64 '.' name */ 465 /* Handle PPC64 '.' name */
471 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 466 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -534,7 +529,6 @@ trace_selftest_function_recursion(void)
534 ret = 0; 529 ret = 0;
535out: 530out:
536 ftrace_enabled = save_ftrace_enabled; 531 ftrace_enabled = save_ftrace_enabled;
537 tracer_enabled = save_tracer_enabled;
538 532
539 return ret; 533 return ret;
540} 534}
@@ -569,7 +563,6 @@ static int
569trace_selftest_function_regs(void) 563trace_selftest_function_regs(void)
570{ 564{
571 int save_ftrace_enabled = ftrace_enabled; 565 int save_ftrace_enabled = ftrace_enabled;
572 int save_tracer_enabled = tracer_enabled;
573 char *func_name; 566 char *func_name;
574 int len; 567 int len;
575 int ret; 568 int ret;
@@ -586,7 +579,6 @@ trace_selftest_function_regs(void)
586 579
587 /* enable tracing, and record the filter function */ 580 /* enable tracing, and record the filter function */
588 ftrace_enabled = 1; 581 ftrace_enabled = 1;
589 tracer_enabled = 1;
590 582
591 /* Handle PPC64 '.' name */ 583 /* Handle PPC64 '.' name */
592 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 584 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -648,7 +640,6 @@ trace_selftest_function_regs(void)
648 ret = 0; 640 ret = 0;
649out: 641out:
650 ftrace_enabled = save_ftrace_enabled; 642 ftrace_enabled = save_ftrace_enabled;
651 tracer_enabled = save_tracer_enabled;
652 643
653 return ret; 644 return ret;
654} 645}
@@ -662,7 +653,6 @@ int
662trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 653trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
663{ 654{
664 int save_ftrace_enabled = ftrace_enabled; 655 int save_ftrace_enabled = ftrace_enabled;
665 int save_tracer_enabled = tracer_enabled;
666 unsigned long count; 656 unsigned long count;
667 int ret; 657 int ret;
668 658
@@ -671,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
671 661
672 /* start the tracing */ 662 /* start the tracing */
673 ftrace_enabled = 1; 663 ftrace_enabled = 1;
674 tracer_enabled = 1;
675 664
676 ret = tracer_init(trace, tr); 665 ret = tracer_init(trace, tr);
677 if (ret) { 666 if (ret) {
@@ -708,7 +697,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
708 ret = trace_selftest_function_regs(); 697 ret = trace_selftest_function_regs();
709 out: 698 out:
710 ftrace_enabled = save_ftrace_enabled; 699 ftrace_enabled = save_ftrace_enabled;
711 tracer_enabled = save_tracer_enabled;
712 700
713 /* kill ftrace totally if we failed */ 701 /* kill ftrace totally if we failed */
714 if (ret) 702 if (ret)
@@ -1106,6 +1094,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1106 tracing_stop(); 1094 tracing_stop();
1107 /* check both trace buffers */ 1095 /* check both trace buffers */
1108 ret = trace_test_buffer(tr, NULL); 1096 ret = trace_test_buffer(tr, NULL);
1097 printk("ret = %d\n", ret);
1109 if (!ret) 1098 if (!ret)
1110 ret = trace_test_buffer(&max_tr, &count); 1099 ret = trace_test_buffer(&max_tr, &count);
1111 1100
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 2485a7d09b11..7609dd6714c2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event,
21static int syscall_exit_register(struct ftrace_event_call *event, 21static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type, void *data); 22 enum trace_reg type, void *data);
23 23
24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call);
26
27static struct list_head * 24static struct list_head *
28syscall_get_enter_fields(struct ftrace_event_call *call) 25syscall_get_enter_fields(struct ftrace_event_call *call)
29{ 26{
@@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
32 return &entry->enter_fields; 29 return &entry->enter_fields;
33} 30}
34 31
35struct trace_event_functions enter_syscall_print_funcs = {
36 .trace = print_syscall_enter,
37};
38
39struct trace_event_functions exit_syscall_print_funcs = {
40 .trace = print_syscall_exit,
41};
42
43struct ftrace_event_class event_class_syscall_enter = {
44 .system = "syscalls",
45 .reg = syscall_enter_register,
46 .define_fields = syscall_enter_define_fields,
47 .get_fields = syscall_get_enter_fields,
48 .raw_init = init_syscall_trace,
49};
50
51struct ftrace_event_class event_class_syscall_exit = {
52 .system = "syscalls",
53 .reg = syscall_exit_register,
54 .define_fields = syscall_exit_define_fields,
55 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
56 .raw_init = init_syscall_trace,
57};
58
59extern struct syscall_metadata *__start_syscalls_metadata[]; 32extern struct syscall_metadata *__start_syscalls_metadata[];
60extern struct syscall_metadata *__stop_syscalls_metadata[]; 33extern struct syscall_metadata *__stop_syscalls_metadata[];
61 34
@@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
432 mutex_unlock(&syscall_trace_lock); 405 mutex_unlock(&syscall_trace_lock);
433} 406}
434 407
435int init_syscall_trace(struct ftrace_event_call *call) 408static int init_syscall_trace(struct ftrace_event_call *call)
436{ 409{
437 int id; 410 int id;
438 int num; 411 int num;
@@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call)
457 return id; 430 return id;
458} 431}
459 432
433struct trace_event_functions enter_syscall_print_funcs = {
434 .trace = print_syscall_enter,
435};
436
437struct trace_event_functions exit_syscall_print_funcs = {
438 .trace = print_syscall_exit,
439};
440
441struct ftrace_event_class event_class_syscall_enter = {
442 .system = "syscalls",
443 .reg = syscall_enter_register,
444 .define_fields = syscall_enter_define_fields,
445 .get_fields = syscall_get_enter_fields,
446 .raw_init = init_syscall_trace,
447};
448
449struct ftrace_event_class event_class_syscall_exit = {
450 .system = "syscalls",
451 .reg = syscall_exit_register,
452 .define_fields = syscall_exit_define_fields,
453 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
454 .raw_init = init_syscall_trace,
455};
456
460unsigned long __init __weak arch_syscall_addr(int nr) 457unsigned long __init __weak arch_syscall_addr(int nr)
461{ 458{
462 return (unsigned long)sys_call_table[nr]; 459 return (unsigned long)sys_call_table[nr];
@@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
537 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
538} 535}
539 536
540int perf_sysenter_enable(struct ftrace_event_call *call) 537static int perf_sysenter_enable(struct ftrace_event_call *call)
541{ 538{
542 int ret = 0; 539 int ret = 0;
543 int num; 540 int num;
@@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
558 return ret; 555 return ret;
559} 556}
560 557
561void perf_sysenter_disable(struct ftrace_event_call *call) 558static void perf_sysenter_disable(struct ftrace_event_call *call)
562{ 559{
563 int num; 560 int num;
564 561
@@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
615 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 612 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
616} 613}
617 614
618int perf_sysexit_enable(struct ftrace_event_call *call) 615static int perf_sysexit_enable(struct ftrace_event_call *call)
619{ 616{
620 int ret = 0; 617 int ret = 0;
621 int num; 618 int num;
@@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
636 return ret; 633 return ret;
637} 634}
638 635
639void perf_sysexit_disable(struct ftrace_event_call *call) 636static void perf_sysexit_disable(struct ftrace_event_call *call)
640{ 637{
641 int num; 638 int num;
642 639
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 03003cd7dd96..c86e6d4f67fb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,6 +22,7 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/uprobes.h> 23#include <linux/uprobes.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/string.h>
25 26
26#include "trace_probe.h" 27#include "trace_probe.h"
27 28
@@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv)
189 if (argv[0][0] == '-') 190 if (argv[0][0] == '-')
190 is_delete = true; 191 is_delete = true;
191 else if (argv[0][0] != 'p') { 192 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); 193 pr_info("Probe definition must be started with 'p' or '-'.\n");
193 return -EINVAL; 194 return -EINVAL;
194 } 195 }
195 196
@@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv)
252 if (ret) 253 if (ret)
253 goto fail_address_parse; 254 goto fail_address_parse;
254 255
255 ret = strict_strtoul(arg, 0, &offset); 256 ret = kstrtoul(arg, 0, &offset);
256 if (ret) 257 if (ret)
257 goto fail_address_parse; 258 goto fail_address_parse;
258 259
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv)
263 264
264 /* setup a probe */ 265 /* setup a probe */
265 if (!event) { 266 if (!event) {
266 char *tail = strrchr(filename, '/'); 267 char *tail;
267 char *ptr; 268 char *ptr;
268 269
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); 270 tail = kstrdup(kbasename(filename), GFP_KERNEL);
270 if (!ptr) { 271 if (!tail) {
271 ret = -ENOMEM; 272 ret = -ENOMEM;
272 goto fail_address_parse; 273 goto fail_address_parse;
273 } 274 }
274 275
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_"); 276 ptr = strpbrk(tail, ".-_");
277 if (ptr) 277 if (ptr)
278 *ptr = '\0'; 278 *ptr = '\0';
diff --git a/kernel/wait.c b/kernel/wait.c
index 7fdd9eaca2c3..6698e0c04ead 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic waiting primitives. 2 * Generic waiting primitives.
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/export.h> 7#include <linux/export.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9d4c8d5a1f53..997c6a16ec22 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,7 @@
31int watchdog_enabled = 1; 31int watchdog_enabled = 1;
32int __read_mostly watchdog_thresh = 10; 32int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled; 33static int __read_mostly watchdog_disabled;
34static u64 __read_mostly sample_period;
34 35
35static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 36static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
36static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 37static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu)
116 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
117} 118}
118 119
119static unsigned long get_sample_period(void) 120static void set_sample_period(void)
120{ 121{
121 /* 122 /*
122 * convert watchdog_thresh from seconds to ns 123 * convert watchdog_thresh from seconds to ns
@@ -125,7 +126,7 @@ static unsigned long get_sample_period(void)
125 * and hard thresholds) to increment before the 126 * and hard thresholds) to increment before the
126 * hardlockup detector generates a warning 127 * hardlockup detector generates a warning
127 */ 128 */
128 return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 129 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
129} 130}
130 131
131/* Commands for resetting the watchdog */ 132/* Commands for resetting the watchdog */
@@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
275 wake_up_process(__this_cpu_read(softlockup_watchdog)); 276 wake_up_process(__this_cpu_read(softlockup_watchdog));
276 277
277 /* .. and repeat */ 278 /* .. and repeat */
278 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 279 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
279 280
280 if (touch_ts == 0) { 281 if (touch_ts == 0) {
281 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 282 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -356,7 +357,7 @@ static void watchdog_enable(unsigned int cpu)
356 hrtimer->function = watchdog_timer_fn; 357 hrtimer->function = watchdog_timer_fn;
357 358
358 /* done here because hrtimer_start can only pin to smp_processor_id() */ 359 /* done here because hrtimer_start can only pin to smp_processor_id() */
359 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 360 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
360 HRTIMER_MODE_REL_PINNED); 361 HRTIMER_MODE_REL_PINNED);
361 362
362 /* initialize timestamp */ 363 /* initialize timestamp */
@@ -368,6 +369,9 @@ static void watchdog_disable(unsigned int cpu)
368{ 369{
369 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 370 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
370 371
372 if (!watchdog_enabled)
373 return;
374
371 watchdog_set_prio(SCHED_NORMAL, 0); 375 watchdog_set_prio(SCHED_NORMAL, 0);
372 hrtimer_cancel(hrtimer); 376 hrtimer_cancel(hrtimer);
373 /* disable the perf event */ 377 /* disable the perf event */
@@ -383,7 +387,7 @@ static int watchdog_should_run(unsigned int cpu)
383/* 387/*
384 * The watchdog thread function - touches the timestamp. 388 * The watchdog thread function - touches the timestamp.
385 * 389 *
386 * It only runs once every get_sample_period() seconds (4 seconds by 390 * It only runs once every sample_period seconds (4 seconds by
387 * default) to reset the softlockup timestamp. If this gets delayed 391 * default) to reset the softlockup timestamp. If this gets delayed
388 * for more than 2*watchdog_thresh seconds then the debug-printout 392 * for more than 2*watchdog_thresh seconds then the debug-printout
389 * triggers in watchdog_timer_fn(). 393 * triggers in watchdog_timer_fn().
@@ -516,6 +520,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
516 if (ret || !write) 520 if (ret || !write)
517 return ret; 521 return ret;
518 522
523 set_sample_period();
519 if (watchdog_enabled && watchdog_thresh) 524 if (watchdog_enabled && watchdog_thresh)
520 watchdog_enable_all_cpus(); 525 watchdog_enable_all_cpus();
521 else 526 else
@@ -537,6 +542,7 @@ static struct smp_hotplug_thread watchdog_threads = {
537 542
538void __init lockup_detector_init(void) 543void __init lockup_detector_init(void)
539{ 544{
545 set_sample_period();
540 if (smpboot_register_percpu_thread(&watchdog_threads)) { 546 if (smpboot_register_percpu_thread(&watchdog_threads)) {
541 pr_err("Failed to create watchdog threads, disabled\n"); 547 pr_err("Failed to create watchdog threads, disabled\n");
542 watchdog_disabled = -ENODEV; 548 watchdog_disabled = -ENODEV;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 042d221d33cc..fbc6576a83c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -739,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
739{ 739{
740 struct worker *worker = kthread_data(task); 740 struct worker *worker = kthread_data(task);
741 741
742 if (!(worker->flags & WORKER_NOT_RUNNING)) 742 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
743 atomic_inc(get_pool_nr_running(worker->pool)); 744 atomic_inc(get_pool_nr_running(worker->pool));
745 }
744} 746}
745 747
746/** 748/**
@@ -1361,8 +1363,19 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1361 1363
1362 WARN_ON_ONCE(timer->function != delayed_work_timer_fn || 1364 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1363 timer->data != (unsigned long)dwork); 1365 timer->data != (unsigned long)dwork);
1364 BUG_ON(timer_pending(timer)); 1366 WARN_ON_ONCE(timer_pending(timer));
1365 BUG_ON(!list_empty(&work->entry)); 1367 WARN_ON_ONCE(!list_empty(&work->entry));
1368
1369 /*
1370 * If @delay is 0, queue @dwork->work immediately. This is for
1371 * both optimization and correctness. The earliest @timer can
1372 * expire is on the closest next tick and delayed_work users depend
1373 * on that there's no such delay when @delay is 0.
1374 */
1375 if (!delay) {
1376 __queue_work(cpu, wq, &dwork->work);
1377 return;
1378 }
1366 1379
1367 timer_stats_timer_set_start_info(&dwork->timer); 1380 timer_stats_timer_set_start_info(&dwork->timer);
1368 1381
@@ -1417,9 +1430,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1417 bool ret = false; 1430 bool ret = false;
1418 unsigned long flags; 1431 unsigned long flags;
1419 1432
1420 if (!delay)
1421 return queue_work_on(cpu, wq, &dwork->work);
1422
1423 /* read the comment in __queue_work() */ 1433 /* read the comment in __queue_work() */
1424 local_irq_save(flags); 1434 local_irq_save(flags);
1425 1435
@@ -2407,8 +2417,10 @@ static int rescuer_thread(void *__wq)
2407repeat: 2417repeat:
2408 set_current_state(TASK_INTERRUPTIBLE); 2418 set_current_state(TASK_INTERRUPTIBLE);
2409 2419
2410 if (kthread_should_stop()) 2420 if (kthread_should_stop()) {
2421 __set_current_state(TASK_RUNNING);
2411 return 0; 2422 return 0;
2423 }
2412 2424
2413 /* 2425 /*
2414 * See whether any cpu is asking for help. Unbounded 2426 * See whether any cpu is asking for help. Unbounded
@@ -3475,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work)
3475 unsigned int ret = 0; 3487 unsigned int ret = 0;
3476 3488
3477 if (!gcwq) 3489 if (!gcwq)
3478 return false; 3490 return 0;
3479 3491
3480 spin_lock_irqsave(&gcwq->lock, flags); 3492 spin_lock_irqsave(&gcwq->lock, flags);
3481 3493