aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2013-01-11 10:28:19 -0500
committerMauro Carvalho Chehab <mchehab@redhat.com>2013-01-11 10:28:19 -0500
commit734d1ece37fbf3d2ddfc71bc6c69e0fe35f02542 (patch)
treec4805dd7e746b1feb9e09e9849f3245d0b2c0c6b /kernel
parent216c82c6aba63eeb49d7654b448e0d47bea255bb (diff)
parent9931faca02c604c22335f5a935a501bb2ace6e20 (diff)
Merge tag 'v3.8-rc3' into v4l_for_linus
Linux 3.8-rc3 * tag 'v3.8-rc3': (11110 commits) Linux 3.8-rc3 mm: reinstante dropped pmd_trans_splitting() check cred: Remove tgcred pointer from struct cred drm/ttm: fix fence locking in ttm_buffer_object_transfer ARM: clps711x: Fix bad merge of clockevents setup ARM: highbank: save and restore L2 cache and GIC on suspend ARM: highbank: add a power request clear ARM: highbank: fix secondary boot and hotplug ARM: highbank: fix typos with hignbank in power request functions ARM: dts: fix highbank cpu mpidr values ARM: dts: add device_type prop to cpu nodes on Calxeda platforms drm/prime: drop reference on imported dma-buf come from gem xen/netfront: improve truesize tracking ARM: mx5: Fix MX53 flexcan2 clock ARM: OMAP2+: am33xx-hwmod: Fix wrongly terminated am33xx_usbss_mpu_irqs array sctp: fix Kconfig bug in default cookie hmac selection EDAC: Cleanup device deregistering path EDAC: Fix EDAC Kconfig menu EDAC: Fix kernel panic on module unloading ALSA: hda - add mute LED for HP Pavilion 17 (Realtek codec) ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/audit_tree.c10
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditsc.c104
-rw-r--r--kernel/cgroup.c757
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/context_tracking.c83
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c122
-rw-r--r--kernel/cred.c154
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/events/uprobes.c43
-rw-r--r--kernel/exit.c108
-rw-r--r--kernel/fork.c161
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/irq/chip.c1
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c43
-rw-r--r--kernel/irq/resend.c8
-rw-r--r--kernel/kcmp.c1
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/ksysfs.c23
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c21
-rw-r--r--kernel/module.c444
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/pid.c75
-rw-r--r--kernel/pid_namespace.c117
-rw-r--r--kernel/posix-cpu-timers.c27
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/power/qos.c65
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk.c57
-rw-r--r--kernel/profile.c7
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/rcu.h2
-rw-r--r--kernel/rcupdate.c3
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h5
-rw-r--r--kernel/rcutorture.c54
-rw-r--r--kernel/rcutree.c347
-rw-r--r--kernel/rcutree.h67
-rw-r--r--kernel/rcutree_plugin.h415
-rw-r--r--kernel/rcutree_trace.c330
-rw-r--r--kernel/res_counter.c42
-rw-r--r--kernel/sched/auto_group.c4
-rw-r--r--kernel/sched/auto_group.h5
-rw-r--r--kernel/sched/core.c146
-rw-r--r--kernel/sched/cputime.c131
-rw-r--r--kernel/sched/debug.c36
-rw-r--r--kernel/sched/fair.c1130
-rw-r--r--kernel/sched/features.h16
-rw-r--r--kernel/sched/sched.h72
-rw-r--r--kernel/seccomp.c13
-rw-r--r--kernel/signal.c120
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/srcu.c16
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c49
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/jiffies.c8
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c137
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timekeeping.c64
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/ring_buffer.c65
-rw-r--r--kernel/trace/trace.c473
-rw-r--r--kernel/trace/trace.h18
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_events.c51
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_functions.c7
-rw-r--r--kernel/trace/trace_functions_graph.c6
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_output.c78
-rw-r--r--kernel/trace/trace_probe.c14
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c12
-rw-r--r--kernel/trace/trace_selftest.c13
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c61
-rw-r--r--kernel/trace/trace_uprobe.c12
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c147
-rw-r--r--kernel/utsname.c34
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c24
-rw-r--r--kernel/workqueue.c6
99 files changed, 4859 insertions, 2771 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 86e3285ae7e5..6c072b6da239 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
55obj-$(CONFIG_UID16) += uid16.o 55obj-$(CONFIG_UID16) += uid16.o
56obj-$(CONFIG_MODULES) += module.o 56obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o 57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
58obj-$(CONFIG_KALLSYMS) += kallsyms.o 58obj-$(CONFIG_KALLSYMS) += kallsyms.o
59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
60obj-$(CONFIG_KEXEC) += kexec.o 60obj-$(CONFIG_KEXEC) += kexec.o
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
110obj-$(CONFIG_PADATA) += padata.o 110obj-$(CONFIG_PADATA) += padata.o
111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
112obj-$(CONFIG_JUMP_LABEL) += jump_label.o 112obj-$(CONFIG_JUMP_LABEL) += jump_label.o
113obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
113 114
114$(obj)/configs.o: $(obj)/config_data.h 115$(obj)/configs.o: $(obj)/config_data.h
115 116
@@ -136,10 +137,14 @@ ifeq ($(CONFIG_MODULE_SIG),y)
136# 137#
137# Pull the signing certificate and any extra certificates into the kernel 138# Pull the signing certificate and any extra certificates into the kernel
138# 139#
140
141quiet_cmd_touch = TOUCH $@
142 cmd_touch = touch $@
143
139extra_certificates: 144extra_certificates:
140 touch $@ 145 $(call cmd,touch)
141 146
142kernel/modsign_pubkey.o: signing_key.x509 extra_certificates 147kernel/modsign_certificate.o: signing_key.x509 extra_certificates
143 148
144############################################################################### 149###############################################################################
145# 150#
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ed206fd88cca..e81175ef25f8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,7 @@ static void untag_chunk(struct node *p)
249 list_del_rcu(&chunk->hash); 249 list_del_rcu(&chunk->hash);
250 spin_unlock(&hash_lock); 250 spin_unlock(&hash_lock);
251 spin_unlock(&entry->lock); 251 spin_unlock(&entry->lock);
252 fsnotify_destroy_mark(entry); 252 fsnotify_destroy_mark(entry, audit_tree_group);
253 goto out; 253 goto out;
254 } 254 }
255 255
@@ -291,7 +291,7 @@ static void untag_chunk(struct node *p)
291 owner->root = new; 291 owner->root = new;
292 spin_unlock(&hash_lock); 292 spin_unlock(&hash_lock);
293 spin_unlock(&entry->lock); 293 spin_unlock(&entry->lock);
294 fsnotify_destroy_mark(entry); 294 fsnotify_destroy_mark(entry, audit_tree_group);
295 fsnotify_put_mark(&new->mark); /* drop initial reference */ 295 fsnotify_put_mark(&new->mark); /* drop initial reference */
296 goto out; 296 goto out;
297 297
@@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
331 spin_unlock(&hash_lock); 331 spin_unlock(&hash_lock);
332 chunk->dead = 1; 332 chunk->dead = 1;
333 spin_unlock(&entry->lock); 333 spin_unlock(&entry->lock);
334 fsnotify_destroy_mark(entry); 334 fsnotify_destroy_mark(entry, audit_tree_group);
335 fsnotify_put_mark(entry); 335 fsnotify_put_mark(entry);
336 return 0; 336 return 0;
337 } 337 }
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
412 spin_unlock(&chunk_entry->lock); 412 spin_unlock(&chunk_entry->lock);
413 spin_unlock(&old_entry->lock); 413 spin_unlock(&old_entry->lock);
414 414
415 fsnotify_destroy_mark(chunk_entry); 415 fsnotify_destroy_mark(chunk_entry, audit_tree_group);
416 416
417 fsnotify_put_mark(chunk_entry); 417 fsnotify_put_mark(chunk_entry);
418 fsnotify_put_mark(old_entry); 418 fsnotify_put_mark(old_entry);
@@ -443,7 +443,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
443 spin_unlock(&hash_lock); 443 spin_unlock(&hash_lock);
444 spin_unlock(&chunk_entry->lock); 444 spin_unlock(&chunk_entry->lock);
445 spin_unlock(&old_entry->lock); 445 spin_unlock(&old_entry->lock);
446 fsnotify_destroy_mark(old_entry); 446 fsnotify_destroy_mark(old_entry, audit_tree_group);
447 fsnotify_put_mark(chunk_entry); /* drop initial reference */ 447 fsnotify_put_mark(chunk_entry); /* drop initial reference */
448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 return 0; 449 return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9a9ae6e3d290..4a599f699adc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -350,7 +350,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
350 } 350 }
351 mutex_unlock(&audit_filter_mutex); 351 mutex_unlock(&audit_filter_mutex);
352 352
353 fsnotify_destroy_mark(&parent->mark); 353 fsnotify_destroy_mark(&parent->mark, audit_watch_group);
354} 354}
355 355
356/* Get path information necessary for adding watches. */ 356/* Get path information necessary for adding watches. */
@@ -457,7 +457,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
457 457
458 if (list_empty(&parent->watches)) { 458 if (list_empty(&parent->watches)) {
459 audit_get_parent(parent); 459 audit_get_parent(parent);
460 fsnotify_destroy_mark(&parent->mark); 460 fsnotify_destroy_mark(&parent->mark, audit_watch_group);
461 audit_put_parent(parent); 461 audit_put_parent(parent);
462 } 462 }
463 } 463 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2f186ed80c40..e37e6a12c5e3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ struct audit_context {
200 struct list_head names_list; /* anchor for struct audit_names->list */ 200 struct list_head names_list; /* anchor for struct audit_names->list */
201 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
202 struct path pwd; 202 struct path pwd;
203 struct audit_context *previous; /* For nested syscalls */
204 struct audit_aux_data *aux; 203 struct audit_aux_data *aux;
205 struct audit_aux_data *aux_pids; 204 struct audit_aux_data *aux_pids;
206 struct sockaddr_storage *sockaddr; 205 struct sockaddr_storage *sockaddr;
@@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk)
1091 1090
1092static inline void audit_free_context(struct audit_context *context) 1091static inline void audit_free_context(struct audit_context *context)
1093{ 1092{
1094 struct audit_context *previous; 1093 audit_free_names(context);
1095 int count = 0; 1094 unroll_tree_refs(context, NULL, 0);
1096 1095 free_tree_refs(context);
1097 do { 1096 audit_free_aux(context);
1098 previous = context->previous; 1097 kfree(context->filterkey);
1099 if (previous || (count && count < 10)) { 1098 kfree(context->sockaddr);
1100 ++count; 1099 kfree(context);
1101 printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
1102 " freeing multiple contexts (%d)\n",
1103 context->serial, context->major,
1104 context->name_count, count);
1105 }
1106 audit_free_names(context);
1107 unroll_tree_refs(context, NULL, 0);
1108 free_tree_refs(context);
1109 audit_free_aux(context);
1110 kfree(context->filterkey);
1111 kfree(context->sockaddr);
1112 kfree(context);
1113 context = previous;
1114 } while (context);
1115 if (count >= 10)
1116 printk(KERN_ERR "audit: freed %d contexts\n", count);
1117} 1100}
1118 1101
1119void audit_log_task_context(struct audit_buffer *ab) 1102void audit_log_task_context(struct audit_buffer *ab)
@@ -1159,7 +1142,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1159 cred = current_cred(); 1142 cred = current_cred();
1160 1143
1161 spin_lock_irq(&tsk->sighand->siglock); 1144 spin_lock_irq(&tsk->sighand->siglock);
1162 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1145 if (tsk->signal && tsk->signal->tty)
1163 tty = tsk->signal->tty->name; 1146 tty = tsk->signal->tty->name;
1164 else 1147 else
1165 tty = "(none)"; 1148 tty = "(none)";
@@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major,
1783 if (!context) 1766 if (!context)
1784 return; 1767 return;
1785 1768
1786 /*
1787 * This happens only on certain architectures that make system
1788 * calls in kernel_thread via the entry.S interface, instead of
1789 * with direct calls. (If you are porting to a new
1790 * architecture, hitting this condition can indicate that you
1791 * got the _exit/_leave calls backward in entry.S.)
1792 *
1793 * i386 no
1794 * x86_64 no
1795 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
1796 *
1797 * This also happens with vm86 emulation in a non-nested manner
1798 * (entries without exits), so this case must be caught.
1799 */
1800 if (context->in_syscall) {
1801 struct audit_context *newctx;
1802
1803#if AUDIT_DEBUG
1804 printk(KERN_ERR
1805 "audit(:%d) pid=%d in syscall=%d;"
1806 " entering syscall=%d\n",
1807 context->serial, tsk->pid, context->major, major);
1808#endif
1809 newctx = audit_alloc_context(context->state);
1810 if (newctx) {
1811 newctx->previous = context;
1812 context = newctx;
1813 tsk->audit_context = newctx;
1814 } else {
1815 /* If we can't alloc a new context, the best we
1816 * can do is to leak memory (any pending putname
1817 * will be lost). The only other alternative is
1818 * to abandon auditing. */
1819 audit_zero_context(context, context->state);
1820 }
1821 }
1822 BUG_ON(context->in_syscall || context->name_count); 1769 BUG_ON(context->in_syscall || context->name_count);
1823 1770
1824 if (!audit_enabled) 1771 if (!audit_enabled)
@@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code)
1881 if (!list_empty(&context->killed_trees)) 1828 if (!list_empty(&context->killed_trees))
1882 audit_kill_trees(&context->killed_trees); 1829 audit_kill_trees(&context->killed_trees);
1883 1830
1884 if (context->previous) { 1831 audit_free_names(context);
1885 struct audit_context *new_context = context->previous; 1832 unroll_tree_refs(context, NULL, 0);
1886 context->previous = NULL; 1833 audit_free_aux(context);
1887 audit_free_context(context); 1834 context->aux = NULL;
1888 tsk->audit_context = new_context; 1835 context->aux_pids = NULL;
1889 } else { 1836 context->target_pid = 0;
1890 audit_free_names(context); 1837 context->target_sid = 0;
1891 unroll_tree_refs(context, NULL, 0); 1838 context->sockaddr_len = 0;
1892 audit_free_aux(context); 1839 context->type = 0;
1893 context->aux = NULL; 1840 context->fds[0] = -1;
1894 context->aux_pids = NULL; 1841 if (context->state != AUDIT_RECORD_CONTEXT) {
1895 context->target_pid = 0; 1842 kfree(context->filterkey);
1896 context->target_sid = 0; 1843 context->filterkey = NULL;
1897 context->sockaddr_len = 0;
1898 context->type = 0;
1899 context->fds[0] = -1;
1900 if (context->state != AUDIT_RECORD_CONTEXT) {
1901 kfree(context->filterkey);
1902 context->filterkey = NULL;
1903 }
1904 tsk->audit_context = context;
1905 } 1844 }
1845 tsk->audit_context = context;
1906} 1846}
1907 1847
1908static inline void handle_one(const struct inode *inode) 1848static inline void handle_one(const struct inode *inode)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..4855892798fd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 784 * The task_lock() exception
783 * 785 *
784 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 789 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
793 * 795 *
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 856 return inode;
855} 857}
856 858
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 860{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
898 * Release the subsystem state objects. 876 * Release the subsystem state objects.
899 */ 877 */
900 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp); 879 ss->css_free(cgrp);
902 880
903 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
917 895
918 simple_xattrs_free(&cgrp->xattrs); 896 simple_xattrs_free(&cgrp->xattrs);
919 897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
920 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
921 } else { 900 } else {
922 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 966 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 967 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 968 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 970 }
992 if (base_files) { 971 if (base_files) {
993 while (!list_empty(&cgrp->files)) 972 while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 994}
1016 995
1017/* 996/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1102 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1107 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1114 unsigned long subsys_mask;
1163 unsigned long flags; 1115 unsigned long flags;
1164 char *release_agent; 1116 char *release_agent;
1165 bool clone_children; 1117 bool cpuset_clone_children;
1166 char *name; 1118 char *name;
1167 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1168 bool none; 1120 bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1165 continue;
1214 } 1166 }
1215 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1217 continue; 1169 continue;
1218 } 1170 }
1219 if (!strcmp(token, "xattr")) { 1171 if (!strcmp(token, "xattr")) {
@@ -1381,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1381 if (ret) 1333 if (ret)
1382 goto out_unlock; 1334 goto out_unlock;
1383 1335
1384 /* See feature-removal-schedule.txt */
1385 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1336 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1386 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1337 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1387 task_tgid_nr(current), current->comm); 1338 task_tgid_nr(current), current->comm);
@@ -1397,14 +1348,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1348 goto out_unlock;
1398 } 1349 }
1399 1350
1351 /*
1352 * Clear out the files of subsystems that should be removed, do
1353 * this before rebind_subsystems, since rebind_subsystems may
1354 * change this hierarchy's subsys_list.
1355 */
1356 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1357
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1358 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1359 if (ret) {
1360 /* rebind_subsystems failed, re-populate the removed files */
1361 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1362 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1363 goto out_unlock;
1404 } 1364 }
1405 1365
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1366 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1367 cgroup_populate_dir(cgrp, false, added_mask);
1410 1368
@@ -1432,6 +1390,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1390 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1391 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1392 INIT_LIST_HEAD(&cgrp->css_sets);
1393 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1394 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1395 INIT_LIST_HEAD(&cgrp->pidlists);
1437 mutex_init(&cgrp->pidlist_mutex); 1396 mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1409 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1410 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1411 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1412 init_cgroup_housekeeping(cgrp);
1413 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1414}
1456 1415
1457static bool init_root_id(struct cgroupfs_root *root) 1416static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1477,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1477
1519 root->subsys_mask = opts->subsys_mask; 1478 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1479 root->flags = opts->flags;
1480 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1481 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1482 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1483 if (opts->name)
1524 strcpy(root->name, opts->name); 1484 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1485 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1486 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1487 return root;
1528} 1488}
1529 1489
@@ -1536,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1496 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1497 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1498 spin_unlock(&hierarchy_id_lock);
1499 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1500 kfree(root);
1540} 1501}
1541 1502
@@ -1701,7 +1662,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1701 1662
1702 free_cg_links(&tmp_cg_links); 1663 free_cg_links(&tmp_cg_links);
1703 1664
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1665 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1666 BUG_ON(root->number_of_cgroups != 1);
1707 1667
@@ -1750,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1710
1751 BUG_ON(root->number_of_cgroups != 1); 1711 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1712 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1713
1755 mutex_lock(&cgroup_mutex); 1714 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1715 mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1767,11 @@ static struct kobject *cgroup_kobj;
1808 */ 1767 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1768int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1769{
1770 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1771 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1772
1813 cgroup_lock_is_held()); 1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking");
1814 1775
1815 if (!dentry || cgrp == dummytop) { 1776 if (!dentry || cgrp == dummytop) {
1816 /* 1777 /*
@@ -1821,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1782 return 0;
1822 } 1783 }
1823 1784
1824 start = buf + buflen; 1785 start = buf + buflen - 1;
1825 1786
1826 *--start = '\0'; 1787 *start = '\0';
1827 for (;;) { 1788 for (;;) {
1828 int len = dentry->d_name.len; 1789 int len = dentry->d_name.len;
1829 1790
@@ -1834,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1795 if (!cgrp)
1835 break; 1796 break;
1836 1797
1837 dentry = rcu_dereference_check(cgrp->dentry, 1798 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1799 if (!cgrp->parent)
1840 continue; 1800 continue;
1841 if (--start < buf) 1801 if (--start < buf)
@@ -1930,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1890/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1891 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1892 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1893 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1894 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1895static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1896 struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2025 } 1983 }
2026 1984
2027 synchronize_rcu(); 1985 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1986out:
2035 if (retval) { 1987 if (retval) {
2036 for_each_subsys(root, ss) { 1988 for_each_subsys(root, ss) {
@@ -2200,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 * step 5: success! and cleanup 2152 * step 5: success! and cleanup
2201 */ 2153 */
2202 synchronize_rcu(); 2154 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2155 retval = 0;
2205out_put_css_set_refs: 2156out_put_css_set_refs:
2206 if (retval) { 2157 if (retval) {
@@ -2711,10 +2662,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2662
2712 /* start off with i_nlink == 2 (for "." entry) */ 2663 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2664 inc_nlink(inode);
2665 inc_nlink(dentry->d_parent->d_inode);
2714 2666
2715 /* start with the directory inode held, so that we can 2667 /*
2716 * populate it without racing with another mkdir */ 2668 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2669 * @inode->i_mutex should nest outside cgroup_mutex but we
2670 * want to populate it immediately without releasing
2671 * cgroup_mutex. As @inode isn't visible to anyone else
2672 * yet, trylock will always succeed without affecting
2673 * lockdep checks.
2674 */
2675 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2676 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2677 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2678 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2683,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2683 return 0;
2726} 2684}
2727 2685
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2686/**
2755 * cgroup_file_mode - deduce file mode of a control file 2687 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2688 * @cft: the control file in question
@@ -2791,12 +2723,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2723
2792 simple_xattrs_init(&cft->xattrs); 2724 simple_xattrs_init(&cft->xattrs);
2793 2725
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2726 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2727 strcpy(name, subsys->name);
2802 strcat(name, "."); 2728 strcat(name, ".");
@@ -2837,6 +2763,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2763 int err, ret = 0;
2838 2764
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2765 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2766 /* does cft->flags tell us to skip this file on @cgrp? */
2767 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2768 continue;
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue;
2771
2840 if (is_add) 2772 if (is_add)
2841 err = cgroup_add_file(cgrp, subsys, cft); 2773 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2774 else
@@ -3044,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2976 write_unlock(&css_set_lock);
3045} 2977}
3046 2978
2979/**
2980 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2981 * @pos: the current position (%NULL to initiate traversal)
2982 * @cgroup: cgroup whose descendants to walk
2983 *
2984 * To be used by cgroup_for_each_descendant_pre(). Find the next
2985 * descendant to visit for pre-order traversal of @cgroup's descendants.
2986 */
2987struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2988 struct cgroup *cgroup)
2989{
2990 struct cgroup *next;
2991
2992 WARN_ON_ONCE(!rcu_read_lock_held());
2993
2994 /* if first iteration, pretend we just visited @cgroup */
2995 if (!pos) {
2996 if (list_empty(&cgroup->children))
2997 return NULL;
2998 pos = cgroup;
2999 }
3000
3001 /* visit the first child if exists */
3002 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3003 if (next)
3004 return next;
3005
3006 /* no child, visit my or the closest ancestor's next sibling */
3007 do {
3008 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3009 sibling);
3010 if (&next->sibling != &pos->parent->children)
3011 return next;
3012
3013 pos = pos->parent;
3014 } while (pos != cgroup);
3015
3016 return NULL;
3017}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{
3022 struct cgroup *last;
3023
3024 do {
3025 last = pos;
3026 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3027 sibling);
3028 } while (pos);
3029
3030 return last;
3031}
3032
3033/**
3034 * cgroup_next_descendant_post - find the next descendant for post-order walk
3035 * @pos: the current position (%NULL to initiate traversal)
3036 * @cgroup: cgroup whose descendants to walk
3037 *
3038 * To be used by cgroup_for_each_descendant_post(). Find the next
3039 * descendant to visit for post-order traversal of @cgroup's descendants.
3040 */
3041struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3042 struct cgroup *cgroup)
3043{
3044 struct cgroup *next;
3045
3046 WARN_ON_ONCE(!rcu_read_lock_held());
3047
3048 /* if first iteration, visit the leftmost descendant */
3049 if (!pos) {
3050 next = cgroup_leftmost_descendant(cgroup);
3051 return next != cgroup ? next : NULL;
3052 }
3053
3054 /* if there's an unvisited sibling, visit its leftmost descendant */
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return cgroup_leftmost_descendant(next);
3058
3059 /* no sibling left, visit parent */
3060 next = pos->parent;
3061 return next != cgroup ? next : NULL;
3062}
3063EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3064
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3065void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3066 __acquires(css_set_lock)
3049{ 3067{
@@ -3390,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3390{ 3408{
3391 struct cgroup_pidlist *l; 3409 struct cgroup_pidlist *l;
3392 /* don't need task_nsproxy() if we're looking at ourself */ 3410 /* don't need task_nsproxy() if we're looking at ourself */
3393 struct pid_namespace *ns = current->nsproxy->pid_ns; 3411 struct pid_namespace *ns = task_active_pid_ns(current);
3394 3412
3395 /* 3413 /*
3396 * We can't drop the pidlist_mutex before taking the l->mutex in case 3414 * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3757,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3757 if (flags & POLLHUP) { 3775 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait); 3776 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock); 3777 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list); 3778 list_del_init(&event->list);
3761 spin_unlock(&cgrp->event_list_lock); 3779 spin_unlock(&cgrp->event_list_lock);
3762 /* 3780 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3781 * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3912,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3912static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3913 struct cftype *cft)
3896{ 3914{
3897 return clone_children(cgrp); 3915 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3916}
3899 3917
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3918static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3920 u64 val)
3903{ 3921{
3904 if (val) 3922 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3923 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3924 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3925 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3926 return 0;
3909} 3927}
3910 3928
@@ -4017,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4035 css->flags = 0;
4018 css->id = NULL; 4036 css->id = NULL;
4019 if (cgrp == dummytop) 4037 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4038 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4039 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4040 cgrp->subsys[ss->subsys_id] = css;
4023 4041
4024 /* 4042 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4043 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4044 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4045 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4046 * dput() asynchronously from css_put().
4029 */ 4047 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4048 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4049}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4050
4051/* invoke ->post_create() on a new CSS and mark it online if successful */
4052static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4053{
4054 int ret = 0;
4055
4056 lockdep_assert_held(&cgroup_mutex);
4057
4058 if (ss->css_online)
4059 ret = ss->css_online(cgrp);
4060 if (!ret)
4061 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4062 return ret;
4063}
4064
4065/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4066static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4067 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4068{
4069 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4070
4071 lockdep_assert_held(&cgroup_mutex);
4072
4073 if (!(css->flags & CSS_ONLINE))
4074 return;
4075
4076 /*
4077 * css_offline() should be called with cgroup_mutex unlocked. See
4078 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4079 * details. This temporary unlocking should go away once
4080 * cgroup_mutex is unexported from controllers.
4081 */
4082 if (ss->css_offline) {
4083 mutex_unlock(&cgroup_mutex);
4084 ss->css_offline(cgrp);
4085 mutex_lock(&cgroup_mutex);
4086 }
4087
4088 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4089}
4034 4090
4035/* 4091/*
@@ -4049,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4105 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4106 struct super_block *sb = root->sb;
4051 4107
4108 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4109 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4110 if (!cgrp)
4054 return -ENOMEM; 4111 return -ENOMEM;
4055 4112
4113 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4114 if (cgrp->id < 0)
4115 goto err_free_cgrp;
4116
4117 /*
4118 * Only live parents can have children. Note that the liveliness
4119 * check isn't strictly necessary because cgroup_mkdir() and
4120 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4121 * anyway so that locking is contained inside cgroup proper and we
4122 * don't get nasty surprises if we ever grow another caller.
4123 */
4124 if (!cgroup_lock_live_group(parent)) {
4125 err = -ENODEV;
4126 goto err_free_id;
4127 }
4128
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4129 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4130 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4131 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4133 * fs */
4061 atomic_inc(&sb->s_active); 4134 atomic_inc(&sb->s_active);
4062 4135
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4136 init_cgroup_housekeeping(cgrp);
4066 4137
4067 cgrp->parent = parent; 4138 cgrp->parent = parent;
@@ -4071,26 +4142,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4142 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4143 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4144
4074 if (clone_children(parent)) 4145 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4146 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4147
4077 for_each_subsys(root, ss) { 4148 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4149 struct cgroup_subsys_state *css;
4079 4150
4080 css = ss->create(cgrp); 4151 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4152 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4153 err = PTR_ERR(css);
4083 goto err_destroy; 4154 goto err_free_all;
4084 } 4155 }
4085 init_cgroup_css(css, ss, cgrp); 4156 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4157 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4158 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4159 if (err)
4089 goto err_destroy; 4160 goto err_free_all;
4090 } 4161 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4162 }
4092 if (clone_children(parent) && ss->post_clone) 4163
4093 ss->post_clone(cgrp); 4164 /*
4165 * Create directory. cgroup_create_file() returns with the new
4166 * directory locked on success so that it can be populated without
4167 * dropping cgroup_mutex.
4168 */
4169 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4170 if (err < 0)
4171 goto err_free_all;
4172 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173
4174 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++;
4180
4181 /* each css holds a ref to the cgroup's dentry */
4182 for_each_subsys(root, ss)
4183 dget(dentry);
4184
4185 /* creation succeeded, notify subsystems */
4186 for_each_subsys(root, ss) {
4187 err = online_css(ss, cgrp);
4188 if (err)
4189 goto err_destroy;
4094 4190
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4191 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4192 parent->parent) {
@@ -4102,50 +4198,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4198 }
4103 } 4199 }
4104 4200
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4201 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4202 if (err)
4203 goto err_destroy;
4124 4204
4125 mutex_unlock(&cgroup_mutex); 4205 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4206 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4207
4128 return 0; 4208 return 0;
4129 4209
4130 err_remove: 4210err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4211 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4212 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4213 ss->css_free(cgrp);
4140 } 4214 }
4141
4142 mutex_unlock(&cgroup_mutex); 4215 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4216 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4217 deactivate_super(sb);
4146 4218err_free_id:
4219 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4220err_free_cgrp:
4147 kfree(cgrp); 4221 kfree(cgrp);
4148 return err; 4222 return err;
4223
4224err_destroy:
4225 cgroup_destroy_locked(cgrp);
4226 mutex_unlock(&cgroup_mutex);
4227 mutex_unlock(&dentry->d_inode->i_mutex);
4228 return err;
4149} 4229}
4150 4230
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4231static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4277 return 0;
4198} 4278}
4199 4279
4200/* 4280static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4281 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4282{
4283 struct dentry *d = cgrp->dentry;
4284 struct cgroup *parent = cgrp->parent;
4285 DEFINE_WAIT(wait);
4286 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4287 struct cgroup_subsys *ss;
4225 unsigned long flags; 4288 LIST_HEAD(tmp_list);
4226 bool failed = false; 4289
4290 lockdep_assert_held(&d->d_inode->i_mutex);
4291 lockdep_assert_held(&cgroup_mutex);
4227 4292
4228 local_irq_save(flags); 4293 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4294 return -EBUSY;
4229 4295
4230 /* 4296 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4297 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4298 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4299 * attempts fail thus maintaining the removal conditions verified
4300 * above.
4234 */ 4301 */
4235 for_each_subsys(cgrp->root, ss) { 4302 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4303 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4304
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4305 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4306 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 }
4244
4245 /*
4246 * If succeeded, set REMOVED and put all the base refs; otherwise,
4247 * restore refcnts to positive values. Either way, all in-progress
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 } 4307 }
4308 set_bit(CGRP_REMOVED, &cgrp->flags);
4260 4309
4261 local_irq_restore(flags); 4310 /* tell subsystems to initate destruction */
4262 return !failed; 4311 for_each_subsys(cgrp->root, ss)
4263} 4312 offline_css(ss, cgrp);
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4313
4298 /* 4314 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4315 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4316 * cgroup's dentry and cgroup removal proceeds regardless of css
4317 * refs. On the last put of each css, whenever that may be, the
4318 * extra dentry ref is put so that dentry destruction happens only
4319 * after all css's are released.
4301 */ 4320 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4321 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4322 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4323
4334 raw_spin_lock(&release_list_lock); 4324 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4325 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4326 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4327 raw_spin_unlock(&release_list_lock);
4339 4328
4340 /* delete this cgroup from parent->children */ 4329 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4330 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4331 list_del_init(&cgrp->allcg_node);
4344 4332
4345 d = dget(cgrp->dentry); 4333 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4334 cgroup_d_remove_dir(d);
4348 dput(d); 4335 dput(d);
4349 4336
@@ -4353,21 +4340,35 @@ again:
4353 /* 4340 /*
4354 * Unregister events and notify userspace. 4341 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4342 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4343 * directory to avoid race between userspace and kernelspace. Use
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4357 */ 4347 */
4358 spin_lock(&cgrp->event_list_lock); 4348 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4349 list_splice_init(&cgrp->event_list, &tmp_list);
4360 list_del(&event->list); 4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait); 4353 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1); 4354 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4355 schedule_work(&event->remove);
4364 } 4356 }
4365 spin_unlock(&cgrp->event_list_lock);
4366 4357
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4358 return 0;
4369} 4359}
4370 4360
4361static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4362{
4363 int ret;
4364
4365 mutex_lock(&cgroup_mutex);
4366 ret = cgroup_destroy_locked(dentry->d_fsdata);
4367 mutex_unlock(&cgroup_mutex);
4368
4369 return ret;
4370}
4371
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4372static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4373{
4373 INIT_LIST_HEAD(&ss->cftsets); 4374 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4389
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4390 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4391
4392 mutex_lock(&cgroup_mutex);
4393
4391 /* init base cftset */ 4394 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4395 cgroup_init_cftsets(ss);
4393 4396
4394 /* Create the top cgroup state for this subsystem */ 4397 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4398 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4399 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4400 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4401 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4402 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4403 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4406 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4407 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4408 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4409 init_css_set.subsys[ss->subsys_id] = css;
4407 4410
4408 need_forkexit_callback |= ss->fork || ss->exit; 4411 need_forkexit_callback |= ss->fork || ss->exit;
4409 4412
@@ -4413,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4416 BUG_ON(!list_empty(&init_task.tasks));
4414 4417
4415 ss->active = 1; 4418 ss->active = 1;
4419 BUG_ON(online_css(ss, dummytop));
4420
4421 mutex_unlock(&cgroup_mutex);
4416 4422
4417 /* this function shouldn't be used with modular subsystems, since they 4423 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4424 * need to register a subsys_id, among other things */
@@ -4430,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4436 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4437int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4438{
4433 int i;
4434 struct cgroup_subsys_state *css; 4439 struct cgroup_subsys_state *css;
4440 int i, ret;
4435 4441
4436 /* check name and function validity */ 4442 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4444 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4445 return -EINVAL;
4440 4446
4441 /* 4447 /*
@@ -4464,10 +4470,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4470 subsys[ss->subsys_id] = ss;
4465 4471
4466 /* 4472 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4473 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4474 * struct, so this can happen first (i.e. before the rootnode
4475 * attachment).
4469 */ 4476 */
4470 css = ss->create(dummytop); 4477 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4478 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4479 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4480 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4489 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4490 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4491 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4492 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4493 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4494 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4495 }
4494 4496
4495 /* 4497 /*
@@ -4522,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4522 write_unlock(&css_set_lock); 4524 write_unlock(&css_set_lock);
4523 4525
4524 ss->active = 1; 4526 ss->active = 1;
4527 ret = online_css(ss, dummytop);
4528 if (ret)
4529 goto err_unload;
4525 4530
4526 /* success! */ 4531 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4532 mutex_unlock(&cgroup_mutex);
4528 return 0; 4533 return 0;
4534
4535err_unload:
4536 mutex_unlock(&cgroup_mutex);
4537 /* @ss can't be mounted here as try_module_get() would fail */
4538 cgroup_unload_subsys(ss);
4539 return ret;
4529} 4540}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4541EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4542
@@ -4552,6 +4563,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4563 BUG_ON(ss->root != &rootnode);
4553 4564
4554 mutex_lock(&cgroup_mutex); 4565 mutex_lock(&cgroup_mutex);
4566
4567 offline_css(ss, dummytop);
4568 ss->active = 0;
4569
4570 if (ss->use_id) {
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr);
4573 }
4574
4555 /* deassign the subsys_id */ 4575 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4576 subsys[ss->subsys_id] = NULL;
4557 4577
@@ -4567,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 struct css_set *cg = link->cg; 4587 struct css_set *cg = link->cg;
4568 4588
4569 hlist_del(&cg->hlist); 4589 hlist_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4590 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4591 hhead = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4592 hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4575 write_unlock(&css_set_lock); 4594 write_unlock(&css_set_lock);
4576 4595
4577 /* 4596 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4597 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4598 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4599 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4600 * takes care of freeing the css_id.
4582 */ 4601 */
4583 ss->destroy(dummytop); 4602 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4603 dummytop->subsys[ss->subsys_id] = NULL;
4585 4604
4586 mutex_unlock(&cgroup_mutex); 4605 mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4643,8 @@ int __init cgroup_init_early(void)
4624 4643
4625 BUG_ON(!ss->name); 4644 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4645 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4646 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4647 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4648 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4649 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4650 ss->name, ss->subsys_id);
@@ -4832,44 +4851,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4851}
4833 4852
4834/** 4853/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4854 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4855 * @child: the task in question
4865 * 4856 *
4866 * Adds the task to the list running through its css_set if necessary. 4857 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4858 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4859 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4860 * cgroup_iter_start() - to guarantee that the new task ends up on its
4861 * list.
4870 */ 4862 */
4871void cgroup_post_fork(struct task_struct *child) 4863void cgroup_post_fork(struct task_struct *child)
4872{ 4864{
4865 int i;
4866
4873 /* 4867 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4868 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4869 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4883,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4883 task_unlock(child);
4890 write_unlock(&css_set_lock); 4884 write_unlock(&css_set_lock);
4891 } 4885 }
4886
4887 /*
4888 * Call ss->fork(). This must happen after @child is linked on
4889 * css_set; otherwise, @child might change state between ->fork()
4890 * and addition to css_set.
4891 */
4892 if (need_forkexit_callback) {
4893 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4894 struct cgroup_subsys *ss = subsys[i];
4895
4896 /*
4897 * fork/exit callbacks are supported only for
4898 * builtin subsystems and we don't need further
4899 * synchronization as they never go away.
4900 */
4901 if (!ss || ss->module)
4902 continue;
4903
4904 if (ss->fork)
4905 ss->fork(child);
4906 }
4907 }
4892} 4908}
4909
4893/** 4910/**
4894 * cgroup_exit - detach cgroup from exiting task 4911 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4912 * @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5039/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5040bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5041{
5025 do { 5042 while (true) {
5026 int v = css_refcnt(css); 5043 int t, v;
5027 5044
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5045 v = css_refcnt(css);
5046 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5047 if (likely(t == v))
5029 return true; 5048 return true;
5049 else if (t < 0)
5050 return false;
5030 cpu_relax(); 5051 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5052 }
5032
5033 return false;
5034} 5053}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5054EXPORT_SYMBOL_GPL(__css_tryget);
5036 5055
@@ -5049,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5068 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5069 check_for_release(cgrp);
5051 } 5070 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5071 break;
5054 case 0: 5072 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5073 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5074 break;
5058 } 5075 }
5059 rcu_read_unlock(); 5076 rcu_read_unlock();
@@ -5439,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5456}
5440 5457
5441#ifdef CONFIG_CGROUP_DEBUG 5458#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5459static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5460{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5461 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5462
@@ -5449,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5466 return css;
5450} 5467}
5451 5468
5452static void debug_destroy(struct cgroup *cont) 5469static void debug_css_free(struct cgroup *cont)
5453{ 5470{
5454 kfree(cont->subsys[debug_subsys_id]); 5471 kfree(cont->subsys[debug_subsys_id]);
5455} 5472}
@@ -5578,8 +5595,8 @@ static struct cftype debug_files[] = {
5578 5595
5579struct cgroup_subsys debug_subsys = { 5596struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5597 .name = "debug",
5581 .create = debug_create, 5598 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5599 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5600 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5601 .base_cftypes = debug_files,
5585}; 5602};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was 319
260 * only partially frozen when we exitted write. */ 320 rcu_read_unlock();
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 321
267 seq_puts(m, freezer_state_strs[state]); 322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
331 387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
384}; 488};
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..f6150e92dfc9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1215 return 0;
1216} 1216}
1217 1217
1218#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
1219asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1220 struct compat_timespec __user *interval)
1221{
1222 struct timespec t;
1223 int ret;
1224 mm_segment_t old_fs = get_fs();
1225
1226 set_fs(KERNEL_DS);
1227 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1228 set_fs(old_fs);
1229 if (put_compat_timespec(&t, interval))
1230 return -EFAULT;
1231 return ret;
1232}
1233#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1234
1218/* 1235/*
1219 * Allocate user-space memory for the duration of a single system call, 1236 * Allocate user-space memory for the duration of a single system call,
1220 * in order to marshall parameters inside a compat thunk. 1237 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000000..e0e07fd55508
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,83 @@
1#include <linux/context_tracking.h>
2#include <linux/rcupdate.h>
3#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h>
6
7struct context_tracking {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true,
24#endif
25};
26
27void user_enter(void)
28{
29 unsigned long flags;
30
31 /*
32 * Some contexts may involve an exception occuring in an irq,
33 * leading to that nesting:
34 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
35 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
36 * helpers are enough to protect RCU uses inside the exception. So
37 * just return immediately if we detect we are in an IRQ.
38 */
39 if (in_interrupt())
40 return;
41
42 WARN_ON_ONCE(!current->mm);
43
44 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER);
48 rcu_user_enter();
49 }
50 local_irq_restore(flags);
51}
52
53void user_exit(void)
54{
55 unsigned long flags;
56
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt())
66 return;
67
68 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL);
71 rcu_user_exit();
72 }
73 local_irq_restore(flags);
74}
75
76void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next)
78{
79 if (__this_cpu_read(context_tracking.active)) {
80 clear_tsk_thread_flag(prev, TIF_NOHZ);
81 set_tsk_thread_flag(next, TIF_NOHZ);
82 }
83}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 42bd331ee0ab..3046a503242c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -348,11 +348,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
349 struct task_struct *idle; 349 struct task_struct *idle;
350 350
351 if (cpu_online(cpu) || !cpu_present(cpu))
352 return -EINVAL;
353
354 cpu_hotplug_begin(); 351 cpu_hotplug_begin();
355 352
353 if (cpu_online(cpu) || !cpu_present(cpu)) {
354 ret = -EINVAL;
355 goto out;
356 }
357
356 idle = idle_thread_get(cpu); 358 idle = idle_thread_get(cpu);
357 if (IS_ERR(idle)) { 359 if (IS_ERR(idle)) {
358 ret = PTR_ERR(idle); 360 ret = PTR_ERR(idle);
@@ -601,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
601 603
602static int __init cpu_hotplug_pm_sync_init(void) 604static int __init cpu_hotplug_pm_sync_init(void)
603{ 605{
606 /*
607 * cpu_hotplug_pm_callback has higher priority than x86
608 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
609 * to disable cpu hotplug to avoid cpu hotplug race.
610 */
604 pm_notifier(cpu_hotplug_pm_callback, 0); 611 pm_notifier(cpu_hotplug_pm_callback, 0);
605 return 0; 612 return 0;
606} 613}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..7bb63eea6eb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
302 * are online, with memory. If none are online with memory, walk 302 * are online, with memory. If none are online with memory, walk
303 * up the cpuset hierarchy until we find one that does have some 303 * up the cpuset hierarchy until we find one that does have some
304 * online mems. If we get all the way to the top and still haven't 304 * online mems. If we get all the way to the top and still haven't
305 * found any online mems, return node_states[N_HIGH_MEMORY]. 305 * found any online mems, return node_states[N_MEMORY].
306 * 306 *
307 * One way or another, we guarantee to return some non-empty subset 307 * One way or another, we guarantee to return some non-empty subset
308 * of node_states[N_HIGH_MEMORY]. 308 * of node_states[N_MEMORY].
309 * 309 *
310 * Call with callback_mutex held. 310 * Call with callback_mutex held.
311 */ 311 */
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 314{
315 while (cs && !nodes_intersects(cs->mems_allowed, 315 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_HIGH_MEMORY])) 316 node_states[N_MEMORY]))
317 cs = cs->parent; 317 cs = cs->parent;
318 if (cs) 318 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 319 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_HIGH_MEMORY]); 320 node_states[N_MEMORY]);
321 else 321 else
322 *pmask = node_states[N_HIGH_MEMORY]; 322 *pmask = node_states[N_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 323 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
324} 324}
325 325
326/* 326/*
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1100 return -ENOMEM; 1100 return -ENOMEM;
1101 1101
1102 /* 1102 /*
1103 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1103 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1104 * it's read-only 1104 * it's read-only
1105 */ 1105 */
1106 if (cs == &top_cpuset) { 1106 if (cs == &top_cpuset) {
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1122 goto done;
1123 1123
1124 if (!nodes_subset(trialcs->mems_allowed, 1124 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) { 1125 node_states[N_MEMORY])) {
1126 retval = -EINVAL; 1126 retval = -EINVAL;
1127 goto done; 1127 goto done;
1128 } 1128 }
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
1784}; 1784};
1785 1785
1786/* 1786/*
1787 * post_clone() is called during cgroup_create() when the 1787 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1788 * cont: control group that the new cpuset will be part of
1826 */ 1789 */
1827 1790
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1792{
1830 struct cpuset *cs; 1793 struct cgroup *parent_cg = cont->parent;
1831 struct cpuset *parent; 1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1832 1796
1833 if (!cont->parent) { 1797 if (!parent_cg)
1834 return &top_cpuset.css; 1798 return &top_cpuset.css;
1835 } 1799 parent = cgroup_cs(parent_cg);
1836 parent = cgroup_cs(cont->parent); 1800
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1802 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1803 return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1855 1819
1856 cs->parent = parent; 1820 cs->parent = parent;
1857 number_of_cpusets++; 1821 number_of_cpusets++;
1858 return &cs->css ; 1822
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1859} 1852}
1860 1853
1861/* 1854/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1864 * will call async_rebuild_sched_domains(). 1857 * will call async_rebuild_sched_domains().
1865 */ 1858 */
1866 1859
1867static void cpuset_destroy(struct cgroup *cont) 1860static void cpuset_css_free(struct cgroup *cont)
1868{ 1861{
1869 struct cpuset *cs = cgroup_cs(cont); 1862 struct cpuset *cs = cgroup_cs(cont);
1870 1863
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
1878 1871
1879struct cgroup_subsys cpuset_subsys = { 1872struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1873 .name = "cpuset",
1881 .create = cpuset_create, 1874 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1875 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1876 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach, 1877 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1878 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1879 .base_cftypes = files,
1888 .early_init = 1, 1880 .early_init = 1,
@@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue)
2034 * before dropping down to the next. It always processes a node before 2026 * before dropping down to the next. It always processes a node before
2035 * any of its children. 2027 * any of its children.
2036 * 2028 *
2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY 2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
2038 * if all present pages from a node are offlined. 2030 * if all present pages from a node are offlined.
2039 */ 2031 */
2040static void 2032static void
@@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2073 2065
2074 /* Continue past cpusets with all mems online */ 2066 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed, 2067 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY])) 2068 node_states[N_MEMORY]))
2077 continue; 2069 continue;
2078 2070
2079 oldmems = cp->mems_allowed; 2071 oldmems = cp->mems_allowed;
@@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2081 /* Remove offline mems from this cpuset. */ 2073 /* Remove offline mems from this cpuset. */
2082 mutex_lock(&callback_mutex); 2074 mutex_lock(&callback_mutex);
2083 nodes_and(cp->mems_allowed, cp->mems_allowed, 2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2084 node_states[N_HIGH_MEMORY]); 2076 node_states[N_MEMORY]);
2085 mutex_unlock(&callback_mutex); 2077 mutex_unlock(&callback_mutex);
2086 2078
2087 /* Move tasks from the empty cpuset to a parent */ 2079 /* Move tasks from the empty cpuset to a parent */
@@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online)
2134 2126
2135#ifdef CONFIG_MEMORY_HOTPLUG 2127#ifdef CONFIG_MEMORY_HOTPLUG
2136/* 2128/*
2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2129 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2130 * Call this routine anytime after node_states[N_MEMORY] changes.
2139 * See cpuset_update_active_cpus() for CPU hotplug handling. 2131 * See cpuset_update_active_cpus() for CPU hotplug handling.
2140 */ 2132 */
2141static int cpuset_track_online_nodes(struct notifier_block *self, 2133static int cpuset_track_online_nodes(struct notifier_block *self,
@@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2148 case MEM_ONLINE: 2140 case MEM_ONLINE:
2149 oldmems = top_cpuset.mems_allowed; 2141 oldmems = top_cpuset.mems_allowed;
2150 mutex_lock(&callback_mutex); 2142 mutex_lock(&callback_mutex);
2151 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2152 mutex_unlock(&callback_mutex); 2144 mutex_unlock(&callback_mutex);
2153 update_tasks_nodemask(&top_cpuset, &oldmems, NULL); 2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2154 break; 2146 break;
@@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2177void __init cpuset_init_smp(void) 2169void __init cpuset_init_smp(void)
2178{ 2170{
2179 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2171 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2180 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2172 top_cpuset.mems_allowed = node_states[N_MEMORY];
2181 2173
2182 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2183 2175
@@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void)
2245 * 2237 *
2246 * Description: Returns the nodemask_t mems_allowed of the cpuset 2238 * Description: Returns the nodemask_t mems_allowed of the cpuset
2247 * attached to the specified @tsk. Guaranteed to return some non-empty 2239 * attached to the specified @tsk. Guaranteed to return some non-empty
2248 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the 2240 * subset of node_states[N_MEMORY], even if this means going outside the
2249 * tasks cpuset. 2241 * tasks cpuset.
2250 **/ 2242 **/
2251 2243
diff --git a/kernel/cred.c b/kernel/cred.c
index 48cea3da6d05..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
30static struct kmem_cache *cred_jar; 30static struct kmem_cache *cred_jar;
31 31
32/* 32/*
33 * The common credentials for the initial task's thread group
34 */
35#ifdef CONFIG_KEYS
36static struct thread_group_cred init_tgcred = {
37 .usage = ATOMIC_INIT(2),
38 .tgid = 0,
39 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
40};
41#endif
42
43/*
44 * The initial credentials for the initial task 33 * The initial credentials for the initial task
45 */ 34 */
46struct cred init_cred = { 35struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
65 .user = INIT_USER, 54 .user = INIT_USER,
66 .user_ns = &init_user_ns, 55 .user_ns = &init_user_ns,
67 .group_info = &init_groups, 56 .group_info = &init_groups,
68#ifdef CONFIG_KEYS
69 .tgcred = &init_tgcred,
70#endif
71}; 57};
72 58
73static inline void set_cred_subscribers(struct cred *cred, int n) 59static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
96} 82}
97 83
98/* 84/*
99 * Dispose of the shared task group credentials
100 */
101#ifdef CONFIG_KEYS
102static void release_tgcred_rcu(struct rcu_head *rcu)
103{
104 struct thread_group_cred *tgcred =
105 container_of(rcu, struct thread_group_cred, rcu);
106
107 BUG_ON(atomic_read(&tgcred->usage) != 0);
108
109 key_put(tgcred->session_keyring);
110 key_put(tgcred->process_keyring);
111 kfree(tgcred);
112}
113#endif
114
115/*
116 * Release a set of thread group credentials.
117 */
118static void release_tgcred(struct cred *cred)
119{
120#ifdef CONFIG_KEYS
121 struct thread_group_cred *tgcred = cred->tgcred;
122
123 if (atomic_dec_and_test(&tgcred->usage))
124 call_rcu(&tgcred->rcu, release_tgcred_rcu);
125#endif
126}
127
128/*
129 * The RCU callback to actually dispose of a set of credentials 85 * The RCU callback to actually dispose of a set of credentials
130 */ 86 */
131static void put_cred_rcu(struct rcu_head *rcu) 87static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
150#endif 106#endif
151 107
152 security_cred_free(cred); 108 security_cred_free(cred);
109 key_put(cred->session_keyring);
110 key_put(cred->process_keyring);
153 key_put(cred->thread_keyring); 111 key_put(cred->thread_keyring);
154 key_put(cred->request_key_auth); 112 key_put(cred->request_key_auth);
155 release_tgcred(cred);
156 if (cred->group_info) 113 if (cred->group_info)
157 put_group_info(cred->group_info); 114 put_group_info(cred->group_info);
158 free_uid(cred->user); 115 free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
246 if (!new) 203 if (!new)
247 return NULL; 204 return NULL;
248 205
249#ifdef CONFIG_KEYS
250 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
251 if (!new->tgcred) {
252 kmem_cache_free(cred_jar, new);
253 return NULL;
254 }
255 atomic_set(&new->tgcred->usage, 1);
256#endif
257
258 atomic_set(&new->usage, 1); 206 atomic_set(&new->usage, 1);
259#ifdef CONFIG_DEBUG_CREDENTIALS 207#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC; 208 new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
308 get_user_ns(new->user_ns); 256 get_user_ns(new->user_ns);
309 257
310#ifdef CONFIG_KEYS 258#ifdef CONFIG_KEYS
259 key_get(new->session_keyring);
260 key_get(new->process_keyring);
311 key_get(new->thread_keyring); 261 key_get(new->thread_keyring);
312 key_get(new->request_key_auth); 262 key_get(new->request_key_auth);
313 atomic_inc(&new->tgcred->usage);
314#endif 263#endif
315 264
316#ifdef CONFIG_SECURITY 265#ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
334 */ 283 */
335struct cred *prepare_exec_creds(void) 284struct cred *prepare_exec_creds(void)
336{ 285{
337 struct thread_group_cred *tgcred = NULL;
338 struct cred *new; 286 struct cred *new;
339 287
340#ifdef CONFIG_KEYS
341 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
342 if (!tgcred)
343 return NULL;
344#endif
345
346 new = prepare_creds(); 288 new = prepare_creds();
347 if (!new) { 289 if (!new)
348 kfree(tgcred);
349 return new; 290 return new;
350 }
351 291
352#ifdef CONFIG_KEYS 292#ifdef CONFIG_KEYS
353 /* newly exec'd tasks don't get a thread keyring */ 293 /* newly exec'd tasks don't get a thread keyring */
354 key_put(new->thread_keyring); 294 key_put(new->thread_keyring);
355 new->thread_keyring = NULL; 295 new->thread_keyring = NULL;
356 296
357 /* create a new per-thread-group creds for all this set of threads to
358 * share */
359 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
360
361 atomic_set(&tgcred->usage, 1);
362 spin_lock_init(&tgcred->lock);
363
364 /* inherit the session keyring; new process keyring */ 297 /* inherit the session keyring; new process keyring */
365 key_get(tgcred->session_keyring); 298 key_put(new->process_keyring);
366 tgcred->process_keyring = NULL; 299 new->process_keyring = NULL;
367
368 release_tgcred(new);
369 new->tgcred = tgcred;
370#endif 300#endif
371 301
372 return new; 302 return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
383 */ 313 */
384int copy_creds(struct task_struct *p, unsigned long clone_flags) 314int copy_creds(struct task_struct *p, unsigned long clone_flags)
385{ 315{
386#ifdef CONFIG_KEYS
387 struct thread_group_cred *tgcred;
388#endif
389 struct cred *new; 316 struct cred *new;
390 int ret; 317 int ret;
391 318
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
425 install_thread_keyring_to_cred(new); 352 install_thread_keyring_to_cred(new);
426 } 353 }
427 354
428 /* we share the process and session keyrings between all the threads in 355 /* The process keyring is only shared between the threads in a process;
429 * a process - this is slightly icky as we violate COW credentials a 356 * anything outside of those threads doesn't inherit.
430 * bit */ 357 */
431 if (!(clone_flags & CLONE_THREAD)) { 358 if (!(clone_flags & CLONE_THREAD)) {
432 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); 359 key_put(new->process_keyring);
433 if (!tgcred) { 360 new->process_keyring = NULL;
434 ret = -ENOMEM;
435 goto error_put;
436 }
437 atomic_set(&tgcred->usage, 1);
438 spin_lock_init(&tgcred->lock);
439 tgcred->process_keyring = NULL;
440 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
441
442 release_tgcred(new);
443 new->tgcred = tgcred;
444 } 361 }
445#endif 362#endif
446 363
@@ -455,6 +372,31 @@ error_put:
455 return ret; 372 return ret;
456} 373}
457 374
375static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
376{
377 const struct user_namespace *set_ns = set->user_ns;
378 const struct user_namespace *subset_ns = subset->user_ns;
379
380 /* If the two credentials are in the same user namespace see if
381 * the capabilities of subset are a subset of set.
382 */
383 if (set_ns == subset_ns)
384 return cap_issubset(subset->cap_permitted, set->cap_permitted);
385
386 /* The credentials are in a different user namespaces
387 * therefore one is a subset of the other only if a set is an
388 * ancestor of subset and set->euid is owner of subset or one
389 * of subsets ancestors.
390 */
391 for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
392 if ((set_ns == subset_ns->parent) &&
393 uid_eq(subset_ns->owner, set->euid))
394 return true;
395 }
396
397 return false;
398}
399
458/** 400/**
459 * commit_creds - Install new credentials upon the current task 401 * commit_creds - Install new credentials upon the current task
460 * @new: The credentials to be assigned 402 * @new: The credentials to be assigned
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new)
493 !gid_eq(old->egid, new->egid) || 435 !gid_eq(old->egid, new->egid) ||
494 !uid_eq(old->fsuid, new->fsuid) || 436 !uid_eq(old->fsuid, new->fsuid) ||
495 !gid_eq(old->fsgid, new->fsgid) || 437 !gid_eq(old->fsgid, new->fsgid) ||
496 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 438 !cred_cap_issubset(old, new)) {
497 if (task->mm) 439 if (task->mm)
498 set_dumpable(task->mm, suid_dumpable); 440 set_dumpable(task->mm, suid_dumpable);
499 task->pdeath_signal = 0; 441 task->pdeath_signal = 0;
@@ -643,9 +585,6 @@ void __init cred_init(void)
643 */ 585 */
644struct cred *prepare_kernel_cred(struct task_struct *daemon) 586struct cred *prepare_kernel_cred(struct task_struct *daemon)
645{ 587{
646#ifdef CONFIG_KEYS
647 struct thread_group_cred *tgcred;
648#endif
649 const struct cred *old; 588 const struct cred *old;
650 struct cred *new; 589 struct cred *new;
651 590
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
653 if (!new) 592 if (!new)
654 return NULL; 593 return NULL;
655 594
656#ifdef CONFIG_KEYS
657 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
658 if (!tgcred) {
659 kmem_cache_free(cred_jar, new);
660 return NULL;
661 }
662#endif
663
664 kdebug("prepare_kernel_cred() alloc %p", new); 595 kdebug("prepare_kernel_cred() alloc %p", new);
665 596
666 if (daemon) 597 if (daemon)
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
678 get_group_info(new->group_info); 609 get_group_info(new->group_info);
679 610
680#ifdef CONFIG_KEYS 611#ifdef CONFIG_KEYS
681 atomic_set(&tgcred->usage, 1); 612 new->session_keyring = NULL;
682 spin_lock_init(&tgcred->lock); 613 new->process_keyring = NULL;
683 tgcred->process_keyring = NULL;
684 tgcred->session_keyring = NULL;
685 new->tgcred = tgcred;
686 new->request_key_auth = NULL;
687 new->thread_keyring = NULL; 614 new->thread_keyring = NULL;
615 new->request_key_auth = NULL;
688 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 616 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
689#endif 617#endif
690 618
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134d..301079d06f24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6155 6155
6156 event->parent = parent_event; 6156 event->parent = parent_event;
6157 6157
6158 event->ns = get_pid_ns(current->nsproxy->pid_ns); 6158 event->ns = get_pid_ns(task_active_pid_ns(current));
6159 event->id = atomic64_inc_return(&perf_event_id); 6159 event->id = atomic64_inc_return(&perf_event_id);
6160 6160
6161 event->state = PERF_EVENT_STATE_INACTIVE; 6161 event->state = PERF_EVENT_STATE_INACTIVE;
@@ -7434,7 +7434,7 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7434device_initcall(perf_event_sysfs_init);
7435 7435
7436#ifdef CONFIG_CGROUP_PERF 7436#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7438{ 7438{
7439 struct perf_cgroup *jc; 7439 struct perf_cgroup *jc;
7440 7440
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7451 return &jc->css; 7451 return &jc->css;
7452} 7452}
7453 7453
7454static void perf_cgroup_destroy(struct cgroup *cont) 7454static void perf_cgroup_css_free(struct cgroup *cont)
7455{ 7455{
7456 struct perf_cgroup *jc; 7456 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7492struct cgroup_subsys perf_subsys = { 7492struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7493 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7494 .subsys_id = perf_subsys_id,
7495 .create = perf_cgroup_create, 7495 .css_alloc = perf_cgroup_css_alloc,
7496 .destroy = perf_cgroup_destroy, 7496 .css_free = perf_cgroup_css_free,
7497 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499 7499
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5cc4e7e42e68..dea7acfbb071 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -33,6 +33,7 @@
33#include <linux/ptrace.h> /* user_enable_single_step */ 33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */ 35#include "../../mm/internal.h" /* munlock_vma_page */
36#include <linux/percpu-rwsem.h>
36 37
37#include <linux/uprobes.h> 38#include <linux/uprobes.h>
38 39
@@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
71static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
72#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
73 74
75static struct percpu_rw_semaphore dup_mmap_sem;
76
74/* 77/*
75 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe 78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
76 * events active at this time. Probably a fine grained per inode count is 79 * events active at this time. Probably a fine grained per inode count is
@@ -766,10 +769,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
766 struct map_info *info; 769 struct map_info *info;
767 int err = 0; 770 int err = 0;
768 771
772 percpu_down_write(&dup_mmap_sem);
769 info = build_map_info(uprobe->inode->i_mapping, 773 info = build_map_info(uprobe->inode->i_mapping,
770 uprobe->offset, is_register); 774 uprobe->offset, is_register);
771 if (IS_ERR(info)) 775 if (IS_ERR(info)) {
772 return PTR_ERR(info); 776 err = PTR_ERR(info);
777 goto out;
778 }
773 779
774 while (info) { 780 while (info) {
775 struct mm_struct *mm = info->mm; 781 struct mm_struct *mm = info->mm;
@@ -799,7 +805,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
799 mmput(mm); 805 mmput(mm);
800 info = free_map_info(info); 806 info = free_map_info(info);
801 } 807 }
802 808 out:
809 percpu_up_write(&dup_mmap_sem);
803 return err; 810 return err;
804} 811}
805 812
@@ -1131,6 +1138,16 @@ void uprobe_clear_state(struct mm_struct *mm)
1131 kfree(area); 1138 kfree(area);
1132} 1139}
1133 1140
1141void uprobe_start_dup_mmap(void)
1142{
1143 percpu_down_read(&dup_mmap_sem);
1144}
1145
1146void uprobe_end_dup_mmap(void)
1147{
1148 percpu_up_read(&dup_mmap_sem);
1149}
1150
1134void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) 1151void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1135{ 1152{
1136 newmm->uprobes_state.xol_area = NULL; 1153 newmm->uprobes_state.xol_area = NULL;
@@ -1199,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1199 vaddr = kmap_atomic(area->page); 1216 vaddr = kmap_atomic(area->page);
1200 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1201 kunmap_atomic(vaddr); 1218 kunmap_atomic(vaddr);
1219 /*
1220 * We probably need flush_icache_user_range() but it needs vma.
1221 * This should work on supported architectures too.
1222 */
1223 flush_dcache_page(area->page);
1202 1224
1203 return current->utask->xol_vaddr; 1225 return current->utask->xol_vaddr;
1204} 1226}
@@ -1430,16 +1452,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1430 return uprobe; 1452 return uprobe;
1431} 1453}
1432 1454
1433void __weak arch_uprobe_enable_step(struct arch_uprobe *arch)
1434{
1435 user_enable_single_step(current);
1436}
1437
1438void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
1439{
1440 user_disable_single_step(current);
1441}
1442
1443/* 1455/*
1444 * Run handler and ask thread to singlestep. 1456 * Run handler and ask thread to singlestep.
1445 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1493,7 +1505,6 @@ static void handle_swbp(struct pt_regs *regs)
1493 goto out; 1505 goto out;
1494 1506
1495 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1496 arch_uprobe_enable_step(&uprobe->arch);
1497 utask->active_uprobe = uprobe; 1508 utask->active_uprobe = uprobe;
1498 utask->state = UTASK_SSTEP; 1509 utask->state = UTASK_SSTEP;
1499 return; 1510 return;
@@ -1525,7 +1536,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1525 else 1536 else
1526 WARN_ON_ONCE(1); 1537 WARN_ON_ONCE(1);
1527 1538
1528 arch_uprobe_disable_step(&uprobe->arch);
1529 put_uprobe(uprobe); 1539 put_uprobe(uprobe);
1530 utask->active_uprobe = NULL; 1540 utask->active_uprobe = NULL;
1531 utask->state = UTASK_RUNNING; 1541 utask->state = UTASK_RUNNING;
@@ -1604,6 +1614,9 @@ static int __init init_uprobes(void)
1604 mutex_init(&uprobes_mmap_mutex[i]); 1614 mutex_init(&uprobes_mmap_mutex[i]);
1605 } 1615 }
1606 1616
1617 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM;
1619
1607 return register_die_notifier(&uprobe_exception_nb); 1620 return register_die_notifier(&uprobe_exception_nb);
1608} 1621}
1609module_init(init_uprobes); 1622module_init(init_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index 346616c0092c..b4df21937216 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
87 } 75 }
88 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
89} 77}
@@ -322,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
322 } 310 }
323} 311}
324 312
325/**
326 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
327 *
328 * If a kernel thread is launched as a result of a system call, or if
329 * it ever exits, it should generally reparent itself to kthreadd so it
330 * isn't in the way of other processes and is correctly cleaned up on exit.
331 *
332 * The various task state such as scheduling policy and priority may have
333 * been inherited from a user process, so we reset them to sane values here.
334 *
335 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
336 */
337static void reparent_to_kthreadd(void)
338{
339 write_lock_irq(&tasklist_lock);
340
341 ptrace_unlink(current);
342 /* Reparent to init */
343 current->real_parent = current->parent = kthreadd_task;
344 list_move_tail(&current->sibling, &current->real_parent->children);
345
346 /* Set the exit signal to SIGCHLD so we signal init on exit */
347 current->exit_signal = SIGCHLD;
348
349 if (task_nice(current) < 0)
350 set_user_nice(current, 0);
351 /* cpus_allowed? */
352 /* rt_priority? */
353 /* signals? */
354 memcpy(current->signal->rlim, init_task.signal->rlim,
355 sizeof(current->signal->rlim));
356
357 atomic_inc(&init_cred.usage);
358 commit_creds(&init_cred);
359 write_unlock_irq(&tasklist_lock);
360}
361
362void __set_special_pids(struct pid *pid) 313void __set_special_pids(struct pid *pid)
363{ 314{
364 struct task_struct *curr = current->group_leader; 315 struct task_struct *curr = current->group_leader;
@@ -370,13 +321,6 @@ void __set_special_pids(struct pid *pid)
370 change_pid(curr, PIDTYPE_PGID, pid); 321 change_pid(curr, PIDTYPE_PGID, pid);
371} 322}
372 323
373static void set_special_pids(struct pid *pid)
374{
375 write_lock_irq(&tasklist_lock);
376 __set_special_pids(pid);
377 write_unlock_irq(&tasklist_lock);
378}
379
380/* 324/*
381 * Let kernel threads use this to say that they allow a certain signal. 325 * Let kernel threads use this to say that they allow a certain signal.
382 * Must not be used if kthread was cloned with CLONE_SIGHAND. 326 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -416,54 +360,6 @@ int disallow_signal(int sig)
416 360
417EXPORT_SYMBOL(disallow_signal); 361EXPORT_SYMBOL(disallow_signal);
418 362
419/*
420 * Put all the gunge required to become a kernel thread without
421 * attached user resources in one place where it belongs.
422 */
423
424void daemonize(const char *name, ...)
425{
426 va_list args;
427 sigset_t blocked;
428
429 va_start(args, name);
430 vsnprintf(current->comm, sizeof(current->comm), name, args);
431 va_end(args);
432
433 /*
434 * If we were started as result of loading a module, close all of the
435 * user space pages. We don't need them, and if we didn't close them
436 * they would be locked into memory.
437 */
438 exit_mm(current);
439 /*
440 * We don't want to get frozen, in case system-wide hibernation
441 * or suspend transition begins right now.
442 */
443 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
444
445 if (current->nsproxy != &init_nsproxy) {
446 get_nsproxy(&init_nsproxy);
447 switch_task_namespaces(current, &init_nsproxy);
448 }
449 set_special_pids(&init_struct_pid);
450 proc_clear_tty(current);
451
452 /* Block and flush all signals */
453 sigfillset(&blocked);
454 sigprocmask(SIG_BLOCK, &blocked, NULL);
455 flush_signals(current);
456
457 /* Become as one with the init task */
458
459 daemonize_fs_struct();
460 daemonize_descriptors();
461
462 reparent_to_kthreadd();
463}
464
465EXPORT_SYMBOL(daemonize);
466
467#ifdef CONFIG_MM_OWNER 363#ifdef CONFIG_MM_OWNER
468/* 364/*
469 * A task is exiting. If it owned this mm, find a new owner for the mm. 365 * A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -1186,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1186 * as other threads in the parent group can be right 1082 * as other threads in the parent group can be right
1187 * here reaping other children at the same time. 1083 * here reaping other children at the same time.
1188 * 1084 *
1189 * We use thread_group_times() to get times for the thread 1085 * We use thread_group_cputime_adjusted() to get times for the thread
1190 * group, which consolidates times for all threads in the 1086 * group, which consolidates times for all threads in the
1191 * group including the group leader. 1087 * group including the group leader.
1192 */ 1088 */
1193 thread_group_times(p, &tgutime, &tgstime); 1089 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1194 spin_lock_irq(&p->real_parent->sighand->siglock); 1090 spin_lock_irq(&p->real_parent->sighand->siglock);
1195 psig = p->real_parent->signal; 1091 psig = p->real_parent->signal;
1196 sig = p->signal; 1092 sig = p->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa2..65ca6d27f24e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
147 int node) 147 int node)
148{ 148{
149 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 149 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
150 THREAD_SIZE_ORDER); 150 THREAD_SIZE_ORDER);
151 151
152 return page ? page_address(page) : NULL; 152 return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
154 154
155static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
156{ 156{
157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
158} 158}
159# else 159# else
160static struct kmem_cache *thread_info_cache; 160static struct kmem_cache *thread_info_cache;
@@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
352 unsigned long charge; 352 unsigned long charge;
353 struct mempolicy *pol; 353 struct mempolicy *pol;
354 354
355 uprobe_start_dup_mmap();
355 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
356 flush_cache_dup_mm(oldmm); 357 flush_cache_dup_mm(oldmm);
357 uprobe_dup_mmap(oldmm, mm); 358 uprobe_dup_mmap(oldmm, mm);
@@ -469,6 +470,7 @@ out:
469 up_write(&mm->mmap_sem); 470 up_write(&mm->mmap_sem);
470 flush_tlb_mm(oldmm); 471 flush_tlb_mm(oldmm);
471 up_write(&oldmm->mmap_sem); 472 up_write(&oldmm->mmap_sem);
473 uprobe_end_dup_mmap();
472 return retval; 474 return retval;
473fail_nomem_anon_vma_fork: 475fail_nomem_anon_vma_fork:
474 mpol_put(pol); 476 mpol_put(pol);
@@ -821,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
821#ifdef CONFIG_TRANSPARENT_HUGEPAGE 823#ifdef CONFIG_TRANSPARENT_HUGEPAGE
822 mm->pmd_huge_pte = NULL; 824 mm->pmd_huge_pte = NULL;
823#endif 825#endif
826#ifdef CONFIG_NUMA_BALANCING
827 mm->first_nid = NUMA_PTE_SCAN_INIT;
828#endif
824 if (!mm_init(mm, tsk)) 829 if (!mm_init(mm, tsk))
825 goto fail_nomem; 830 goto fail_nomem;
826 831
@@ -1039,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1039 atomic_set(&sig->live, 1); 1044 atomic_set(&sig->live, 1);
1040 atomic_set(&sig->sigcnt, 1); 1045 atomic_set(&sig->sigcnt, 1);
1041 init_waitqueue_head(&sig->wait_chldexit); 1046 init_waitqueue_head(&sig->wait_chldexit);
1042 if (clone_flags & CLONE_NEWPID)
1043 sig->flags |= SIGNAL_UNKILLABLE;
1044 sig->curr_target = tsk; 1047 sig->curr_target = tsk;
1045 init_sigpending(&sig->shared_pending); 1048 init_sigpending(&sig->shared_pending);
1046 INIT_LIST_HEAD(&sig->posix_timers); 1049 INIT_LIST_HEAD(&sig->posix_timers);
@@ -1127,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1127 */ 1130 */
1128static struct task_struct *copy_process(unsigned long clone_flags, 1131static struct task_struct *copy_process(unsigned long clone_flags,
1129 unsigned long stack_start, 1132 unsigned long stack_start,
1130 struct pt_regs *regs,
1131 unsigned long stack_size, 1133 unsigned long stack_size,
1132 int __user *child_tidptr, 1134 int __user *child_tidptr,
1133 struct pid *pid, 1135 struct pid *pid,
@@ -1135,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1135{ 1137{
1136 int retval; 1138 int retval;
1137 struct task_struct *p; 1139 struct task_struct *p;
1138 int cgroup_callbacks_done = 0;
1139 1140
1140 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1141 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
@@ -1165,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1165 current->signal->flags & SIGNAL_UNKILLABLE) 1166 current->signal->flags & SIGNAL_UNKILLABLE)
1166 return ERR_PTR(-EINVAL); 1167 return ERR_PTR(-EINVAL);
1167 1168
1169 /*
1170 * If the new process will be in a different pid namespace
1171 * don't allow the creation of threads.
1172 */
1173 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
1174 (task_active_pid_ns(current) != current->nsproxy->pid_ns))
1175 return ERR_PTR(-EINVAL);
1176
1168 retval = security_task_create(clone_flags); 1177 retval = security_task_create(clone_flags);
1169 if (retval) 1178 if (retval)
1170 goto fork_out; 1179 goto fork_out;
@@ -1222,7 +1231,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1222 p->utime = p->stime = p->gtime = 0; 1231 p->utime = p->stime = p->gtime = 0;
1223 p->utimescaled = p->stimescaled = 0; 1232 p->utimescaled = p->stimescaled = 0;
1224#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1225 p->prev_utime = p->prev_stime = 0; 1234 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1226#endif 1235#endif
1227#if defined(SPLIT_RSS_COUNTING) 1236#if defined(SPLIT_RSS_COUNTING)
1228 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1237 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1320,7 +1329,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1320 retval = copy_io(clone_flags, p); 1329 retval = copy_io(clone_flags, p);
1321 if (retval) 1330 if (retval)
1322 goto bad_fork_cleanup_namespaces; 1331 goto bad_fork_cleanup_namespaces;
1323 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1332 retval = copy_thread(clone_flags, stack_start, stack_size, p);
1324 if (retval) 1333 if (retval)
1325 goto bad_fork_cleanup_io; 1334 goto bad_fork_cleanup_io;
1326 1335
@@ -1393,12 +1402,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1393 INIT_LIST_HEAD(&p->thread_group); 1402 INIT_LIST_HEAD(&p->thread_group);
1394 p->task_works = NULL; 1403 p->task_works = NULL;
1395 1404
1396 /* Now that the task is set up, run cgroup callbacks if
1397 * necessary. We need to run them before the task is visible
1398 * on the tasklist. */
1399 cgroup_fork_callbacks(p);
1400 cgroup_callbacks_done = 1;
1401
1402 /* Need tasklist lock for parent etc handling! */ 1405 /* Need tasklist lock for parent etc handling! */
1403 write_lock_irq(&tasklist_lock); 1406 write_lock_irq(&tasklist_lock);
1404 1407
@@ -1441,8 +1444,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1441 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1444 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1442 1445
1443 if (thread_group_leader(p)) { 1446 if (thread_group_leader(p)) {
1444 if (is_child_reaper(pid)) 1447 if (is_child_reaper(pid)) {
1445 p->nsproxy->pid_ns->child_reaper = p; 1448 ns_of_pid(pid)->child_reaper = p;
1449 p->signal->flags |= SIGNAL_UNKILLABLE;
1450 }
1446 1451
1447 p->signal->leader_pid = pid; 1452 p->signal->leader_pid = pid;
1448 p->signal->tty = tty_kref_get(current->signal->tty); 1453 p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1476,8 +1481,6 @@ bad_fork_cleanup_io:
1476 if (p->io_context) 1481 if (p->io_context)
1477 exit_io_context(p); 1482 exit_io_context(p);
1478bad_fork_cleanup_namespaces: 1483bad_fork_cleanup_namespaces:
1479 if (unlikely(clone_flags & CLONE_NEWPID))
1480 pid_ns_release_proc(p->nsproxy->pid_ns);
1481 exit_task_namespaces(p); 1484 exit_task_namespaces(p);
1482bad_fork_cleanup_mm: 1485bad_fork_cleanup_mm:
1483 if (p->mm) 1486 if (p->mm)
@@ -1503,7 +1506,7 @@ bad_fork_cleanup_cgroup:
1503#endif 1506#endif
1504 if (clone_flags & CLONE_THREAD) 1507 if (clone_flags & CLONE_THREAD)
1505 threadgroup_change_end(current); 1508 threadgroup_change_end(current);
1506 cgroup_exit(p, cgroup_callbacks_done); 1509 cgroup_exit(p, 0);
1507 delayacct_tsk_free(p); 1510 delayacct_tsk_free(p);
1508 module_put(task_thread_info(p)->exec_domain->module); 1511 module_put(task_thread_info(p)->exec_domain->module);
1509bad_fork_cleanup_count: 1512bad_fork_cleanup_count:
@@ -1515,12 +1518,6 @@ fork_out:
1515 return ERR_PTR(retval); 1518 return ERR_PTR(retval);
1516} 1519}
1517 1520
1518noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1519{
1520 memset(regs, 0, sizeof(struct pt_regs));
1521 return regs;
1522}
1523
1524static inline void init_idle_pids(struct pid_link *links) 1521static inline void init_idle_pids(struct pid_link *links)
1525{ 1522{
1526 enum pid_type type; 1523 enum pid_type type;
@@ -1534,10 +1531,7 @@ static inline void init_idle_pids(struct pid_link *links)
1534struct task_struct * __cpuinit fork_idle(int cpu) 1531struct task_struct * __cpuinit fork_idle(int cpu)
1535{ 1532{
1536 struct task_struct *task; 1533 struct task_struct *task;
1537 struct pt_regs regs; 1534 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
1538
1539 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1540 &init_struct_pid, 0);
1541 if (!IS_ERR(task)) { 1535 if (!IS_ERR(task)) {
1542 init_idle_pids(task->pids); 1536 init_idle_pids(task->pids);
1543 init_idle(task, cpu); 1537 init_idle(task, cpu);
@@ -1554,7 +1548,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1554 */ 1548 */
1555long do_fork(unsigned long clone_flags, 1549long do_fork(unsigned long clone_flags,
1556 unsigned long stack_start, 1550 unsigned long stack_start,
1557 struct pt_regs *regs,
1558 unsigned long stack_size, 1551 unsigned long stack_size,
1559 int __user *parent_tidptr, 1552 int __user *parent_tidptr,
1560 int __user *child_tidptr) 1553 int __user *child_tidptr)
@@ -1567,15 +1560,9 @@ long do_fork(unsigned long clone_flags,
1567 * Do some preliminary argument and permissions checking before we 1560 * Do some preliminary argument and permissions checking before we
1568 * actually start allocating stuff 1561 * actually start allocating stuff
1569 */ 1562 */
1570 if (clone_flags & CLONE_NEWUSER) { 1563 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1571 if (clone_flags & CLONE_THREAD) 1564 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1572 return -EINVAL; 1565 return -EINVAL;
1573 /* hopefully this check will go away when userns support is
1574 * complete
1575 */
1576 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1577 !capable(CAP_SETGID))
1578 return -EPERM;
1579 } 1566 }
1580 1567
1581 /* 1568 /*
@@ -1584,7 +1571,7 @@ long do_fork(unsigned long clone_flags,
1584 * requested, no event is reported; otherwise, report if the event 1571 * requested, no event is reported; otherwise, report if the event
1585 * for the type of forking is enabled. 1572 * for the type of forking is enabled.
1586 */ 1573 */
1587 if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { 1574 if (!(clone_flags & CLONE_UNTRACED)) {
1588 if (clone_flags & CLONE_VFORK) 1575 if (clone_flags & CLONE_VFORK)
1589 trace = PTRACE_EVENT_VFORK; 1576 trace = PTRACE_EVENT_VFORK;
1590 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1577 else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1596,7 +1583,7 @@ long do_fork(unsigned long clone_flags,
1596 trace = 0; 1583 trace = 0;
1597 } 1584 }
1598 1585
1599 p = copy_process(clone_flags, stack_start, regs, stack_size, 1586 p = copy_process(clone_flags, stack_start, stack_size,
1600 child_tidptr, NULL, trace); 1587 child_tidptr, NULL, trace);
1601 /* 1588 /*
1602 * Do this prior waking up the new thread - the thread pointer 1589 * Do this prior waking up the new thread - the thread pointer
@@ -1634,15 +1621,56 @@ long do_fork(unsigned long clone_flags,
1634 return nr; 1621 return nr;
1635} 1622}
1636 1623
1637#ifdef CONFIG_GENERIC_KERNEL_THREAD
1638/* 1624/*
1639 * Create a kernel thread. 1625 * Create a kernel thread.
1640 */ 1626 */
1641pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 1627pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1642{ 1628{
1643 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, 1629 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1644 (unsigned long)arg, NULL, NULL); 1630 (unsigned long)arg, NULL, NULL);
1645} 1631}
1632
1633#ifdef __ARCH_WANT_SYS_FORK
1634SYSCALL_DEFINE0(fork)
1635{
1636#ifdef CONFIG_MMU
1637 return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1638#else
1639 /* can not support in nommu mode */
1640 return(-EINVAL);
1641#endif
1642}
1643#endif
1644
1645#ifdef __ARCH_WANT_SYS_VFORK
1646SYSCALL_DEFINE0(vfork)
1647{
1648 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1649 0, NULL, NULL);
1650}
1651#endif
1652
1653#ifdef __ARCH_WANT_SYS_CLONE
1654#ifdef CONFIG_CLONE_BACKWARDS
1655SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1656 int __user *, parent_tidptr,
1657 int, tls_val,
1658 int __user *, child_tidptr)
1659#elif defined(CONFIG_CLONE_BACKWARDS2)
1660SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1661 int __user *, parent_tidptr,
1662 int __user *, child_tidptr,
1663 int, tls_val)
1664#else
1665SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1666 int __user *, parent_tidptr,
1667 int __user *, child_tidptr,
1668 int, tls_val)
1669#endif
1670{
1671 return do_fork(clone_flags, newsp, 0,
1672 parent_tidptr, child_tidptr);
1673}
1646#endif 1674#endif
1647 1675
1648#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1676#ifndef ARCH_MIN_MMSTRUCT_ALIGN
@@ -1694,7 +1722,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
1694{ 1722{
1695 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1723 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1696 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1724 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1697 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1725 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1726 CLONE_NEWUSER|CLONE_NEWPID))
1698 return -EINVAL; 1727 return -EINVAL;
1699 /* 1728 /*
1700 * Not implemented, but pretend it works if there is nothing to 1729 * Not implemented, but pretend it works if there is nothing to
@@ -1761,19 +1790,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1761{ 1790{
1762 struct fs_struct *fs, *new_fs = NULL; 1791 struct fs_struct *fs, *new_fs = NULL;
1763 struct files_struct *fd, *new_fd = NULL; 1792 struct files_struct *fd, *new_fd = NULL;
1793 struct cred *new_cred = NULL;
1764 struct nsproxy *new_nsproxy = NULL; 1794 struct nsproxy *new_nsproxy = NULL;
1765 int do_sysvsem = 0; 1795 int do_sysvsem = 0;
1766 int err; 1796 int err;
1767 1797
1768 err = check_unshare_flags(unshare_flags); 1798 /*
1769 if (err) 1799 * If unsharing a user namespace must also unshare the thread.
1770 goto bad_unshare_out; 1800 */
1771 1801 if (unshare_flags & CLONE_NEWUSER)
1802 unshare_flags |= CLONE_THREAD;
1803 /*
1804 * If unsharing a pid namespace must also unshare the thread.
1805 */
1806 if (unshare_flags & CLONE_NEWPID)
1807 unshare_flags |= CLONE_THREAD;
1808 /*
1809 * If unsharing a thread from a thread group, must also unshare vm.
1810 */
1811 if (unshare_flags & CLONE_THREAD)
1812 unshare_flags |= CLONE_VM;
1813 /*
1814 * If unsharing vm, must also unshare signal handlers.
1815 */
1816 if (unshare_flags & CLONE_VM)
1817 unshare_flags |= CLONE_SIGHAND;
1772 /* 1818 /*
1773 * If unsharing namespace, must also unshare filesystem information. 1819 * If unsharing namespace, must also unshare filesystem information.
1774 */ 1820 */
1775 if (unshare_flags & CLONE_NEWNS) 1821 if (unshare_flags & CLONE_NEWNS)
1776 unshare_flags |= CLONE_FS; 1822 unshare_flags |= CLONE_FS;
1823
1824 err = check_unshare_flags(unshare_flags);
1825 if (err)
1826 goto bad_unshare_out;
1777 /* 1827 /*
1778 * CLONE_NEWIPC must also detach from the undolist: after switching 1828 * CLONE_NEWIPC must also detach from the undolist: after switching
1779 * to a new ipc namespace, the semaphore arrays from the old 1829 * to a new ipc namespace, the semaphore arrays from the old
@@ -1787,11 +1837,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1787 err = unshare_fd(unshare_flags, &new_fd); 1837 err = unshare_fd(unshare_flags, &new_fd);
1788 if (err) 1838 if (err)
1789 goto bad_unshare_cleanup_fs; 1839 goto bad_unshare_cleanup_fs;
1790 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1840 err = unshare_userns(unshare_flags, &new_cred);
1791 if (err) 1841 if (err)
1792 goto bad_unshare_cleanup_fd; 1842 goto bad_unshare_cleanup_fd;
1843 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1844 new_cred, new_fs);
1845 if (err)
1846 goto bad_unshare_cleanup_cred;
1793 1847
1794 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1848 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
1795 if (do_sysvsem) { 1849 if (do_sysvsem) {
1796 /* 1850 /*
1797 * CLONE_SYSVSEM is equivalent to sys_exit(). 1851 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1824,11 +1878,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1824 } 1878 }
1825 1879
1826 task_unlock(current); 1880 task_unlock(current);
1881
1882 if (new_cred) {
1883 /* Install the new user namespace */
1884 commit_creds(new_cred);
1885 new_cred = NULL;
1886 }
1827 } 1887 }
1828 1888
1829 if (new_nsproxy) 1889 if (new_nsproxy)
1830 put_nsproxy(new_nsproxy); 1890 put_nsproxy(new_nsproxy);
1831 1891
1892bad_unshare_cleanup_cred:
1893 if (new_cred)
1894 put_cred(new_cred);
1832bad_unshare_cleanup_fd: 1895bad_unshare_cleanup_fd:
1833 if (new_fd) 1896 if (new_fd)
1834 put_files_struct(new_fd); 1897 put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 57d86d07221e..3aca9f29d30e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq)
272 272
273 raw_spin_lock_irq(&desc->lock); 273 raw_spin_lock_irq(&desc->lock);
274 274
275 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
275 kstat_incr_irqs_this_cpu(irq, desc); 276 kstat_incr_irqs_this_cpu(irq, desc);
276 277
277 action = desc->action; 278 action = desc->action;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4e69e24d3d7d..96f3a1d9c379 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
177 irq_base = irq_alloc_descs(first_irq, first_irq, size, 177 irq_base = irq_alloc_descs(first_irq, first_irq, size,
178 of_node_to_nid(of_node)); 178 of_node_to_nid(of_node));
179 if (irq_base < 0) { 179 if (irq_base < 0) {
180 WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 180 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
181 first_irq); 181 first_irq);
182 irq_base = first_irq; 182 irq_base = first_irq;
183 } 183 }
184 } else 184 } else
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c69326aa773..e49a288fa479 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
616 return ret; 616 return ret;
617} 617}
618 618
619#ifdef CONFIG_HARDIRQS_SW_RESEND
620int irq_set_parent(int irq, int parent_irq)
621{
622 unsigned long flags;
623 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
624
625 if (!desc)
626 return -EINVAL;
627
628 desc->parent_irq = parent_irq;
629
630 irq_put_desc_unlock(desc, flags);
631 return 0;
632}
633#endif
634
619/* 635/*
620 * Default primary interrupt handler for threaded interrupts. Is 636 * Default primary interrupt handler for threaded interrupts. Is
621 * assigned as primary handler when request_threaded_irq is called 637 * assigned as primary handler when request_threaded_irq is called
@@ -716,6 +732,7 @@ static void
716irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 732irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
717{ 733{
718 cpumask_var_t mask; 734 cpumask_var_t mask;
735 bool valid = true;
719 736
720 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) 737 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
721 return; 738 return;
@@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
730 } 747 }
731 748
732 raw_spin_lock_irq(&desc->lock); 749 raw_spin_lock_irq(&desc->lock);
733 cpumask_copy(mask, desc->irq_data.affinity); 750 /*
751 * This code is triggered unconditionally. Check the affinity
752 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
753 */
754 if (desc->irq_data.affinity)
755 cpumask_copy(mask, desc->irq_data.affinity);
756 else
757 valid = false;
734 raw_spin_unlock_irq(&desc->lock); 758 raw_spin_unlock_irq(&desc->lock);
735 759
736 set_cpus_allowed_ptr(current, mask); 760 if (valid)
761 set_cpus_allowed_ptr(current, mask);
737 free_cpumask_var(mask); 762 free_cpumask_var(mask);
738} 763}
739#else 764#else
@@ -793,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused)
793 action = kthread_data(tsk); 818 action = kthread_data(tsk);
794 819
795 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
796 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 821 tsk->comm, tsk->pid, action->irq);
797 822
798 823
799 desc = irq_to_desc(action->irq); 824 desc = irq_to_desc(action->irq);
@@ -833,6 +858,8 @@ static int irq_thread(void *data)
833 init_task_work(&on_exit_work, irq_thread_dtor); 858 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 859 task_work_add(current, &on_exit_work, false);
835 860
861 irq_thread_check_affinity(desc, action);
862
836 while (!irq_wait_for_interrupt(action)) { 863 while (!irq_wait_for_interrupt(action)) {
837 irqreturn_t action_ret; 864 irqreturn_t action_ret;
838 865
@@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
936 */ 963 */
937 get_task_struct(t); 964 get_task_struct(t);
938 new->thread = t; 965 new->thread = t;
966 /*
967 * Tell the thread to set its affinity. This is
968 * important for shared interrupt handlers as we do
969 * not invoke setup_affinity() for the secondary
970 * handlers as everything is already set up. Even for
971 * interrupts marked with IRQF_NO_BALANCE this is
972 * correct as we want the thread to move to the cpu(s)
973 * on which the requesting code placed the interrupt.
974 */
975 set_bit(IRQTF_AFFINITY, &new->thread_flags);
939 } 976 }
940 977
941 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { 978 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 6454db7b6a4d..9065107f083e 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
74 if (!desc->irq_data.chip->irq_retrigger || 74 if (!desc->irq_data.chip->irq_retrigger ||
75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
76#ifdef CONFIG_HARDIRQS_SW_RESEND 76#ifdef CONFIG_HARDIRQS_SW_RESEND
77 /*
78 * If the interrupt has a parent irq and runs
79 * in the thread context of the parent irq,
80 * retrigger the parent.
81 */
82 if (desc->parent_irq &&
83 irq_settings_is_nested_thread(desc))
84 irq = desc->parent_irq;
77 /* Set it pending and activate the softirq: */ 85 /* Set it pending and activate the softirq: */
78 set_bit(irq, irqs_resend); 86 set_bit(irq, irqs_resend);
79 tasklet_schedule(&resend_tasklet); 87 tasklet_schedule(&resend_tasklet);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 30b7b225306c..e30ac0fe61c3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -4,6 +4,7 @@
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/random.h> 5#include <linux/random.h>
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/ptrace.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/cache.h> 10#include <linux/cache.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1c317e386831..0023a87e8de6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -219,9 +219,9 @@ static int ____call_usermodehelper(void *data)
219 219
220 commit_creds(new); 220 commit_creds(new);
221 221
222 retval = kernel_execve(sub_info->path, 222 retval = do_execve(sub_info->path,
223 (const char *const *)sub_info->argv, 223 (const char __user *const __user *)sub_info->argv,
224 (const char *const *)sub_info->envp); 224 (const char __user *const __user *)sub_info->envp);
225 if (!retval) 225 if (!retval)
226 return 0; 226 return 0;
227 227
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..6ada93c23a9a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
26static struct kobj_attribute _name##_attr = \ 26static struct kobj_attribute _name##_attr = \
27 __ATTR(_name, 0644, _name##_show, _name##_store) 27 __ATTR(_name, 0644, _name##_show, _name##_store)
28 28
29#if defined(CONFIG_HOTPLUG)
30/* current uevent sequence number */ 29/* current uevent sequence number */
31static ssize_t uevent_seqnum_show(struct kobject *kobj, 30static ssize_t uevent_seqnum_show(struct kobject *kobj,
32 struct kobj_attribute *attr, char *buf) 31 struct kobj_attribute *attr, char *buf)
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
54 return count; 53 return count;
55} 54}
56KERNEL_ATTR_RW(uevent_helper); 55KERNEL_ATTR_RW(uevent_helper);
57#endif 56
58 57
59#ifdef CONFIG_PROFILING 58#ifdef CONFIG_PROFILING
60static ssize_t profiling_show(struct kobject *kobj, 59static ssize_t profiling_show(struct kobject *kobj,
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
141} 140}
142KERNEL_ATTR_RO(fscaps); 141KERNEL_ATTR_RO(fscaps);
143 142
143int rcu_expedited;
144static ssize_t rcu_expedited_show(struct kobject *kobj,
145 struct kobj_attribute *attr, char *buf)
146{
147 return sprintf(buf, "%d\n", rcu_expedited);
148}
149static ssize_t rcu_expedited_store(struct kobject *kobj,
150 struct kobj_attribute *attr,
151 const char *buf, size_t count)
152{
153 if (kstrtoint(buf, 0, &rcu_expedited))
154 return -EINVAL;
155
156 return count;
157}
158KERNEL_ATTR_RW(rcu_expedited);
159
144/* 160/*
145 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 161 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
146 */ 162 */
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
169 185
170static struct attribute * kernel_attrs[] = { 186static struct attribute * kernel_attrs[] = {
171 &fscaps_attr.attr, 187 &fscaps_attr.attr,
172#if defined(CONFIG_HOTPLUG)
173 &uevent_seqnum_attr.attr, 188 &uevent_seqnum_attr.attr,
174 &uevent_helper_attr.attr, 189 &uevent_helper_attr.attr,
175#endif
176#ifdef CONFIG_PROFILING 190#ifdef CONFIG_PROFILING
177 &profiling_attr.attr, 191 &profiling_attr.attr,
178#endif 192#endif
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = {
182 &kexec_crash_size_attr.attr, 196 &kexec_crash_size_attr.attr,
183 &vmcoreinfo_attr.attr, 197 &vmcoreinfo_attr.attr,
184#endif 198#endif
199 &rcu_expedited_attr.attr,
185 NULL 200 NULL
186}; 201};
187 202
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 29fb60caecb5..691dc2ef9baf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -428,7 +428,7 @@ int kthreadd(void *unused)
428 set_task_comm(tsk, "kthreadd"); 428 set_task_comm(tsk, "kthreadd");
429 ignore_signals(tsk); 429 ignore_signals(tsk);
430 set_cpus_allowed_ptr(tsk, cpu_all_mask); 430 set_cpus_allowed_ptr(tsk, cpu_all_mask);
431 set_mems_allowed(node_states[N_HIGH_MEMORY]); 431 set_mems_allowed(node_states[N_MEMORY]);
432 432
433 current->flags |= PF_NOFREEZE; 433 current->flags |= PF_NOFREEZE;
434 434
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 91c32a0b612c..b2c71c5873e4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
39 39
40static void print_name(struct seq_file *m, struct lock_class *class) 40static void print_name(struct seq_file *m, struct lock_class *class)
41{ 41{
42 char str[128]; 42 char str[KSYM_NAME_LEN];
43 const char *name = class->name; 43 const char *name = class->name;
44 44
45 if (!name) { 45 if (!name) {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9
10#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \
12 ASM_SYMBOL(name):
13
14 .section ".init.data","aw"
15
16GLOBAL(modsign_certificate_list)
17 .incbin "signing_key.x509"
18 .incbin "extra_certificates"
19GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 767e559dfb10..2b6e69909c39 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
20 20
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initdata const u8 modsign_certificate_list_end[];
23asm(".section .init.data,\"aw\"\n"
24 SYMBOL_PREFIX "modsign_certificate_list:\n"
25 ".incbin \"signing_key.x509\"\n"
26 ".incbin \"extra_certificates\"\n"
27 SYMBOL_PREFIX "modsign_certificate_list_end:"
28 );
29 23
30/* 24/*
31 * We need to make sure ccache doesn't cache the .o file as it doesn't notice 25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
@@ -40,18 +34,15 @@ static __init int module_verify_init(void)
40{ 34{
41 pr_notice("Initialise module verification\n"); 35 pr_notice("Initialise module verification\n");
42 36
43 modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", 37 modsign_keyring = keyring_alloc(".module_sign",
44 KUIDT_INIT(0), KGIDT_INIT(0), 38 KUIDT_INIT(0), KGIDT_INIT(0),
45 current_cred(), 39 current_cred(),
46 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 40 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
47 KEY_USR_VIEW | KEY_USR_READ, 41 KEY_USR_VIEW | KEY_USR_READ),
48 KEY_ALLOC_NOT_IN_QUOTA); 42 KEY_ALLOC_NOT_IN_QUOTA, NULL);
49 if (IS_ERR(modsign_keyring)) 43 if (IS_ERR(modsign_keyring))
50 panic("Can't allocate module signing keyring\n"); 44 panic("Can't allocate module signing keyring\n");
51 45
52 if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
53 panic("Can't instantiate module signing keyring\n");
54
55 return 0; 46 return 0;
56} 47}
57 48
diff --git a/kernel/module.c b/kernel/module.c
index 6e48c3a43599..250092c1d57d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/file.h>
24#include <linux/fs.h> 25#include <linux/fs.h>
25#include <linux/sysfs.h> 26#include <linux/sysfs.h>
26#include <linux/kernel.h> 27#include <linux/kernel.h>
@@ -28,6 +29,7 @@
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/elf.h> 30#include <linux/elf.h>
30#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
32#include <linux/security.h>
31#include <linux/seq_file.h> 33#include <linux/seq_file.h>
32#include <linux/syscalls.h> 34#include <linux/syscalls.h>
33#include <linux/fcntl.h> 35#include <linux/fcntl.h>
@@ -59,6 +61,7 @@
59#include <linux/pfn.h> 61#include <linux/pfn.h>
60#include <linux/bsearch.h> 62#include <linux/bsearch.h>
61#include <linux/fips.h> 63#include <linux/fips.h>
64#include <uapi/linux/module.h>
62#include "module-internal.h" 65#include "module-internal.h"
63 66
64#define CREATE_TRACE_POINTS 67#define CREATE_TRACE_POINTS
@@ -372,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms,
372 printk(KERN_WARNING "Symbol %s is being used " 375 printk(KERN_WARNING "Symbol %s is being used "
373 "by a non-GPL module, which will not " 376 "by a non-GPL module, which will not "
374 "be allowed in the future\n", fsa->name); 377 "be allowed in the future\n", fsa->name);
375 printk(KERN_WARNING "Please see the file "
376 "Documentation/feature-removal-schedule.txt "
377 "in the kernel source tree for more details.\n");
378 } 378 }
379 } 379 }
380 380
@@ -2282,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2283 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2283 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2284 const Elf_Sym *src; 2284 const Elf_Sym *src;
2285 unsigned int i, nsrc, ndst, strtab_size; 2285 unsigned int i, nsrc, ndst, strtab_size = 0;
2286 2286
2287 /* Put symbol section at end of init part of module. */ 2287 /* Put symbol section at end of init part of module. */
2288 symsect->sh_flags |= SHF_ALLOC; 2288 symsect->sh_flags |= SHF_ALLOC;
@@ -2293,9 +2293,6 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2293 src = (void *)info->hdr + symsect->sh_offset; 2293 src = (void *)info->hdr + symsect->sh_offset;
2294 nsrc = symsect->sh_size / sizeof(*src); 2294 nsrc = symsect->sh_size / sizeof(*src);
2295 2295
2296 /* strtab always starts with a nul, so offset 0 is the empty string. */
2297 strtab_size = 1;
2298
2299 /* Compute total space required for the core symbols' strtab. */ 2296 /* Compute total space required for the core symbols' strtab. */
2300 for (ndst = i = 0; i < nsrc; i++) { 2297 for (ndst = i = 0; i < nsrc; i++) {
2301 if (i == 0 || 2298 if (i == 0 ||
@@ -2337,7 +2334,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2337 mod->core_symtab = dst = mod->module_core + info->symoffs; 2334 mod->core_symtab = dst = mod->module_core + info->symoffs;
2338 mod->core_strtab = s = mod->module_core + info->stroffs; 2335 mod->core_strtab = s = mod->module_core + info->stroffs;
2339 src = mod->symtab; 2336 src = mod->symtab;
2340 *s++ = 0;
2341 for (ndst = i = 0; i < mod->num_symtab; i++) { 2337 for (ndst = i = 0; i < mod->num_symtab; i++) {
2342 if (i == 0 || 2338 if (i == 0 ||
2343 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { 2339 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
@@ -2378,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2378 2374
2379void * __weak module_alloc(unsigned long size) 2375void * __weak module_alloc(unsigned long size)
2380{ 2376{
2381 return size == 0 ? NULL : vmalloc_exec(size); 2377 return vmalloc_exec(size);
2382} 2378}
2383 2379
2384static void *module_alloc_update_bounds(unsigned long size) 2380static void *module_alloc_update_bounds(unsigned long size)
@@ -2425,18 +2421,17 @@ static inline void kmemleak_load_module(const struct module *mod,
2425#endif 2421#endif
2426 2422
2427#ifdef CONFIG_MODULE_SIG 2423#ifdef CONFIG_MODULE_SIG
2428static int module_sig_check(struct load_info *info, 2424static int module_sig_check(struct load_info *info)
2429 const void *mod, unsigned long *_len)
2430{ 2425{
2431 int err = -ENOKEY; 2426 int err = -ENOKEY;
2432 unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; 2427 const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
2433 unsigned long len = *_len; 2428 const void *mod = info->hdr;
2434 2429
2435 if (len > markerlen && 2430 if (info->len > markerlen &&
2436 memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { 2431 memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
2437 /* We truncate the module to discard the signature */ 2432 /* We truncate the module to discard the signature */
2438 *_len -= markerlen; 2433 info->len -= markerlen;
2439 err = mod_verify_sig(mod, _len); 2434 err = mod_verify_sig(mod, &info->len);
2440 } 2435 }
2441 2436
2442 if (!err) { 2437 if (!err) {
@@ -2454,59 +2449,107 @@ static int module_sig_check(struct load_info *info,
2454 return err; 2449 return err;
2455} 2450}
2456#else /* !CONFIG_MODULE_SIG */ 2451#else /* !CONFIG_MODULE_SIG */
2457static int module_sig_check(struct load_info *info, 2452static int module_sig_check(struct load_info *info)
2458 void *mod, unsigned long *len)
2459{ 2453{
2460 return 0; 2454 return 0;
2461} 2455}
2462#endif /* !CONFIG_MODULE_SIG */ 2456#endif /* !CONFIG_MODULE_SIG */
2463 2457
2464/* Sets info->hdr, info->len and info->sig_ok. */ 2458/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
2465static int copy_and_check(struct load_info *info, 2459static int elf_header_check(struct load_info *info)
2466 const void __user *umod, unsigned long len, 2460{
2467 const char __user *uargs) 2461 if (info->len < sizeof(*(info->hdr)))
2462 return -ENOEXEC;
2463
2464 if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
2465 || info->hdr->e_type != ET_REL
2466 || !elf_check_arch(info->hdr)
2467 || info->hdr->e_shentsize != sizeof(Elf_Shdr))
2468 return -ENOEXEC;
2469
2470 if (info->hdr->e_shoff >= info->len
2471 || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
2472 info->len - info->hdr->e_shoff))
2473 return -ENOEXEC;
2474
2475 return 0;
2476}
2477
2478/* Sets info->hdr and info->len. */
2479static int copy_module_from_user(const void __user *umod, unsigned long len,
2480 struct load_info *info)
2468{ 2481{
2469 int err; 2482 int err;
2470 Elf_Ehdr *hdr;
2471 2483
2472 if (len < sizeof(*hdr)) 2484 info->len = len;
2485 if (info->len < sizeof(*(info->hdr)))
2473 return -ENOEXEC; 2486 return -ENOEXEC;
2474 2487
2488 err = security_kernel_module_from_file(NULL);
2489 if (err)
2490 return err;
2491
2475 /* Suck in entire file: we'll want most of it. */ 2492 /* Suck in entire file: we'll want most of it. */
2476 if ((hdr = vmalloc(len)) == NULL) 2493 info->hdr = vmalloc(info->len);
2494 if (!info->hdr)
2477 return -ENOMEM; 2495 return -ENOMEM;
2478 2496
2479 if (copy_from_user(hdr, umod, len) != 0) { 2497 if (copy_from_user(info->hdr, umod, info->len) != 0) {
2480 err = -EFAULT; 2498 vfree(info->hdr);
2481 goto free_hdr; 2499 return -EFAULT;
2482 } 2500 }
2483 2501
2484 err = module_sig_check(info, hdr, &len); 2502 return 0;
2503}
2504
2505/* Sets info->hdr and info->len. */
2506static int copy_module_from_fd(int fd, struct load_info *info)
2507{
2508 struct file *file;
2509 int err;
2510 struct kstat stat;
2511 loff_t pos;
2512 ssize_t bytes = 0;
2513
2514 file = fget(fd);
2515 if (!file)
2516 return -ENOEXEC;
2517
2518 err = security_kernel_module_from_file(file);
2485 if (err) 2519 if (err)
2486 goto free_hdr; 2520 goto out;
2487 2521
2488 /* Sanity checks against insmoding binaries or wrong arch, 2522 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
2489 weird elf version */ 2523 if (err)
2490 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2524 goto out;
2491 || hdr->e_type != ET_REL
2492 || !elf_check_arch(hdr)
2493 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2494 err = -ENOEXEC;
2495 goto free_hdr;
2496 }
2497 2525
2498 if (hdr->e_shoff >= len || 2526 if (stat.size > INT_MAX) {
2499 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { 2527 err = -EFBIG;
2500 err = -ENOEXEC; 2528 goto out;
2501 goto free_hdr; 2529 }
2530 info->hdr = vmalloc(stat.size);
2531 if (!info->hdr) {
2532 err = -ENOMEM;
2533 goto out;
2502 } 2534 }
2503 2535
2504 info->hdr = hdr; 2536 pos = 0;
2505 info->len = len; 2537 while (pos < stat.size) {
2506 return 0; 2538 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
2539 stat.size - pos);
2540 if (bytes < 0) {
2541 vfree(info->hdr);
2542 err = bytes;
2543 goto out;
2544 }
2545 if (bytes == 0)
2546 break;
2547 pos += bytes;
2548 }
2549 info->len = pos;
2507 2550
2508free_hdr: 2551out:
2509 vfree(hdr); 2552 fput(file);
2510 return err; 2553 return err;
2511} 2554}
2512 2555
@@ -2515,7 +2558,7 @@ static void free_copy(struct load_info *info)
2515 vfree(info->hdr); 2558 vfree(info->hdr);
2516} 2559}
2517 2560
2518static int rewrite_section_headers(struct load_info *info) 2561static int rewrite_section_headers(struct load_info *info, int flags)
2519{ 2562{
2520 unsigned int i; 2563 unsigned int i;
2521 2564
@@ -2543,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info)
2543 } 2586 }
2544 2587
2545 /* Track but don't keep modinfo and version sections. */ 2588 /* Track but don't keep modinfo and version sections. */
2546 info->index.vers = find_sec(info, "__versions"); 2589 if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
2590 info->index.vers = 0; /* Pretend no __versions section! */
2591 else
2592 info->index.vers = find_sec(info, "__versions");
2547 info->index.info = find_sec(info, ".modinfo"); 2593 info->index.info = find_sec(info, ".modinfo");
2548 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2594 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2549 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2595 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2558,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info)
2558 * Return the temporary module pointer (we'll replace it with the final 2604 * Return the temporary module pointer (we'll replace it with the final
2559 * one when we move the module sections around). 2605 * one when we move the module sections around).
2560 */ 2606 */
2561static struct module *setup_load_info(struct load_info *info) 2607static struct module *setup_load_info(struct load_info *info, int flags)
2562{ 2608{
2563 unsigned int i; 2609 unsigned int i;
2564 int err; 2610 int err;
@@ -2569,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info)
2569 info->secstrings = (void *)info->hdr 2615 info->secstrings = (void *)info->hdr
2570 + info->sechdrs[info->hdr->e_shstrndx].sh_offset; 2616 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2571 2617
2572 err = rewrite_section_headers(info); 2618 err = rewrite_section_headers(info, flags);
2573 if (err) 2619 if (err)
2574 return ERR_PTR(err); 2620 return ERR_PTR(err);
2575 2621
@@ -2607,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info)
2607 return mod; 2653 return mod;
2608} 2654}
2609 2655
2610static int check_modinfo(struct module *mod, struct load_info *info) 2656static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2611{ 2657{
2612 const char *modmagic = get_modinfo(info, "vermagic"); 2658 const char *modmagic = get_modinfo(info, "vermagic");
2613 int err; 2659 int err;
2614 2660
2661 if (flags & MODULE_INIT_IGNORE_VERMAGIC)
2662 modmagic = NULL;
2663
2615 /* This is allowed: modprobe --force will invalidate it. */ 2664 /* This is allowed: modprobe --force will invalidate it. */
2616 if (!modmagic) { 2665 if (!modmagic) {
2617 err = try_to_force_load(mod, "bad vermagic"); 2666 err = try_to_force_load(mod, "bad vermagic");
@@ -2741,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info)
2741 memset(ptr, 0, mod->core_size); 2790 memset(ptr, 0, mod->core_size);
2742 mod->module_core = ptr; 2791 mod->module_core = ptr;
2743 2792
2744 ptr = module_alloc_update_bounds(mod->init_size); 2793 if (mod->init_size) {
2745 /* 2794 ptr = module_alloc_update_bounds(mod->init_size);
2746 * The pointer to this block is stored in the module structure 2795 /*
2747 * which is inside the block. This block doesn't need to be 2796 * The pointer to this block is stored in the module structure
2748 * scanned as it contains data and code that will be freed 2797 * which is inside the block. This block doesn't need to be
2749 * after the module is initialized. 2798 * scanned as it contains data and code that will be freed
2750 */ 2799 * after the module is initialized.
2751 kmemleak_ignore(ptr); 2800 */
2752 if (!ptr && mod->init_size) { 2801 kmemleak_ignore(ptr);
2753 module_free(mod, mod->module_core); 2802 if (!ptr) {
2754 return -ENOMEM; 2803 module_free(mod, mod->module_core);
2755 } 2804 return -ENOMEM;
2756 memset(ptr, 0, mod->init_size); 2805 }
2757 mod->module_init = ptr; 2806 memset(ptr, 0, mod->init_size);
2807 mod->module_init = ptr;
2808 } else
2809 mod->module_init = NULL;
2758 2810
2759 /* Transfer each section which specifies SHF_ALLOC */ 2811 /* Transfer each section which specifies SHF_ALLOC */
2760 pr_debug("final section addresses:\n"); 2812 pr_debug("final section addresses:\n");
@@ -2847,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2847 return 0; 2899 return 0;
2848} 2900}
2849 2901
2850static struct module *layout_and_allocate(struct load_info *info) 2902static struct module *layout_and_allocate(struct load_info *info, int flags)
2851{ 2903{
2852 /* Module within temporary copy. */ 2904 /* Module within temporary copy. */
2853 struct module *mod; 2905 struct module *mod;
2854 Elf_Shdr *pcpusec; 2906 Elf_Shdr *pcpusec;
2855 int err; 2907 int err;
2856 2908
2857 mod = setup_load_info(info); 2909 mod = setup_load_info(info, flags);
2858 if (IS_ERR(mod)) 2910 if (IS_ERR(mod))
2859 return mod; 2911 return mod;
2860 2912
2861 err = check_modinfo(mod, info); 2913 err = check_modinfo(mod, info, flags);
2862 if (err) 2914 if (err)
2863 return ERR_PTR(err); 2915 return ERR_PTR(err);
2864 2916
@@ -2945,33 +2997,124 @@ static bool finished_loading(const char *name)
2945 return ret; 2997 return ret;
2946} 2998}
2947 2999
3000/* Call module constructors. */
3001static void do_mod_ctors(struct module *mod)
3002{
3003#ifdef CONFIG_CONSTRUCTORS
3004 unsigned long i;
3005
3006 for (i = 0; i < mod->num_ctors; i++)
3007 mod->ctors[i]();
3008#endif
3009}
3010
3011/* This is where the real work happens */
3012static int do_init_module(struct module *mod)
3013{
3014 int ret = 0;
3015
3016 blocking_notifier_call_chain(&module_notify_list,
3017 MODULE_STATE_COMING, mod);
3018
3019 /* Set RO and NX regions for core */
3020 set_section_ro_nx(mod->module_core,
3021 mod->core_text_size,
3022 mod->core_ro_size,
3023 mod->core_size);
3024
3025 /* Set RO and NX regions for init */
3026 set_section_ro_nx(mod->module_init,
3027 mod->init_text_size,
3028 mod->init_ro_size,
3029 mod->init_size);
3030
3031 do_mod_ctors(mod);
3032 /* Start the module */
3033 if (mod->init != NULL)
3034 ret = do_one_initcall(mod->init);
3035 if (ret < 0) {
3036 /* Init routine failed: abort. Try to protect us from
3037 buggy refcounters. */
3038 mod->state = MODULE_STATE_GOING;
3039 synchronize_sched();
3040 module_put(mod);
3041 blocking_notifier_call_chain(&module_notify_list,
3042 MODULE_STATE_GOING, mod);
3043 free_module(mod);
3044 wake_up_all(&module_wq);
3045 return ret;
3046 }
3047 if (ret > 0) {
3048 printk(KERN_WARNING
3049"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3050"%s: loading module anyway...\n",
3051 __func__, mod->name, ret,
3052 __func__);
3053 dump_stack();
3054 }
3055
3056 /* Now it's a first class citizen! */
3057 mod->state = MODULE_STATE_LIVE;
3058 blocking_notifier_call_chain(&module_notify_list,
3059 MODULE_STATE_LIVE, mod);
3060
3061 /* We need to finish all async code before the module init sequence is done */
3062 async_synchronize_full();
3063
3064 mutex_lock(&module_mutex);
3065 /* Drop initial reference. */
3066 module_put(mod);
3067 trim_init_extable(mod);
3068#ifdef CONFIG_KALLSYMS
3069 mod->num_symtab = mod->core_num_syms;
3070 mod->symtab = mod->core_symtab;
3071 mod->strtab = mod->core_strtab;
3072#endif
3073 unset_module_init_ro_nx(mod);
3074 module_free(mod, mod->module_init);
3075 mod->module_init = NULL;
3076 mod->init_size = 0;
3077 mod->init_ro_size = 0;
3078 mod->init_text_size = 0;
3079 mutex_unlock(&module_mutex);
3080 wake_up_all(&module_wq);
3081
3082 return 0;
3083}
3084
3085static int may_init_module(void)
3086{
3087 if (!capable(CAP_SYS_MODULE) || modules_disabled)
3088 return -EPERM;
3089
3090 return 0;
3091}
3092
2948/* Allocate and load the module: note that size of section 0 is always 3093/* Allocate and load the module: note that size of section 0 is always
2949 zero, and we rely on this for optional sections. */ 3094 zero, and we rely on this for optional sections. */
2950static struct module *load_module(void __user *umod, 3095static int load_module(struct load_info *info, const char __user *uargs,
2951 unsigned long len, 3096 int flags)
2952 const char __user *uargs)
2953{ 3097{
2954 struct load_info info = { NULL, };
2955 struct module *mod, *old; 3098 struct module *mod, *old;
2956 long err; 3099 long err;
2957 3100
2958 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", 3101 err = module_sig_check(info);
2959 umod, len, uargs); 3102 if (err)
3103 goto free_copy;
2960 3104
2961 /* Copy in the blobs from userspace, check they are vaguely sane. */ 3105 err = elf_header_check(info);
2962 err = copy_and_check(&info, umod, len, uargs);
2963 if (err) 3106 if (err)
2964 return ERR_PTR(err); 3107 goto free_copy;
2965 3108
2966 /* Figure out module layout, and allocate all the memory. */ 3109 /* Figure out module layout, and allocate all the memory. */
2967 mod = layout_and_allocate(&info); 3110 mod = layout_and_allocate(info, flags);
2968 if (IS_ERR(mod)) { 3111 if (IS_ERR(mod)) {
2969 err = PTR_ERR(mod); 3112 err = PTR_ERR(mod);
2970 goto free_copy; 3113 goto free_copy;
2971 } 3114 }
2972 3115
2973#ifdef CONFIG_MODULE_SIG 3116#ifdef CONFIG_MODULE_SIG
2974 mod->sig_ok = info.sig_ok; 3117 mod->sig_ok = info->sig_ok;
2975 if (!mod->sig_ok) 3118 if (!mod->sig_ok)
2976 add_taint_module(mod, TAINT_FORCED_MODULE); 3119 add_taint_module(mod, TAINT_FORCED_MODULE);
2977#endif 3120#endif
@@ -2983,25 +3126,25 @@ static struct module *load_module(void __user *umod,
2983 3126
2984 /* Now we've got everything in the final locations, we can 3127 /* Now we've got everything in the final locations, we can
2985 * find optional sections. */ 3128 * find optional sections. */
2986 find_module_sections(mod, &info); 3129 find_module_sections(mod, info);
2987 3130
2988 err = check_module_license_and_versions(mod); 3131 err = check_module_license_and_versions(mod);
2989 if (err) 3132 if (err)
2990 goto free_unload; 3133 goto free_unload;
2991 3134
2992 /* Set up MODINFO_ATTR fields */ 3135 /* Set up MODINFO_ATTR fields */
2993 setup_modinfo(mod, &info); 3136 setup_modinfo(mod, info);
2994 3137
2995 /* Fix up syms, so that st_value is a pointer to location. */ 3138 /* Fix up syms, so that st_value is a pointer to location. */
2996 err = simplify_symbols(mod, &info); 3139 err = simplify_symbols(mod, info);
2997 if (err < 0) 3140 if (err < 0)
2998 goto free_modinfo; 3141 goto free_modinfo;
2999 3142
3000 err = apply_relocations(mod, &info); 3143 err = apply_relocations(mod, info);
3001 if (err < 0) 3144 if (err < 0)
3002 goto free_modinfo; 3145 goto free_modinfo;
3003 3146
3004 err = post_relocation(mod, &info); 3147 err = post_relocation(mod, info);
3005 if (err < 0) 3148 if (err < 0)
3006 goto free_modinfo; 3149 goto free_modinfo;
3007 3150
@@ -3041,14 +3184,14 @@ again:
3041 } 3184 }
3042 3185
3043 /* This has to be done once we're sure module name is unique. */ 3186 /* This has to be done once we're sure module name is unique. */
3044 dynamic_debug_setup(info.debug, info.num_debug); 3187 dynamic_debug_setup(info->debug, info->num_debug);
3045 3188
3046 /* Find duplicate symbols */ 3189 /* Find duplicate symbols */
3047 err = verify_export_symbols(mod); 3190 err = verify_export_symbols(mod);
3048 if (err < 0) 3191 if (err < 0)
3049 goto ddebug; 3192 goto ddebug;
3050 3193
3051 module_bug_finalize(info.hdr, info.sechdrs, mod); 3194 module_bug_finalize(info->hdr, info->sechdrs, mod);
3052 list_add_rcu(&mod->list, &modules); 3195 list_add_rcu(&mod->list, &modules);
3053 mutex_unlock(&module_mutex); 3196 mutex_unlock(&module_mutex);
3054 3197
@@ -3059,16 +3202,17 @@ again:
3059 goto unlink; 3202 goto unlink;
3060 3203
3061 /* Link in to syfs. */ 3204 /* Link in to syfs. */
3062 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); 3205 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
3063 if (err < 0) 3206 if (err < 0)
3064 goto unlink; 3207 goto unlink;
3065 3208
3066 /* Get rid of temporary copy. */ 3209 /* Get rid of temporary copy. */
3067 free_copy(&info); 3210 free_copy(info);
3068 3211
3069 /* Done! */ 3212 /* Done! */
3070 trace_module_load(mod); 3213 trace_module_load(mod);
3071 return mod; 3214
3215 return do_init_module(mod);
3072 3216
3073 unlink: 3217 unlink:
3074 mutex_lock(&module_mutex); 3218 mutex_lock(&module_mutex);
@@ -3077,7 +3221,7 @@ again:
3077 module_bug_cleanup(mod); 3221 module_bug_cleanup(mod);
3078 wake_up_all(&module_wq); 3222 wake_up_all(&module_wq);
3079 ddebug: 3223 ddebug:
3080 dynamic_debug_remove(info.debug); 3224 dynamic_debug_remove(info->debug);
3081 unlock: 3225 unlock:
3082 mutex_unlock(&module_mutex); 3226 mutex_unlock(&module_mutex);
3083 synchronize_sched(); 3227 synchronize_sched();
@@ -3089,106 +3233,52 @@ again:
3089 free_unload: 3233 free_unload:
3090 module_unload_free(mod); 3234 module_unload_free(mod);
3091 free_module: 3235 free_module:
3092 module_deallocate(mod, &info); 3236 module_deallocate(mod, info);
3093 free_copy: 3237 free_copy:
3094 free_copy(&info); 3238 free_copy(info);
3095 return ERR_PTR(err); 3239 return err;
3096}
3097
3098/* Call module constructors. */
3099static void do_mod_ctors(struct module *mod)
3100{
3101#ifdef CONFIG_CONSTRUCTORS
3102 unsigned long i;
3103
3104 for (i = 0; i < mod->num_ctors; i++)
3105 mod->ctors[i]();
3106#endif
3107} 3240}
3108 3241
3109/* This is where the real work happens */
3110SYSCALL_DEFINE3(init_module, void __user *, umod, 3242SYSCALL_DEFINE3(init_module, void __user *, umod,
3111 unsigned long, len, const char __user *, uargs) 3243 unsigned long, len, const char __user *, uargs)
3112{ 3244{
3113 struct module *mod; 3245 int err;
3114 int ret = 0; 3246 struct load_info info = { };
3115 3247
3116 /* Must have permission */ 3248 err = may_init_module();
3117 if (!capable(CAP_SYS_MODULE) || modules_disabled) 3249 if (err)
3118 return -EPERM; 3250 return err;
3119 3251
3120 /* Do all the hard work */ 3252 pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
3121 mod = load_module(umod, len, uargs); 3253 umod, len, uargs);
3122 if (IS_ERR(mod))
3123 return PTR_ERR(mod);
3124 3254
3125 blocking_notifier_call_chain(&module_notify_list, 3255 err = copy_module_from_user(umod, len, &info);
3126 MODULE_STATE_COMING, mod); 3256 if (err)
3257 return err;
3127 3258
3128 /* Set RO and NX regions for core */ 3259 return load_module(&info, uargs, 0);
3129 set_section_ro_nx(mod->module_core, 3260}
3130 mod->core_text_size,
3131 mod->core_ro_size,
3132 mod->core_size);
3133 3261
3134 /* Set RO and NX regions for init */ 3262SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
3135 set_section_ro_nx(mod->module_init, 3263{
3136 mod->init_text_size, 3264 int err;
3137 mod->init_ro_size, 3265 struct load_info info = { };
3138 mod->init_size);
3139 3266
3140 do_mod_ctors(mod); 3267 err = may_init_module();
3141 /* Start the module */ 3268 if (err)
3142 if (mod->init != NULL) 3269 return err;
3143 ret = do_one_initcall(mod->init);
3144 if (ret < 0) {
3145 /* Init routine failed: abort. Try to protect us from
3146 buggy refcounters. */
3147 mod->state = MODULE_STATE_GOING;
3148 synchronize_sched();
3149 module_put(mod);
3150 blocking_notifier_call_chain(&module_notify_list,
3151 MODULE_STATE_GOING, mod);
3152 free_module(mod);
3153 wake_up_all(&module_wq);
3154 return ret;
3155 }
3156 if (ret > 0) {
3157 printk(KERN_WARNING
3158"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3159"%s: loading module anyway...\n",
3160 __func__, mod->name, ret,
3161 __func__);
3162 dump_stack();
3163 }
3164 3270
3165 /* Now it's a first class citizen! */ 3271 pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
3166 mod->state = MODULE_STATE_LIVE;
3167 blocking_notifier_call_chain(&module_notify_list,
3168 MODULE_STATE_LIVE, mod);
3169 3272
3170 /* We need to finish all async code before the module init sequence is done */ 3273 if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
3171 async_synchronize_full(); 3274 |MODULE_INIT_IGNORE_VERMAGIC))
3275 return -EINVAL;
3172 3276
3173 mutex_lock(&module_mutex); 3277 err = copy_module_from_fd(fd, &info);
3174 /* Drop initial reference. */ 3278 if (err)
3175 module_put(mod); 3279 return err;
3176 trim_init_extable(mod);
3177#ifdef CONFIG_KALLSYMS
3178 mod->num_symtab = mod->core_num_syms;
3179 mod->symtab = mod->core_symtab;
3180 mod->strtab = mod->core_strtab;
3181#endif
3182 unset_module_init_ro_nx(mod);
3183 module_free(mod, mod->module_init);
3184 mod->module_init = NULL;
3185 mod->init_size = 0;
3186 mod->init_ro_size = 0;
3187 mod->init_text_size = 0;
3188 mutex_unlock(&module_mutex);
3189 wake_up_all(&module_wq);
3190 3280
3191 return 0; 3281 return load_module(&info, uargs, flags);
3192} 3282}
3193 3283
3194static inline int within(unsigned long addr, void *start, unsigned long size) 3284static inline int within(unsigned long addr, void *start, unsigned long size)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b576f7f14bc6..78e2ecb20165 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
57 * leave it to the caller to do proper locking and attach it to task. 57 * leave it to the caller to do proper locking and attach it to task.
58 */ 58 */
59static struct nsproxy *create_new_namespaces(unsigned long flags, 59static struct nsproxy *create_new_namespaces(unsigned long flags,
60 struct task_struct *tsk, struct fs_struct *new_fs) 60 struct task_struct *tsk, struct user_namespace *user_ns,
61 struct fs_struct *new_fs)
61{ 62{
62 struct nsproxy *new_nsp; 63 struct nsproxy *new_nsp;
63 int err; 64 int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
66 if (!new_nsp) 67 if (!new_nsp)
67 return ERR_PTR(-ENOMEM); 68 return ERR_PTR(-ENOMEM);
68 69
69 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 70 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
70 if (IS_ERR(new_nsp->mnt_ns)) { 71 if (IS_ERR(new_nsp->mnt_ns)) {
71 err = PTR_ERR(new_nsp->mnt_ns); 72 err = PTR_ERR(new_nsp->mnt_ns);
72 goto out_ns; 73 goto out_ns;
73 } 74 }
74 75
75 new_nsp->uts_ns = copy_utsname(flags, tsk); 76 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
76 if (IS_ERR(new_nsp->uts_ns)) { 77 if (IS_ERR(new_nsp->uts_ns)) {
77 err = PTR_ERR(new_nsp->uts_ns); 78 err = PTR_ERR(new_nsp->uts_ns);
78 goto out_uts; 79 goto out_uts;
79 } 80 }
80 81
81 new_nsp->ipc_ns = copy_ipcs(flags, tsk); 82 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
82 if (IS_ERR(new_nsp->ipc_ns)) { 83 if (IS_ERR(new_nsp->ipc_ns)) {
83 err = PTR_ERR(new_nsp->ipc_ns); 84 err = PTR_ERR(new_nsp->ipc_ns);
84 goto out_ipc; 85 goto out_ipc;
85 } 86 }
86 87
87 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
88 if (IS_ERR(new_nsp->pid_ns)) { 89 if (IS_ERR(new_nsp->pid_ns)) {
89 err = PTR_ERR(new_nsp->pid_ns); 90 err = PTR_ERR(new_nsp->pid_ns);
90 goto out_pid; 91 goto out_pid;
91 } 92 }
92 93
93 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 94 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
94 if (IS_ERR(new_nsp->net_ns)) { 95 if (IS_ERR(new_nsp->net_ns)) {
95 err = PTR_ERR(new_nsp->net_ns); 96 err = PTR_ERR(new_nsp->net_ns);
96 goto out_net; 97 goto out_net;
@@ -122,6 +123,7 @@ out_ns:
122int copy_namespaces(unsigned long flags, struct task_struct *tsk) 123int copy_namespaces(unsigned long flags, struct task_struct *tsk)
123{ 124{
124 struct nsproxy *old_ns = tsk->nsproxy; 125 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
125 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
126 int err = 0; 128 int err = 0;
127 129
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
134 CLONE_NEWPID | CLONE_NEWNET))) 136 CLONE_NEWPID | CLONE_NEWNET)))
135 return 0; 137 return 0;
136 138
137 if (!capable(CAP_SYS_ADMIN)) { 139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
138 err = -EPERM; 140 err = -EPERM;
139 goto out; 141 goto out;
140 } 142 }
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
151 goto out; 153 goto out;
152 } 154 }
153 155
154 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 156 new_ns = create_new_namespaces(flags, tsk,
157 task_cred_xxx(tsk, user_ns), tsk->fs);
155 if (IS_ERR(new_ns)) { 158 if (IS_ERR(new_ns)) {
156 err = PTR_ERR(new_ns); 159 err = PTR_ERR(new_ns);
157 goto out; 160 goto out;
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns)
183 * On success, returns the new nsproxy. 186 * On success, returns the new nsproxy.
184 */ 187 */
185int unshare_nsproxy_namespaces(unsigned long unshare_flags, 188int unshare_nsproxy_namespaces(unsigned long unshare_flags,
186 struct nsproxy **new_nsp, struct fs_struct *new_fs) 189 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
187{ 190{
191 struct user_namespace *user_ns;
188 int err = 0; 192 int err = 0;
189 193
190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 194 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
191 CLONE_NEWNET))) 195 CLONE_NEWNET | CLONE_NEWPID)))
192 return 0; 196 return 0;
193 197
194 if (!capable(CAP_SYS_ADMIN)) 198 user_ns = new_cred ? new_cred->user_ns : current_user_ns();
199 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
195 return -EPERM; 200 return -EPERM;
196 201
197 *new_nsp = create_new_namespaces(unshare_flags, current, 202 *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
198 new_fs ? new_fs : current->fs); 203 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) { 204 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 205 err = PTR_ERR(*new_nsp);
201 goto out; 206 goto out;
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 struct file *file; 246 struct file *file;
242 int err; 247 int err;
243 248
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd); 249 file = proc_ns_fget(fd);
248 if (IS_ERR(file)) 250 if (IS_ERR(file))
249 return PTR_ERR(file); 251 return PTR_ERR(file);
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
254 if (nstype && (ops->type != nstype)) 256 if (nstype && (ops->type != nstype))
255 goto out; 257 goto out;
256 258
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 259 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
258 if (IS_ERR(new_nsproxy)) { 260 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy); 261 err = PTR_ERR(new_nsproxy);
260 goto out; 262 goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 89fe3d1b9efb..072f4ee4eb89 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
171{ 171{
172 int cpu, num_cpus; 172 int cpu, num_cpus;
173 unsigned int next_nr, next_index; 173 unsigned int next_nr, next_index;
174 struct padata_parallel_queue *queue, *next_queue; 174 struct padata_parallel_queue *next_queue;
175 struct padata_priv *padata; 175 struct padata_priv *padata;
176 struct padata_list *reorder; 176 struct padata_list *reorder;
177 177
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
204 goto out; 204 goto out;
205 } 205 }
206 206
207 queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); 207 if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
208 if (queue->cpu_index == next_queue->cpu_index) {
209 padata = ERR_PTR(-ENODATA); 208 padata = ERR_PTR(-ENODATA);
210 goto out; 209 goto out;
211 } 210 }
diff --git a/kernel/pid.c b/kernel/pid.c
index aebd4f5aaf41..de9af600006f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Generic pidhash and scalable, time-bounded PID allocator 2 * Generic pidhash and scalable, time-bounded PID allocator
3 * 3 *
4 * (C) 2002-2003 William Irwin, IBM 4 * (C) 2002-2003 Nadia Yvette Chambers, IBM
5 * (C) 2004 William Irwin, Oracle 5 * (C) 2004 Nadia Yvette Chambers, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 * 7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain 8 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
78 .last_pid = 0, 79 .last_pid = 0,
79 .level = 0, 80 .level = 0,
80 .child_reaper = &init_task, 81 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
81}; 84};
82EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
83 86
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
98
99/* 87/*
100 * Note: disable interrupts while the pidmap_lock is held as an 88 * Note: disable interrupts while the pidmap_lock is held as an
101 * interrupt might come in and do read_lock(&tasklist_lock). 89 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,23 @@ void free_pid(struct pid *pid)
269 unsigned long flags; 257 unsigned long flags;
270 258
271 spin_lock_irqsave(&pidmap_lock, flags); 259 spin_lock_irqsave(&pidmap_lock, flags);
272 for (i = 0; i <= pid->level; i++) 260 for (i = 0; i <= pid->level; i++) {
273 hlist_del_rcu(&pid->numbers[i].pid_chain); 261 struct upid *upid = pid->numbers + i;
262 struct pid_namespace *ns = upid->ns;
263 hlist_del_rcu(&upid->pid_chain);
264 switch(--ns->nr_hashed) {
265 case 1:
266 /* When all that is left in the pid namespace
267 * is the reaper wake up the reaper. The reaper
268 * may be sleeping in zap_pid_ns_processes().
269 */
270 wake_up_process(ns->child_reaper);
271 break;
272 case 0:
273 schedule_work(&ns->proc_work);
274 break;
275 }
276 }
274 spin_unlock_irqrestore(&pidmap_lock, flags); 277 spin_unlock_irqrestore(&pidmap_lock, flags);
275 278
276 for (i = 0; i <= pid->level; i++) 279 for (i = 0; i <= pid->level; i++)
@@ -292,6 +295,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
292 goto out; 295 goto out;
293 296
294 tmp = ns; 297 tmp = ns;
298 pid->level = ns->level;
295 for (i = ns->level; i >= 0; i--) { 299 for (i = ns->level; i >= 0; i--) {
296 nr = alloc_pidmap(tmp); 300 nr = alloc_pidmap(tmp);
297 if (nr < 0) 301 if (nr < 0)
@@ -302,22 +306,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
302 tmp = tmp->parent; 306 tmp = tmp->parent;
303 } 307 }
304 308
309 if (unlikely(is_child_reaper(pid))) {
310 if (pid_ns_prepare_proc(ns))
311 goto out_free;
312 }
313
305 get_pid_ns(ns); 314 get_pid_ns(ns);
306 pid->level = ns->level;
307 atomic_set(&pid->count, 1); 315 atomic_set(&pid->count, 1);
308 for (type = 0; type < PIDTYPE_MAX; ++type) 316 for (type = 0; type < PIDTYPE_MAX; ++type)
309 INIT_HLIST_HEAD(&pid->tasks[type]); 317 INIT_HLIST_HEAD(&pid->tasks[type]);
310 318
311 upid = pid->numbers + ns->level; 319 upid = pid->numbers + ns->level;
312 spin_lock_irq(&pidmap_lock); 320 spin_lock_irq(&pidmap_lock);
313 for ( ; upid >= pid->numbers; --upid) 321 if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
322 goto out_unlock;
323 for ( ; upid >= pid->numbers; --upid) {
314 hlist_add_head_rcu(&upid->pid_chain, 324 hlist_add_head_rcu(&upid->pid_chain,
315 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 325 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
326 upid->ns->nr_hashed++;
327 }
316 spin_unlock_irq(&pidmap_lock); 328 spin_unlock_irq(&pidmap_lock);
317 329
318out: 330out:
319 return pid; 331 return pid;
320 332
333out_unlock:
334 spin_unlock(&pidmap_lock);
321out_free: 335out_free:
322 while (++i <= ns->level) 336 while (++i <= ns->level)
323 free_pidmap(pid->numbers + i); 337 free_pidmap(pid->numbers + i);
@@ -327,6 +341,13 @@ out_free:
327 goto out; 341 goto out;
328} 342}
329 343
344void disable_pid_allocation(struct pid_namespace *ns)
345{
346 spin_lock_irq(&pidmap_lock);
347 ns->nr_hashed &= ~PIDNS_HASH_ADDING;
348 spin_unlock_irq(&pidmap_lock);
349}
350
330struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 351struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
331{ 352{
332 struct hlist_node *elem; 353 struct hlist_node *elem;
@@ -344,7 +365,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
344 365
345struct pid *find_vpid(int nr) 366struct pid *find_vpid(int nr)
346{ 367{
347 return find_pid_ns(nr, current->nsproxy->pid_ns); 368 return find_pid_ns(nr, task_active_pid_ns(current));
348} 369}
349EXPORT_SYMBOL_GPL(find_vpid); 370EXPORT_SYMBOL_GPL(find_vpid);
350 371
@@ -428,7 +449,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
428 449
429struct task_struct *find_task_by_vpid(pid_t vnr) 450struct task_struct *find_task_by_vpid(pid_t vnr)
430{ 451{
431 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 452 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
432} 453}
433 454
434struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 455struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +504,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
483 504
484pid_t pid_vnr(struct pid *pid) 505pid_t pid_vnr(struct pid *pid)
485{ 506{
486 return pid_nr_ns(pid, current->nsproxy->pid_ns); 507 return pid_nr_ns(pid, task_active_pid_ns(current));
487} 508}
488EXPORT_SYMBOL_GPL(pid_vnr); 509EXPORT_SYMBOL_GPL(pid_vnr);
489 510
@@ -494,7 +515,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
494 515
495 rcu_read_lock(); 516 rcu_read_lock();
496 if (!ns) 517 if (!ns)
497 ns = current->nsproxy->pid_ns; 518 ns = task_active_pid_ns(current);
498 if (likely(pid_alive(task))) { 519 if (likely(pid_alive(task))) {
499 if (type != PIDTYPE_PID) 520 if (type != PIDTYPE_PID)
500 task = task->group_leader; 521 task = task->group_leader;
@@ -558,6 +579,9 @@ void __init pidhash_init(void)
558 579
559void __init pidmap_init(void) 580void __init pidmap_init(void)
560{ 581{
582 /* Veryify no one has done anything silly */
583 BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
584
561 /* bump default and minimum pid_max based on number of cpus */ 585 /* bump default and minimum pid_max based on number of cpus */
562 pid_max = min(pid_max_max, max_t(int, pid_max, 586 pid_max = min(pid_max_max, max_t(int, pid_max,
563 PIDS_PER_CPU_DEFAULT * num_possible_cpus())); 587 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -569,6 +593,7 @@ void __init pidmap_init(void)
569 /* Reserve PID 0. We never call free_pidmap(0) */ 593 /* Reserve PID 0. We never call free_pidmap(0) */
570 set_bit(0, init_pid_ns.pidmap[0].page); 594 set_bit(0, init_pid_ns.pidmap[0].page);
571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 595 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
596 init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
572 597
573 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 598 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
574 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 599 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb75..c1c3dc1c6023 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/pid.h> 11#include <linux/pid.h>
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/user_namespace.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/err.h> 15#include <linux/err.h>
15#include <linux/acct.h> 16#include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
71 return NULL; 72 return NULL;
72} 73}
73 74
75static void proc_cleanup_work(struct work_struct *work)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
74/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
75#define MAX_PID_NS_LEVEL 32 82#define MAX_PID_NS_LEVEL 32
76 83
77static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 84static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
85 struct pid_namespace *parent_pid_ns)
78{ 86{
79 struct pid_namespace *ns; 87 struct pid_namespace *ns;
80 unsigned int level = parent_pid_ns->level + 1; 88 unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
99 if (ns->pid_cachep == NULL) 107 if (ns->pid_cachep == NULL)
100 goto out_free_map; 108 goto out_free_map;
101 109
110 err = proc_alloc_inum(&ns->proc_inum);
111 if (err)
112 goto out_free_map;
113
102 kref_init(&ns->kref); 114 kref_init(&ns->kref);
103 ns->level = level; 115 ns->level = level;
104 ns->parent = get_pid_ns(parent_pid_ns); 116 ns->parent = get_pid_ns(parent_pid_ns);
117 ns->user_ns = get_user_ns(user_ns);
118 ns->nr_hashed = PIDNS_HASH_ADDING;
119 INIT_WORK(&ns->proc_work, proc_cleanup_work);
105 120
106 set_bit(0, ns->pidmap[0].page); 121 set_bit(0, ns->pidmap[0].page);
107 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 122 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +124,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
109 for (i = 1; i < PIDMAP_ENTRIES; i++) 124 for (i = 1; i < PIDMAP_ENTRIES; i++)
110 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 125 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
111 126
112 err = pid_ns_prepare_proc(ns);
113 if (err)
114 goto out_put_parent_pid_ns;
115
116 return ns; 127 return ns;
117 128
118out_put_parent_pid_ns:
119 put_pid_ns(parent_pid_ns);
120out_free_map: 129out_free_map:
121 kfree(ns->pidmap[0].page); 130 kfree(ns->pidmap[0].page);
122out_free: 131out_free:
@@ -129,18 +138,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
129{ 138{
130 int i; 139 int i;
131 140
141 proc_free_inum(ns->proc_inum);
132 for (i = 0; i < PIDMAP_ENTRIES; i++) 142 for (i = 0; i < PIDMAP_ENTRIES; i++)
133 kfree(ns->pidmap[i].page); 143 kfree(ns->pidmap[i].page);
144 put_user_ns(ns->user_ns);
134 kmem_cache_free(pid_ns_cachep, ns); 145 kmem_cache_free(pid_ns_cachep, ns);
135} 146}
136 147
137struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 148struct pid_namespace *copy_pid_ns(unsigned long flags,
149 struct user_namespace *user_ns, struct pid_namespace *old_ns)
138{ 150{
139 if (!(flags & CLONE_NEWPID)) 151 if (!(flags & CLONE_NEWPID))
140 return get_pid_ns(old_ns); 152 return get_pid_ns(old_ns);
141 if (flags & (CLONE_THREAD|CLONE_PARENT)) 153 if (task_active_pid_ns(current) != old_ns)
142 return ERR_PTR(-EINVAL); 154 return ERR_PTR(-EINVAL);
143 return create_pid_namespace(old_ns); 155 return create_pid_namespace(user_ns, old_ns);
144} 156}
145 157
146static void free_pid_ns(struct kref *kref) 158static void free_pid_ns(struct kref *kref)
@@ -170,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
170 int rc; 182 int rc;
171 struct task_struct *task, *me = current; 183 struct task_struct *task, *me = current;
172 184
185 /* Don't allow any more processes into the pid namespace */
186 disable_pid_allocation(pid_ns);
187
173 /* Ignore SIGCHLD causing any terminated children to autoreap */ 188 /* Ignore SIGCHLD causing any terminated children to autoreap */
174 spin_lock_irq(&me->sighand->siglock); 189 spin_lock_irq(&me->sighand->siglock);
175 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 190 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
@@ -211,22 +226,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
211 226
212 /* 227 /*
213 * sys_wait4() above can't reap the TASK_DEAD children. 228 * sys_wait4() above can't reap the TASK_DEAD children.
214 * Make sure they all go away, see __unhash_process(). 229 * Make sure they all go away, see free_pid().
215 */ 230 */
216 for (;;) { 231 for (;;) {
217 bool need_wait = false; 232 set_current_state(TASK_UNINTERRUPTIBLE);
218 233 if (pid_ns->nr_hashed == 1)
219 read_lock(&tasklist_lock);
220 if (!list_empty(&current->children)) {
221 __set_current_state(TASK_UNINTERRUPTIBLE);
222 need_wait = true;
223 }
224 read_unlock(&tasklist_lock);
225
226 if (!need_wait)
227 break; 234 break;
228 schedule(); 235 schedule();
229 } 236 }
237 __set_current_state(TASK_RUNNING);
230 238
231 if (pid_ns->reboot) 239 if (pid_ns->reboot)
232 current->signal->group_exit_code = pid_ns->reboot; 240 current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +247,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
239static int pid_ns_ctl_handler(struct ctl_table *table, int write, 247static int pid_ns_ctl_handler(struct ctl_table *table, int write,
240 void __user *buffer, size_t *lenp, loff_t *ppos) 248 void __user *buffer, size_t *lenp, loff_t *ppos)
241{ 249{
250 struct pid_namespace *pid_ns = task_active_pid_ns(current);
242 struct ctl_table tmp = *table; 251 struct ctl_table tmp = *table;
243 252
244 if (write && !capable(CAP_SYS_ADMIN)) 253 if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
245 return -EPERM; 254 return -EPERM;
246 255
247 /* 256 /*
@@ -250,7 +259,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
250 * it should synchronize its usage with external means. 259 * it should synchronize its usage with external means.
251 */ 260 */
252 261
253 tmp.data = &current->nsproxy->pid_ns->last_pid; 262 tmp.data = &pid_ns->last_pid;
254 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 263 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
255} 264}
256 265
@@ -299,6 +308,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
299 return 0; 308 return 0;
300} 309}
301 310
311static void *pidns_get(struct task_struct *task)
312{
313 struct pid_namespace *ns;
314
315 rcu_read_lock();
316 ns = get_pid_ns(task_active_pid_ns(task));
317 rcu_read_unlock();
318
319 return ns;
320}
321
322static void pidns_put(void *ns)
323{
324 put_pid_ns(ns);
325}
326
327static int pidns_install(struct nsproxy *nsproxy, void *ns)
328{
329 struct pid_namespace *active = task_active_pid_ns(current);
330 struct pid_namespace *ancestor, *new = ns;
331
332 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
333 !nsown_capable(CAP_SYS_ADMIN))
334 return -EPERM;
335
336 /*
337 * Only allow entering the current active pid namespace
338 * or a child of the current active pid namespace.
339 *
340 * This is required for fork to return a usable pid value and
341 * this maintains the property that processes and their
342 * children can not escape their current pid namespace.
343 */
344 if (new->level < active->level)
345 return -EINVAL;
346
347 ancestor = new;
348 while (ancestor->level > active->level)
349 ancestor = ancestor->parent;
350 if (ancestor != active)
351 return -EINVAL;
352
353 put_pid_ns(nsproxy->pid_ns);
354 nsproxy->pid_ns = get_pid_ns(new);
355 return 0;
356}
357
358static unsigned int pidns_inum(void *ns)
359{
360 struct pid_namespace *pid_ns = ns;
361 return pid_ns->proc_inum;
362}
363
364const struct proc_ns_operations pidns_operations = {
365 .name = "pid",
366 .type = CLONE_NEWPID,
367 .get = pidns_get,
368 .put = pidns_put,
369 .install = pidns_install,
370 .inum = pidns_inum,
371};
372
302static __init int pid_namespaces_init(void) 373static __init int pid_namespaces_init(void)
303{ 374{
304 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 375 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa21..a278cad1d5d6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h>
12 13
13/* 14/*
14 * Called after updating RLIMIT_CPU to run cpu timer and update 15 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -217,30 +218,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
217 return 0; 218 return 0;
218} 219}
219 220
220void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
221{
222 struct signal_struct *sig = tsk->signal;
223 struct task_struct *t;
224
225 times->utime = sig->utime;
226 times->stime = sig->stime;
227 times->sum_exec_runtime = sig->sum_sched_runtime;
228
229 rcu_read_lock();
230 /* make sure we can trust tsk->thread_group list */
231 if (!likely(pid_alive(tsk)))
232 goto out;
233
234 t = tsk;
235 do {
236 times->utime += t->utime;
237 times->stime += t->stime;
238 times->sum_exec_runtime += task_sched_runtime(t);
239 } while_each_thread(tsk, t);
240out:
241 rcu_read_unlock();
242}
243
244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 221static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
245{ 222{
246 if (b->utime > a->utime) 223 if (b->utime > a->utime)
@@ -494,6 +471,8 @@ static void cleanup_timers(struct list_head *head,
494 */ 471 */
495void posix_cpu_timers_exit(struct task_struct *tsk) 472void posix_cpu_timers_exit(struct task_struct *tsk)
496{ 473{
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long));
497 cleanup_timers(tsk->cpu_timers, 476 cleanup_timers(tsk->cpu_timers,
498 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
499 478
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f458238109cc..1c16f9167de1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
59{ 59{
60 unsigned long val; 60 unsigned long val;
61 61
62 if (strict_strtoul(buf, 10, &val)) 62 if (kstrtoul(buf, 10, &val))
63 return -EINVAL; 63 return -EINVAL;
64 64
65 if (val > 1) 65 if (val > 1)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 846bd42c7ed1..9322ff7eaad6 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -213,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
213} 213}
214 214
215/** 215/**
216 * pm_qos_flags_remove_req - Remove device PM QoS flags request.
217 * @pqf: Device PM QoS flags set to remove the request from.
218 * @req: Request to remove from the set.
219 */
220static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
221 struct pm_qos_flags_request *req)
222{
223 s32 val = 0;
224
225 list_del(&req->node);
226 list_for_each_entry(req, &pqf->list, node)
227 val |= req->flags;
228
229 pqf->effective_flags = val;
230}
231
232/**
233 * pm_qos_update_flags - Update a set of PM QoS flags.
234 * @pqf: Set of flags to update.
235 * @req: Request to add to the set, to modify, or to remove from the set.
236 * @action: Action to take on the set.
237 * @val: Value of the request to add or modify.
238 *
239 * Update the given set of PM QoS flags and call notifiers if the aggregate
240 * value has changed. Returns 1 if the aggregate constraint value has changed,
241 * 0 otherwise.
242 */
243bool pm_qos_update_flags(struct pm_qos_flags *pqf,
244 struct pm_qos_flags_request *req,
245 enum pm_qos_req_action action, s32 val)
246{
247 unsigned long irqflags;
248 s32 prev_value, curr_value;
249
250 spin_lock_irqsave(&pm_qos_lock, irqflags);
251
252 prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
253
254 switch (action) {
255 case PM_QOS_REMOVE_REQ:
256 pm_qos_flags_remove_req(pqf, req);
257 break;
258 case PM_QOS_UPDATE_REQ:
259 pm_qos_flags_remove_req(pqf, req);
260 case PM_QOS_ADD_REQ:
261 req->flags = val;
262 INIT_LIST_HEAD(&req->node);
263 list_add_tail(&req->node, &pqf->list);
264 pqf->effective_flags |= val;
265 break;
266 default:
267 /* no action */
268 ;
269 }
270
271 curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
272
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274
275 return prev_value != curr_value;
276}
277
278/**
216 * pm_qos_request - returns current system wide qos expectation 279 * pm_qos_request - returns current system wide qos expectation
217 * @pm_qos_class: identification of which qos value is requested 280 * @pm_qos_class: identification of which qos value is requested
218 * 281 *
@@ -500,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
500 } else { 563 } else {
501 ascii_value[count] = '\0'; 564 ascii_value[count] = '\0';
502 } 565 }
503 ret = strict_strtoul(ascii_value, 16, &ulval); 566 ret = kstrtoul(ascii_value, 16, &ulval);
504 if (ret) { 567 if (ret) {
505 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); 568 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
506 return -EINVAL; 569 return -EINVAL;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3c9d764eb0d8..7c33ed200410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
126 126
127 /* Figure out where to put the new node */ 127 /* Figure out where to put the new node */
128 while (*new) { 128 while (*new) {
129 ext = container_of(*new, struct swsusp_extent, node); 129 ext = rb_entry(*new, struct swsusp_extent, node);
130 parent = *new; 130 parent = *new;
131 if (swap_offset < ext->start) { 131 if (swap_offset < ext->start) {
132 /* Try to merge */ 132 /* Try to merge */
diff --git a/kernel/printk.c b/kernel/printk.c
index 2d607f4d1797..357f714ddd49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);
87struct console *console_drivers; 87struct console *console_drivers;
88EXPORT_SYMBOL_GPL(console_drivers); 88EXPORT_SYMBOL_GPL(console_drivers);
89 89
90#ifdef CONFIG_LOCKDEP
91static struct lockdep_map console_lock_dep_map = {
92 .name = "console_lock"
93};
94#endif
95
90/* 96/*
91 * This is used for debugging the mess that is the VT code by 97 * This is used for debugging the mess that is the VT code by
92 * keeping track if we have the console semaphore held. It's 98 * keeping track if we have the console semaphore held. It's
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early)
741 free, (free * 100) / __LOG_BUF_LEN); 747 free, (free * 100) / __LOG_BUF_LEN);
742} 748}
743 749
750static bool __read_mostly ignore_loglevel;
751
752static int __init ignore_loglevel_setup(char *str)
753{
754 ignore_loglevel = 1;
755 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
756
757 return 0;
758}
759
760early_param("ignore_loglevel", ignore_loglevel_setup);
761module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
762MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
763 "print all kernel messages to the console.");
764
744#ifdef CONFIG_BOOT_PRINTK_DELAY 765#ifdef CONFIG_BOOT_PRINTK_DELAY
745 766
746static int boot_delay; /* msecs delay after each printk during bootup */ 767static int boot_delay; /* msecs delay after each printk during bootup */
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str)
764} 785}
765__setup("boot_delay=", boot_delay_setup); 786__setup("boot_delay=", boot_delay_setup);
766 787
767static void boot_delay_msec(void) 788static void boot_delay_msec(int level)
768{ 789{
769 unsigned long long k; 790 unsigned long long k;
770 unsigned long timeout; 791 unsigned long timeout;
771 792
772 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 793 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
794 || (level >= console_loglevel && !ignore_loglevel)) {
773 return; 795 return;
796 }
774 797
775 k = (unsigned long long)loops_per_msec * boot_delay; 798 k = (unsigned long long)loops_per_msec * boot_delay;
776 799
@@ -789,7 +812,7 @@ static void boot_delay_msec(void)
789 } 812 }
790} 813}
791#else 814#else
792static inline void boot_delay_msec(void) 815static inline void boot_delay_msec(int level)
793{ 816{
794} 817}
795#endif 818#endif
@@ -847,10 +870,11 @@ static size_t print_time(u64 ts, char *buf)
847 if (!printk_time) 870 if (!printk_time)
848 return 0; 871 return 0;
849 872
873 rem_nsec = do_div(ts, 1000000000);
874
850 if (!buf) 875 if (!buf)
851 return 15; 876 return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
852 877
853 rem_nsec = do_div(ts, 1000000000);
854 return sprintf(buf, "[%5lu.%06lu] ", 878 return sprintf(buf, "[%5lu.%06lu] ",
855 (unsigned long)ts, rem_nsec / 1000); 879 (unsigned long)ts, rem_nsec / 1000);
856} 880}
@@ -1232,21 +1256,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1232 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1256 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1233} 1257}
1234 1258
1235static bool __read_mostly ignore_loglevel;
1236
1237static int __init ignore_loglevel_setup(char *str)
1238{
1239 ignore_loglevel = 1;
1240 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
1241
1242 return 0;
1243}
1244
1245early_param("ignore_loglevel", ignore_loglevel_setup);
1246module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
1247MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
1248 "print all kernel messages to the console.");
1249
1250/* 1259/*
1251 * Call the console drivers, asking them to write out 1260 * Call the console drivers, asking them to write out
1252 * log_buf[start] to log_buf[end - 1]. 1261 * log_buf[start] to log_buf[end - 1].
@@ -1492,7 +1501,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1492 int this_cpu; 1501 int this_cpu;
1493 int printed_len = 0; 1502 int printed_len = 0;
1494 1503
1495 boot_delay_msec(); 1504 boot_delay_msec(level);
1496 printk_delay(); 1505 printk_delay();
1497 1506
1498 /* This stops the holder of console_sem just where we want him */ 1507 /* This stops the holder of console_sem just where we want him */
@@ -1908,12 +1917,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1908 */ 1917 */
1909void console_lock(void) 1918void console_lock(void)
1910{ 1919{
1911 BUG_ON(in_interrupt()); 1920 might_sleep();
1921
1912 down(&console_sem); 1922 down(&console_sem);
1913 if (console_suspended) 1923 if (console_suspended)
1914 return; 1924 return;
1915 console_locked = 1; 1925 console_locked = 1;
1916 console_may_schedule = 1; 1926 console_may_schedule = 1;
1927 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1917} 1928}
1918EXPORT_SYMBOL(console_lock); 1929EXPORT_SYMBOL(console_lock);
1919 1930
@@ -1935,6 +1946,7 @@ int console_trylock(void)
1935 } 1946 }
1936 console_locked = 1; 1947 console_locked = 1;
1937 console_may_schedule = 0; 1948 console_may_schedule = 0;
1949 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1938 return 1; 1950 return 1;
1939} 1951}
1940EXPORT_SYMBOL(console_trylock); 1952EXPORT_SYMBOL(console_trylock);
@@ -2095,6 +2107,7 @@ skip:
2095 local_irq_restore(flags); 2107 local_irq_restore(flags);
2096 } 2108 }
2097 console_locked = 0; 2109 console_locked = 0;
2110 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2098 2111
2099 /* Release the exclusive_console once it is used */ 2112 /* Release the exclusive_console once it is used */
2100 if (unlikely(exclusive_console)) 2113 if (unlikely(exclusive_console))
diff --git a/kernel/profile.c b/kernel/profile.c
index 76b8e77773ee..1f391819c42f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,9 +8,10 @@
8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
9 * Red Hat, July 2004 9 * Red Hat, July 2004
10 * Consolidation of architecture support code for profiling, 10 * Consolidation of architecture support code for profiling,
11 * William Irwin, Oracle, July 2004 11 * Nadia Yvette Chambers, Oracle, July 2004
12 * Amortized hit count accounting via per-cpu open-addressed hashtables 12 * Amortized hit count accounting via per-cpu open-addressed hashtables
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, Nadia Yvette Chambers,
14 * Oracle, 2004
14 */ 15 */
15 16
16#include <linux/export.h> 17#include <linux/export.h>
@@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
256 * pagetable hash functions, but uses a full hashtable full of finite 257 * pagetable hash functions, but uses a full hashtable full of finite
257 * collision chains, not just pairs of them. 258 * collision chains, not just pairs of them.
258 * 259 *
259 * -- wli 260 * -- nyc
260 */ 261 */
261static void __profile_flip_buffers(void *unused) 262static void __profile_flip_buffers(void *unused)
262{ 263{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda955..1599157336a6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -215,8 +215,12 @@ ok:
215 smp_rmb(); 215 smp_rmb();
216 if (task->mm) 216 if (task->mm)
217 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 218 rcu_read_lock();
219 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
220 rcu_read_unlock();
219 return -EPERM; 221 return -EPERM;
222 }
223 rcu_read_unlock();
220 224
221 return security_ptrace_access_check(task, mode); 225 return security_ptrace_access_check(task, mode);
222} 226}
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
280 284
281 if (seize) 285 if (seize)
282 flags |= PT_SEIZED; 286 flags |= PT_SEIZED;
283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 287 rcu_read_lock();
288 if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
284 flags |= PT_PTRACE_CAP; 289 flags |= PT_PTRACE_CAP;
290 rcu_read_unlock();
285 task->ptrace = flags; 291 task->ptrace = flags;
286 292
287 __ptrace_link(task, current); 293 __ptrace_link(task, current);
@@ -457,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer)
457 return; 463 return;
458 464
459 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 465 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
466 if (unlikely(p->ptrace & PT_EXITKILL))
467 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
468
460 if (__ptrace_detach(tracer, p)) 469 if (__ptrace_detach(tracer, p))
461 list_add(&p->ptrace_entry, &ptrace_dead); 470 list_add(&p->ptrace_entry, &ptrace_dead);
462 } 471 }
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..20dfba576c2b 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
109 } 109 }
110} 110}
111 111
112extern int rcu_expedited;
113
112#endif /* __LINUX_RCU_H */ 114#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da594..a2cf76177b44 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h>
49 50
50#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
51#include <trace/events/rcu.h> 52#include <trace/events/rcu.h>
52 53
53#include "rcu.h" 54#include "rcu.h"
54 55
56module_param(rcu_expedited, int, 0);
57
55#ifdef CONFIG_PREEMPT_RCU 58#ifdef CONFIG_PREEMPT_RCU
56 59
57/* 60/*
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f7..e7dce58f9c2a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 0; 198 return rcu_dynticks_nesting <= 1;
199} 199}
200 200
201/* 201/*
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d0190282204..f85016a2309b 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -706,7 +706,10 @@ void synchronize_rcu(void)
706 return; 706 return;
707 707
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 rcu_barrier(); 709 if (rcu_expedited)
710 synchronize_rcu_expedited();
711 else
712 rcu_barrier();
710} 713}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 714EXPORT_SYMBOL_GPL(synchronize_rcu);
712 715
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index aaa7b9f3532a..31dea01c85fd 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title)
339 339
340struct rcu_torture_ops { 340struct rcu_torture_ops {
341 void (*init)(void); 341 void (*init)(void);
342 void (*cleanup)(void);
343 int (*readlock)(void); 342 int (*readlock)(void);
344 void (*read_delay)(struct rcu_random_state *rrsp); 343 void (*read_delay)(struct rcu_random_state *rrsp);
345 void (*readunlock)(int idx); 344 void (*readunlock)(int idx);
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
431 430
432static struct rcu_torture_ops rcu_ops = { 431static struct rcu_torture_ops rcu_ops = {
433 .init = NULL, 432 .init = NULL,
434 .cleanup = NULL,
435 .readlock = rcu_torture_read_lock, 433 .readlock = rcu_torture_read_lock,
436 .read_delay = rcu_read_delay, 434 .read_delay = rcu_read_delay,
437 .readunlock = rcu_torture_read_unlock, 435 .readunlock = rcu_torture_read_unlock,
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void)
475 473
476static struct rcu_torture_ops rcu_sync_ops = { 474static struct rcu_torture_ops rcu_sync_ops = {
477 .init = rcu_sync_torture_init, 475 .init = rcu_sync_torture_init,
478 .cleanup = NULL,
479 .readlock = rcu_torture_read_lock, 476 .readlock = rcu_torture_read_lock,
480 .read_delay = rcu_read_delay, 477 .read_delay = rcu_read_delay,
481 .readunlock = rcu_torture_read_unlock, 478 .readunlock = rcu_torture_read_unlock,
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
493 490
494static struct rcu_torture_ops rcu_expedited_ops = { 491static struct rcu_torture_ops rcu_expedited_ops = {
495 .init = rcu_sync_torture_init, 492 .init = rcu_sync_torture_init,
496 .cleanup = NULL,
497 .readlock = rcu_torture_read_lock, 493 .readlock = rcu_torture_read_lock,
498 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 494 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
499 .readunlock = rcu_torture_read_unlock, 495 .readunlock = rcu_torture_read_unlock,
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
536 532
537static struct rcu_torture_ops rcu_bh_ops = { 533static struct rcu_torture_ops rcu_bh_ops = {
538 .init = NULL, 534 .init = NULL,
539 .cleanup = NULL,
540 .readlock = rcu_bh_torture_read_lock, 535 .readlock = rcu_bh_torture_read_lock,
541 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 536 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
542 .readunlock = rcu_bh_torture_read_unlock, 537 .readunlock = rcu_bh_torture_read_unlock,
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
553 548
554static struct rcu_torture_ops rcu_bh_sync_ops = { 549static struct rcu_torture_ops rcu_bh_sync_ops = {
555 .init = rcu_sync_torture_init, 550 .init = rcu_sync_torture_init,
556 .cleanup = NULL,
557 .readlock = rcu_bh_torture_read_lock, 551 .readlock = rcu_bh_torture_read_lock,
558 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 552 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
559 .readunlock = rcu_bh_torture_read_unlock, 553 .readunlock = rcu_bh_torture_read_unlock,
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
570 564
571static struct rcu_torture_ops rcu_bh_expedited_ops = { 565static struct rcu_torture_ops rcu_bh_expedited_ops = {
572 .init = rcu_sync_torture_init, 566 .init = rcu_sync_torture_init,
573 .cleanup = NULL,
574 .readlock = rcu_bh_torture_read_lock, 567 .readlock = rcu_bh_torture_read_lock,
575 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
576 .readunlock = rcu_bh_torture_read_unlock, 569 .readunlock = rcu_bh_torture_read_unlock,
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
589 * Definitions for srcu torture testing. 582 * Definitions for srcu torture testing.
590 */ 583 */
591 584
592static struct srcu_struct srcu_ctl; 585DEFINE_STATIC_SRCU(srcu_ctl);
593
594static void srcu_torture_init(void)
595{
596 init_srcu_struct(&srcu_ctl);
597 rcu_sync_torture_init();
598}
599
600static void srcu_torture_cleanup(void)
601{
602 synchronize_srcu(&srcu_ctl);
603 cleanup_srcu_struct(&srcu_ctl);
604}
605 586
606static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 587static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
607{ 588{
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page)
672} 653}
673 654
674static struct rcu_torture_ops srcu_ops = { 655static struct rcu_torture_ops srcu_ops = {
675 .init = srcu_torture_init, 656 .init = rcu_sync_torture_init,
676 .cleanup = srcu_torture_cleanup,
677 .readlock = srcu_torture_read_lock, 657 .readlock = srcu_torture_read_lock,
678 .read_delay = srcu_read_delay, 658 .read_delay = srcu_read_delay,
679 .readunlock = srcu_torture_read_unlock, 659 .readunlock = srcu_torture_read_unlock,
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = {
687}; 667};
688 668
689static struct rcu_torture_ops srcu_sync_ops = { 669static struct rcu_torture_ops srcu_sync_ops = {
690 .init = srcu_torture_init, 670 .init = rcu_sync_torture_init,
691 .cleanup = srcu_torture_cleanup,
692 .readlock = srcu_torture_read_lock, 671 .readlock = srcu_torture_read_lock,
693 .read_delay = srcu_read_delay, 672 .read_delay = srcu_read_delay,
694 .readunlock = srcu_torture_read_unlock, 673 .readunlock = srcu_torture_read_unlock,
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
712} 691}
713 692
714static struct rcu_torture_ops srcu_raw_ops = { 693static struct rcu_torture_ops srcu_raw_ops = {
715 .init = srcu_torture_init, 694 .init = rcu_sync_torture_init,
716 .cleanup = srcu_torture_cleanup,
717 .readlock = srcu_torture_read_lock_raw, 695 .readlock = srcu_torture_read_lock_raw,
718 .read_delay = srcu_read_delay, 696 .read_delay = srcu_read_delay,
719 .readunlock = srcu_torture_read_unlock_raw, 697 .readunlock = srcu_torture_read_unlock_raw,
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
727}; 705};
728 706
729static struct rcu_torture_ops srcu_raw_sync_ops = { 707static struct rcu_torture_ops srcu_raw_sync_ops = {
730 .init = srcu_torture_init, 708 .init = rcu_sync_torture_init,
731 .cleanup = srcu_torture_cleanup,
732 .readlock = srcu_torture_read_lock_raw, 709 .readlock = srcu_torture_read_lock_raw,
733 .read_delay = srcu_read_delay, 710 .read_delay = srcu_read_delay,
734 .readunlock = srcu_torture_read_unlock_raw, 711 .readunlock = srcu_torture_read_unlock_raw,
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void)
747} 724}
748 725
749static struct rcu_torture_ops srcu_expedited_ops = { 726static struct rcu_torture_ops srcu_expedited_ops = {
750 .init = srcu_torture_init, 727 .init = rcu_sync_torture_init,
751 .cleanup = srcu_torture_cleanup,
752 .readlock = srcu_torture_read_lock, 728 .readlock = srcu_torture_read_lock,
753 .read_delay = srcu_read_delay, 729 .read_delay = srcu_read_delay,
754 .readunlock = srcu_torture_read_unlock, 730 .readunlock = srcu_torture_read_unlock,
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
783 759
784static struct rcu_torture_ops sched_ops = { 760static struct rcu_torture_ops sched_ops = {
785 .init = rcu_sync_torture_init, 761 .init = rcu_sync_torture_init,
786 .cleanup = NULL,
787 .readlock = sched_torture_read_lock, 762 .readlock = sched_torture_read_lock,
788 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 763 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
789 .readunlock = sched_torture_read_unlock, 764 .readunlock = sched_torture_read_unlock,
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = {
799 774
800static struct rcu_torture_ops sched_sync_ops = { 775static struct rcu_torture_ops sched_sync_ops = {
801 .init = rcu_sync_torture_init, 776 .init = rcu_sync_torture_init,
802 .cleanup = NULL,
803 .readlock = sched_torture_read_lock, 777 .readlock = sched_torture_read_lock,
804 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
805 .readunlock = sched_torture_read_unlock, 779 .readunlock = sched_torture_read_unlock,
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = {
814 788
815static struct rcu_torture_ops sched_expedited_ops = { 789static struct rcu_torture_ops sched_expedited_ops = {
816 .init = rcu_sync_torture_init, 790 .init = rcu_sync_torture_init,
817 .cleanup = NULL,
818 .readlock = sched_torture_read_lock, 791 .readlock = sched_torture_read_lock,
819 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 792 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
820 .readunlock = sched_torture_read_unlock, 793 .readunlock = sched_torture_read_unlock,
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1369 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1397 "test_boost=%d/%d test_boost_interval=%d " 1370 "test_boost=%d/%d test_boost_interval=%d "
1398 "test_boost_duration=%d shutdown_secs=%d " 1371 "test_boost_duration=%d shutdown_secs=%d "
1372 "stall_cpu=%d stall_cpu_holdoff=%d "
1373 "n_barrier_cbs=%d "
1399 "onoff_interval=%d onoff_holdoff=%d\n", 1374 "onoff_interval=%d onoff_holdoff=%d\n",
1400 torture_type, tag, nrealreaders, nfakewriters, 1375 torture_type, tag, nrealreaders, nfakewriters,
1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1376 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1377 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1403 test_boost, cur_ops->can_boost, 1378 test_boost, cur_ops->can_boost,
1404 test_boost_interval, test_boost_duration, shutdown_secs, 1379 test_boost_interval, test_boost_duration, shutdown_secs,
1380 stall_cpu, stall_cpu_holdoff,
1381 n_barrier_cbs,
1405 onoff_interval, onoff_holdoff); 1382 onoff_interval, onoff_holdoff);
1406} 1383}
1407 1384
@@ -1502,6 +1479,7 @@ rcu_torture_onoff(void *arg)
1502 unsigned long delta; 1479 unsigned long delta;
1503 int maxcpu = -1; 1480 int maxcpu = -1;
1504 DEFINE_RCU_RANDOM(rand); 1481 DEFINE_RCU_RANDOM(rand);
1482 int ret;
1505 unsigned long starttime; 1483 unsigned long starttime;
1506 1484
1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1485 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
@@ -1522,7 +1500,13 @@ rcu_torture_onoff(void *arg)
1522 torture_type, cpu); 1500 torture_type, cpu);
1523 starttime = jiffies; 1501 starttime = jiffies;
1524 n_offline_attempts++; 1502 n_offline_attempts++;
1525 if (cpu_down(cpu) == 0) { 1503 ret = cpu_down(cpu);
1504 if (ret) {
1505 if (verbose)
1506 pr_alert("%s" TORTURE_FLAG
1507 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1508 torture_type, cpu, ret);
1509 } else {
1526 if (verbose) 1510 if (verbose)
1527 pr_alert("%s" TORTURE_FLAG 1511 pr_alert("%s" TORTURE_FLAG
1528 "rcu_torture_onoff task: offlined %d\n", 1512 "rcu_torture_onoff task: offlined %d\n",
@@ -1936,8 +1920,6 @@ rcu_torture_cleanup(void)
1936 1920
1937 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1921 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1938 1922
1939 if (cur_ops->cleanup)
1940 cur_ops->cleanup();
1941 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1923 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1942 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1924 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1943 else if (n_online_successes != n_online_attempts || 1925 else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 74df86bd9204..e441b77b614e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
71 .gpnum = -300, \ 71 .gpnum = 0UL - 300UL, \
72 .completed = -300, \ 72 .completed = 0UL - 300UL, \
73 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ 73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
@@ -207,18 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
209 .dynticks = ATOMIC_INIT(1), 209 .dynticks = ATOMIC_INIT(1),
210#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
211 .ignore_user_qs = true,
212#endif
213}; 210};
214 211
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 212static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 213static long qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 214static long qlowmark = 100; /* Once only this many pending, use blimit. */
218 215
219module_param(blimit, int, 0444); 216module_param(blimit, long, 0444);
220module_param(qhimark, int, 0444); 217module_param(qhimark, long, 0444);
221module_param(qlowmark, int, 0444); 218module_param(qlowmark, long, 0444);
222 219
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -303,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
303static int 300static int
304cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 301cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305{ 302{
306 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; 303 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
304 rdp->nxttail[RCU_DONE_TAIL] != NULL;
307} 305}
308 306
309/* 307/*
@@ -312,8 +310,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
312static int 310static int
313cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
314{ 312{
315 return *rdp->nxttail[RCU_DONE_TAIL + 313 struct rcu_head **ntp;
316 ACCESS_ONCE(rsp->completed) != rdp->completed] && 314
315 ntp = rdp->nxttail[RCU_DONE_TAIL +
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)];
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
317 !rcu_gp_in_progress(rsp); 318 !rcu_gp_in_progress(rsp);
318} 319}
319 320
@@ -416,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
416 */ 417 */
417void rcu_user_enter(void) 418void rcu_user_enter(void)
418{ 419{
419 unsigned long flags; 420 rcu_eqs_enter(1);
420 struct rcu_dynticks *rdtp;
421
422 /*
423 * Some contexts may involve an exception occuring in an irq,
424 * leading to that nesting:
425 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
426 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
427 * helpers are enough to protect RCU uses inside the exception. So
428 * just return immediately if we detect we are in an IRQ.
429 */
430 if (in_interrupt())
431 return;
432
433 WARN_ON_ONCE(!current->mm);
434
435 local_irq_save(flags);
436 rdtp = &__get_cpu_var(rcu_dynticks);
437 if (!rdtp->ignore_user_qs && !rdtp->in_user) {
438 rdtp->in_user = true;
439 rcu_eqs_enter(true);
440 }
441 local_irq_restore(flags);
442} 421}
443 422
444/** 423/**
@@ -575,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
575 */ 554 */
576void rcu_user_exit(void) 555void rcu_user_exit(void)
577{ 556{
578 unsigned long flags; 557 rcu_eqs_exit(1);
579 struct rcu_dynticks *rdtp;
580
581 /*
582 * Some contexts may involve an exception occuring in an irq,
583 * leading to that nesting:
584 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
585 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
586 * helpers are enough to protect RCU uses inside the exception. So
587 * just return immediately if we detect we are in an IRQ.
588 */
589 if (in_interrupt())
590 return;
591
592 local_irq_save(flags);
593 rdtp = &__get_cpu_var(rcu_dynticks);
594 if (rdtp->in_user) {
595 rdtp->in_user = false;
596 rcu_eqs_exit(true);
597 }
598 local_irq_restore(flags);
599} 558}
600 559
601/** 560/**
@@ -718,21 +677,6 @@ int rcu_is_cpu_idle(void)
718} 677}
719EXPORT_SYMBOL(rcu_is_cpu_idle); 678EXPORT_SYMBOL(rcu_is_cpu_idle);
720 679
721#ifdef CONFIG_RCU_USER_QS
722void rcu_user_hooks_switch(struct task_struct *prev,
723 struct task_struct *next)
724{
725 struct rcu_dynticks *rdtp;
726
727 /* Interrupts are disabled in context switch */
728 rdtp = &__get_cpu_var(rcu_dynticks);
729 if (!rdtp->ignore_user_qs) {
730 clear_tsk_thread_flag(prev, TIF_NOHZ);
731 set_tsk_thread_flag(next, TIF_NOHZ);
732 }
733}
734#endif /* #ifdef CONFIG_RCU_USER_QS */
735
736#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 680#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
737 681
738/* 682/*
@@ -873,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
873 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
874} 818}
875 819
820/*
821 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
822 * for architectures that do not implement trigger_all_cpu_backtrace().
823 * The NMI-triggered stack traces are more accurate because they are
824 * printed by the target CPU.
825 */
826static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
827{
828 int cpu;
829 unsigned long flags;
830 struct rcu_node *rnp;
831
832 rcu_for_each_leaf_node(rsp, rnp) {
833 raw_spin_lock_irqsave(&rnp->lock, flags);
834 if (rnp->qsmask != 0) {
835 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
836 if (rnp->qsmask & (1UL << cpu))
837 dump_cpu_task(rnp->grplo + cpu);
838 }
839 raw_spin_unlock_irqrestore(&rnp->lock, flags);
840 }
841}
842
876static void print_other_cpu_stall(struct rcu_state *rsp) 843static void print_other_cpu_stall(struct rcu_state *rsp)
877{ 844{
878 int cpu; 845 int cpu;
@@ -880,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
880 unsigned long flags; 847 unsigned long flags;
881 int ndetected = 0; 848 int ndetected = 0;
882 struct rcu_node *rnp = rcu_get_root(rsp); 849 struct rcu_node *rnp = rcu_get_root(rsp);
850 long totqlen = 0;
883 851
884 /* Only let one CPU complain about others per time interval. */ 852 /* Only let one CPU complain about others per time interval. */
885 853
@@ -924,12 +892,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
924 raw_spin_unlock_irqrestore(&rnp->lock, flags); 892 raw_spin_unlock_irqrestore(&rnp->lock, flags);
925 893
926 print_cpu_stall_info_end(); 894 print_cpu_stall_info_end();
927 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", 895 for_each_possible_cpu(cpu)
928 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 896 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
897 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
898 smp_processor_id(), (long)(jiffies - rsp->gp_start),
899 rsp->gpnum, rsp->completed, totqlen);
929 if (ndetected == 0) 900 if (ndetected == 0)
930 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 901 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
931 else if (!trigger_all_cpu_backtrace()) 902 else if (!trigger_all_cpu_backtrace())
932 dump_stack(); 903 rcu_dump_cpu_stacks(rsp);
933 904
934 /* Complain about tasks blocking the grace period. */ 905 /* Complain about tasks blocking the grace period. */
935 906
@@ -940,8 +911,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
940 911
941static void print_cpu_stall(struct rcu_state *rsp) 912static void print_cpu_stall(struct rcu_state *rsp)
942{ 913{
914 int cpu;
943 unsigned long flags; 915 unsigned long flags;
944 struct rcu_node *rnp = rcu_get_root(rsp); 916 struct rcu_node *rnp = rcu_get_root(rsp);
917 long totqlen = 0;
945 918
946 /* 919 /*
947 * OK, time to rat on ourselves... 920 * OK, time to rat on ourselves...
@@ -952,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
952 print_cpu_stall_info_begin(); 925 print_cpu_stall_info_begin();
953 print_cpu_stall_info(rsp, smp_processor_id()); 926 print_cpu_stall_info(rsp, smp_processor_id());
954 print_cpu_stall_info_end(); 927 print_cpu_stall_info_end();
955 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); 928 for_each_possible_cpu(cpu)
929 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
930 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
931 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
956 if (!trigger_all_cpu_backtrace()) 932 if (!trigger_all_cpu_backtrace())
957 dump_stack(); 933 dump_stack();
958 934
@@ -1091,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp)
1091 rdp->nxtlist = NULL; 1067 rdp->nxtlist = NULL;
1092 for (i = 0; i < RCU_NEXT_SIZE; i++) 1068 for (i = 0; i < RCU_NEXT_SIZE; i++)
1093 rdp->nxttail[i] = &rdp->nxtlist; 1069 rdp->nxttail[i] = &rdp->nxtlist;
1070 init_nocb_callback_list(rdp);
1094} 1071}
1095 1072
1096/* 1073/*
@@ -1404,15 +1381,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1404 !cpu_needs_another_gp(rsp, rdp)) { 1381 !cpu_needs_another_gp(rsp, rdp)) {
1405 /* 1382 /*
1406 * Either we have not yet spawned the grace-period 1383 * Either we have not yet spawned the grace-period
1407 * task or this CPU does not need another grace period. 1384 * task, this CPU does not need another grace period,
1385 * or a grace period is already in progress.
1408 * Either way, don't start a new grace period. 1386 * Either way, don't start a new grace period.
1409 */ 1387 */
1410 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1388 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1411 return; 1389 return;
1412 } 1390 }
1413 1391
1392 /*
1393 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be
1396 * handled after the end of the next grace period. If the
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406
1414 rsp->gp_flags = RCU_GP_FLAG_INIT; 1407 rsp->gp_flags = RCU_GP_FLAG_INIT;
1415 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1409
1410 /* Ensure that CPU is aware of completion of last grace period. */
1411 rcu_process_gp_end(rsp, rdp);
1412 local_irq_restore(flags);
1413
1414 /* Wake up rcu_gp_kthread() to start the grace period. */
1416 wake_up(&rsp->gp_wq); 1415 wake_up(&rsp->gp_wq);
1417} 1416}
1418 1417
@@ -1573,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1573/* 1572/*
1574 * Send the specified CPU's RCU callbacks to the orphanage. The 1573 * Send the specified CPU's RCU callbacks to the orphanage. The
1575 * specified CPU must be offline, and the caller must hold the 1574 * specified CPU must be offline, and the caller must hold the
1576 * ->onofflock. 1575 * ->orphan_lock.
1577 */ 1576 */
1578static void 1577static void
1579rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1578rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1580 struct rcu_node *rnp, struct rcu_data *rdp) 1579 struct rcu_node *rnp, struct rcu_data *rdp)
1581{ 1580{
1581 /* No-CBs CPUs do not have orphanable callbacks. */
1582 if (is_nocb_cpu(rdp->cpu))
1583 return;
1584
1582 /* 1585 /*
1583 * Orphan the callbacks. First adjust the counts. This is safe 1586 * Orphan the callbacks. First adjust the counts. This is safe
1584 * because ->onofflock excludes _rcu_barrier()'s adoption of 1587 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1585 * the callbacks, thus no memory barrier is required. 1588 * cannot be running now. Thus no memory barrier is required.
1586 */ 1589 */
1587 if (rdp->nxtlist != NULL) { 1590 if (rdp->nxtlist != NULL) {
1588 rsp->qlen_lazy += rdp->qlen_lazy; 1591 rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1623,13 +1626,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1623 1626
1624/* 1627/*
1625 * Adopt the RCU callbacks from the specified rcu_state structure's 1628 * Adopt the RCU callbacks from the specified rcu_state structure's
1626 * orphanage. The caller must hold the ->onofflock. 1629 * orphanage. The caller must hold the ->orphan_lock.
1627 */ 1630 */
1628static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1631static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1629{ 1632{
1630 int i; 1633 int i;
1631 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1634 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1632 1635
1636 /* No-CBs CPUs are handled specially. */
1637 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
1638 return;
1639
1633 /* Do the accounting first. */ 1640 /* Do the accounting first. */
1634 rdp->qlen_lazy += rsp->qlen_lazy; 1641 rdp->qlen_lazy += rsp->qlen_lazy;
1635 rdp->qlen += rsp->qlen; 1642 rdp->qlen += rsp->qlen;
@@ -1702,7 +1709,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1702 1709
1703 /* Exclude any attempts to start a new grace period. */ 1710 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex); 1711 mutex_lock(&rsp->onoff_mutex);
1705 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1712 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1706 1713
1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1714 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1708 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1715 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1729,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1729 /* 1736 /*
1730 * We still hold the leaf rcu_node structure lock here, and 1737 * We still hold the leaf rcu_node structure lock here, and
1731 * irqs are still disabled. The reason for this subterfuge is 1738 * irqs are still disabled. The reason for this subterfuge is
1732 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1739 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1733 * held leads to deadlock. 1740 * held leads to deadlock.
1734 */ 1741 */
1735 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1742 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1736 rnp = rdp->mynode; 1743 rnp = rdp->mynode;
1737 if (need_report & RCU_OFL_TASKS_NORM_GP) 1744 if (need_report & RCU_OFL_TASKS_NORM_GP)
1738 rcu_report_unblock_qs_rnp(rnp, flags); 1745 rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1769,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1769{ 1776{
1770 unsigned long flags; 1777 unsigned long flags;
1771 struct rcu_head *next, *list, **tail; 1778 struct rcu_head *next, *list, **tail;
1772 int bl, count, count_lazy, i; 1779 long bl, count, count_lazy;
1780 int i;
1773 1781
1774 /* If no callbacks are ready, just return.*/ 1782 /* If no callbacks are ready, just return.*/
1775 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -2107,9 +2115,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2107 } 2115 }
2108} 2116}
2109 2117
2118/*
2119 * Helper function for call_rcu() and friends. The cpu argument will
2120 * normally be -1, indicating "currently running CPU". It may specify
2121 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
2122 * is expected to specify a CPU.
2123 */
2110static void 2124static void
2111__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 2125__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2112 struct rcu_state *rsp, bool lazy) 2126 struct rcu_state *rsp, int cpu, bool lazy)
2113{ 2127{
2114 unsigned long flags; 2128 unsigned long flags;
2115 struct rcu_data *rdp; 2129 struct rcu_data *rdp;
@@ -2129,9 +2143,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2129 rdp = this_cpu_ptr(rsp->rda); 2143 rdp = this_cpu_ptr(rsp->rda);
2130 2144
2131 /* Add the callback to our list. */ 2145 /* Add the callback to our list. */
2132 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { 2146 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2147 int offline;
2148
2149 if (cpu != -1)
2150 rdp = per_cpu_ptr(rsp->rda, cpu);
2151 offline = !__call_rcu_nocb(rdp, head, lazy);
2152 WARN_ON_ONCE(offline);
2133 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2153 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2134 WARN_ON_ONCE(1);
2135 local_irq_restore(flags); 2154 local_irq_restore(flags);
2136 return; 2155 return;
2137 } 2156 }
@@ -2160,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2160 */ 2179 */
2161void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2180void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2162{ 2181{
2163 __call_rcu(head, func, &rcu_sched_state, 0); 2182 __call_rcu(head, func, &rcu_sched_state, -1, 0);
2164} 2183}
2165EXPORT_SYMBOL_GPL(call_rcu_sched); 2184EXPORT_SYMBOL_GPL(call_rcu_sched);
2166 2185
@@ -2169,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
2169 */ 2188 */
2170void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2189void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2171{ 2190{
2172 __call_rcu(head, func, &rcu_bh_state, 0); 2191 __call_rcu(head, func, &rcu_bh_state, -1, 0);
2173} 2192}
2174EXPORT_SYMBOL_GPL(call_rcu_bh); 2193EXPORT_SYMBOL_GPL(call_rcu_bh);
2175 2194
@@ -2205,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void)
2205 * rcu_read_lock_sched(). 2224 * rcu_read_lock_sched().
2206 * 2225 *
2207 * This means that all preempt_disable code sequences, including NMI and 2226 * This means that all preempt_disable code sequences, including NMI and
2208 * hardware-interrupt handlers, in progress on entry will have completed 2227 * non-threaded hardware-interrupt handlers, in progress on entry will
2209 * before this primitive returns. However, this does not guarantee that 2228 * have completed before this primitive returns. However, this does not
2210 * softirq handlers will have completed, since in some kernels, these 2229 * guarantee that softirq handlers will have completed, since in some
2211 * handlers can run in process context, and can block. 2230 * kernels, these handlers can run in process context, and can block.
2231 *
2232 * Note that this guarantee implies further memory-ordering guarantees.
2233 * On systems with more than one CPU, when synchronize_sched() returns,
2234 * each CPU is guaranteed to have executed a full memory barrier since the
2235 * end of its last RCU-sched read-side critical section whose beginning
2236 * preceded the call to synchronize_sched(). In addition, each CPU having
2237 * an RCU read-side critical section that extends beyond the return from
2238 * synchronize_sched() is guaranteed to have executed a full memory barrier
2239 * after the beginning of synchronize_sched() and before the beginning of
2240 * that RCU read-side critical section. Note that these guarantees include
2241 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2242 * that are executing in the kernel.
2243 *
2244 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2245 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2246 * to have executed a full memory barrier during the execution of
2247 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2248 * again only if the system has more than one CPU).
2212 * 2249 *
2213 * This primitive provides the guarantees made by the (now removed) 2250 * This primitive provides the guarantees made by the (now removed)
2214 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2251 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2224,7 +2261,10 @@ void synchronize_sched(void)
2224 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2261 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2225 if (rcu_blocking_is_gp()) 2262 if (rcu_blocking_is_gp())
2226 return; 2263 return;
2227 wait_rcu_gp(call_rcu_sched); 2264 if (rcu_expedited)
2265 synchronize_sched_expedited();
2266 else
2267 wait_rcu_gp(call_rcu_sched);
2228} 2268}
2229EXPORT_SYMBOL_GPL(synchronize_sched); 2269EXPORT_SYMBOL_GPL(synchronize_sched);
2230 2270
@@ -2236,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2236 * read-side critical sections have completed. RCU read-side critical 2276 * read-side critical sections have completed. RCU read-side critical
2237 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2277 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2238 * and may be nested. 2278 * and may be nested.
2279 *
2280 * See the description of synchronize_sched() for more detailed information
2281 * on memory ordering guarantees.
2239 */ 2282 */
2240void synchronize_rcu_bh(void) 2283void synchronize_rcu_bh(void)
2241{ 2284{
@@ -2245,13 +2288,13 @@ void synchronize_rcu_bh(void)
2245 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2288 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2246 if (rcu_blocking_is_gp()) 2289 if (rcu_blocking_is_gp())
2247 return; 2290 return;
2248 wait_rcu_gp(call_rcu_bh); 2291 if (rcu_expedited)
2292 synchronize_rcu_bh_expedited();
2293 else
2294 wait_rcu_gp(call_rcu_bh);
2249} 2295}
2250EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2296EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2251 2297
2252static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
2253static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
2254
2255static int synchronize_sched_expedited_cpu_stop(void *data) 2298static int synchronize_sched_expedited_cpu_stop(void *data)
2256{ 2299{
2257 /* 2300 /*
@@ -2308,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2308 */ 2351 */
2309void synchronize_sched_expedited(void) 2352void synchronize_sched_expedited(void)
2310{ 2353{
2311 int firstsnap, s, snap, trycount = 0; 2354 long firstsnap, s, snap;
2355 int trycount = 0;
2356 struct rcu_state *rsp = &rcu_sched_state;
2357
2358 /*
2359 * If we are in danger of counter wrap, just do synchronize_sched().
2360 * By allowing sync_sched_expedited_started to advance no more than
2361 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2362 * that more than 3.5 billion CPUs would be required to force a
2363 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2364 * course be required on a 64-bit system.
2365 */
2366 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2367 (ulong)atomic_long_read(&rsp->expedited_done) +
2368 ULONG_MAX / 8)) {
2369 synchronize_sched();
2370 atomic_long_inc(&rsp->expedited_wrap);
2371 return;
2372 }
2312 2373
2313 /* Note that atomic_inc_return() implies full memory barrier. */ 2374 /*
2314 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); 2375 * Take a ticket. Note that atomic_inc_return() implies a
2376 * full memory barrier.
2377 */
2378 snap = atomic_long_inc_return(&rsp->expedited_start);
2379 firstsnap = snap;
2315 get_online_cpus(); 2380 get_online_cpus();
2316 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2381 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2317 2382
@@ -2323,48 +2388,65 @@ void synchronize_sched_expedited(void)
2323 synchronize_sched_expedited_cpu_stop, 2388 synchronize_sched_expedited_cpu_stop,
2324 NULL) == -EAGAIN) { 2389 NULL) == -EAGAIN) {
2325 put_online_cpus(); 2390 put_online_cpus();
2391 atomic_long_inc(&rsp->expedited_tryfail);
2392
2393 /* Check to see if someone else did our work for us. */
2394 s = atomic_long_read(&rsp->expedited_done);
2395 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2396 /* ensure test happens before caller kfree */
2397 smp_mb__before_atomic_inc(); /* ^^^ */
2398 atomic_long_inc(&rsp->expedited_workdone1);
2399 return;
2400 }
2326 2401
2327 /* No joy, try again later. Or just synchronize_sched(). */ 2402 /* No joy, try again later. Or just synchronize_sched(). */
2328 if (trycount++ < 10) { 2403 if (trycount++ < 10) {
2329 udelay(trycount * num_online_cpus()); 2404 udelay(trycount * num_online_cpus());
2330 } else { 2405 } else {
2331 synchronize_sched(); 2406 wait_rcu_gp(call_rcu_sched);
2407 atomic_long_inc(&rsp->expedited_normal);
2332 return; 2408 return;
2333 } 2409 }
2334 2410
2335 /* Check to see if someone else did our work for us. */ 2411 /* Recheck to see if someone else did our work for us. */
2336 s = atomic_read(&sync_sched_expedited_done); 2412 s = atomic_long_read(&rsp->expedited_done);
2337 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { 2413 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2338 smp_mb(); /* ensure test happens before caller kfree */ 2414 /* ensure test happens before caller kfree */
2415 smp_mb__before_atomic_inc(); /* ^^^ */
2416 atomic_long_inc(&rsp->expedited_workdone2);
2339 return; 2417 return;
2340 } 2418 }
2341 2419
2342 /* 2420 /*
2343 * Refetching sync_sched_expedited_started allows later 2421 * Refetching sync_sched_expedited_started allows later
2344 * callers to piggyback on our grace period. We subtract 2422 * callers to piggyback on our grace period. We retry
2345 * 1 to get the same token that the last incrementer got. 2423 * after they started, so our grace period works for them,
2346 * We retry after they started, so our grace period works 2424 * and they started after our first try, so their grace
2347 * for them, and they started after our first try, so their 2425 * period works for us.
2348 * grace period works for us.
2349 */ 2426 */
2350 get_online_cpus(); 2427 get_online_cpus();
2351 snap = atomic_read(&sync_sched_expedited_started); 2428 snap = atomic_long_read(&rsp->expedited_start);
2352 smp_mb(); /* ensure read is before try_stop_cpus(). */ 2429 smp_mb(); /* ensure read is before try_stop_cpus(). */
2353 } 2430 }
2431 atomic_long_inc(&rsp->expedited_stoppedcpus);
2354 2432
2355 /* 2433 /*
2356 * Everyone up to our most recent fetch is covered by our grace 2434 * Everyone up to our most recent fetch is covered by our grace
2357 * period. Update the counter, but only if our work is still 2435 * period. Update the counter, but only if our work is still
2358 * relevant -- which it won't be if someone who started later 2436 * relevant -- which it won't be if someone who started later
2359 * than we did beat us to the punch. 2437 * than we did already did their update.
2360 */ 2438 */
2361 do { 2439 do {
2362 s = atomic_read(&sync_sched_expedited_done); 2440 atomic_long_inc(&rsp->expedited_done_tries);
2363 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { 2441 s = atomic_long_read(&rsp->expedited_done);
2364 smp_mb(); /* ensure test happens before caller kfree */ 2442 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2443 /* ensure test happens before caller kfree */
2444 smp_mb__before_atomic_inc(); /* ^^^ */
2445 atomic_long_inc(&rsp->expedited_done_lost);
2365 break; 2446 break;
2366 } 2447 }
2367 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); 2448 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2449 atomic_long_inc(&rsp->expedited_done_exit);
2368 2450
2369 put_online_cpus(); 2451 put_online_cpus();
2370} 2452}
@@ -2558,9 +2640,17 @@ static void _rcu_barrier(struct rcu_state *rsp)
2558 * When that callback is invoked, we will know that all of the 2640 * When that callback is invoked, we will know that all of the
2559 * corresponding CPU's preceding callbacks have been invoked. 2641 * corresponding CPU's preceding callbacks have been invoked.
2560 */ 2642 */
2561 for_each_online_cpu(cpu) { 2643 for_each_possible_cpu(cpu) {
2644 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2645 continue;
2562 rdp = per_cpu_ptr(rsp->rda, cpu); 2646 rdp = per_cpu_ptr(rsp->rda, cpu);
2563 if (ACCESS_ONCE(rdp->qlen)) { 2647 if (is_nocb_cpu(cpu)) {
2648 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2649 rsp->n_barrier_done);
2650 atomic_inc(&rsp->barrier_cpu_count);
2651 __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2652 rsp, cpu, 0);
2653 } else if (ACCESS_ONCE(rdp->qlen)) {
2564 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2654 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2565 rsp->n_barrier_done); 2655 rsp->n_barrier_done);
2566 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2656 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -2634,6 +2724,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2634#endif 2724#endif
2635 rdp->cpu = cpu; 2725 rdp->cpu = cpu;
2636 rdp->rsp = rsp; 2726 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp);
2637 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2728 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2638} 2729}
2639 2730
@@ -2715,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2715 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2806 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2716 struct rcu_node *rnp = rdp->mynode; 2807 struct rcu_node *rnp = rdp->mynode;
2717 struct rcu_state *rsp; 2808 struct rcu_state *rsp;
2809 int ret = NOTIFY_OK;
2718 2810
2719 trace_rcu_utilization("Start CPU hotplug"); 2811 trace_rcu_utilization("Start CPU hotplug");
2720 switch (action) { 2812 switch (action) {
@@ -2728,7 +2820,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2728 rcu_boost_kthread_setaffinity(rnp, -1); 2820 rcu_boost_kthread_setaffinity(rnp, -1);
2729 break; 2821 break;
2730 case CPU_DOWN_PREPARE: 2822 case CPU_DOWN_PREPARE:
2731 rcu_boost_kthread_setaffinity(rnp, cpu); 2823 if (nocb_cpu_expendable(cpu))
2824 rcu_boost_kthread_setaffinity(rnp, cpu);
2825 else
2826 ret = NOTIFY_BAD;
2732 break; 2827 break;
2733 case CPU_DYING: 2828 case CPU_DYING:
2734 case CPU_DYING_FROZEN: 2829 case CPU_DYING_FROZEN:
@@ -2752,7 +2847,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2752 break; 2847 break;
2753 } 2848 }
2754 trace_rcu_utilization("End CPU hotplug"); 2849 trace_rcu_utilization("End CPU hotplug");
2755 return NOTIFY_OK; 2850 return ret;
2756} 2851}
2757 2852
2758/* 2853/*
@@ -2772,6 +2867,7 @@ static int __init rcu_spawn_gp_kthread(void)
2772 raw_spin_lock_irqsave(&rnp->lock, flags); 2867 raw_spin_lock_irqsave(&rnp->lock, flags);
2773 rsp->gp_kthread = t; 2868 rsp->gp_kthread = t;
2774 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2869 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2870 rcu_spawn_nocb_kthreads(rsp);
2775 } 2871 }
2776 return 0; 2872 return 0;
2777} 2873}
@@ -2967,6 +3063,7 @@ void __init rcu_init(void)
2967 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3063 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2968 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3064 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2969 __rcu_init_preempt(); 3065 __rcu_init_preempt();
3066 rcu_init_nocb();
2970 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3067 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
2971 3068
2972 /* 3069 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a240f032848e..4b69291b093d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -287,6 +287,7 @@ struct rcu_data {
287 long qlen_last_fqs_check; 287 long qlen_last_fqs_check;
288 /* qlen at last check for QS forcing */ 288 /* qlen at last check for QS forcing */
289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
290 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
290 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ 291 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
291 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ 292 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
292 unsigned long n_force_qs_snap; 293 unsigned long n_force_qs_snap;
@@ -317,6 +318,18 @@ struct rcu_data {
317 struct rcu_head oom_head; 318 struct rcu_head oom_head;
318#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 319#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
319 320
321 /* 7) Callback offloading. */
322#ifdef CONFIG_RCU_NOCB_CPU
323 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
324 struct rcu_head **nocb_tail;
325 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
326 atomic_long_t nocb_q_count_lazy; /* (approximate). */
327 int nocb_p_count; /* # CBs being invoked by kthread */
328 int nocb_p_count_lazy; /* (approximate). */
329 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
330 struct task_struct *nocb_kthread;
331#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
332
320 int cpu; 333 int cpu;
321 struct rcu_state *rsp; 334 struct rcu_state *rsp;
322}; 335};
@@ -369,6 +382,12 @@ struct rcu_state {
369 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 382 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
370 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 383 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
371 void (*func)(struct rcu_head *head)); 384 void (*func)(struct rcu_head *head));
385#ifdef CONFIG_RCU_NOCB_CPU
386 void (*call_remote)(struct rcu_head *head,
387 void (*func)(struct rcu_head *head));
388 /* call_rcu() flavor, but for */
389 /* placing on remote CPU. */
390#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
372 391
373 /* The following fields are guarded by the root rcu_node's lock. */ 392 /* The following fields are guarded by the root rcu_node's lock. */
374 393
@@ -383,9 +402,8 @@ struct rcu_state {
383 402
384 /* End of fields guarded by root rcu_node's lock. */ 403 /* End of fields guarded by root rcu_node's lock. */
385 404
386 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; 405 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
387 /* exclude on/offline and */ 406 /* Protect following fields. */
388 /* starting new GP. */
389 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 407 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
390 /* need a grace period. */ 408 /* need a grace period. */
391 struct rcu_head **orphan_nxttail; /* Tail of above. */ 409 struct rcu_head **orphan_nxttail; /* Tail of above. */
@@ -394,7 +412,7 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 412 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 413 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 414 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */ 415 /* End of fields guarded by orphan_lock. */
398 416
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ 417 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400 418
@@ -405,6 +423,18 @@ struct rcu_state {
405 /* _rcu_barrier(). */ 423 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */ 424 /* End of fields guarded by barrier_mutex. */
407 425
426 atomic_long_t expedited_start; /* Starting ticket. */
427 atomic_long_t expedited_done; /* Done ticket. */
428 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
429 atomic_long_t expedited_tryfail; /* # acquisition failures. */
430 atomic_long_t expedited_workdone1; /* # done by others #1. */
431 atomic_long_t expedited_workdone2; /* # done by others #2. */
432 atomic_long_t expedited_normal; /* # fallbacks to normal. */
433 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
434 atomic_long_t expedited_done_tries; /* # tries to update _done. */
435 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
436 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
437
408 unsigned long jiffies_force_qs; /* Time at which to invoke */ 438 unsigned long jiffies_force_qs; /* Time at which to invoke */
409 /* force_quiescent_state(). */ 439 /* force_quiescent_state(). */
410 unsigned long n_force_qs; /* Number of calls to */ 440 unsigned long n_force_qs; /* Number of calls to */
@@ -428,6 +458,8 @@ struct rcu_state {
428#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 458#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
429 459
430extern struct list_head rcu_struct_flavors; 460extern struct list_head rcu_struct_flavors;
461
462/* Sequence through rcu_state structures for each RCU flavor. */
431#define for_each_rcu_flavor(rsp) \ 463#define for_each_rcu_flavor(rsp) \
432 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 464 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
433 465
@@ -504,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
504static void print_cpu_stall_info_end(void); 536static void print_cpu_stall_info_end(void);
505static void zero_cpu_stall_ticks(struct rcu_data *rdp); 537static void zero_cpu_stall_ticks(struct rcu_data *rdp);
506static void increment_cpu_stall_ticks(void); 538static void increment_cpu_stall_ticks(void);
539static bool is_nocb_cpu(int cpu);
540static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
541 bool lazy);
542static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
543 struct rcu_data *rdp);
544static bool nocb_cpu_expendable(int cpu);
545static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
546static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
547static void init_nocb_callback_list(struct rcu_data *rdp);
548static void __init rcu_init_nocb(void);
507 549
508#endif /* #ifndef RCU_TREE_NONCORE */ 550#endif /* #ifndef RCU_TREE_NONCORE */
551
552#ifdef CONFIG_RCU_TRACE
553#ifdef CONFIG_RCU_NOCB_CPU
554/* Sum up queue lengths for tracing. */
555static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
556{
557 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
558 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
559}
560#else /* #ifdef CONFIG_RCU_NOCB_CPU */
561static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
562{
563 *ql = 0;
564 *qll = 0;
565}
566#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
567#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f92115488187..f6e5ec2932b4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/gfp.h>
28#include <linux/oom.h> 29#include <linux/oom.h>
29#include <linux/smpboot.h> 30#include <linux/smpboot.h>
30 31
@@ -36,6 +37,14 @@
36#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 37#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
37#endif 38#endif
38 39
40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool rcu_nocb_poll; /* Offload kthread are to poll. */
44module_param(rcu_nocb_poll, bool, 0444);
45static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47
39/* 48/*
40 * Check the RCU kernel configuration parameters and print informative 49 * Check the RCU kernel configuration parameters and print informative
41 * messages about anything out of the ordinary. If you like #ifdef, you 50 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void)
76 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
77 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
78 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU
89 if (have_rcu_nocb_mask) {
90 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
91 cpumask_clear_cpu(0, rcu_nocb_mask);
92 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
93 }
94 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
95 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
96 if (rcu_nocb_poll)
97 pr_info("\tExperimental polled no-CBs CPUs.\n");
98 }
99#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
79} 100}
80 101
81#ifdef CONFIG_TREE_PREEMPT_RCU 102#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void)
642 */ 663 */
643void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 664void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
644{ 665{
645 __call_rcu(head, func, &rcu_preempt_state, 0); 666 __call_rcu(head, func, &rcu_preempt_state, -1, 0);
646} 667}
647EXPORT_SYMBOL_GPL(call_rcu); 668EXPORT_SYMBOL_GPL(call_rcu);
648 669
@@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
656void kfree_call_rcu(struct rcu_head *head, 677void kfree_call_rcu(struct rcu_head *head,
657 void (*func)(struct rcu_head *rcu)) 678 void (*func)(struct rcu_head *rcu))
658{ 679{
659 __call_rcu(head, func, &rcu_preempt_state, 1); 680 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
660} 681}
661EXPORT_SYMBOL_GPL(kfree_call_rcu); 682EXPORT_SYMBOL_GPL(kfree_call_rcu);
662 683
@@ -670,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
670 * concurrently with new RCU read-side critical sections that began while 691 * concurrently with new RCU read-side critical sections that began while
671 * synchronize_rcu() was waiting. RCU read-side critical sections are 692 * synchronize_rcu() was waiting. RCU read-side critical sections are
672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 693 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
694 *
695 * See the description of synchronize_sched() for more detailed information
696 * on memory ordering guarantees.
673 */ 697 */
674void synchronize_rcu(void) 698void synchronize_rcu(void)
675{ 699{
@@ -679,7 +703,10 @@ void synchronize_rcu(void)
679 "Illegal synchronize_rcu() in RCU read-side critical section"); 703 "Illegal synchronize_rcu() in RCU read-side critical section");
680 if (!rcu_scheduler_active) 704 if (!rcu_scheduler_active)
681 return; 705 return;
682 wait_rcu_gp(call_rcu); 706 if (rcu_expedited)
707 synchronize_rcu_expedited();
708 else
709 wait_rcu_gp(call_rcu);
683} 710}
684EXPORT_SYMBOL_GPL(synchronize_rcu); 711EXPORT_SYMBOL_GPL(synchronize_rcu);
685 712
@@ -757,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
757 * grace period for the specified rcu_node structure. If there are no such 784 * grace period for the specified rcu_node structure. If there are no such
758 * tasks, report it up the rcu_node hierarchy. 785 * tasks, report it up the rcu_node hierarchy.
759 * 786 *
760 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. 787 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
788 * CPU hotplug operations.
761 */ 789 */
762static void 790static void
763sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 791sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -831,7 +859,7 @@ void synchronize_rcu_expedited(void)
831 udelay(trycount * num_online_cpus()); 859 udelay(trycount * num_online_cpus());
832 } else { 860 } else {
833 put_online_cpus(); 861 put_online_cpus();
834 synchronize_rcu(); 862 wait_rcu_gp(call_rcu);
835 return; 863 return;
836 } 864 }
837 } 865 }
@@ -875,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
875 903
876/** 904/**
877 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 905 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
906 *
907 * Note that this primitive does not necessarily wait for an RCU grace period
908 * to complete. For example, if there are no RCU callbacks queued anywhere
909 * in the system, then rcu_barrier() is within its rights to return
910 * immediately, without waiting for anything, much less an RCU grace period.
878 */ 911 */
879void rcu_barrier(void) 912void rcu_barrier(void)
880{ 913{
@@ -1013,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu)
1013void kfree_call_rcu(struct rcu_head *head, 1046void kfree_call_rcu(struct rcu_head *head,
1014 void (*func)(struct rcu_head *rcu)) 1047 void (*func)(struct rcu_head *rcu))
1015{ 1048{
1016 __call_rcu(head, func, &rcu_sched_state, 1); 1049 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1017} 1050}
1018EXPORT_SYMBOL_GPL(kfree_call_rcu); 1051EXPORT_SYMBOL_GPL(kfree_call_rcu);
1019 1052
@@ -2092,3 +2125,373 @@ static void increment_cpu_stall_ticks(void)
2092} 2125}
2093 2126
2094#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2127#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2128
2129#ifdef CONFIG_RCU_NOCB_CPU
2130
2131/*
2132 * Offload callback processing from the boot-time-specified set of CPUs
2133 * specified by rcu_nocb_mask. For each CPU in the set, there is a
2134 * kthread created that pulls the callbacks from the corresponding CPU,
2135 * waits for a grace period to elapse, and invokes the callbacks.
2136 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2137 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2138 * has been specified, in which case each kthread actively polls its
2139 * CPU. (Which isn't so great for energy efficiency, but which does
2140 * reduce RCU's overhead on that CPU.)
2141 *
2142 * This is intended to be used in conjunction with Frederic Weisbecker's
2143 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2144 * running CPU-bound user-mode computations.
2145 *
2146 * Offloading of callback processing could also in theory be used as
2147 * an energy-efficiency measure because CPUs with no RCU callbacks
2148 * queued are more aggressive about entering dyntick-idle mode.
2149 */
2150
2151
2152/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2153static int __init rcu_nocb_setup(char *str)
2154{
2155 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2156 have_rcu_nocb_mask = true;
2157 cpulist_parse(str, rcu_nocb_mask);
2158 return 1;
2159}
2160__setup("rcu_nocbs=", rcu_nocb_setup);
2161
2162/* Is the specified CPU a no-CPUs CPU? */
2163static bool is_nocb_cpu(int cpu)
2164{
2165 if (have_rcu_nocb_mask)
2166 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2167 return false;
2168}
2169
2170/*
2171 * Enqueue the specified string of rcu_head structures onto the specified
2172 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2173 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2174 * counts are supplied by rhcount and rhcount_lazy.
2175 *
2176 * If warranted, also wake up the kthread servicing this CPUs queues.
2177 */
2178static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2179 struct rcu_head *rhp,
2180 struct rcu_head **rhtp,
2181 int rhcount, int rhcount_lazy)
2182{
2183 int len;
2184 struct rcu_head **old_rhpp;
2185 struct task_struct *t;
2186
2187 /* Enqueue the callback on the nocb list and update counts. */
2188 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2189 ACCESS_ONCE(*old_rhpp) = rhp;
2190 atomic_long_add(rhcount, &rdp->nocb_q_count);
2191 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2192
2193 /* If we are not being polled and there is a kthread, awaken it ... */
2194 t = ACCESS_ONCE(rdp->nocb_kthread);
2195 if (rcu_nocb_poll | !t)
2196 return;
2197 len = atomic_long_read(&rdp->nocb_q_count);
2198 if (old_rhpp == &rdp->nocb_head) {
2199 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2200 rdp->qlen_last_fqs_check = 0;
2201 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2202 wake_up_process(t); /* ... or if many callbacks queued. */
2203 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2204 }
2205 return;
2206}
2207
2208/*
2209 * This is a helper for __call_rcu(), which invokes this when the normal
2210 * callback queue is inoperable. If this is not a no-CBs CPU, this
2211 * function returns failure back to __call_rcu(), which can complain
2212 * appropriately.
2213 *
2214 * Otherwise, this function queues the callback where the corresponding
2215 * "rcuo" kthread can find it.
2216 */
2217static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2218 bool lazy)
2219{
2220
2221 if (!is_nocb_cpu(rdp->cpu))
2222 return 0;
2223 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2224 return 1;
2225}
2226
2227/*
2228 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2229 * not a no-CBs CPU.
2230 */
2231static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2232 struct rcu_data *rdp)
2233{
2234 long ql = rsp->qlen;
2235 long qll = rsp->qlen_lazy;
2236
2237 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2238 if (!is_nocb_cpu(smp_processor_id()))
2239 return 0;
2240 rsp->qlen = 0;
2241 rsp->qlen_lazy = 0;
2242
2243 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2244 if (rsp->orphan_donelist != NULL) {
2245 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2246 rsp->orphan_donetail, ql, qll);
2247 ql = qll = 0;
2248 rsp->orphan_donelist = NULL;
2249 rsp->orphan_donetail = &rsp->orphan_donelist;
2250 }
2251 if (rsp->orphan_nxtlist != NULL) {
2252 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2253 rsp->orphan_nxttail, ql, qll);
2254 ql = qll = 0;
2255 rsp->orphan_nxtlist = NULL;
2256 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2257 }
2258 return 1;
2259}
2260
2261/*
2262 * There must be at least one non-no-CBs CPU in operation at any given
2263 * time, because no-CBs CPUs are not capable of initiating grace periods
2264 * independently. This function therefore complains if the specified
2265 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2266 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2267 * but you have to have a base case!)
2268 */
2269static bool nocb_cpu_expendable(int cpu)
2270{
2271 cpumask_var_t non_nocb_cpus;
2272 int ret;
2273
2274 /*
2275 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2276 * then offlining this CPU is harmless. Let it happen.
2277 */
2278 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2279 return 1;
2280
2281 /* If no memory, play it safe and keep the CPU around. */
2282 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2283 return 0;
2284 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2285 cpumask_clear_cpu(cpu, non_nocb_cpus);
2286 ret = !cpumask_empty(non_nocb_cpus);
2287 free_cpumask_var(non_nocb_cpus);
2288 return ret;
2289}
2290
2291/*
2292 * Helper structure for remote registry of RCU callbacks.
2293 * This is needed for when a no-CBs CPU needs to start a grace period.
2294 * If it just invokes call_rcu(), the resulting callback will be queued,
2295 * which can result in deadlock.
2296 */
2297struct rcu_head_remote {
2298 struct rcu_head *rhp;
2299 call_rcu_func_t *crf;
2300 void (*func)(struct rcu_head *rhp);
2301};
2302
2303/*
2304 * Register a callback as specified by the rcu_head_remote struct.
2305 * This function is intended to be invoked via smp_call_function_single().
2306 */
2307static void call_rcu_local(void *arg)
2308{
2309 struct rcu_head_remote *rhrp =
2310 container_of(arg, struct rcu_head_remote, rhp);
2311
2312 rhrp->crf(rhrp->rhp, rhrp->func);
2313}
2314
2315/*
2316 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2317 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2318 * smp_call_function_single().
2319 */
2320static void invoke_crf_remote(struct rcu_head *rhp,
2321 void (*func)(struct rcu_head *rhp),
2322 call_rcu_func_t crf)
2323{
2324 struct rcu_head_remote rhr;
2325
2326 rhr.rhp = rhp;
2327 rhr.crf = crf;
2328 rhr.func = func;
2329 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2330}
2331
2332/*
2333 * Helper functions to be passed to wait_rcu_gp(), each of which
2334 * invokes invoke_crf_remote() to register a callback appropriately.
2335 */
2336static void __maybe_unused
2337call_rcu_preempt_remote(struct rcu_head *rhp,
2338 void (*func)(struct rcu_head *rhp))
2339{
2340 invoke_crf_remote(rhp, func, call_rcu);
2341}
2342static void call_rcu_bh_remote(struct rcu_head *rhp,
2343 void (*func)(struct rcu_head *rhp))
2344{
2345 invoke_crf_remote(rhp, func, call_rcu_bh);
2346}
2347static void call_rcu_sched_remote(struct rcu_head *rhp,
2348 void (*func)(struct rcu_head *rhp))
2349{
2350 invoke_crf_remote(rhp, func, call_rcu_sched);
2351}
2352
2353/*
2354 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2355 * callbacks queued by the corresponding no-CBs CPU.
2356 */
2357static int rcu_nocb_kthread(void *arg)
2358{
2359 int c, cl;
2360 struct rcu_head *list;
2361 struct rcu_head *next;
2362 struct rcu_head **tail;
2363 struct rcu_data *rdp = arg;
2364
2365 /* Each pass through this loop invokes one batch of callbacks */
2366 for (;;) {
2367 /* If not polling, wait for next batch of callbacks. */
2368 if (!rcu_nocb_poll)
2369 wait_event(rdp->nocb_wq, rdp->nocb_head);
2370 list = ACCESS_ONCE(rdp->nocb_head);
2371 if (!list) {
2372 schedule_timeout_interruptible(1);
2373 continue;
2374 }
2375
2376 /*
2377 * Extract queued callbacks, update counts, and wait
2378 * for a grace period to elapse.
2379 */
2380 ACCESS_ONCE(rdp->nocb_head) = NULL;
2381 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2382 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2383 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2384 ACCESS_ONCE(rdp->nocb_p_count) += c;
2385 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2386 wait_rcu_gp(rdp->rsp->call_remote);
2387
2388 /* Each pass through the following loop invokes a callback. */
2389 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2390 c = cl = 0;
2391 while (list) {
2392 next = list->next;
2393 /* Wait for enqueuing to complete, if needed. */
2394 while (next == NULL && &list->next != tail) {
2395 schedule_timeout_interruptible(1);
2396 next = list->next;
2397 }
2398 debug_rcu_head_unqueue(list);
2399 local_bh_disable();
2400 if (__rcu_reclaim(rdp->rsp->name, list))
2401 cl++;
2402 c++;
2403 local_bh_enable();
2404 list = next;
2405 }
2406 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2407 ACCESS_ONCE(rdp->nocb_p_count) -= c;
2408 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2409 rdp->n_nocbs_invoked += c;
2410 }
2411 return 0;
2412}
2413
2414/* Initialize per-rcu_data variables for no-CBs CPUs. */
2415static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2416{
2417 rdp->nocb_tail = &rdp->nocb_head;
2418 init_waitqueue_head(&rdp->nocb_wq);
2419}
2420
2421/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2422static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2423{
2424 int cpu;
2425 struct rcu_data *rdp;
2426 struct task_struct *t;
2427
2428 if (rcu_nocb_mask == NULL)
2429 return;
2430 for_each_cpu(cpu, rcu_nocb_mask) {
2431 rdp = per_cpu_ptr(rsp->rda, cpu);
2432 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
2433 BUG_ON(IS_ERR(t));
2434 ACCESS_ONCE(rdp->nocb_kthread) = t;
2435 }
2436}
2437
2438/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2439static void init_nocb_callback_list(struct rcu_data *rdp)
2440{
2441 if (rcu_nocb_mask == NULL ||
2442 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2443 return;
2444 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2445}
2446
2447/* Initialize the ->call_remote fields in the rcu_state structures. */
2448static void __init rcu_init_nocb(void)
2449{
2450#ifdef CONFIG_PREEMPT_RCU
2451 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2452#endif /* #ifdef CONFIG_PREEMPT_RCU */
2453 rcu_bh_state.call_remote = call_rcu_bh_remote;
2454 rcu_sched_state.call_remote = call_rcu_sched_remote;
2455}
2456
2457#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2458
2459static bool is_nocb_cpu(int cpu)
2460{
2461 return false;
2462}
2463
2464static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2465 bool lazy)
2466{
2467 return 0;
2468}
2469
2470static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2471 struct rcu_data *rdp)
2472{
2473 return 0;
2474}
2475
2476static bool nocb_cpu_expendable(int cpu)
2477{
2478 return 1;
2479}
2480
2481static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2482{
2483}
2484
2485static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2486{
2487}
2488
2489static void init_nocb_callback_list(struct rcu_data *rdp)
2490{
2491}
2492
2493static void __init rcu_init_nocb(void)
2494{
2495}
2496
2497#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 693513bc50e6..0d095dcaa670 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused) 49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
50{ 53{
51 struct rcu_state *rsp; 54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
52 70
53 for_each_rcu_flavor(rsp) 71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54 seq_printf(m, "%s: bcc: %d nbd: %lu\n", 72{
55 rsp->name, 73 (*pos)++;
56 atomic_read(&rsp->barrier_cpu_count), 74 return r_start(m, pos);
57 rsp->n_barrier_done); 75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
58 return 0; 87 return 0;
59} 88}
60 89
61static int rcubarrier_open(struct inode *inode, struct file *file) 90static int rcubarrier_open(struct inode *inode, struct file *file)
62{ 91{
63 return single_open(file, show_rcubarrier, NULL); 92 return single_open(file, show_rcubarrier, inode->i_private);
64} 93}
65 94
66static const struct file_operations rcubarrier_fops = { 95static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
68 .open = rcubarrier_open, 97 .open = rcubarrier_open,
69 .read = seq_read, 98 .read = seq_read,
70 .llseek = seq_lseek, 99 .llseek = no_llseek,
71 .release = single_release, 100 .release = seq_release,
72}; 101};
73 102
74#ifdef CONFIG_RCU_BOOST 103#ifdef CONFIG_RCU_BOOST
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status)
84 113
85static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 114static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 115{
116 long ql, qll;
117
87 if (!rdp->beenonline) 118 if (!rdp->beenonline)
88 return; 119 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
90 rdp->cpu, 121 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
93 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce, rdp->qs_pending);
94 seq_printf(m, " dt=%d/%llx/%d df=%lu", 125 seq_printf(m, " dt=%d/%llx/%d df=%lu",
95 atomic_read(&rdp->dynticks->dynticks), 126 atomic_read(&rdp->dynticks->dynticks),
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
97 rdp->dynticks->dynticks_nmi_nesting, 128 rdp->dynticks->dynticks_nmi_nesting,
98 rdp->dynticks_fqs); 129 rdp->dynticks_fqs);
99 seq_printf(m, " of=%lu", rdp->offline_fqs); 130 seq_printf(m, " of=%lu", rdp->offline_fqs);
131 rcu_nocb_q_lengths(rdp, &ql, &qll);
132 qll += rdp->qlen_lazy;
133 ql += rdp->qlen;
100 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 134 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
101 rdp->qlen_lazy, rdp->qlen, 135 qll, ql,
102 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 136 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
103 rdp->nxttail[RCU_NEXT_TAIL]], 137 rdp->nxttail[RCU_NEXT_TAIL]],
104 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 138 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
114 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 148 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
115#endif /* #ifdef CONFIG_RCU_BOOST */ 149#endif /* #ifdef CONFIG_RCU_BOOST */
116 seq_printf(m, " b=%ld", rdp->blimit); 150 seq_printf(m, " b=%ld", rdp->blimit);
117 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 151 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
118 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 152 rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
153 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
119} 154}
120 155
121static int show_rcudata(struct seq_file *m, void *unused) 156static int show_rcudata(struct seq_file *m, void *v)
122{ 157{
123 int cpu; 158 print_one_rcu_data(m, (struct rcu_data *)v);
124 struct rcu_state *rsp;
125
126 for_each_rcu_flavor(rsp) {
127 seq_printf(m, "%s:\n", rsp->name);
128 for_each_possible_cpu(cpu)
129 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
130 }
131 return 0; 159 return 0;
132} 160}
133 161
162static const struct seq_operations rcudate_op = {
163 .start = r_start,
164 .next = r_next,
165 .stop = r_stop,
166 .show = show_rcudata,
167};
168
134static int rcudata_open(struct inode *inode, struct file *file) 169static int rcudata_open(struct inode *inode, struct file *file)
135{ 170{
136 return single_open(file, show_rcudata, NULL); 171 return r_open(inode, file, &rcudate_op);
137} 172}
138 173
139static const struct file_operations rcudata_fops = { 174static const struct file_operations rcudata_fops = {
140 .owner = THIS_MODULE, 175 .owner = THIS_MODULE,
141 .open = rcudata_open, 176 .open = rcudata_open,
142 .read = seq_read, 177 .read = seq_read,
143 .llseek = seq_lseek, 178 .llseek = no_llseek,
144 .release = single_release, 179 .release = seq_release,
145}; 180};
146 181
147static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) 182static int show_rcuexp(struct seq_file *m, void *v)
148{
149 if (!rdp->beenonline)
150 return;
151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
152 rdp->cpu,
153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
154 rdp->completed, rdp->gpnum,
155 rdp->passed_quiesce, rdp->qs_pending);
156 seq_printf(m, ",%d,%llx,%d,%lu",
157 atomic_read(&rdp->dynticks->dynticks),
158 rdp->dynticks->dynticks_nesting,
159 rdp->dynticks->dynticks_nmi_nesting,
160 rdp->dynticks_fqs);
161 seq_printf(m, ",%lu", rdp->offline_fqs);
162 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
163 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
164 rdp->nxttail[RCU_NEXT_TAIL]],
165 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
166 rdp->nxttail[RCU_NEXT_READY_TAIL]],
167 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
168 rdp->nxttail[RCU_WAIT_TAIL]],
169 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
170#ifdef CONFIG_RCU_BOOST
171 seq_printf(m, ",%d,\"%c\"",
172 per_cpu(rcu_cpu_has_work, rdp->cpu),
173 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
174 rdp->cpu)));
175#endif /* #ifdef CONFIG_RCU_BOOST */
176 seq_printf(m, ",%ld", rdp->blimit);
177 seq_printf(m, ",%lu,%lu,%lu\n",
178 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
179}
180
181static int show_rcudata_csv(struct seq_file *m, void *unused)
182{ 183{
183 int cpu; 184 struct rcu_state *rsp = (struct rcu_state *)m->private;
184 struct rcu_state *rsp; 185
185 186 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); 187 atomic_long_read(&rsp->expedited_start),
187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 188 atomic_long_read(&rsp->expedited_done),
188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 189 atomic_long_read(&rsp->expedited_wrap),
189#ifdef CONFIG_RCU_BOOST 190 atomic_long_read(&rsp->expedited_tryfail),
190 seq_puts(m, "\"kt\",\"ktl\""); 191 atomic_long_read(&rsp->expedited_workdone1),
191#endif /* #ifdef CONFIG_RCU_BOOST */ 192 atomic_long_read(&rsp->expedited_workdone2),
192 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 193 atomic_long_read(&rsp->expedited_normal),
193 for_each_rcu_flavor(rsp) { 194 atomic_long_read(&rsp->expedited_stoppedcpus),
194 seq_printf(m, "\"%s:\"\n", rsp->name); 195 atomic_long_read(&rsp->expedited_done_tries),
195 for_each_possible_cpu(cpu) 196 atomic_long_read(&rsp->expedited_done_lost),
196 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); 197 atomic_long_read(&rsp->expedited_done_exit));
197 }
198 return 0; 198 return 0;
199} 199}
200 200
201static int rcudata_csv_open(struct inode *inode, struct file *file) 201static int rcuexp_open(struct inode *inode, struct file *file)
202{ 202{
203 return single_open(file, show_rcudata_csv, NULL); 203 return single_open(file, show_rcuexp, inode->i_private);
204} 204}
205 205
206static const struct file_operations rcudata_csv_fops = { 206static const struct file_operations rcuexp_fops = {
207 .owner = THIS_MODULE, 207 .owner = THIS_MODULE,
208 .open = rcudata_csv_open, 208 .open = rcuexp_open,
209 .read = seq_read, 209 .read = seq_read,
210 .llseek = seq_lseek, 210 .llseek = no_llseek,
211 .release = single_release, 211 .release = seq_release,
212}; 212};
213 213
214#ifdef CONFIG_RCU_BOOST 214#ifdef CONFIG_RCU_BOOST
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = {
254 .owner = THIS_MODULE, 254 .owner = THIS_MODULE,
255 .open = rcu_node_boost_open, 255 .open = rcu_node_boost_open,
256 .read = seq_read, 256 .read = seq_read,
257 .llseek = seq_lseek, 257 .llseek = no_llseek,
258 .release = single_release, 258 .release = single_release,
259}; 259};
260 260
261/* 261#endif /* #ifdef CONFIG_RCU_BOOST */
262 * Create the rcuboost debugfs entry. Standard error return.
263 */
264static int rcu_boost_trace_create_file(struct dentry *rcudir)
265{
266 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
267 &rcu_node_boost_fops);
268}
269
270#else /* #ifdef CONFIG_RCU_BOOST */
271
272static int rcu_boost_trace_create_file(struct dentry *rcudir)
273{
274 return 0; /* There cannot be an error if we didn't create it! */
275}
276
277#endif /* #else #ifdef CONFIG_RCU_BOOST */
278 262
279static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
280{ 264{
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 struct rcu_node *rnp; 267 struct rcu_node *rnp;
284 268
285 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
286 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", 270 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
287 rsp->name, rsp->completed, gpnum, rsp->fqs_state, 271 ulong2long(rsp->completed), ulong2long(gpnum),
272 rsp->fqs_state,
288 (long)(rsp->jiffies_force_qs - jiffies), 273 (long)(rsp->jiffies_force_qs - jiffies),
289 (int)(jiffies & 0xffff)); 274 (int)(jiffies & 0xffff));
290 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 275 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
306 seq_puts(m, "\n"); 291 seq_puts(m, "\n");
307} 292}
308 293
309static int show_rcuhier(struct seq_file *m, void *unused) 294static int show_rcuhier(struct seq_file *m, void *v)
310{ 295{
311 struct rcu_state *rsp; 296 struct rcu_state *rsp = (struct rcu_state *)m->private;
312 297 print_one_rcu_state(m, rsp);
313 for_each_rcu_flavor(rsp)
314 print_one_rcu_state(m, rsp);
315 return 0; 298 return 0;
316} 299}
317 300
318static int rcuhier_open(struct inode *inode, struct file *file) 301static int rcuhier_open(struct inode *inode, struct file *file)
319{ 302{
320 return single_open(file, show_rcuhier, NULL); 303 return single_open(file, show_rcuhier, inode->i_private);
321} 304}
322 305
323static const struct file_operations rcuhier_fops = { 306static const struct file_operations rcuhier_fops = {
324 .owner = THIS_MODULE, 307 .owner = THIS_MODULE,
325 .open = rcuhier_open, 308 .open = rcuhier_open,
326 .read = seq_read, 309 .read = seq_read,
327 .llseek = seq_lseek, 310 .llseek = no_llseek,
328 .release = single_release, 311 .release = seq_release,
329}; 312};
330 313
331static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
338 struct rcu_node *rnp = &rsp->node[0]; 321 struct rcu_node *rnp = &rsp->node[0];
339 322
340 raw_spin_lock_irqsave(&rnp->lock, flags); 323 raw_spin_lock_irqsave(&rnp->lock, flags);
341 completed = rsp->completed; 324 completed = ACCESS_ONCE(rsp->completed);
342 gpnum = rsp->gpnum; 325 gpnum = ACCESS_ONCE(rsp->gpnum);
343 if (rsp->completed == rsp->gpnum) 326 if (completed == gpnum)
344 gpage = 0; 327 gpage = 0;
345 else 328 else
346 gpage = jiffies - rsp->gp_start; 329 gpage = jiffies - rsp->gp_start;
347 gpmax = rsp->gp_max; 330 gpmax = rsp->gp_max;
348 raw_spin_unlock_irqrestore(&rnp->lock, flags); 331 raw_spin_unlock_irqrestore(&rnp->lock, flags);
349 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", 332 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
350 rsp->name, completed, gpnum, gpage, gpmax); 333 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
351} 334}
352 335
353static int show_rcugp(struct seq_file *m, void *unused) 336static int show_rcugp(struct seq_file *m, void *v)
354{ 337{
355 struct rcu_state *rsp; 338 struct rcu_state *rsp = (struct rcu_state *)m->private;
356 339 show_one_rcugp(m, rsp);
357 for_each_rcu_flavor(rsp)
358 show_one_rcugp(m, rsp);
359 return 0; 340 return 0;
360} 341}
361 342
362static int rcugp_open(struct inode *inode, struct file *file) 343static int rcugp_open(struct inode *inode, struct file *file)
363{ 344{
364 return single_open(file, show_rcugp, NULL); 345 return single_open(file, show_rcugp, inode->i_private);
365} 346}
366 347
367static const struct file_operations rcugp_fops = { 348static const struct file_operations rcugp_fops = {
368 .owner = THIS_MODULE, 349 .owner = THIS_MODULE,
369 .open = rcugp_open, 350 .open = rcugp_open,
370 .read = seq_read, 351 .read = seq_read,
371 .llseek = seq_lseek, 352 .llseek = no_llseek,
372 .release = single_release, 353 .release = seq_release,
373}; 354};
374 355
375static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
376{ 357{
358 if (!rdp->beenonline)
359 return;
377 seq_printf(m, "%3d%cnp=%ld ", 360 seq_printf(m, "%3d%cnp=%ld ",
378 rdp->cpu, 361 rdp->cpu,
379 cpu_is_offline(rdp->cpu) ? '!' : ' ', 362 cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
389 rdp->n_rp_need_nothing); 372 rdp->n_rp_need_nothing);
390} 373}
391 374
392static int show_rcu_pending(struct seq_file *m, void *unused) 375static int show_rcu_pending(struct seq_file *m, void *v)
393{ 376{
394 int cpu; 377 print_one_rcu_pending(m, (struct rcu_data *)v);
395 struct rcu_data *rdp;
396 struct rcu_state *rsp;
397
398 for_each_rcu_flavor(rsp) {
399 seq_printf(m, "%s:\n", rsp->name);
400 for_each_possible_cpu(cpu) {
401 rdp = per_cpu_ptr(rsp->rda, cpu);
402 if (rdp->beenonline)
403 print_one_rcu_pending(m, rdp);
404 }
405 }
406 return 0; 378 return 0;
407} 379}
408 380
381static const struct seq_operations rcu_pending_op = {
382 .start = r_start,
383 .next = r_next,
384 .stop = r_stop,
385 .show = show_rcu_pending,
386};
387
409static int rcu_pending_open(struct inode *inode, struct file *file) 388static int rcu_pending_open(struct inode *inode, struct file *file)
410{ 389{
411 return single_open(file, show_rcu_pending, NULL); 390 return r_open(inode, file, &rcu_pending_op);
412} 391}
413 392
414static const struct file_operations rcu_pending_fops = { 393static const struct file_operations rcu_pending_fops = {
415 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
416 .open = rcu_pending_open, 395 .open = rcu_pending_open,
417 .read = seq_read, 396 .read = seq_read,
418 .llseek = seq_lseek, 397 .llseek = no_llseek,
419 .release = single_release, 398 .release = seq_release,
420}; 399};
421 400
422static int show_rcutorture(struct seq_file *m, void *unused) 401static int show_rcutorture(struct seq_file *m, void *unused)
@@ -446,43 +425,58 @@ static struct dentry *rcudir;
446 425
447static int __init rcutree_trace_init(void) 426static int __init rcutree_trace_init(void)
448{ 427{
428 struct rcu_state *rsp;
449 struct dentry *retval; 429 struct dentry *retval;
430 struct dentry *rspdir;
450 431
451 rcudir = debugfs_create_dir("rcu", NULL); 432 rcudir = debugfs_create_dir("rcu", NULL);
452 if (!rcudir) 433 if (!rcudir)
453 goto free_out; 434 goto free_out;
454 435
455 retval = debugfs_create_file("rcubarrier", 0444, rcudir, 436 for_each_rcu_flavor(rsp) {
456 NULL, &rcubarrier_fops); 437 rspdir = debugfs_create_dir(rsp->name, rcudir);
457 if (!retval) 438 if (!rspdir)
458 goto free_out; 439 goto free_out;
459 440
460 retval = debugfs_create_file("rcudata", 0444, rcudir, 441 retval = debugfs_create_file("rcudata", 0444,
461 NULL, &rcudata_fops); 442 rspdir, rsp, &rcudata_fops);
462 if (!retval) 443 if (!retval)
463 goto free_out; 444 goto free_out;
464 445
465 retval = debugfs_create_file("rcudata.csv", 0444, rcudir, 446 retval = debugfs_create_file("rcuexp", 0444,
466 NULL, &rcudata_csv_fops); 447 rspdir, rsp, &rcuexp_fops);
467 if (!retval) 448 if (!retval)
468 goto free_out; 449 goto free_out;
469 450
470 if (rcu_boost_trace_create_file(rcudir)) 451 retval = debugfs_create_file("rcu_pending", 0444,
471 goto free_out; 452 rspdir, rsp, &rcu_pending_fops);
453 if (!retval)
454 goto free_out;
455
456 retval = debugfs_create_file("rcubarrier", 0444,
457 rspdir, rsp, &rcubarrier_fops);
458 if (!retval)
459 goto free_out;
472 460
473 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 461#ifdef CONFIG_RCU_BOOST
474 if (!retval) 462 if (rsp == &rcu_preempt_state) {
475 goto free_out; 463 retval = debugfs_create_file("rcuboost", 0444,
464 rspdir, NULL, &rcu_node_boost_fops);
465 if (!retval)
466 goto free_out;
467 }
468#endif
476 469
477 retval = debugfs_create_file("rcuhier", 0444, rcudir, 470 retval = debugfs_create_file("rcugp", 0444,
478 NULL, &rcuhier_fops); 471 rspdir, rsp, &rcugp_fops);
479 if (!retval) 472 if (!retval)
480 goto free_out; 473 goto free_out;
481 474
482 retval = debugfs_create_file("rcu_pending", 0444, rcudir, 475 retval = debugfs_create_file("rcuhier", 0444,
483 NULL, &rcu_pending_fops); 476 rspdir, rsp, &rcuhier_fops);
484 if (!retval) 477 if (!retval)
485 goto free_out; 478 goto free_out;
479 }
486 480
487 retval = debugfs_create_file("rcutorture", 0444, rcudir, 481 retval = debugfs_create_file("rcutorture", 0444, rcudir,
488 NULL, &rcutorture_fops); 482 NULL, &rcutorture_fops);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ad581aa2369a..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
86 return __res_counter_charge(counter, val, limit_fail_at, true); 86 return __res_counter_charge(counter, val, limit_fail_at, true);
87} 87}
88 88
89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{ 90{
91 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
92 val = counter->usage; 92 val = counter->usage;
93 93
94 counter->usage -= val; 94 counter->usage -= val;
95 return counter->usage;
95} 96}
96 97
97void res_counter_uncharge_until(struct res_counter *counter, 98u64 res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top, 99 struct res_counter *top,
99 unsigned long val) 100 unsigned long val)
100{ 101{
101 unsigned long flags; 102 unsigned long flags;
102 struct res_counter *c; 103 struct res_counter *c;
104 u64 ret = 0;
103 105
104 local_irq_save(flags); 106 local_irq_save(flags);
105 for (c = counter; c != top; c = c->parent) { 107 for (c = counter; c != top; c = c->parent) {
108 u64 r;
106 spin_lock(&c->lock); 109 spin_lock(&c->lock);
107 res_counter_uncharge_locked(c, val); 110 r = res_counter_uncharge_locked(c, val);
111 if (c == counter)
112 ret = r;
108 spin_unlock(&c->lock); 113 spin_unlock(&c->lock);
109 } 114 }
110 local_irq_restore(flags); 115 local_irq_restore(flags);
116 return ret;
111} 117}
112 118
113void res_counter_uncharge(struct res_counter *counter, unsigned long val) 119u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{ 120{
115 res_counter_uncharge_until(counter, NULL, val); 121 return res_counter_uncharge_until(counter, NULL, val);
116} 122}
117 123
118static inline unsigned long long * 124static inline unsigned long long *
@@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf,
192 *res = PAGE_ALIGN(*res); 198 *res = PAGE_ALIGN(*res);
193 return 0; 199 return 0;
194} 200}
195
196int res_counter_write(struct res_counter *counter, int member,
197 const char *buf, write_strategy_fn write_strategy)
198{
199 char *end;
200 unsigned long flags;
201 unsigned long long tmp, *val;
202
203 if (write_strategy) {
204 if (write_strategy(buf, &tmp))
205 return -EINVAL;
206 } else {
207 tmp = simple_strtoull(buf, &end, 10);
208 if (*end != '\0')
209 return -EINVAL;
210 }
211 spin_lock_irqsave(&counter->lock, flags);
212 val = res_counter_member(counter, member);
213 *val = tmp;
214 spin_unlock_irqrestore(&counter->lock, flags);
215 return 0;
216}
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 15f60d01198b..0984a21076a3 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
143 143
144 p->signal->autogroup = autogroup_kref_get(ag); 144 p->signal->autogroup = autogroup_kref_get(ag);
145 145
146 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
147 goto out;
148
146 t = p; 149 t = p;
147 do { 150 do {
148 sched_move_task(t); 151 sched_move_task(t);
149 } while_each_thread(p, t); 152 } while_each_thread(p, t);
150 153
154out:
151 unlock_task_sighand(p, &flags); 155 unlock_task_sighand(p, &flags);
152 autogroup_kref_put(prev); 156 autogroup_kref_put(prev);
153} 157}
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 443232ebbb53..8bd047142816 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -4,6 +4,11 @@
4#include <linux/rwsem.h> 4#include <linux/rwsem.h>
5 5
6struct autogroup { 6struct autogroup {
7 /*
8 * reference doesn't mean how many thread attach to this
9 * autogroup now. It just stands for the number of task
10 * could use this autogroup.
11 */
7 struct kref kref; 12 struct kref kref;
8 struct task_group *tg; 13 struct task_group *tg;
9 struct rw_semaphore lock; 14 struct rw_semaphore lock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..257002c13bb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
75 76
76#include <asm/switch_to.h> 77#include <asm/switch_to.h>
77#include <asm/tlb.h> 78#include <asm/tlb.h>
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { }; 193static void sched_feat_enable(int i) { };
193#endif /* HAVE_JUMP_LABEL */ 194#endif /* HAVE_JUMP_LABEL */
194 195
195static ssize_t 196static int sched_feat_set(char *cmp)
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{ 197{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i; 198 int i;
203 199 int neg = 0;
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212 200
213 if (strncmp(cmp, "NO_", 3) == 0) { 201 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1; 202 neg = 1;
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
228 } 216 }
229 } 217 }
230 218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
231 if (i == __SCHED_FEAT_NR) 240 if (i == __SCHED_FEAT_NR)
232 return -EINVAL; 241 return -EINVAL;
233 242
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
922 rq->skip_clock_update = 1; 931 rq->skip_clock_update = 1;
923} 932}
924 933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
925#ifdef CONFIG_SMP 941#ifdef CONFIG_SMP
926void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
927{ 943{
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
952 trace_sched_migrate_task(p, new_cpu); 968 trace_sched_migrate_task(p, new_cpu);
953 969
954 if (task_cpu(p) != new_cpu) { 970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
955 p->se.nr_migrations++; 975 p->se.nr_migrations++;
956 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
957 } 983 }
958 984
959 __set_task_cpu(p, new_cpu); 985 __set_task_cpu(p, new_cpu);
@@ -1524,6 +1550,15 @@ static void __sched_fork(struct task_struct *p)
1524 p->se.vruntime = 0; 1550 p->se.vruntime = 0;
1525 INIT_LIST_HEAD(&p->se.group_node); 1551 INIT_LIST_HEAD(&p->se.group_node);
1526 1552
1553/*
1554 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1555 * removed when useful for applications beyond shares distribution (e.g.
1556 * load-balance).
1557 */
1558#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1559 p->se.avg.runnable_avg_period = 0;
1560 p->se.avg.runnable_avg_sum = 0;
1561#endif
1527#ifdef CONFIG_SCHEDSTATS 1562#ifdef CONFIG_SCHEDSTATS
1528 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1563 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1529#endif 1564#endif
@@ -1533,7 +1568,40 @@ static void __sched_fork(struct task_struct *p)
1533#ifdef CONFIG_PREEMPT_NOTIFIERS 1568#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers); 1569 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif 1570#endif
1571
1572#ifdef CONFIG_NUMA_BALANCING
1573 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1574 p->mm->numa_next_scan = jiffies;
1575 p->mm->numa_next_reset = jiffies;
1576 p->mm->numa_scan_seq = 0;
1577 }
1578
1579 p->node_stamp = 0ULL;
1580 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1581 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1582 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1583 p->numa_work.next = &p->numa_work;
1584#endif /* CONFIG_NUMA_BALANCING */
1585}
1586
1587#ifdef CONFIG_NUMA_BALANCING
1588#ifdef CONFIG_SCHED_DEBUG
1589void set_numabalancing_state(bool enabled)
1590{
1591 if (enabled)
1592 sched_feat_set("NUMA");
1593 else
1594 sched_feat_set("NO_NUMA");
1536} 1595}
1596#else
1597__read_mostly bool numabalancing_enabled;
1598
1599void set_numabalancing_state(bool enabled)
1600{
1601 numabalancing_enabled = enabled;
1602}
1603#endif /* CONFIG_SCHED_DEBUG */
1604#endif /* CONFIG_NUMA_BALANCING */
1537 1605
1538/* 1606/*
1539 * fork()/clone()-time setup: 1607 * fork()/clone()-time setup:
@@ -1886,8 +1954,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
1886 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1954 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1887#endif 1955#endif
1888 1956
1957 context_tracking_task_switch(prev, next);
1889 /* Here we just switch the register state and the stack. */ 1958 /* Here we just switch the register state and the stack. */
1890 rcu_switch(prev, next);
1891 switch_to(prev, next, prev); 1959 switch_to(prev, next, prev);
1892 1960
1893 barrier(); 1961 barrier();
@@ -2911,7 +2979,7 @@ asmlinkage void __sched schedule(void)
2911} 2979}
2912EXPORT_SYMBOL(schedule); 2980EXPORT_SYMBOL(schedule);
2913 2981
2914#ifdef CONFIG_RCU_USER_QS 2982#ifdef CONFIG_CONTEXT_TRACKING
2915asmlinkage void __sched schedule_user(void) 2983asmlinkage void __sched schedule_user(void)
2916{ 2984{
2917 /* 2985 /*
@@ -2920,9 +2988,9 @@ asmlinkage void __sched schedule_user(void)
2920 * we haven't yet exited the RCU idle mode. Do it here manually until 2988 * we haven't yet exited the RCU idle mode. Do it here manually until
2921 * we find a better solution. 2989 * we find a better solution.
2922 */ 2990 */
2923 rcu_user_exit(); 2991 user_exit();
2924 schedule(); 2992 schedule();
2925 rcu_user_enter(); 2993 user_enter();
2926} 2994}
2927#endif 2995#endif
2928 2996
@@ -3027,7 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3027 /* Catch callers which need to be fixed */ 3095 /* Catch callers which need to be fixed */
3028 BUG_ON(ti->preempt_count || !irqs_disabled()); 3096 BUG_ON(ti->preempt_count || !irqs_disabled());
3029 3097
3030 rcu_user_exit(); 3098 user_exit();
3031 do { 3099 do {
3032 add_preempt_count(PREEMPT_ACTIVE); 3100 add_preempt_count(PREEMPT_ACTIVE);
3033 local_irq_enable(); 3101 local_irq_enable();
@@ -4029,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4029 goto out_free_cpus_allowed; 4097 goto out_free_cpus_allowed;
4030 } 4098 }
4031 retval = -EPERM; 4099 retval = -EPERM;
4032 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4100 if (!check_same_owner(p)) {
4033 goto out_unlock; 4101 rcu_read_lock();
4102 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4103 rcu_read_unlock();
4104 goto out_unlock;
4105 }
4106 rcu_read_unlock();
4107 }
4034 4108
4035 retval = security_task_setscheduler(p); 4109 retval = security_task_setscheduler(p);
4036 if (retval) 4110 if (retval)
@@ -4474,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4474void sched_show_task(struct task_struct *p) 4548void sched_show_task(struct task_struct *p)
4475{ 4549{
4476 unsigned long free = 0; 4550 unsigned long free = 0;
4551 int ppid;
4477 unsigned state; 4552 unsigned state;
4478 4553
4479 state = p->state ? __ffs(p->state) + 1 : 0; 4554 state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4493,8 +4568,11 @@ void sched_show_task(struct task_struct *p)
4493#ifdef CONFIG_DEBUG_STACK_USAGE 4568#ifdef CONFIG_DEBUG_STACK_USAGE
4494 free = stack_not_used(p); 4569 free = stack_not_used(p);
4495#endif 4570#endif
4571 rcu_read_lock();
4572 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4573 rcu_read_unlock();
4496 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4574 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4497 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4575 task_pid_nr(p), ppid,
4498 (unsigned long)task_thread_info(p)->flags); 4576 (unsigned long)task_thread_info(p)->flags);
4499 4577
4500 show_stack(p, NULL); 4578 show_stack(p, NULL);
@@ -7468,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7468 struct task_group, css); 7546 struct task_group, css);
7469} 7547}
7470 7548
7471static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7549static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7472{ 7550{
7473 struct task_group *tg, *parent; 7551 struct task_group *tg, *parent;
7474 7552
@@ -7485,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7485 return &tg->css; 7563 return &tg->css;
7486} 7564}
7487 7565
7488static void cpu_cgroup_destroy(struct cgroup *cgrp) 7566static void cpu_cgroup_css_free(struct cgroup *cgrp)
7489{ 7567{
7490 struct task_group *tg = cgroup_tg(cgrp); 7568 struct task_group *tg = cgroup_tg(cgrp);
7491 7569
@@ -7845,8 +7923,8 @@ static struct cftype cpu_files[] = {
7845 7923
7846struct cgroup_subsys cpu_cgroup_subsys = { 7924struct cgroup_subsys cpu_cgroup_subsys = {
7847 .name = "cpu", 7925 .name = "cpu",
7848 .create = cpu_cgroup_create, 7926 .css_alloc = cpu_cgroup_css_alloc,
7849 .destroy = cpu_cgroup_destroy, 7927 .css_free = cpu_cgroup_css_free,
7850 .can_attach = cpu_cgroup_can_attach, 7928 .can_attach = cpu_cgroup_can_attach,
7851 .attach = cpu_cgroup_attach, 7929 .attach = cpu_cgroup_attach,
7852 .exit = cpu_cgroup_exit, 7930 .exit = cpu_cgroup_exit,
@@ -7869,7 +7947,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7869struct cpuacct root_cpuacct; 7947struct cpuacct root_cpuacct;
7870 7948
7871/* create a new cpu accounting group */ 7949/* create a new cpu accounting group */
7872static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7950static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7873{ 7951{
7874 struct cpuacct *ca; 7952 struct cpuacct *ca;
7875 7953
@@ -7899,7 +7977,7 @@ out:
7899} 7977}
7900 7978
7901/* destroy an existing cpu accounting group */ 7979/* destroy an existing cpu accounting group */
7902static void cpuacct_destroy(struct cgroup *cgrp) 7980static void cpuacct_css_free(struct cgroup *cgrp)
7903{ 7981{
7904 struct cpuacct *ca = cgroup_ca(cgrp); 7982 struct cpuacct *ca = cgroup_ca(cgrp);
7905 7983
@@ -8070,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8070 8148
8071struct cgroup_subsys cpuacct_subsys = { 8149struct cgroup_subsys cpuacct_subsys = {
8072 .name = "cpuacct", 8150 .name = "cpuacct",
8073 .create = cpuacct_create, 8151 .css_alloc = cpuacct_css_alloc,
8074 .destroy = cpuacct_destroy, 8152 .css_free = cpuacct_css_free,
8075 .subsys_id = cpuacct_subsys_id, 8153 .subsys_id = cpuacct_subsys_id,
8076 .base_cftypes = files, 8154 .base_cftypes = files,
8077}; 8155};
8078#endif /* CONFIG_CGROUP_CPUACCT */ 8156#endif /* CONFIG_CGROUP_CPUACCT */
8157
8158void dump_cpu_task(int cpu)
8159{
8160 pr_info("Task dump for CPU %d:\n", cpu);
8161 sched_show_task(cpu_curr(cpu));
8162}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 81b763ba58a6..293b202fcf79 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
43 * Called before incrementing preempt_count on {soft,}irq_enter 43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit. 44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */ 45 */
46void vtime_account(struct task_struct *curr) 46void irqtime_account_irq(struct task_struct *curr)
47{ 47{
48 unsigned long flags; 48 unsigned long flags;
49 s64 delta; 49 s64 delta;
@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr)
73 irq_time_write_end(); 73 irq_time_write_end();
74 local_irq_restore(flags); 74 local_irq_restore(flags);
75} 75}
76EXPORT_SYMBOL_GPL(vtime_account); 76EXPORT_SYMBOL_GPL(irqtime_account_irq);
77 77
78static int irqtime_account_hi_update(void) 78static int irqtime_account_hi_update(void)
79{ 79{
@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void)
288 return false; 288 return false;
289} 289}
290 290
291/*
292 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
293 * tasks (sum on group iteration) belonging to @tsk's group.
294 */
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{
297 struct signal_struct *sig = tsk->signal;
298 struct task_struct *t;
299
300 times->utime = sig->utime;
301 times->stime = sig->stime;
302 times->sum_exec_runtime = sig->sum_sched_runtime;
303
304 rcu_read_lock();
305 /* make sure we can trust tsk->thread_group list */
306 if (!likely(pid_alive(tsk)))
307 goto out;
308
309 t = tsk;
310 do {
311 times->utime += t->utime;
312 times->stime += t->stime;
313 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t);
315out:
316 rcu_read_unlock();
317}
318
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING 319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
292 320
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING 321#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks)
417 * Use precise platform statistics if available: 445 * Use precise platform statistics if available:
418 */ 446 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING 447#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 448void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{ 449{
422 *ut = p->utime; 450 *ut = p->utime;
423 *st = p->stime; 451 *st = p->stime;
424} 452}
425 453
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 454void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{ 455{
428 struct task_cputime cputime; 456 struct task_cputime cputime;
429 457
@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
433 *st = cputime.stime; 461 *st = cputime.stime;
434} 462}
435 463
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev)
476{
477 if (is_idle_task(prev))
478 vtime_account_idle(prev);
479 else
480 vtime_account_system(prev);
481
482 vtime_account_user(prev);
483 arch_vtime_task_switch(prev);
484}
485#endif
486
436/* 487/*
437 * Archs that account the whole time spent in the idle task 488 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement 489 * (outside irq) as idle time can rely on this and just implement
@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
444#ifndef __ARCH_HAS_VTIME_ACCOUNT 495#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk) 496void vtime_account(struct task_struct *tsk)
446{ 497{
447 unsigned long flags;
448
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk)) 498 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk); 499 vtime_account_system(tsk);
453 else 500 else
454 vtime_account_idle(tsk); 501 vtime_account_idle(tsk);
455
456 local_irq_restore(flags);
457} 502}
458EXPORT_SYMBOL_GPL(vtime_account); 503EXPORT_SYMBOL_GPL(vtime_account);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 504#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
478 return (__force cputime_t) temp; 523 return (__force cputime_t) temp;
479} 524}
480 525
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 526/*
527 * Adjust tick based cputime random precision against scheduler
528 * runtime accounting.
529 */
530static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st)
482{ 533{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime; 534 cputime_t rtime, utime, total;
535
536 utime = curr->utime;
537 total = utime + curr->stime;
484 538
485 /* 539 /*
486 * Use CFS's precise accounting: 540 * Tick based cputime accounting depend on random scheduling
541 * timeslices of a task to be interrupted or not by the timer.
542 * Depending on these circumstances, the number of these interrupts
543 * may be over or under-optimistic, matching the real user and system
544 * cputime with a variable precision.
545 *
546 * Fix this by scaling these tick based values against the total
547 * runtime accounted by the CFS scheduler.
487 */ 548 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
489 550
490 if (total) 551 if (total)
491 utime = scale_utime(utime, rtime, total); 552 utime = scale_utime(utime, rtime, total);
@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
493 utime = rtime; 554 utime = rtime;
494 555
495 /* 556 /*
496 * Compare with previous values, to keep monotonicity: 557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity.
497 */ 560 */
498 p->prev_utime = max(p->prev_utime, utime); 561 prev->utime = max(prev->utime, utime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); 562 prev->stime = max(prev->stime, rtime - prev->utime);
500 563
501 *ut = p->prev_utime; 564 *ut = prev->utime;
502 *st = p->prev_stime; 565 *st = prev->stime;
566}
567
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{
570 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime,
574 };
575
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
503} 577}
504 578
505/* 579/*
506 * Must be called with siglock held. 580 * Must be called with siglock held.
507 */ 581 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 582void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{ 583{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime; 584 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513 585
514 thread_group_cputime(p, &cputime); 586 thread_group_cputime(p, &cputime);
515 587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
516 total = cputime.utime + cputime.stime;
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
518
519 if (total)
520 utime = scale_utime(cputime.utime, rtime, total);
521 else
522 utime = rtime;
523
524 sig->prev_utime = max(sig->prev_utime, utime);
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
526
527 *ut = sig->prev_utime;
528 *st = sig->prev_stime;
529} 588}
530#endif 589#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..2cd3c1b4e582 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{ 62{
63 struct sched_entity *se = tg->se[cpu]; 63 struct sched_entity *se = tg->se[cpu];
64 if (!se)
65 return;
66 64
67#define P(F) \ 65#define P(F) \
68 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
69#define PN(F) \ 67#define PN(F) \
70 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
71 69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
72 PN(se->exec_start); 78 PN(se->exec_start);
73 PN(se->vruntime); 79 PN(se->vruntime);
74 PN(se->sum_exec_runtime); 80 PN(se->sum_exec_runtime);
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
85 P(se->statistics.wait_count); 91 P(se->statistics.wait_count);
86#endif 92#endif
87 P(se->load.weight); 93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
88#undef PN 100#undef PN
89#undef P 101#undef P
90} 102}
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 218 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 219#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 220#ifdef CONFIG_SMP
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", 221 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
210 SPLIT_NS(cfs_rq->load_avg)); 222 cfs_rq->runnable_load_avg);
211 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", 223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
212 SPLIT_NS(cfs_rq->load_period)); 224 cfs_rq->blocked_load_avg);
213 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", 225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
214 cfs_rq->load_contribution); 226 atomic64_read(&cfs_rq->tg->load_avg));
215 SEQ_printf(m, " .%-30s: %d\n", "load_tg", 227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
216 atomic_read(&cfs_rq->tg->load_weight)); 228 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
230 cfs_rq->tg_runnable_contrib);
231 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
232 atomic_read(&cfs_rq->tg->runnable_avg));
217#endif 233#endif
218 234
219 print_cfs_group_stats(m, cpu, cfs_rq->tg); 235 print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..5eea8707234a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
259 return grp->my_q; 262 return grp->my_q;
260} 263}
261 264
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
267
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{ 269{
264 if (!cfs_rq->on_list) { 270 if (!cfs_rq->on_list) {
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
278 } 284 }
279 285
280 cfs_rq->on_list = 1; 286 cfs_rq->on_list = 1;
287 /* We should have no load, but we need to update last_decay. */
288 update_cfs_rq_blocked_load(cfs_rq, 0);
281 } 289 }
282} 290}
283 291
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653 return calc_delta_fair(sched_slice(cfs_rq, se), se); 661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654} 662}
655 663
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659/* 664/*
660 * Update the current task's runtime statistics. Skip current tasks that 665 * Update the current task's runtime statistics. Skip current tasks that
661 * are not in our scheduling class. 666 * are not in our scheduling class.
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
675 680
676 curr->vruntime += delta_exec_weighted; 681 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq); 682 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682} 683}
683 684
684static void update_curr(struct cfs_rq *cfs_rq) 685static void update_curr(struct cfs_rq *cfs_rq)
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
777 */ 778 */
778 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq;
797
798 if (!p->mm) /* for example, ksmd faulting in a user's mm */
799 return;
800 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
801 if (p->numa_scan_seq == seq)
802 return;
803 p->numa_scan_seq = seq;
804
805 /* FIXME: Scheduling placement policy hints go here */
806}
807
808/*
809 * Got a PROT_NONE fault for a page on @node.
810 */
811void task_numa_fault(int node, int pages, bool migrated)
812{
813 struct task_struct *p = current;
814
815 if (!sched_feat_numa(NUMA))
816 return;
817
818 /* FIXME: Allocate task-specific structure for placement policy here */
819
820 /*
821 * If pages are properly placed (did not migrate) then scan slower.
822 * This is reset periodically in case of phase changes
823 */
824 if (!migrated)
825 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
826 p->numa_scan_period + jiffies_to_msecs(10));
827
828 task_numa_placement(p);
829}
830
831static void reset_ptenuma_scan(struct task_struct *p)
832{
833 ACCESS_ONCE(p->mm->numa_scan_seq)++;
834 p->mm->numa_scan_offset = 0;
835}
836
837/*
838 * The expensive part of numa migration is done from task_work context.
839 * Triggered from task_tick_numa().
840 */
841void task_numa_work(struct callback_head *work)
842{
843 unsigned long migrate, next_scan, now = jiffies;
844 struct task_struct *p = current;
845 struct mm_struct *mm = p->mm;
846 struct vm_area_struct *vma;
847 unsigned long start, end;
848 long pages;
849
850 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
851
852 work->next = work; /* protect against double add */
853 /*
854 * Who cares about NUMA placement when they're dying.
855 *
856 * NOTE: make sure not to dereference p->mm before this check,
857 * exit_task_work() happens _after_ exit_mm() so we could be called
858 * without p->mm even though we still had it when we enqueued this
859 * work.
860 */
861 if (p->flags & PF_EXITING)
862 return;
863
864 /*
865 * We do not care about task placement until a task runs on a node
866 * other than the first one used by the address space. This is
867 * largely because migrations are driven by what CPU the task
868 * is running on. If it's never scheduled on another node, it'll
869 * not migrate so why bother trapping the fault.
870 */
871 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
872 mm->first_nid = numa_node_id();
873 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
874 /* Are we running on a new node yet? */
875 if (numa_node_id() == mm->first_nid &&
876 !sched_feat_numa(NUMA_FORCE))
877 return;
878
879 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
880 }
881
882 /*
883 * Reset the scan period if enough time has gone by. Objective is that
884 * scanning will be reduced if pages are properly placed. As tasks
885 * can enter different phases this needs to be re-examined. Lacking
886 * proper tracking of reference behaviour, this blunt hammer is used.
887 */
888 migrate = mm->numa_next_reset;
889 if (time_after(now, migrate)) {
890 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
891 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
892 xchg(&mm->numa_next_reset, next_scan);
893 }
894
895 /*
896 * Enforce maximal scan/migration frequency..
897 */
898 migrate = mm->numa_next_scan;
899 if (time_before(now, migrate))
900 return;
901
902 if (p->numa_scan_period == 0)
903 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
904
905 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
906 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
907 return;
908
909 /*
910 * Do not set pte_numa if the current running node is rate-limited.
911 * This loses statistics on the fault but if we are unwilling to
912 * migrate to this node, it is less likely we can do useful work
913 */
914 if (migrate_ratelimited(numa_node_id()))
915 return;
916
917 start = mm->numa_scan_offset;
918 pages = sysctl_numa_balancing_scan_size;
919 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
920 if (!pages)
921 return;
922
923 down_read(&mm->mmap_sem);
924 vma = find_vma(mm, start);
925 if (!vma) {
926 reset_ptenuma_scan(p);
927 start = 0;
928 vma = mm->mmap;
929 }
930 for (; vma; vma = vma->vm_next) {
931 if (!vma_migratable(vma))
932 continue;
933
934 /* Skip small VMAs. They are not likely to be of relevance */
935 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
936 continue;
937
938 do {
939 start = max(start, vma->vm_start);
940 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
941 end = min(end, vma->vm_end);
942 pages -= change_prot_numa(vma, start, end);
943
944 start = end;
945 if (pages <= 0)
946 goto out;
947 } while (end != vma->vm_end);
948 }
949
950out:
951 /*
952 * It is possible to reach the end of the VMA list but the last few VMAs are
953 * not guaranteed to the vma_migratable. If they are not, we would find the
954 * !migratable VMA on the next scan but not reset the scanner to the start
955 * so check it now.
956 */
957 if (vma)
958 mm->numa_scan_offset = start;
959 else
960 reset_ptenuma_scan(p);
961 up_read(&mm->mmap_sem);
962}
963
964/*
965 * Drive the periodic memory faults..
966 */
967void task_tick_numa(struct rq *rq, struct task_struct *curr)
968{
969 struct callback_head *work = &curr->numa_work;
970 u64 period, now;
971
972 /*
973 * We don't care about NUMA placement if we don't have memory.
974 */
975 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
976 return;
977
978 /*
979 * Using runtime rather than walltime has the dual advantage that
980 * we (mostly) drive the selection from busy threads and that the
981 * task needs to have done some actual work before we bother with
982 * NUMA placement.
983 */
984 now = curr->se.sum_exec_runtime;
985 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
986
987 if (now - curr->node_stamp > period) {
988 if (!curr->node_stamp)
989 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
990 curr->node_stamp = now;
991
992 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
993 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
994 task_work_add(curr, work, true);
995 }
996 }
997}
998#else
999static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1000{
1001}
1002#endif /* CONFIG_NUMA_BALANCING */
1003
779static void 1004static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1005account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 1006{
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
801} 1026}
802 1027
803#ifdef CONFIG_FAIR_GROUP_SCHED 1028#ifdef CONFIG_FAIR_GROUP_SCHED
804/* we need this in update_cfs_load and load-balance functions below */
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP 1029# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834 /* truncate load history at 4 idle periods */
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850 /* consider updating load contribution on each fold or truncate */
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856 /*
857 * Inline assembly required to prevent the compiler
858 * optimising this loop into a divmod call.
859 * See __iter_div_u64_rem() for another example of this.
860 */
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) 1030static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{ 1031{
872 long tg_weight; 1032 long tg_weight;
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
876 * to gain a more accurate current total weight. See 1036 * to gain a more accurate current total weight. See
877 * update_cfs_rq_load_contribution(). 1037 * update_cfs_rq_load_contribution().
878 */ 1038 */
879 tg_weight = atomic_read(&tg->load_weight); 1039 tg_weight = atomic64_read(&tg->load_avg);
880 tg_weight -= cfs_rq->load_contribution; 1040 tg_weight -= cfs_rq->tg_load_contrib;
881 tg_weight += cfs_rq->load.weight; 1041 tg_weight += cfs_rq->load.weight;
882 1042
883 return tg_weight; 1043 return tg_weight;
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
901 1061
902 return shares; 1062 return shares;
903} 1063}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else /* CONFIG_SMP */ 1064# else /* CONFIG_SMP */
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 1065static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{ 1066{
919 return tg->shares; 1067 return tg->shares;
920} 1068}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif /* CONFIG_SMP */ 1069# endif /* CONFIG_SMP */
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 1070static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight) 1071 unsigned long weight)
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
939 account_entity_enqueue(cfs_rq, se); 1083 account_entity_enqueue(cfs_rq, se);
940} 1084}
941 1085
1086static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1087
942static void update_cfs_shares(struct cfs_rq *cfs_rq) 1088static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{ 1089{
944 struct task_group *tg; 1090 struct task_group *tg;
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
958 reweight_entity(cfs_rq_of(se), se, shares); 1104 reweight_entity(cfs_rq_of(se), se, shares);
959} 1105}
960#else /* CONFIG_FAIR_GROUP_SCHED */ 1106#else /* CONFIG_FAIR_GROUP_SCHED */
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 1107static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
962{ 1108{
963} 1109}
1110#endif /* CONFIG_FAIR_GROUP_SCHED */
964 1111
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq) 1112/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1113#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1114/*
1115 * We choose a half-life close to 1 scheduling period.
1116 * Note: The tables below are dependent on this value.
1117 */
1118#define LOAD_AVG_PERIOD 32
1119#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1120#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1121
1122/* Precomputed fixed inverse multiplies for multiplication by y^n */
1123static const u32 runnable_avg_yN_inv[] = {
1124 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1125 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1126 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1127 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1128 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1129 0x85aac367, 0x82cd8698,
1130};
1131
1132/*
1133 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1134 * over-estimates when re-combining.
1135 */
1136static const u32 runnable_avg_yN_sum[] = {
1137 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1138 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1139 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1140};
1141
1142/*
1143 * Approximate:
1144 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1145 */
1146static __always_inline u64 decay_load(u64 val, u64 n)
966{ 1147{
1148 unsigned int local_n;
1149
1150 if (!n)
1151 return val;
1152 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1153 return 0;
1154
1155 /* after bounds checking we can collapse to 32-bit */
1156 local_n = n;
1157
1158 /*
1159 * As y^PERIOD = 1/2, we can combine
1160 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1161 * With a look-up table which covers k^n (n<PERIOD)
1162 *
1163 * To achieve constant time decay_load.
1164 */
1165 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1166 val >>= local_n / LOAD_AVG_PERIOD;
1167 local_n %= LOAD_AVG_PERIOD;
1168 }
1169
1170 val *= runnable_avg_yN_inv[local_n];
1171 /* We don't use SRR here since we always want to round down. */
1172 return val >> 32;
967} 1173}
968 1174
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) 1175/*
1176 * For updates fully spanning n periods, the contribution to runnable
1177 * average will be: \Sum 1024*y^n
1178 *
1179 * We can compute this reasonably efficiently by combining:
1180 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1181 */
1182static u32 __compute_runnable_contrib(u64 n)
970{ 1183{
1184 u32 contrib = 0;
1185
1186 if (likely(n <= LOAD_AVG_PERIOD))
1187 return runnable_avg_yN_sum[n];
1188 else if (unlikely(n >= LOAD_AVG_MAX_N))
1189 return LOAD_AVG_MAX;
1190
1191 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1192 do {
1193 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1194 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1195
1196 n -= LOAD_AVG_PERIOD;
1197 } while (n > LOAD_AVG_PERIOD);
1198
1199 contrib = decay_load(contrib, n);
1200 return contrib + runnable_avg_yN_sum[n];
971} 1201}
972#endif /* CONFIG_FAIR_GROUP_SCHED */ 1202
1203/*
1204 * We can represent the historical contribution to runnable average as the
1205 * coefficients of a geometric series. To do this we sub-divide our runnable
1206 * history into segments of approximately 1ms (1024us); label the segment that
1207 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1208 *
1209 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1210 * p0 p1 p2
1211 * (now) (~1ms ago) (~2ms ago)
1212 *
1213 * Let u_i denote the fraction of p_i that the entity was runnable.
1214 *
1215 * We then designate the fractions u_i as our co-efficients, yielding the
1216 * following representation of historical load:
1217 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1218 *
1219 * We choose y based on the with of a reasonably scheduling period, fixing:
1220 * y^32 = 0.5
1221 *
1222 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1223 * approximately half as much as the contribution to load within the last ms
1224 * (u_0).
1225 *
1226 * When a period "rolls over" and we have new u_0`, multiplying the previous
1227 * sum again by y is sufficient to update:
1228 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1229 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1230 */
1231static __always_inline int __update_entity_runnable_avg(u64 now,
1232 struct sched_avg *sa,
1233 int runnable)
1234{
1235 u64 delta, periods;
1236 u32 runnable_contrib;
1237 int delta_w, decayed = 0;
1238
1239 delta = now - sa->last_runnable_update;
1240 /*
1241 * This should only happen when time goes backwards, which it
1242 * unfortunately does during sched clock init when we swap over to TSC.
1243 */
1244 if ((s64)delta < 0) {
1245 sa->last_runnable_update = now;
1246 return 0;
1247 }
1248
1249 /*
1250 * Use 1024ns as the unit of measurement since it's a reasonable
1251 * approximation of 1us and fast to compute.
1252 */
1253 delta >>= 10;
1254 if (!delta)
1255 return 0;
1256 sa->last_runnable_update = now;
1257
1258 /* delta_w is the amount already accumulated against our next period */
1259 delta_w = sa->runnable_avg_period % 1024;
1260 if (delta + delta_w >= 1024) {
1261 /* period roll-over */
1262 decayed = 1;
1263
1264 /*
1265 * Now that we know we're crossing a period boundary, figure
1266 * out how much from delta we need to complete the current
1267 * period and accrue it.
1268 */
1269 delta_w = 1024 - delta_w;
1270 if (runnable)
1271 sa->runnable_avg_sum += delta_w;
1272 sa->runnable_avg_period += delta_w;
1273
1274 delta -= delta_w;
1275
1276 /* Figure out how many additional periods this update spans */
1277 periods = delta / 1024;
1278 delta %= 1024;
1279
1280 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1281 periods + 1);
1282 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1283 periods + 1);
1284
1285 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1286 runnable_contrib = __compute_runnable_contrib(periods);
1287 if (runnable)
1288 sa->runnable_avg_sum += runnable_contrib;
1289 sa->runnable_avg_period += runnable_contrib;
1290 }
1291
1292 /* Remainder of delta accrued against u_0` */
1293 if (runnable)
1294 sa->runnable_avg_sum += delta;
1295 sa->runnable_avg_period += delta;
1296
1297 return decayed;
1298}
1299
1300/* Synchronize an entity's decay with its parenting cfs_rq.*/
1301static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1302{
1303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1304 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1305
1306 decays -= se->avg.decay_count;
1307 if (!decays)
1308 return 0;
1309
1310 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1311 se->avg.decay_count = 0;
1312
1313 return decays;
1314}
1315
1316#ifdef CONFIG_FAIR_GROUP_SCHED
1317static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1318 int force_update)
1319{
1320 struct task_group *tg = cfs_rq->tg;
1321 s64 tg_contrib;
1322
1323 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1324 tg_contrib -= cfs_rq->tg_load_contrib;
1325
1326 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1327 atomic64_add(tg_contrib, &tg->load_avg);
1328 cfs_rq->tg_load_contrib += tg_contrib;
1329 }
1330}
1331
1332/*
1333 * Aggregate cfs_rq runnable averages into an equivalent task_group
1334 * representation for computing load contributions.
1335 */
1336static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1337 struct cfs_rq *cfs_rq)
1338{
1339 struct task_group *tg = cfs_rq->tg;
1340 long contrib;
1341
1342 /* The fraction of a cpu used by this cfs_rq */
1343 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1344 sa->runnable_avg_period + 1);
1345 contrib -= cfs_rq->tg_runnable_contrib;
1346
1347 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1348 atomic_add(contrib, &tg->runnable_avg);
1349 cfs_rq->tg_runnable_contrib += contrib;
1350 }
1351}
1352
1353static inline void __update_group_entity_contrib(struct sched_entity *se)
1354{
1355 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1356 struct task_group *tg = cfs_rq->tg;
1357 int runnable_avg;
1358
1359 u64 contrib;
1360
1361 contrib = cfs_rq->tg_load_contrib * tg->shares;
1362 se->avg.load_avg_contrib = div64_u64(contrib,
1363 atomic64_read(&tg->load_avg) + 1);
1364
1365 /*
1366 * For group entities we need to compute a correction term in the case
1367 * that they are consuming <1 cpu so that we would contribute the same
1368 * load as a task of equal weight.
1369 *
1370 * Explicitly co-ordinating this measurement would be expensive, but
1371 * fortunately the sum of each cpus contribution forms a usable
1372 * lower-bound on the true value.
1373 *
1374 * Consider the aggregate of 2 contributions. Either they are disjoint
1375 * (and the sum represents true value) or they are disjoint and we are
1376 * understating by the aggregate of their overlap.
1377 *
1378 * Extending this to N cpus, for a given overlap, the maximum amount we
1379 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1380 * cpus that overlap for this interval and w_i is the interval width.
1381 *
1382 * On a small machine; the first term is well-bounded which bounds the
1383 * total error since w_i is a subset of the period. Whereas on a
1384 * larger machine, while this first term can be larger, if w_i is the
1385 * of consequential size guaranteed to see n_i*w_i quickly converge to
1386 * our upper bound of 1-cpu.
1387 */
1388 runnable_avg = atomic_read(&tg->runnable_avg);
1389 if (runnable_avg < NICE_0_LOAD) {
1390 se->avg.load_avg_contrib *= runnable_avg;
1391 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1392 }
1393}
1394#else
1395static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1396 int force_update) {}
1397static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1398 struct cfs_rq *cfs_rq) {}
1399static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1400#endif
1401
1402static inline void __update_task_entity_contrib(struct sched_entity *se)
1403{
1404 u32 contrib;
1405
1406 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1407 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1408 contrib /= (se->avg.runnable_avg_period + 1);
1409 se->avg.load_avg_contrib = scale_load(contrib);
1410}
1411
1412/* Compute the current contribution to load_avg by se, return any delta */
1413static long __update_entity_load_avg_contrib(struct sched_entity *se)
1414{
1415 long old_contrib = se->avg.load_avg_contrib;
1416
1417 if (entity_is_task(se)) {
1418 __update_task_entity_contrib(se);
1419 } else {
1420 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1421 __update_group_entity_contrib(se);
1422 }
1423
1424 return se->avg.load_avg_contrib - old_contrib;
1425}
1426
1427static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1428 long load_contrib)
1429{
1430 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1431 cfs_rq->blocked_load_avg -= load_contrib;
1432 else
1433 cfs_rq->blocked_load_avg = 0;
1434}
1435
1436static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1437
1438/* Update a sched_entity's runnable average */
1439static inline void update_entity_load_avg(struct sched_entity *se,
1440 int update_cfs_rq)
1441{
1442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1443 long contrib_delta;
1444 u64 now;
1445
1446 /*
1447 * For a group entity we need to use their owned cfs_rq_clock_task() in
1448 * case they are the parent of a throttled hierarchy.
1449 */
1450 if (entity_is_task(se))
1451 now = cfs_rq_clock_task(cfs_rq);
1452 else
1453 now = cfs_rq_clock_task(group_cfs_rq(se));
1454
1455 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1456 return;
1457
1458 contrib_delta = __update_entity_load_avg_contrib(se);
1459
1460 if (!update_cfs_rq)
1461 return;
1462
1463 if (se->on_rq)
1464 cfs_rq->runnable_load_avg += contrib_delta;
1465 else
1466 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1467}
1468
1469/*
1470 * Decay the load contributed by all blocked children and account this so that
1471 * their contribution may appropriately discounted when they wake up.
1472 */
1473static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1474{
1475 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1476 u64 decays;
1477
1478 decays = now - cfs_rq->last_decay;
1479 if (!decays && !force_update)
1480 return;
1481
1482 if (atomic64_read(&cfs_rq->removed_load)) {
1483 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1484 subtract_blocked_load_contrib(cfs_rq, removed_load);
1485 }
1486
1487 if (decays) {
1488 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1489 decays);
1490 atomic64_add(decays, &cfs_rq->decay_counter);
1491 cfs_rq->last_decay = now;
1492 }
1493
1494 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1495}
1496
1497static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1498{
1499 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1500 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1501}
1502
1503/* Add the load generated by se into cfs_rq's child load-average */
1504static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1505 struct sched_entity *se,
1506 int wakeup)
1507{
1508 /*
1509 * We track migrations using entity decay_count <= 0, on a wake-up
1510 * migration we use a negative decay count to track the remote decays
1511 * accumulated while sleeping.
1512 */
1513 if (unlikely(se->avg.decay_count <= 0)) {
1514 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1515 if (se->avg.decay_count) {
1516 /*
1517 * In a wake-up migration we have to approximate the
1518 * time sleeping. This is because we can't synchronize
1519 * clock_task between the two cpus, and it is not
1520 * guaranteed to be read-safe. Instead, we can
1521 * approximate this using our carried decays, which are
1522 * explicitly atomically readable.
1523 */
1524 se->avg.last_runnable_update -= (-se->avg.decay_count)
1525 << 20;
1526 update_entity_load_avg(se, 0);
1527 /* Indicate that we're now synchronized and on-rq */
1528 se->avg.decay_count = 0;
1529 }
1530 wakeup = 0;
1531 } else {
1532 __synchronize_entity_decay(se);
1533 }
1534
1535 /* migrated tasks did not contribute to our blocked load */
1536 if (wakeup) {
1537 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1538 update_entity_load_avg(se, 0);
1539 }
1540
1541 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1542 /* we force update consideration on load-balancer moves */
1543 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1544}
1545
1546/*
1547 * Remove se's load from this cfs_rq child load-average, if the entity is
1548 * transitioning to a blocked state we track its projected decay using
1549 * blocked_load_avg.
1550 */
1551static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1552 struct sched_entity *se,
1553 int sleep)
1554{
1555 update_entity_load_avg(se, 1);
1556 /* we force update consideration on load-balancer moves */
1557 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1558
1559 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1560 if (sleep) {
1561 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564}
1565#else
1566static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {}
1568static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1569static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 struct sched_entity *se,
1571 int wakeup) {}
1572static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1573 struct sched_entity *se,
1574 int sleep) {}
1575static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1576 int force_update) {}
1577#endif
973 1578
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 1579static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{ 1580{
@@ -1096,7 +1701,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1096 * Update run-time statistics of the 'current'. 1701 * Update run-time statistics of the 'current'.
1097 */ 1702 */
1098 update_curr(cfs_rq); 1703 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0); 1704 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1100 account_entity_enqueue(cfs_rq, se); 1705 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq); 1706 update_cfs_shares(cfs_rq);
1102 1707
@@ -1171,6 +1776,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1171 * Update run-time statistics of the 'current'. 1776 * Update run-time statistics of the 'current'.
1172 */ 1777 */
1173 update_curr(cfs_rq); 1778 update_curr(cfs_rq);
1779 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1174 1780
1175 update_stats_dequeue(cfs_rq, se); 1781 update_stats_dequeue(cfs_rq, se);
1176 if (flags & DEQUEUE_SLEEP) { 1782 if (flags & DEQUEUE_SLEEP) {
@@ -1191,7 +1797,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1191 if (se != cfs_rq->curr) 1797 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se); 1798 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0; 1799 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se); 1800 account_entity_dequeue(cfs_rq, se);
1196 1801
1197 /* 1802 /*
@@ -1340,6 +1945,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1340 update_stats_wait_start(cfs_rq, prev); 1945 update_stats_wait_start(cfs_rq, prev);
1341 /* Put 'current' back into the tree. */ 1946 /* Put 'current' back into the tree. */
1342 __enqueue_entity(cfs_rq, prev); 1947 __enqueue_entity(cfs_rq, prev);
1948 /* in !on_rq case, update occurred at dequeue */
1949 update_entity_load_avg(prev, 1);
1343 } 1950 }
1344 cfs_rq->curr = NULL; 1951 cfs_rq->curr = NULL;
1345} 1952}
@@ -1353,9 +1960,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1353 update_curr(cfs_rq); 1960 update_curr(cfs_rq);
1354 1961
1355 /* 1962 /*
1356 * Update share accounting for long-running entities. 1963 * Ensure that runnable average is periodically updated.
1357 */ 1964 */
1358 update_entity_shares_tick(cfs_rq); 1965 update_entity_load_avg(curr, 1);
1966 update_cfs_rq_blocked_load(cfs_rq, 1);
1359 1967
1360#ifdef CONFIG_SCHED_HRTICK 1968#ifdef CONFIG_SCHED_HRTICK
1361 /* 1969 /*
@@ -1448,6 +2056,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1448 return &tg->cfs_bandwidth; 2056 return &tg->cfs_bandwidth;
1449} 2057}
1450 2058
2059/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2060static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2061{
2062 if (unlikely(cfs_rq->throttle_count))
2063 return cfs_rq->throttled_clock_task;
2064
2065 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2066}
2067
1451/* returns 0 on failure to allocate runtime */ 2068/* returns 0 on failure to allocate runtime */
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2069static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{ 2070{
@@ -1592,14 +2209,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
1592 cfs_rq->throttle_count--; 2209 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP 2210#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) { 2211 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp; 2212 /* adjust cfs_rq_clock_task() */
1596 2213 cfs_rq->throttled_clock_task_time += rq->clock_task -
1597 /* leaving throttled state, advance shares averaging windows */ 2214 cfs_rq->throttled_clock_task;
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601 /* update entity weight now that we are on_rq again */
1602 update_cfs_shares(cfs_rq);
1603 } 2215 }
1604#endif 2216#endif
1605 2217
@@ -1611,9 +2223,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
1611 struct rq *rq = data; 2223 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 2224 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613 2225
1614 /* group is entering throttled state, record last load */ 2226 /* group is entering throttled state, stop time */
1615 if (!cfs_rq->throttle_count) 2227 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0); 2228 cfs_rq->throttled_clock_task = rq->clock_task;
1617 cfs_rq->throttle_count++; 2229 cfs_rq->throttle_count++;
1618 2230
1619 return 0; 2231 return 0;
@@ -1628,7 +2240,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1628 2240
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2241 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630 2242
1631 /* account load preceding throttle */ 2243 /* freeze hierarchy runnable averages while throttled */
1632 rcu_read_lock(); 2244 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 2245 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock(); 2246 rcu_read_unlock();
@@ -1652,7 +2264,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1652 rq->nr_running -= task_delta; 2264 rq->nr_running -= task_delta;
1653 2265
1654 cfs_rq->throttled = 1; 2266 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock; 2267 cfs_rq->throttled_clock = rq->clock;
1656 raw_spin_lock(&cfs_b->lock); 2268 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2269 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock); 2270 raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2282,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1670 2282
1671 cfs_rq->throttled = 0; 2283 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock); 2284 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; 2285 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
1674 list_del_rcu(&cfs_rq->throttled_list); 2286 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock); 2287 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677 2288
1678 update_rq_clock(rq); 2289 update_rq_clock(rq);
1679 /* update hierarchical throttle state */ 2290 /* update hierarchical throttle state */
@@ -2073,8 +2684,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
2073} 2684}
2074 2685
2075#else /* CONFIG_CFS_BANDWIDTH */ 2686#else /* CONFIG_CFS_BANDWIDTH */
2076static __always_inline 2687static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} 2688{
2689 return rq_of(cfs_rq)->clock_task;
2690}
2691
2692static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2693 unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2694static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2695static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2696static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2823,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2207 if (cfs_rq_throttled(cfs_rq)) 2823 if (cfs_rq_throttled(cfs_rq))
2208 break; 2824 break;
2209 2825
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq); 2826 update_cfs_shares(cfs_rq);
2827 update_entity_load_avg(se, 1);
2212 } 2828 }
2213 2829
2214 if (!se) 2830 if (!se) {
2831 update_rq_runnable_avg(rq, rq->nr_running);
2215 inc_nr_running(rq); 2832 inc_nr_running(rq);
2833 }
2216 hrtick_update(rq); 2834 hrtick_update(rq);
2217} 2835}
2218 2836
@@ -2266,12 +2884,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2266 if (cfs_rq_throttled(cfs_rq)) 2884 if (cfs_rq_throttled(cfs_rq))
2267 break; 2885 break;
2268 2886
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq); 2887 update_cfs_shares(cfs_rq);
2888 update_entity_load_avg(se, 1);
2271 } 2889 }
2272 2890
2273 if (!se) 2891 if (!se) {
2274 dec_nr_running(rq); 2892 dec_nr_running(rq);
2893 update_rq_runnable_avg(rq, 1);
2894 }
2275 hrtick_update(rq); 2895 hrtick_update(rq);
2276} 2896}
2277 2897
@@ -2781,6 +3401,37 @@ unlock:
2781 3401
2782 return new_cpu; 3402 return new_cpu;
2783} 3403}
3404
3405/*
3406 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3407 * removed when useful for applications beyond shares distribution (e.g.
3408 * load-balance).
3409 */
3410#ifdef CONFIG_FAIR_GROUP_SCHED
3411/*
3412 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3413 * cfs_rq_of(p) references at time of call are still valid and identify the
3414 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3415 * other assumptions, including the state of rq->lock, should be made.
3416 */
3417static void
3418migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3419{
3420 struct sched_entity *se = &p->se;
3421 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3422
3423 /*
3424 * Load tracking: accumulate removed load so that it can be processed
3425 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3426 * to blocked load iff they have a positive decay-count. It can never
3427 * be negative here since on-rq tasks have decay-count == 0.
3428 */
3429 if (se->avg.decay_count) {
3430 se->avg.decay_count = -__synchronize_entity_decay(se);
3431 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3432 }
3433}
3434#endif
2784#endif /* CONFIG_SMP */ 3435#endif /* CONFIG_SMP */
2785 3436
2786static unsigned long 3437static unsigned long
@@ -2907,7 +3558,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2907 * Batch and idle tasks do not preempt non-idle tasks (their preemption 3558 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2908 * is driven by the tick): 3559 * is driven by the tick):
2909 */ 3560 */
2910 if (unlikely(p->policy != SCHED_NORMAL)) 3561 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
2911 return; 3562 return;
2912 3563
2913 find_matching_se(&se, &pse); 3564 find_matching_se(&se, &pse);
@@ -3033,8 +3684,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3033 3684
3034#ifdef CONFIG_SMP 3685#ifdef CONFIG_SMP
3035/************************************************** 3686/**************************************************
3036 * Fair scheduling class load-balancing methods: 3687 * Fair scheduling class load-balancing methods.
3037 */ 3688 *
3689 * BASICS
3690 *
3691 * The purpose of load-balancing is to achieve the same basic fairness the
3692 * per-cpu scheduler provides, namely provide a proportional amount of compute
3693 * time to each task. This is expressed in the following equation:
3694 *
3695 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3696 *
3697 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3698 * W_i,0 is defined as:
3699 *
3700 * W_i,0 = \Sum_j w_i,j (2)
3701 *
3702 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3703 * is derived from the nice value as per prio_to_weight[].
3704 *
3705 * The weight average is an exponential decay average of the instantaneous
3706 * weight:
3707 *
3708 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3709 *
3710 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3711 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3712 * can also include other factors [XXX].
3713 *
3714 * To achieve this balance we define a measure of imbalance which follows
3715 * directly from (1):
3716 *
3717 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3718 *
3719 * We them move tasks around to minimize the imbalance. In the continuous
3720 * function space it is obvious this converges, in the discrete case we get
3721 * a few fun cases generally called infeasible weight scenarios.
3722 *
3723 * [XXX expand on:
3724 * - infeasible weights;
3725 * - local vs global optima in the discrete case. ]
3726 *
3727 *
3728 * SCHED DOMAINS
3729 *
3730 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3731 * for all i,j solution, we create a tree of cpus that follows the hardware
3732 * topology where each level pairs two lower groups (or better). This results
3733 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3734 * tree to only the first of the previous level and we decrease the frequency
3735 * of load-balance at each level inv. proportional to the number of cpus in
3736 * the groups.
3737 *
3738 * This yields:
3739 *
3740 * log_2 n 1 n
3741 * \Sum { --- * --- * 2^i } = O(n) (5)
3742 * i = 0 2^i 2^i
3743 * `- size of each group
3744 * | | `- number of cpus doing load-balance
3745 * | `- freq
3746 * `- sum over all levels
3747 *
3748 * Coupled with a limit on how many tasks we can migrate every balance pass,
3749 * this makes (5) the runtime complexity of the balancer.
3750 *
3751 * An important property here is that each CPU is still (indirectly) connected
3752 * to every other cpu in at most O(log n) steps:
3753 *
3754 * The adjacency matrix of the resulting graph is given by:
3755 *
3756 * log_2 n
3757 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3758 * k = 0
3759 *
3760 * And you'll find that:
3761 *
3762 * A^(log_2 n)_i,j != 0 for all i,j (7)
3763 *
3764 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3765 * The task movement gives a factor of O(m), giving a convergence complexity
3766 * of:
3767 *
3768 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3769 *
3770 *
3771 * WORK CONSERVING
3772 *
3773 * In order to avoid CPUs going idle while there's still work to do, new idle
3774 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3775 * tree itself instead of relying on other CPUs to bring it work.
3776 *
3777 * This adds some complexity to both (5) and (8) but it reduces the total idle
3778 * time.
3779 *
3780 * [XXX more?]
3781 *
3782 *
3783 * CGROUPS
3784 *
3785 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3786 *
3787 * s_k,i
3788 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3789 * S_k
3790 *
3791 * Where
3792 *
3793 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3794 *
3795 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3796 *
3797 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3798 * property.
3799 *
3800 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3801 * rewrite all of this once again.]
3802 */
3038 3803
3039static unsigned long __read_mostly max_load_balance_interval = HZ/10; 3804static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3040 3805
@@ -3300,52 +4065,58 @@ next:
3300/* 4065/*
3301 * update tg->load_weight by folding this cpu's load_avg 4066 * update tg->load_weight by folding this cpu's load_avg
3302 */ 4067 */
3303static int update_shares_cpu(struct task_group *tg, int cpu) 4068static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
3304{ 4069{
3305 struct cfs_rq *cfs_rq; 4070 struct sched_entity *se = tg->se[cpu];
3306 unsigned long flags; 4071 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
3307 struct rq *rq;
3308
3309 if (!tg->se[cpu])
3310 return 0;
3311
3312 rq = cpu_rq(cpu);
3313 cfs_rq = tg->cfs_rq[cpu];
3314
3315 raw_spin_lock_irqsave(&rq->lock, flags);
3316
3317 update_rq_clock(rq);
3318 update_cfs_load(cfs_rq, 1);
3319 4072
3320 /* 4073 /* throttled entities do not contribute to load */
3321 * We need to update shares after updating tg->load_weight in 4074 if (throttled_hierarchy(cfs_rq))
3322 * order to adjust the weight of groups with long running tasks. 4075 return;
3323 */
3324 update_cfs_shares(cfs_rq);
3325 4076
3326 raw_spin_unlock_irqrestore(&rq->lock, flags); 4077 update_cfs_rq_blocked_load(cfs_rq, 1);
3327 4078
3328 return 0; 4079 if (se) {
4080 update_entity_load_avg(se, 1);
4081 /*
4082 * We pivot on our runnable average having decayed to zero for
4083 * list removal. This generally implies that all our children
4084 * have also been removed (modulo rounding error or bandwidth
4085 * control); however, such cases are rare and we can fix these
4086 * at enqueue.
4087 *
4088 * TODO: fix up out-of-order children on enqueue.
4089 */
4090 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4091 list_del_leaf_cfs_rq(cfs_rq);
4092 } else {
4093 struct rq *rq = rq_of(cfs_rq);
4094 update_rq_runnable_avg(rq, rq->nr_running);
4095 }
3329} 4096}
3330 4097
3331static void update_shares(int cpu) 4098static void update_blocked_averages(int cpu)
3332{ 4099{
3333 struct cfs_rq *cfs_rq;
3334 struct rq *rq = cpu_rq(cpu); 4100 struct rq *rq = cpu_rq(cpu);
4101 struct cfs_rq *cfs_rq;
4102 unsigned long flags;
3335 4103
3336 rcu_read_lock(); 4104 raw_spin_lock_irqsave(&rq->lock, flags);
4105 update_rq_clock(rq);
3337 /* 4106 /*
3338 * Iterates the task_group tree in a bottom up fashion, see 4107 * Iterates the task_group tree in a bottom up fashion, see
3339 * list_add_leaf_cfs_rq() for details. 4108 * list_add_leaf_cfs_rq() for details.
3340 */ 4109 */
3341 for_each_leaf_cfs_rq(rq, cfs_rq) { 4110 for_each_leaf_cfs_rq(rq, cfs_rq) {
3342 /* throttled entities do not contribute to load */ 4111 /*
3343 if (throttled_hierarchy(cfs_rq)) 4112 * Note: We may want to consider periodically releasing
3344 continue; 4113 * rq->lock about these updates so that creating many task
3345 4114 * groups does not result in continually extending hold time.
3346 update_shares_cpu(cfs_rq->tg, cpu); 4115 */
4116 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
3347 } 4117 }
3348 rcu_read_unlock(); 4118
4119 raw_spin_unlock_irqrestore(&rq->lock, flags);
3349} 4120}
3350 4121
3351/* 4122/*
@@ -3397,7 +4168,7 @@ static unsigned long task_h_load(struct task_struct *p)
3397 return load; 4168 return load;
3398} 4169}
3399#else 4170#else
3400static inline void update_shares(int cpu) 4171static inline void update_blocked_averages(int cpu)
3401{ 4172{
3402} 4173}
3403 4174
@@ -4457,12 +5228,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
4457 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5228 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4458 return; 5229 return;
4459 5230
5231 update_rq_runnable_avg(this_rq, 1);
5232
4460 /* 5233 /*
4461 * Drop the rq->lock, but keep IRQ/preempt disabled. 5234 * Drop the rq->lock, but keep IRQ/preempt disabled.
4462 */ 5235 */
4463 raw_spin_unlock(&this_rq->lock); 5236 raw_spin_unlock(&this_rq->lock);
4464 5237
4465 update_shares(this_cpu); 5238 update_blocked_averages(this_cpu);
4466 rcu_read_lock(); 5239 rcu_read_lock();
4467 for_each_domain(this_cpu, sd) { 5240 for_each_domain(this_cpu, sd) {
4468 unsigned long interval; 5241 unsigned long interval;
@@ -4717,7 +5490,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4717 int update_next_balance = 0; 5490 int update_next_balance = 0;
4718 int need_serialize; 5491 int need_serialize;
4719 5492
4720 update_shares(cpu); 5493 update_blocked_averages(cpu);
4721 5494
4722 rcu_read_lock(); 5495 rcu_read_lock();
4723 for_each_domain(cpu, sd) { 5496 for_each_domain(cpu, sd) {
@@ -4954,6 +5727,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5727 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5728 entity_tick(cfs_rq, se, queued);
4956 } 5729 }
5730
5731 if (sched_feat_numa(NUMA))
5732 task_tick_numa(rq, curr);
5733
5734 update_rq_runnable_avg(rq, 1);
4957} 5735}
4958 5736
4959/* 5737/*
@@ -5046,6 +5824,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5046 place_entity(cfs_rq, se, 0); 5824 place_entity(cfs_rq, se, 0);
5047 se->vruntime -= cfs_rq->min_vruntime; 5825 se->vruntime -= cfs_rq->min_vruntime;
5048 } 5826 }
5827
5828#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5829 /*
5830 * Remove our load from contribution when we leave sched_fair
5831 * and ensure we don't carry in an old decay_count if we
5832 * switch back.
5833 */
5834 if (p->se.avg.decay_count) {
5835 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5836 __synchronize_entity_decay(&p->se);
5837 subtract_blocked_load_contrib(cfs_rq,
5838 p->se.avg.load_avg_contrib);
5839 }
5840#endif
5049} 5841}
5050 5842
5051/* 5843/*
@@ -5092,11 +5884,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5092#ifndef CONFIG_64BIT 5884#ifndef CONFIG_64BIT
5093 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5885 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5094#endif 5886#endif
5887#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5888 atomic64_set(&cfs_rq->decay_counter, 1);
5889 atomic64_set(&cfs_rq->removed_load, 0);
5890#endif
5095} 5891}
5096 5892
5097#ifdef CONFIG_FAIR_GROUP_SCHED 5893#ifdef CONFIG_FAIR_GROUP_SCHED
5098static void task_move_group_fair(struct task_struct *p, int on_rq) 5894static void task_move_group_fair(struct task_struct *p, int on_rq)
5099{ 5895{
5896 struct cfs_rq *cfs_rq;
5100 /* 5897 /*
5101 * If the task was not on the rq at the time of this cgroup movement 5898 * If the task was not on the rq at the time of this cgroup movement
5102 * it must have been asleep, sleeping tasks keep their ->vruntime 5899 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +5925,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5128 if (!on_rq) 5925 if (!on_rq)
5129 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5926 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5130 set_task_rq(p, task_cpu(p)); 5927 set_task_rq(p, task_cpu(p));
5131 if (!on_rq) 5928 if (!on_rq) {
5132 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5929 cfs_rq = cfs_rq_of(&p->se);
5930 p->se.vruntime += cfs_rq->min_vruntime;
5931#ifdef CONFIG_SMP
5932 /*
5933 * migrate_task_rq_fair() will have removed our previous
5934 * contribution, but we must synchronize for ongoing future
5935 * decay.
5936 */
5937 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5938 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5939#endif
5940 }
5133} 5941}
5134 5942
5135void free_fair_sched_group(struct task_group *tg) 5943void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6022,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5214 6022
5215 cfs_rq->tg = tg; 6023 cfs_rq->tg = tg;
5216 cfs_rq->rq = rq; 6024 cfs_rq->rq = rq;
5217#ifdef CONFIG_SMP
5218 /* allow initial update_cfs_load() to truncate */
5219 cfs_rq->load_stamp = 1;
5220#endif
5221 init_cfs_rq_runtime(cfs_rq); 6025 init_cfs_rq_runtime(cfs_rq);
5222 6026
5223 tg->cfs_rq[cpu] = cfs_rq; 6027 tg->cfs_rq[cpu] = cfs_rq;
@@ -5319,7 +6123,9 @@ const struct sched_class fair_sched_class = {
5319 6123
5320#ifdef CONFIG_SMP 6124#ifdef CONFIG_SMP
5321 .select_task_rq = select_task_rq_fair, 6125 .select_task_rq = select_task_rq_fair,
5322 6126#ifdef CONFIG_FAIR_GROUP_SCHED
6127 .migrate_task_rq = migrate_task_rq_fair,
6128#endif
5323 .rq_online = rq_online_fair, 6129 .rq_online = rq_online_fair,
5324 .rq_offline = rq_offline_fair, 6130 .rq_offline = rq_offline_fair,
5325 6131
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true)
32SCHED_FEAT(CACHE_HOT_BUDDY, true) 32SCHED_FEAT(CACHE_HOT_BUDDY, true)
33 33
34/* 34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
35 * Use arch dependent cpu power functions 40 * Use arch dependent cpu power functions
36 */ 41 */
37SCHED_FEAT(ARCH_POWER, true) 42SCHED_FEAT(ARCH_POWER, true)
@@ -61,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
61SCHED_FEAT(FORCE_SD_OVERLAP, false) 66SCHED_FEAT(FORCE_SD_OVERLAP, false)
62SCHED_FEAT(RT_RUNTIME_SHARE, true) 67SCHED_FEAT(RT_RUNTIME_SHARE, true)
63SCHED_FEAT(LB_MIN, false) 68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..fc886441436a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
112 unsigned long shares; 112 unsigned long shares;
113 113
114 atomic_t load_weight; 114 atomic_t load_weight;
115 atomic64_t load_avg;
116 atomic_t runnable_avg;
115#endif 117#endif
116 118
117#ifdef CONFIG_RT_GROUP_SCHED 119#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,29 @@ struct cfs_rq {
222 unsigned int nr_spread_over; 224 unsigned int nr_spread_over;
223#endif 225#endif
224 226
227#ifdef CONFIG_SMP
228/*
229 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
230 * removed when useful for applications beyond shares distribution (e.g.
231 * load-balance).
232 */
225#ifdef CONFIG_FAIR_GROUP_SCHED 233#ifdef CONFIG_FAIR_GROUP_SCHED
226 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
227
228 /* 234 /*
229 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 235 * CFS Load tracking
230 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 236 * Under CFS, load is tracked on a per-entity basis and aggregated up.
231 * (like users, containers etc.) 237 * This allows for the description of both thread and group usage (in
232 * 238 * the FAIR_GROUP_SCHED case).
233 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
234 * list is used during load balance.
235 */ 239 */
236 int on_list; 240 u64 runnable_load_avg, blocked_load_avg;
237 struct list_head leaf_cfs_rq_list; 241 atomic64_t decay_counter, removed_load;
238 struct task_group *tg; /* group that "owns" this runqueue */ 242 u64 last_decay;
243#endif /* CONFIG_FAIR_GROUP_SCHED */
244/* These always depend on CONFIG_FAIR_GROUP_SCHED */
245#ifdef CONFIG_FAIR_GROUP_SCHED
246 u32 tg_runnable_contrib;
247 u64 tg_load_contrib;
248#endif /* CONFIG_FAIR_GROUP_SCHED */
239 249
240#ifdef CONFIG_SMP
241 /* 250 /*
242 * h_load = weight * f(tg) 251 * h_load = weight * f(tg)
243 * 252 *
@@ -245,26 +254,30 @@ struct cfs_rq {
245 * this group. 254 * this group.
246 */ 255 */
247 unsigned long h_load; 256 unsigned long h_load;
257#endif /* CONFIG_SMP */
258
259#ifdef CONFIG_FAIR_GROUP_SCHED
260 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
248 261
249 /* 262 /*
250 * Maintaining per-cpu shares distribution for group scheduling 263 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
264 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
265 * (like users, containers etc.)
251 * 266 *
252 * load_stamp is the last time we updated the load average 267 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
253 * load_last is the last time we updated the load average and saw load 268 * list is used during load balance.
254 * load_unacc_exec_time is currently unaccounted execution time
255 */ 269 */
256 u64 load_avg; 270 int on_list;
257 u64 load_period; 271 struct list_head leaf_cfs_rq_list;
258 u64 load_stamp, load_last, load_unacc_exec_time; 272 struct task_group *tg; /* group that "owns" this runqueue */
259 273
260 unsigned long load_contribution;
261#endif /* CONFIG_SMP */
262#ifdef CONFIG_CFS_BANDWIDTH 274#ifdef CONFIG_CFS_BANDWIDTH
263 int runtime_enabled; 275 int runtime_enabled;
264 u64 runtime_expires; 276 u64 runtime_expires;
265 s64 runtime_remaining; 277 s64 runtime_remaining;
266 278
267 u64 throttled_timestamp; 279 u64 throttled_clock, throttled_clock_task;
280 u64 throttled_clock_task_time;
268 int throttled, throttle_count; 281 int throttled, throttle_count;
269 struct list_head throttled_list; 282 struct list_head throttled_list;
270#endif /* CONFIG_CFS_BANDWIDTH */ 283#endif /* CONFIG_CFS_BANDWIDTH */
@@ -467,6 +480,8 @@ struct rq {
467#ifdef CONFIG_SMP 480#ifdef CONFIG_SMP
468 struct llist_head wake_list; 481 struct llist_head wake_list;
469#endif 482#endif
483
484 struct sched_avg avg;
470}; 485};
471 486
472static inline int cpu_of(struct rq *rq) 487static inline int cpu_of(struct rq *rq)
@@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 665
666#ifdef CONFIG_NUMA_BALANCING
667#define sched_feat_numa(x) sched_feat(x)
668#ifdef CONFIG_SCHED_DEBUG
669#define numabalancing_enabled sched_feat_numa(NUMA)
670#else
671extern bool numabalancing_enabled;
672#endif /* CONFIG_SCHED_DEBUG */
673#else
674#define sched_feat_numa(x) (0)
675#define numabalancing_enabled (0)
676#endif /* CONFIG_NUMA_BALANCING */
677
651static inline u64 global_rt_period(void) 678static inline u64 global_rt_period(void)
652{ 679{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 680 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -1212,4 +1239,3 @@ static inline u64 irq_time_read(int cpu)
1212} 1239}
1213#endif /* CONFIG_64BIT */ 1240#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1241#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
396#ifdef CONFIG_SECCOMP_FILTER 396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: { 397 case SECCOMP_MODE_FILTER: {
398 int data; 398 int data;
399 struct pt_regs *regs = task_pt_regs(current);
399 ret = seccomp_run_filters(this_syscall); 400 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA; 401 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION; 402 ret &= SECCOMP_RET_ACTION;
402 switch (ret) { 403 switch (ret) {
403 case SECCOMP_RET_ERRNO: 404 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */ 405 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current), 406 syscall_set_return_value(current, regs,
406 -data, 0); 407 -data, 0);
407 goto skip; 408 goto skip;
408 case SECCOMP_RET_TRAP: 409 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */ 410 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current)); 411 syscall_rollback(current, regs);
411 /* Let the filter pass back 16 bits of data. */ 412 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data); 413 seccomp_send_sigsys(this_syscall, data);
413 goto skip; 414 goto skip;
414 case SECCOMP_RET_TRACE: 415 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */ 416 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) 417 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
418 syscall_set_return_value(current, regs,
419 -ENOSYS, 0);
417 goto skip; 420 goto skip;
421 }
418 /* Allow the BPF to provide the event message */ 422 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data); 423 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /* 424 /*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
425 */ 429 */
426 if (fatal_signal_pending(current)) 430 if (fatal_signal_pending(current))
427 break; 431 break;
432 if (syscall_get_nr(current, regs) < 0)
433 goto skip; /* Explicit request to skip. */
434
428 return 0; 435 return 0;
429 case SECCOMP_RET_ALLOW: 436 case SECCOMP_RET_ALLOW:
430 return 0; 437 return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..372771e948c2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -31,6 +31,7 @@
31#include <linux/nsproxy.h> 31#include <linux/nsproxy.h>
32#include <linux/user_namespace.h> 32#include <linux/user_namespace.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/compat.h>
34#define CREATE_TRACE_POINTS 35#define CREATE_TRACE_POINTS
35#include <trace/events/signal.h> 36#include <trace/events/signal.h>
36 37
@@ -1159,8 +1160,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1159 return __send_signal(sig, info, t, group, from_ancestor_ns); 1160 return __send_signal(sig, info, t, group, from_ancestor_ns);
1160} 1161}
1161 1162
1162static void print_fatal_signal(struct pt_regs *regs, int signr) 1163static void print_fatal_signal(int signr)
1163{ 1164{
1165 struct pt_regs *regs = signal_pt_regs();
1164 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1166 printk("%s/%d: potentially unexpected fatal signal %d.\n",
1165 current->comm, task_pid_nr(current), signr); 1167 current->comm, task_pid_nr(current), signr);
1166 1168
@@ -1752,7 +1754,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1752 * see comment in do_notify_parent() about the following 4 lines 1754 * see comment in do_notify_parent() about the following 4 lines
1753 */ 1755 */
1754 rcu_read_lock(); 1756 rcu_read_lock();
1755 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1757 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
1756 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1758 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1757 rcu_read_unlock(); 1759 rcu_read_unlock();
1758 1760
@@ -1908,7 +1910,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1908 preempt_disable(); 1910 preempt_disable();
1909 read_unlock(&tasklist_lock); 1911 read_unlock(&tasklist_lock);
1910 preempt_enable_no_resched(); 1912 preempt_enable_no_resched();
1911 schedule(); 1913 freezable_schedule();
1912 } else { 1914 } else {
1913 /* 1915 /*
1914 * By the time we got the lock, our tracer went away. 1916 * By the time we got the lock, our tracer went away.
@@ -1930,13 +1932,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1930 } 1932 }
1931 1933
1932 /* 1934 /*
1933 * While in TASK_TRACED, we were considered "frozen enough".
1934 * Now that we woke up, it's crucial if we're supposed to be
1935 * frozen that we freeze now before running anything substantial.
1936 */
1937 try_to_freeze();
1938
1939 /*
1940 * We are back. Now reacquire the siglock before touching 1935 * We are back. Now reacquire the siglock before touching
1941 * last_siginfo, so that we are sure to have synchronized with 1936 * last_siginfo, so that we are sure to have synchronized with
1942 * any signal-sending on another CPU that wants to examine it. 1937 * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2087,7 @@ static bool do_signal_stop(int signr)
2092 } 2087 }
2093 2088
2094 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2089 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2095 schedule(); 2090 freezable_schedule();
2096 return true; 2091 return true;
2097 } else { 2092 } else {
2098 /* 2093 /*
@@ -2138,10 +2133,9 @@ static void do_jobctl_trap(void)
2138 } 2133 }
2139} 2134}
2140 2135
2141static int ptrace_signal(int signr, siginfo_t *info, 2136static int ptrace_signal(int signr, siginfo_t *info)
2142 struct pt_regs *regs, void *cookie)
2143{ 2137{
2144 ptrace_signal_deliver(regs, cookie); 2138 ptrace_signal_deliver();
2145 /* 2139 /*
2146 * We do not check sig_kernel_stop(signr) but set this marker 2140 * We do not check sig_kernel_stop(signr) but set this marker
2147 * unconditionally because we do not know whether debugger will 2141 * unconditionally because we do not know whether debugger will
@@ -2200,15 +2194,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2200 if (unlikely(uprobe_deny_signal())) 2194 if (unlikely(uprobe_deny_signal()))
2201 return 0; 2195 return 0;
2202 2196
2203relock:
2204 /* 2197 /*
2205 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2198 * Do this once, we can't return to user-mode if freezing() == T.
2206 * While in TASK_STOPPED, we were considered "frozen enough". 2199 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2207 * Now that we woke up, it's crucial if we're supposed to be 2200 * thus do not need another check after return.
2208 * frozen that we freeze now before running anything substantial.
2209 */ 2201 */
2210 try_to_freeze(); 2202 try_to_freeze();
2211 2203
2204relock:
2212 spin_lock_irq(&sighand->siglock); 2205 spin_lock_irq(&sighand->siglock);
2213 /* 2206 /*
2214 * Every stopped thread goes here after wakeup. Check to see if 2207 * Every stopped thread goes here after wakeup. Check to see if
@@ -2265,8 +2258,7 @@ relock:
2265 break; /* will return 0 */ 2258 break; /* will return 0 */
2266 2259
2267 if (unlikely(current->ptrace) && signr != SIGKILL) { 2260 if (unlikely(current->ptrace) && signr != SIGKILL) {
2268 signr = ptrace_signal(signr, info, 2261 signr = ptrace_signal(signr, info);
2269 regs, cookie);
2270 if (!signr) 2262 if (!signr)
2271 continue; 2263 continue;
2272 } 2264 }
@@ -2351,7 +2343,7 @@ relock:
2351 2343
2352 if (sig_kernel_coredump(signr)) { 2344 if (sig_kernel_coredump(signr)) {
2353 if (print_fatal_signals) 2345 if (print_fatal_signals)
2354 print_fatal_signal(regs, info->si_signo); 2346 print_fatal_signal(info->si_signo);
2355 /* 2347 /*
2356 * If it was able to dump core, this kills all 2348 * If it was able to dump core, this kills all
2357 * other threads in the group and synchronizes with 2349 * other threads in the group and synchronizes with
@@ -2360,7 +2352,7 @@ relock:
2360 * first and our do_group_exit call below will use 2352 * first and our do_group_exit call below will use
2361 * that value and ignore the one we pass it. 2353 * that value and ignore the one we pass it.
2362 */ 2354 */
2363 do_coredump(info, regs); 2355 do_coredump(info);
2364 } 2356 }
2365 2357
2366 /* 2358 /*
@@ -2536,11 +2528,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2536 */ 2528 */
2537void set_current_blocked(sigset_t *newset) 2529void set_current_blocked(sigset_t *newset)
2538{ 2530{
2539 struct task_struct *tsk = current;
2540 sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); 2531 sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
2541 spin_lock_irq(&tsk->sighand->siglock); 2532 __set_current_blocked(newset);
2542 __set_task_blocked(tsk, newset);
2543 spin_unlock_irq(&tsk->sighand->siglock);
2544} 2533}
2545 2534
2546void __set_current_blocked(const sigset_t *newset) 2535void __set_current_blocked(const sigset_t *newset)
@@ -3103,6 +3092,79 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3103out: 3092out:
3104 return error; 3093 return error;
3105} 3094}
3095#ifdef CONFIG_GENERIC_SIGALTSTACK
3096SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3097{
3098 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3099}
3100#endif
3101
3102int restore_altstack(const stack_t __user *uss)
3103{
3104 int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
3105 /* squash all but EFAULT for now */
3106 return err == -EFAULT ? err : 0;
3107}
3108
3109int __save_altstack(stack_t __user *uss, unsigned long sp)
3110{
3111 struct task_struct *t = current;
3112 return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
3113 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3114 __put_user(t->sas_ss_size, &uss->ss_size);
3115}
3116
3117#ifdef CONFIG_COMPAT
3118#ifdef CONFIG_GENERIC_SIGALTSTACK
3119asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
3120 compat_stack_t __user *uoss_ptr)
3121{
3122 stack_t uss, uoss;
3123 int ret;
3124 mm_segment_t seg;
3125
3126 if (uss_ptr) {
3127 compat_stack_t uss32;
3128
3129 memset(&uss, 0, sizeof(stack_t));
3130 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
3131 return -EFAULT;
3132 uss.ss_sp = compat_ptr(uss32.ss_sp);
3133 uss.ss_flags = uss32.ss_flags;
3134 uss.ss_size = uss32.ss_size;
3135 }
3136 seg = get_fs();
3137 set_fs(KERNEL_DS);
3138 ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
3139 (stack_t __force __user *) &uoss,
3140 compat_user_stack_pointer());
3141 set_fs(seg);
3142 if (ret >= 0 && uoss_ptr) {
3143 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
3144 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
3145 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
3146 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
3147 ret = -EFAULT;
3148 }
3149 return ret;
3150}
3151
3152int compat_restore_altstack(const compat_stack_t __user *uss)
3153{
3154 int err = compat_sys_sigaltstack(uss, NULL);
3155 /* squash all but -EFAULT for now */
3156 return err == -EFAULT ? err : 0;
3157}
3158
3159int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3160{
3161 struct task_struct *t = current;
3162 return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
3163 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3164 __put_user(t->sas_ss_size, &uss->ss_size);
3165}
3166#endif
3167#endif
3106 3168
3107#ifdef __ARCH_WANT_SYS_SIGPENDING 3169#ifdef __ARCH_WANT_SYS_SIGPENDING
3108 3170
@@ -3139,7 +3201,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3139 if (nset) { 3201 if (nset) {
3140 if (copy_from_user(&new_set, nset, sizeof(*nset))) 3202 if (copy_from_user(&new_set, nset, sizeof(*nset)))
3141 return -EFAULT; 3203 return -EFAULT;
3142 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
3143 3204
3144 new_blocked = current->blocked; 3205 new_blocked = current->blocked;
3145 3206
@@ -3157,7 +3218,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3157 return -EINVAL; 3218 return -EINVAL;
3158 } 3219 }
3159 3220
3160 __set_current_blocked(&new_blocked); 3221 set_current_blocked(&new_blocked);
3161 } 3222 }
3162 3223
3163 if (oset) { 3224 if (oset) {
@@ -3221,6 +3282,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3221 int old = current->blocked.sig[0]; 3282 int old = current->blocked.sig[0];
3222 sigset_t newset; 3283 sigset_t newset;
3223 3284
3285 siginitset(&newset, newmask);
3224 set_current_blocked(&newset); 3286 set_current_blocked(&newset);
3225 3287
3226 return old; 3288 return old;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index cc96bdc0c2c9..ed567babe789 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 vtime_account(current); 224 vtime_account_irq_enter(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -272,7 +272,7 @@ restart:
272 272
273 lockdep_softirq_exit(); 273 lockdep_softirq_exit();
274 274
275 vtime_account(current); 275 vtime_account_irq_exit(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 276 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 278}
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
341 */ 341 */
342void irq_exit(void) 342void irq_exit(void)
343{ 343{
344 vtime_account(current); 344 vtime_account_irq_exit(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd844..2b859828cdc3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
34#include <linux/delay.h> 36#include <linux/delay.h>
35#include <linux/srcu.h> 37#include <linux/srcu.h>
36 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
37/* 43/*
38 * Initialize an rcu_batch structure to empty. 44 * Initialize an rcu_batch structure to empty.
39 */ 45 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
92 } 98 }
93} 99}
94 100
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
98static int init_srcu_struct_fields(struct srcu_struct *sp) 101static int init_srcu_struct_fields(struct srcu_struct *sp)
99{ 102{
100 sp->completed = 0; 103 sp->completed = 0;
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
464 */ 467 */
465void synchronize_srcu(struct srcu_struct *sp) 468void synchronize_srcu(struct srcu_struct *sp)
466{ 469{
467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 470 __synchronize_srcu(sp, rcu_expedited
471 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
472 : SYNCHRONIZE_SRCU_TRYCOUNT);
468} 473}
469EXPORT_SYMBOL_GPL(synchronize_srcu); 474EXPORT_SYMBOL_GPL(synchronize_srcu);
470 475
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
637/* 642/*
638 * This is the work-queue function that handles SRCU grace periods. 643 * This is the work-queue function that handles SRCU grace periods.
639 */ 644 */
640static void process_srcu(struct work_struct *work) 645void process_srcu(struct work_struct *work)
641{ 646{
642 struct srcu_struct *sp; 647 struct srcu_struct *sp;
643 648
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work)
648 srcu_invoke_callbacks(sp); 653 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp); 654 srcu_reschedule(sp);
650} 655}
656EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/sys.c b/kernel/sys.c
index e6e0ece5f6a0..265b37690421 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms)
1046 cputime_t tgutime, tgstime, cutime, cstime; 1046 cputime_t tgutime, tgstime, cutime, cstime;
1047 1047
1048 spin_lock_irq(&current->sighand->siglock); 1048 spin_lock_irq(&current->sighand->siglock);
1049 thread_group_times(current, &tgutime, &tgstime); 1049 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
1050 cutime = current->signal->cutime; 1050 cutime = current->signal->cutime;
1051 cstime = current->signal->cstime; 1051 cstime = current->signal->cstime;
1052 spin_unlock_irq(&current->sighand->siglock); 1052 spin_unlock_irq(&current->sighand->siglock);
@@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1704 utime = stime = 0; 1704 utime = stime = 0;
1705 1705
1706 if (who == RUSAGE_THREAD) { 1706 if (who == RUSAGE_THREAD) {
1707 task_times(current, &utime, &stime); 1707 task_cputime_adjusted(current, &utime, &stime);
1708 accumulate_thread_rusage(p, r); 1708 accumulate_thread_rusage(p, r);
1709 maxrss = p->signal->maxrss; 1709 maxrss = p->signal->maxrss;
1710 goto out; 1710 goto out;
@@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1730 break; 1730 break;
1731 1731
1732 case RUSAGE_SELF: 1732 case RUSAGE_SELF:
1733 thread_group_times(p, &tgutime, &tgstime); 1733 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1734 utime += tgutime; 1734 utime += tgutime;
1735 stime += tgstime; 1735 stime += tgstime;
1736 r->ru_nvcsw += p->signal->nvcsw; 1736 r->ru_nvcsw += p->signal->nvcsw;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 25cond_syscall(sys_kexec_load);
26cond_syscall(compat_sys_kexec_load); 26cond_syscall(compat_sys_kexec_load);
27cond_syscall(sys_init_module); 27cond_syscall(sys_init_module);
28cond_syscall(sys_finit_module);
28cond_syscall(sys_delete_module); 29cond_syscall(sys_delete_module);
29cond_syscall(sys_socketpair); 30cond_syscall(sys_socketpair);
30cond_syscall(sys_bind); 31cond_syscall(sys_bind);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..c88878db491e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 257static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
259static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261#endif 262#endif /* CONFIG_SMP */
263#endif /* CONFIG_SCHED_DEBUG */
262 264
263#ifdef CONFIG_COMPACTION 265#ifdef CONFIG_COMPACTION
264static int min_extfrag_threshold; 266static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301 .extra1 = &min_wakeup_granularity_ns, 303 .extra1 = &min_wakeup_granularity_ns,
302 .extra2 = &max_wakeup_granularity_ns, 304 .extra2 = &max_wakeup_granularity_ns,
303 }, 305 },
306#ifdef CONFIG_SMP
304 { 307 {
305 .procname = "sched_tunable_scaling", 308 .procname = "sched_tunable_scaling",
306 .data = &sysctl_sched_tunable_scaling, 309 .data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = {
347 .extra1 = &zero, 350 .extra1 = &zero,
348 .extra2 = &one, 351 .extra2 = &one,
349 }, 352 },
350#endif 353#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_delay_ms",
357 .data = &sysctl_numa_balancing_scan_delay,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_min_ms",
364 .data = &sysctl_numa_balancing_scan_period_min,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369 {
370 .procname = "numa_balancing_scan_period_reset",
371 .data = &sysctl_numa_balancing_scan_period_reset,
372 .maxlen = sizeof(unsigned int),
373 .mode = 0644,
374 .proc_handler = proc_dointvec,
375 },
376 {
377 .procname = "numa_balancing_scan_period_max_ms",
378 .data = &sysctl_numa_balancing_scan_period_max,
379 .maxlen = sizeof(unsigned int),
380 .mode = 0644,
381 .proc_handler = proc_dointvec,
382 },
383 {
384 .procname = "numa_balancing_scan_size_mb",
385 .data = &sysctl_numa_balancing_scan_size,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec,
389 },
390#endif /* CONFIG_NUMA_BALANCING */
391#endif /* CONFIG_SCHED_DEBUG */
351 { 392 {
352 .procname = "sched_rt_period_us", 393 .procname = "sched_rt_period_us",
353 .data = &sysctl_sched_rt_period, 394 .data = &sysctl_sched_rt_period,
@@ -565,7 +606,7 @@ static struct ctl_table kern_table[] = {
565 .extra2 = &one, 606 .extra2 = &one,
566 }, 607 },
567#endif 608#endif
568#ifdef CONFIG_HOTPLUG 609
569 { 610 {
570 .procname = "hotplug", 611 .procname = "hotplug",
571 .data = &uevent_helper, 612 .data = &uevent_helper,
@@ -573,7 +614,7 @@ static struct ctl_table kern_table[] = {
573 .mode = 0644, 614 .mode = 0644,
574 .proc_handler = proc_dostring, 615 .proc_handler = proc_dostring,
575 }, 616 },
576#endif 617
577#ifdef CONFIG_CHR_DEV_SG 618#ifdef CONFIG_CHR_DEV_SG
578 { 619 {
579 .procname = "sg-big-buff", 620 .procname = "sg-big-buff",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..5a6384450501 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1344 goto out_putname; 1344 goto out_putname;
1345 } 1345 }
1346 1346
1347 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = task_active_pid_ns(current)->proc_mnt;
1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1349 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1350 if (IS_ERR(file)) 1350 if (IS_ERR(file))
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c2..ff7d9d2ab504 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 6629bf7b5285..7a925ba456fb 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
58 return (cycle_t) jiffies; 58 return (cycle_t) jiffies;
59} 59}
60 60
61struct clocksource clocksource_jiffies = { 61static struct clocksource clocksource_jiffies = {
62 .name = "jiffies", 62 .name = "jiffies",
63 .rating = 1, /* lowest valid rating*/ 63 .rating = 1, /* lowest valid rating*/
64 .read = jiffies_read, 64 .read = jiffies_read,
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = {
67 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
68}; 68};
69 69
70__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
71
70#if (BITS_PER_LONG < 64) 72#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void) 73u64 get_jiffies_64(void)
72{ 74{
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void)
74 u64 ret; 76 u64 ret;
75 77
76 do { 78 do {
77 seq = read_seqbegin(&xtime_lock); 79 seq = read_seqbegin(&jiffies_lock);
78 ret = jiffies_64; 80 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq)); 81 } while (read_seqretry(&jiffies_lock, seq));
80 return ret; 82 return ret;
81} 83}
82EXPORT_SYMBOL(get_jiffies_64); 84EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e4..b1600a6973f4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
63static void tick_periodic(int cpu) 63static void tick_periodic(int cpu)
64{ 64{
65 if (tick_do_timer_cpu == cpu) { 65 if (tick_do_timer_cpu == cpu) {
66 write_seqlock(&xtime_lock); 66 write_seqlock(&jiffies_lock);
67 67
68 /* Keep track of the next tick event */ 68 /* Keep track of the next tick event */
69 tick_next_period = ktime_add(tick_next_period, tick_period); 69 tick_next_period = ktime_add(tick_next_period, tick_period);
70 70
71 do_timer(1); 71 do_timer(1);
72 write_sequnlock(&xtime_lock); 72 write_sequnlock(&jiffies_lock);
73 } 73 }
74 74
75 update_process_times(user_mode(get_irq_regs())); 75 update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
130 ktime_t next; 130 ktime_t next;
131 131
132 do { 132 do {
133 seq = read_seqbegin(&xtime_lock); 133 seq = read_seqbegin(&jiffies_lock);
134 next = tick_next_period; 134 next = tick_next_period;
135 } while (read_seqretry(&xtime_lock, seq)); 135 } while (read_seqretry(&jiffies_lock, seq));
136 136
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e265b901fed..cf3e59ed6dc0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
141#endif 141#endif
142 142
143extern void do_timer(unsigned long ticks); 143extern void do_timer(unsigned long ticks);
144extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a40260885265..d58e552d9fd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 32
33/* 33/*
34 * The time, when the last jiffy update happened. Protected by xtime_lock. 34 * The time, when the last jiffy update happened. Protected by jiffies_lock.
35 */ 35 */
36static ktime_t last_jiffies_update; 36static ktime_t last_jiffies_update;
37 37
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now)
49 ktime_t delta; 49 ktime_t delta;
50 50
51 /* 51 /*
52 * Do a quick check without holding xtime_lock: 52 * Do a quick check without holding jiffies_lock:
53 */ 53 */
54 delta = ktime_sub(now, last_jiffies_update); 54 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 < tick_period.tv64) 55 if (delta.tv64 < tick_period.tv64)
56 return; 56 return;
57 57
58 /* Reevalute with xtime_lock held */ 58 /* Reevalute with jiffies_lock held */
59 write_seqlock(&xtime_lock); 59 write_seqlock(&jiffies_lock);
60 60
61 delta = ktime_sub(now, last_jiffies_update); 61 delta = ktime_sub(now, last_jiffies_update);
62 if (delta.tv64 >= tick_period.tv64) { 62 if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now)
79 /* Keep the tick_next_period variable up to date */ 79 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period); 80 tick_next_period = ktime_add(last_jiffies_update, tick_period);
81 } 81 }
82 write_sequnlock(&xtime_lock); 82 write_sequnlock(&jiffies_lock);
83} 83}
84 84
85/* 85/*
@@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void)
89{ 89{
90 ktime_t period; 90 ktime_t period;
91 91
92 write_seqlock(&xtime_lock); 92 write_seqlock(&jiffies_lock);
93 /* Did we start the jiffies update yet ? */ 93 /* Did we start the jiffies update yet ? */
94 if (last_jiffies_update.tv64 == 0) 94 if (last_jiffies_update.tv64 == 0)
95 last_jiffies_update = tick_next_period; 95 last_jiffies_update = tick_next_period;
96 period = last_jiffies_update; 96 period = last_jiffies_update;
97 write_sequnlock(&xtime_lock); 97 write_sequnlock(&jiffies_lock);
98 return period; 98 return period;
99} 99}
100 100
101
102static void tick_sched_do_timer(ktime_t now)
103{
104 int cpu = smp_processor_id();
105
106#ifdef CONFIG_NO_HZ
107 /*
108 * Check if the do_timer duty was dropped. We don't care about
109 * concurrency: This happens only when the cpu in charge went
110 * into a long sleep. If two cpus happen to assign themself to
111 * this duty, then the jiffies update is still serialized by
112 * jiffies_lock.
113 */
114 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
115 tick_do_timer_cpu = cpu;
116#endif
117
118 /* Check, if the jiffies need an update */
119 if (tick_do_timer_cpu == cpu)
120 tick_do_update_jiffies64(now);
121}
122
123static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
124{
125#ifdef CONFIG_NO_HZ
126 /*
127 * When we are idle and the tick is stopped, we have to touch
128 * the watchdog as we might not schedule for a really long
129 * time. This happens on complete idle SMP systems while
130 * waiting on the login prompt. We also increment the "start of
131 * idle" jiffy stamp so the idle accounting adjustment we do
132 * when we go busy again does not account too much ticks.
133 */
134 if (ts->tick_stopped) {
135 touch_softlockup_watchdog();
136 if (is_idle_task(current))
137 ts->idle_jiffies++;
138 }
139#endif
140 update_process_times(user_mode(regs));
141 profile_tick(CPU_PROFILING);
142}
143
101/* 144/*
102 * NOHZ - aka dynamic tick functionality 145 * NOHZ - aka dynamic tick functionality
103 */ 146 */
@@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
282 325
283 /* Read jiffies and the time when jiffies were updated last */ 326 /* Read jiffies and the time when jiffies were updated last */
284 do { 327 do {
285 seq = read_seqbegin(&xtime_lock); 328 seq = read_seqbegin(&jiffies_lock);
286 last_update = last_jiffies_update; 329 last_update = last_jiffies_update;
287 last_jiffies = jiffies; 330 last_jiffies = jiffies;
288 time_delta = timekeeping_max_deferment(); 331 time_delta = timekeeping_max_deferment();
289 } while (read_seqretry(&xtime_lock, seq)); 332 } while (read_seqretry(&jiffies_lock, seq));
290 333
291 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
292 arch_needs_cpu(cpu)) { 335 arch_needs_cpu(cpu)) {
@@ -526,6 +569,8 @@ void tick_nohz_irq_exit(void)
526 if (!ts->inidle) 569 if (!ts->inidle)
527 return; 570 return;
528 571
572 /* Cancel the timer because CPU already waken up from the C-states*/
573 menu_hrtimer_cancel();
529 __tick_nohz_idle_enter(ts); 574 __tick_nohz_idle_enter(ts);
530} 575}
531 576
@@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void)
621 666
622 ts->inidle = 0; 667 ts->inidle = 0;
623 668
669 /* Cancel the timer because CPU already waken up from the C-states*/
670 menu_hrtimer_cancel();
624 if (ts->idle_active || ts->tick_stopped) 671 if (ts->idle_active || ts->tick_stopped)
625 now = ktime_get(); 672 now = ktime_get();
626 673
@@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)
648{ 695{
649 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 696 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
650 struct pt_regs *regs = get_irq_regs(); 697 struct pt_regs *regs = get_irq_regs();
651 int cpu = smp_processor_id();
652 ktime_t now = ktime_get(); 698 ktime_t now = ktime_get();
653 699
654 dev->next_event.tv64 = KTIME_MAX; 700 dev->next_event.tv64 = KTIME_MAX;
655 701
656 /* 702 tick_sched_do_timer(now);
657 * Check if the do_timer duty was dropped. We don't care about 703 tick_sched_handle(ts, regs);
658 * concurrency: This happens only when the cpu in charge went
659 * into a long sleep. If two cpus happen to assign themself to
660 * this duty, then the jiffies update is still serialized by
661 * xtime_lock.
662 */
663 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
664 tick_do_timer_cpu = cpu;
665
666 /* Check, if the jiffies need an update */
667 if (tick_do_timer_cpu == cpu)
668 tick_do_update_jiffies64(now);
669
670 /*
671 * When we are idle and the tick is stopped, we have to touch
672 * the watchdog as we might not schedule for a really long
673 * time. This happens on complete idle SMP systems while
674 * waiting on the login prompt. We also increment the "start
675 * of idle" jiffy stamp so the idle accounting adjustment we
676 * do when we go busy again does not account too much ticks.
677 */
678 if (ts->tick_stopped) {
679 touch_softlockup_watchdog();
680 ts->idle_jiffies++;
681 }
682
683 update_process_times(user_mode(regs));
684 profile_tick(CPU_PROFILING);
685 704
686 while (tick_nohz_reprogram(ts, now)) { 705 while (tick_nohz_reprogram(ts, now)) {
687 now = ktime_get(); 706 now = ktime_get();
@@ -794,7 +813,7 @@ void tick_check_idle(int cpu)
794#ifdef CONFIG_HIGH_RES_TIMERS 813#ifdef CONFIG_HIGH_RES_TIMERS
795/* 814/*
796 * We rearm the timer until we get disabled by the idle code. 815 * We rearm the timer until we get disabled by the idle code.
797 * Called with interrupts disabled and timer->base->cpu_base->lock held. 816 * Called with interrupts disabled.
798 */ 817 */
799static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 818static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
800{ 819{
@@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
802 container_of(timer, struct tick_sched, sched_timer); 821 container_of(timer, struct tick_sched, sched_timer);
803 struct pt_regs *regs = get_irq_regs(); 822 struct pt_regs *regs = get_irq_regs();
804 ktime_t now = ktime_get(); 823 ktime_t now = ktime_get();
805 int cpu = smp_processor_id();
806 824
807#ifdef CONFIG_NO_HZ 825 tick_sched_do_timer(now);
808 /*
809 * Check if the do_timer duty was dropped. We don't care about
810 * concurrency: This happens only when the cpu in charge went
811 * into a long sleep. If two cpus happen to assign themself to
812 * this duty, then the jiffies update is still serialized by
813 * xtime_lock.
814 */
815 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
816 tick_do_timer_cpu = cpu;
817#endif
818
819 /* Check, if the jiffies need an update */
820 if (tick_do_timer_cpu == cpu)
821 tick_do_update_jiffies64(now);
822 826
823 /* 827 /*
824 * Do not call, when we are not in irq context and have 828 * Do not call, when we are not in irq context and have
825 * no valid regs pointer 829 * no valid regs pointer
826 */ 830 */
827 if (regs) { 831 if (regs)
828 /* 832 tick_sched_handle(ts, regs);
829 * When we are idle and the tick is stopped, we have to touch
830 * the watchdog as we might not schedule for a really long
831 * time. This happens on complete idle SMP systems while
832 * waiting on the login prompt. We also increment the "start of
833 * idle" jiffy stamp so the idle accounting adjustment we do
834 * when we go busy again does not account too much ticks.
835 */
836 if (ts->tick_stopped) {
837 touch_softlockup_watchdog();
838 if (is_idle_task(current))
839 ts->idle_jiffies++;
840 }
841 update_process_times(user_mode(regs));
842 profile_tick(CPU_PROFILING);
843 }
844 833
845 hrtimer_forward(timer, now, tick_period); 834 hrtimer_forward(timer, now, tick_period);
846 835
@@ -874,7 +863,7 @@ void tick_setup_sched_timer(void)
874 /* Get the next period (per cpu) */ 863 /* Get the next period (per cpu) */
875 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 864 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
876 865
877 /* Offset the tick to avert xtime_lock contention. */ 866 /* Offset the tick to avert jiffies_lock contention. */
878 if (sched_skew_tick) { 867 if (sched_skew_tick) {
879 u64 offset = ktime_to_ns(tick_period) >> 1; 868 u64 offset = ktime_to_ns(tick_period) >> 1;
880 do_div(offset, num_possible_cpus()); 869 do_div(offset, num_possible_cpus());
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
deleted file mode 100644
index a9ae369925ce..000000000000
--- a/kernel/time/timecompare.c
+++ /dev/null
@@ -1,193 +0,0 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/math64.h>
24#include <linux/kernel.h>
25
26/*
27 * fixed point arithmetic scale factor for skew
28 *
29 * Usually one would measure skew in ppb (parts per billion, 1e9), but
30 * using a factor of 2 simplifies the math.
31 */
32#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
33
34ktime_t timecompare_transform(struct timecompare *sync,
35 u64 source_tstamp)
36{
37 u64 nsec;
38
39 nsec = source_tstamp + sync->offset;
40 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
41 TIMECOMPARE_SKEW_RESOLUTION;
42
43 return ns_to_ktime(nsec);
44}
45EXPORT_SYMBOL_GPL(timecompare_transform);
46
47int timecompare_offset(struct timecompare *sync,
48 s64 *offset,
49 u64 *source_tstamp)
50{
51 u64 start_source = 0, end_source = 0;
52 struct {
53 s64 offset;
54 s64 duration_target;
55 } buffer[10], sample, *samples;
56 int counter = 0, i;
57 int used;
58 int index;
59 int num_samples = sync->num_samples;
60
61 if (num_samples > ARRAY_SIZE(buffer)) {
62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
63 if (!samples) {
64 samples = buffer;
65 num_samples = ARRAY_SIZE(buffer);
66 }
67 } else {
68 samples = buffer;
69 }
70
71 /* run until we have enough valid samples, but do not try forever */
72 i = 0;
73 counter = 0;
74 while (1) {
75 u64 ts;
76 ktime_t start, end;
77
78 start = sync->target();
79 ts = timecounter_read(sync->source);
80 end = sync->target();
81
82 if (!i)
83 start_source = ts;
84
85 /* ignore negative durations */
86 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
87 if (sample.duration_target >= 0) {
88 /*
89 * assume symetric delay to and from source:
90 * average target time corresponds to measured
91 * source time
92 */
93 sample.offset =
94 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
95 ts;
96
97 /* simple insertion sort based on duration */
98 index = counter - 1;
99 while (index >= 0) {
100 if (samples[index].duration_target <
101 sample.duration_target)
102 break;
103 samples[index + 1] = samples[index];
104 index--;
105 }
106 samples[index + 1] = sample;
107 counter++;
108 }
109
110 i++;
111 if (counter >= num_samples || i >= 100000) {
112 end_source = ts;
113 break;
114 }
115 }
116
117 *source_tstamp = (end_source + start_source) / 2;
118
119 /* remove outliers by only using 75% of the samples */
120 used = counter * 3 / 4;
121 if (!used)
122 used = counter;
123 if (used) {
124 /* calculate average */
125 s64 off = 0;
126 for (index = 0; index < used; index++)
127 off += samples[index].offset;
128 *offset = div_s64(off, used);
129 }
130
131 if (samples && samples != buffer)
132 kfree(samples);
133
134 return used;
135}
136EXPORT_SYMBOL_GPL(timecompare_offset);
137
138void __timecompare_update(struct timecompare *sync,
139 u64 source_tstamp)
140{
141 s64 offset;
142 u64 average_time;
143
144 if (!timecompare_offset(sync, &offset, &average_time))
145 return;
146
147 if (!sync->last_update) {
148 sync->last_update = average_time;
149 sync->offset = offset;
150 sync->skew = 0;
151 } else {
152 s64 delta_nsec = average_time - sync->last_update;
153
154 /* avoid division by negative or small deltas */
155 if (delta_nsec >= 10000) {
156 s64 delta_offset_nsec = offset - sync->offset;
157 s64 skew; /* delta_offset_nsec *
158 TIMECOMPARE_SKEW_RESOLUTION /
159 delta_nsec */
160 u64 divisor;
161
162 /* div_s64() is limited to 32 bit divisor */
163 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
164 divisor = delta_nsec;
165 while (unlikely(divisor >= ((s64)1) << 32)) {
166 /* divide both by 2; beware, right shift
167 of negative value has undefined
168 behavior and can only be used for
169 the positive divisor */
170 skew = div_s64(skew, 2);
171 divisor >>= 1;
172 }
173 skew = div_s64(skew, divisor);
174
175 /*
176 * Calculate new overall skew as 4/16 the
177 * old value and 12/16 the new one. This is
178 * a rather arbitrary tradeoff between
179 * only using the latest measurement (0/16 and
180 * 16/16) and even more weight on past measurements.
181 */
182#define TIMECOMPARE_NEW_SKEW_PER_16 12
183 sync->skew =
184 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
185 sync->skew +
186 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
187 16);
188 sync->last_update = average_time;
189 sync->offset = offset;
190 }
191 }
192}
193EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e424970bb562..cbc6acb0db3f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,16 +21,11 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h>
24 25
25 26
26static struct timekeeper timekeeper; 27static struct timekeeper timekeeper;
27 28
28/*
29 * This read-write spinlock protects us from races in SMP while
30 * playing with xtime.
31 */
32__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
33
34/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
35int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
36 31
@@ -180,6 +175,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
180 return nsec + arch_gettimeoffset(); 175 return nsec + arch_gettimeoffset();
181} 176}
182 177
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
179
180static void update_pvclock_gtod(struct timekeeper *tk)
181{
182 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
183}
184
185/**
186 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
187 *
188 * Must hold write on timekeeper.lock
189 */
190int pvclock_gtod_register_notifier(struct notifier_block *nb)
191{
192 struct timekeeper *tk = &timekeeper;
193 unsigned long flags;
194 int ret;
195
196 write_seqlock_irqsave(&tk->lock, flags);
197 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
198 /* update timekeeping data */
199 update_pvclock_gtod(tk);
200 write_sequnlock_irqrestore(&tk->lock, flags);
201
202 return ret;
203}
204EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
205
206/**
207 * pvclock_gtod_unregister_notifier - unregister a pvclock
208 * timedata update listener
209 *
210 * Must hold write on timekeeper.lock
211 */
212int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
213{
214 struct timekeeper *tk = &timekeeper;
215 unsigned long flags;
216 int ret;
217
218 write_seqlock_irqsave(&tk->lock, flags);
219 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
220 write_sequnlock_irqrestore(&tk->lock, flags);
221
222 return ret;
223}
224EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
225
183/* must hold write on timekeeper.lock */ 226/* must hold write on timekeeper.lock */
184static void timekeeping_update(struct timekeeper *tk, bool clearntp) 227static void timekeeping_update(struct timekeeper *tk, bool clearntp)
185{ 228{
@@ -188,6 +231,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
188 ntp_clear(); 231 ntp_clear();
189 } 232 }
190 update_vsyscall(tk); 233 update_vsyscall(tk);
234 update_pvclock_gtod(tk);
191} 235}
192 236
193/** 237/**
@@ -1299,9 +1343,7 @@ struct timespec get_monotonic_coarse(void)
1299} 1343}
1300 1344
1301/* 1345/*
1302 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1346 * Must hold jiffies_lock
1303 * without sampling the sequence number in xtime_lock.
1304 * jiffies is defined in the linker script...
1305 */ 1347 */
1306void do_timer(unsigned long ticks) 1348void do_timer(unsigned long ticks)
1307{ 1349{
@@ -1389,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1389 */ 1431 */
1390void xtime_update(unsigned long ticks) 1432void xtime_update(unsigned long ticks)
1391{ 1433{
1392 write_seqlock(&xtime_lock); 1434 write_seqlock(&jiffies_lock);
1393 do_timer(ticks); 1435 do_timer(ticks);
1394 write_sequnlock(&xtime_lock); 1436 write_sequnlock(&jiffies_lock);
1395} 1437}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4cea4f41c1d9..5d89335a485f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -119,6 +119,7 @@ config TRACING
119 select BINARY_PRINTF 119 select BINARY_PRINTF
120 select EVENT_TRACING 120 select EVENT_TRACING
121 select TRACE_CLOCK 121 select TRACE_CLOCK
122 select IRQ_WORK
122 123
123config GENERIC_TRACER 124config GENERIC_TRACER
124 bool 125 bool
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9dcf15d38380..3ffe4c5ad3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
10 * Based on code in the latency_tracer, that is: 10 * Based on code in the latency_tracer, that is:
11 * 11 *
12 * Copyright (C) 2004-2006 Ingo Molnar 12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III 13 * Copyright (C) 2004 Nadia Yvette Chambers
14 */ 14 */
15 15
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
@@ -2437,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2437{ 2437{
2438 iter->pos = 0; 2438 iter->pos = 0;
2439 iter->func_pos = 0; 2439 iter->func_pos = 0;
2440 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); 2440 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
2441} 2441}
2442 2442
2443static void *t_start(struct seq_file *m, loff_t *pos) 2443static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2675} 2675}
2676 2676
2677loff_t 2677loff_t
2678ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2678ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2679{ 2679{
2680 loff_t ret; 2680 loff_t ret;
2681 2681
2682 if (file->f_mode & FMODE_READ) 2682 if (file->f_mode & FMODE_READ)
2683 ret = seq_lseek(file, offset, origin); 2683 ret = seq_lseek(file, offset, whence);
2684 else 2684 else
2685 file->f_pos = ret = 1; 2685 file->f_pos = ret = 1;
2686 2686
@@ -2868,7 +2868,7 @@ static int __init ftrace_mod_cmd_init(void)
2868{ 2868{
2869 return register_ftrace_command(&ftrace_mod_cmd); 2869 return register_ftrace_command(&ftrace_mod_cmd);
2870} 2870}
2871device_initcall(ftrace_mod_cmd_init); 2871core_initcall(ftrace_mod_cmd_init);
2872 2872
2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, 2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2874 struct ftrace_ops *op, struct pt_regs *pt_regs) 2874 struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -4055,7 +4055,7 @@ static int __init ftrace_nodyn_init(void)
4055 ftrace_enabled = 1; 4055 ftrace_enabled = 1;
4056 return 0; 4056 return 0;
4057} 4057}
4058device_initcall(ftrace_nodyn_init); 4058core_initcall(ftrace_nodyn_init);
4059 4059
4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4061static inline void ftrace_startup_enable(int command) { } 4061static inline void ftrace_startup_enable(int command) { }
@@ -4381,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
4381 if (strlen(tmp) == 0) 4381 if (strlen(tmp) == 0)
4382 return 1; 4382 return 1;
4383 4383
4384 ret = strict_strtol(tmp, 10, &val); 4384 ret = kstrtol(tmp, 10, &val);
4385 if (ret < 0) 4385 if (ret < 0)
4386 return ret; 4386 return ret;
4387 4387
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b979426d16c6..ce8514feedcd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -460,9 +460,10 @@ struct ring_buffer_per_cpu {
460 unsigned long lost_events; 460 unsigned long lost_events;
461 unsigned long last_overrun; 461 unsigned long last_overrun;
462 local_t entries_bytes; 462 local_t entries_bytes;
463 local_t commit_overrun;
464 local_t overrun;
465 local_t entries; 463 local_t entries;
464 local_t overrun;
465 local_t commit_overrun;
466 local_t dropped_events;
466 local_t committing; 467 local_t committing;
467 local_t commits; 468 local_t commits;
468 unsigned long read; 469 unsigned long read;
@@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1396 struct list_head *head_page_with_bit; 1397 struct list_head *head_page_with_bit;
1397 1398
1398 head_page = &rb_set_head_page(cpu_buffer)->list; 1399 head_page = &rb_set_head_page(cpu_buffer)->list;
1400 if (!head_page)
1401 break;
1399 prev_page = head_page->prev; 1402 prev_page = head_page->prev;
1400 1403
1401 first_page = pages->next; 1404 first_page = pages->next;
@@ -1820,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1820} 1823}
1821 1824
1822/** 1825/**
1823 * ring_buffer_update_event - update event type and data 1826 * rb_update_event - update event type and data
1824 * @event: the even to update 1827 * @event: the even to update
1825 * @type: the type of event 1828 * @type: the type of event
1826 * @length: the size of the event field in the ring buffer 1829 * @length: the size of the event field in the ring buffer
@@ -2155,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2155 * If we are not in overwrite mode, 2158 * If we are not in overwrite mode,
2156 * this is easy, just stop here. 2159 * this is easy, just stop here.
2157 */ 2160 */
2158 if (!(buffer->flags & RB_FL_OVERWRITE)) 2161 if (!(buffer->flags & RB_FL_OVERWRITE)) {
2162 local_inc(&cpu_buffer->dropped_events);
2159 goto out_reset; 2163 goto out_reset;
2164 }
2160 2165
2161 ret = rb_handle_head_page(cpu_buffer, 2166 ret = rb_handle_head_page(cpu_buffer,
2162 tail_page, 2167 tail_page,
@@ -2720,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2720 * and not the length of the event which would hold the header. 2725 * and not the length of the event which would hold the header.
2721 */ 2726 */
2722int ring_buffer_write(struct ring_buffer *buffer, 2727int ring_buffer_write(struct ring_buffer *buffer,
2723 unsigned long length, 2728 unsigned long length,
2724 void *data) 2729 void *data)
2725{ 2730{
2726 struct ring_buffer_per_cpu *cpu_buffer; 2731 struct ring_buffer_per_cpu *cpu_buffer;
2727 struct ring_buffer_event *event; 2732 struct ring_buffer_event *event;
@@ -2929,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2929 * @buffer: The ring buffer 2934 * @buffer: The ring buffer
2930 * @cpu: The per CPU buffer to read from. 2935 * @cpu: The per CPU buffer to read from.
2931 */ 2936 */
2932unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) 2937u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2933{ 2938{
2934 unsigned long flags; 2939 unsigned long flags;
2935 struct ring_buffer_per_cpu *cpu_buffer; 2940 struct ring_buffer_per_cpu *cpu_buffer;
2936 struct buffer_page *bpage; 2941 struct buffer_page *bpage;
2937 unsigned long ret; 2942 u64 ret = 0;
2938 2943
2939 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2944 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2940 return 0; 2945 return 0;
@@ -2949,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2949 bpage = cpu_buffer->reader_page; 2954 bpage = cpu_buffer->reader_page;
2950 else 2955 else
2951 bpage = rb_set_head_page(cpu_buffer); 2956 bpage = rb_set_head_page(cpu_buffer);
2952 ret = bpage->page->time_stamp; 2957 if (bpage)
2958 ret = bpage->page->time_stamp;
2953 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2959 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2954 2960
2955 return ret; 2961 return ret;
@@ -2995,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2995EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 3001EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2996 3002
2997/** 3003/**
2998 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 3004 * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
3005 * buffer wrapping around (only if RB_FL_OVERWRITE is on).
2999 * @buffer: The ring buffer 3006 * @buffer: The ring buffer
3000 * @cpu: The per CPU buffer to get the number of overruns from 3007 * @cpu: The per CPU buffer to get the number of overruns from
3001 */ 3008 */
@@ -3015,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
3015EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 3022EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
3016 3023
3017/** 3024/**
3018 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 3025 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
3026 * commits failing due to the buffer wrapping around while there are uncommitted
3027 * events, such as during an interrupt storm.
3019 * @buffer: The ring buffer 3028 * @buffer: The ring buffer
3020 * @cpu: The per CPU buffer to get the number of overruns from 3029 * @cpu: The per CPU buffer to get the number of overruns from
3021 */ 3030 */
@@ -3036,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
3036EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); 3045EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
3037 3046
3038/** 3047/**
3048 * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
3049 * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
3050 * @buffer: The ring buffer
3051 * @cpu: The per CPU buffer to get the number of overruns from
3052 */
3053unsigned long
3054ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3055{
3056 struct ring_buffer_per_cpu *cpu_buffer;
3057 unsigned long ret;
3058
3059 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3060 return 0;
3061
3062 cpu_buffer = buffer->buffers[cpu];
3063 ret = local_read(&cpu_buffer->dropped_events);
3064
3065 return ret;
3066}
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068
3069/**
3039 * ring_buffer_entries - get the number of entries in a buffer 3070 * ring_buffer_entries - get the number of entries in a buffer
3040 * @buffer: The ring buffer 3071 * @buffer: The ring buffer
3041 * 3072 *
@@ -3260,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3260 * Splice the empty reader page into the list around the head. 3291 * Splice the empty reader page into the list around the head.
3261 */ 3292 */
3262 reader = rb_set_head_page(cpu_buffer); 3293 reader = rb_set_head_page(cpu_buffer);
3294 if (!reader)
3295 goto out;
3263 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); 3296 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
3264 cpu_buffer->reader_page->list.prev = reader->list.prev; 3297 cpu_buffer->reader_page->list.prev = reader->list.prev;
3265 3298
@@ -3778,12 +3811,17 @@ void
3778ring_buffer_read_finish(struct ring_buffer_iter *iter) 3811ring_buffer_read_finish(struct ring_buffer_iter *iter)
3779{ 3812{
3780 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3813 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3814 unsigned long flags;
3781 3815
3782 /* 3816 /*
3783 * Ring buffer is disabled from recording, here's a good place 3817 * Ring buffer is disabled from recording, here's a good place
3784 * to check the integrity of the ring buffer. 3818 * to check the integrity of the ring buffer.
3819 * Must prevent readers from trying to read, as the check
3820 * clears the HEAD page and readers require it.
3785 */ 3821 */
3822 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3786 rb_check_pages(cpu_buffer); 3823 rb_check_pages(cpu_buffer);
3824 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3825
3788 atomic_dec(&cpu_buffer->record_disabled); 3826 atomic_dec(&cpu_buffer->record_disabled);
3789 atomic_dec(&cpu_buffer->buffer->resize_disabled); 3827 atomic_dec(&cpu_buffer->buffer->resize_disabled);
@@ -3864,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3864 local_set(&cpu_buffer->reader_page->page->commit, 0); 3902 local_set(&cpu_buffer->reader_page->page->commit, 0);
3865 cpu_buffer->reader_page->read = 0; 3903 cpu_buffer->reader_page->read = 0;
3866 3904
3867 local_set(&cpu_buffer->commit_overrun, 0);
3868 local_set(&cpu_buffer->entries_bytes, 0); 3905 local_set(&cpu_buffer->entries_bytes, 0);
3869 local_set(&cpu_buffer->overrun, 0); 3906 local_set(&cpu_buffer->overrun, 0);
3907 local_set(&cpu_buffer->commit_overrun, 0);
3908 local_set(&cpu_buffer->dropped_events, 0);
3870 local_set(&cpu_buffer->entries, 0); 3909 local_set(&cpu_buffer->entries, 0);
3871 local_set(&cpu_buffer->committing, 0); 3910 local_set(&cpu_buffer->committing, 0);
3872 local_set(&cpu_buffer->commits, 0); 3911 local_set(&cpu_buffer->commits, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 31e4f55773f1..e5125677efa0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Based on code from the latency_tracer, that is: 10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 Nadia Yvette Chambers
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <generated/utsrelease.h> 15#include <generated/utsrelease.h>
@@ -19,6 +19,7 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/hardirq.h> 25#include <linux/hardirq.h>
@@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
78} 79}
79 80
80/* 81/*
82 * To prevent the comm cache from being overwritten when no
83 * tracing is active, only save the comm when a trace event
84 * occurred.
85 */
86static DEFINE_PER_CPU(bool, trace_cmdline_save);
87
88/*
89 * When a reader is waiting for data, then this variable is
90 * set to true.
91 */
92static bool trace_wakeup_needed;
93
94static struct irq_work trace_work_wakeup;
95
96/*
81 * Kill all tracing for good (never come back). 97 * Kill all tracing for good (never come back).
82 * It is initialized to 1 but will turn to zero if the initialization 98 * It is initialized to 1 but will turn to zero if the initialization
83 * of the tracer is successful. But that is the only place that sets 99 * of the tracer is successful. But that is the only place that sets
@@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str)
139} 155}
140__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 156__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
141 157
158
159static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
160static char *trace_boot_options __initdata;
161
162static int __init set_trace_boot_options(char *str)
163{
164 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
165 trace_boot_options = trace_boot_options_buf;
166 return 0;
167}
168__setup("trace_options=", set_trace_boot_options);
169
142unsigned long long ns2usecs(cycle_t nsec) 170unsigned long long ns2usecs(cycle_t nsec)
143{ 171{
144 nsec += 500; 172 nsec += 500;
@@ -198,20 +226,9 @@ static struct trace_array max_tr;
198 226
199static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); 227static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
200 228
201/* tracer_enabled is used to toggle activation of a tracer */
202static int tracer_enabled = 1;
203
204/**
205 * tracing_is_enabled - return tracer_enabled status
206 *
207 * This function is used by other tracers to know the status
208 * of the tracer_enabled flag. Tracers may use this function
209 * to know if it should enable their features when starting
210 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
211 */
212int tracing_is_enabled(void) 229int tracing_is_enabled(void)
213{ 230{
214 return tracer_enabled; 231 return tracing_is_on();
215} 232}
216 233
217/* 234/*
@@ -333,12 +350,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
333static int trace_stop_count; 350static int trace_stop_count;
334static DEFINE_RAW_SPINLOCK(tracing_start_lock); 351static DEFINE_RAW_SPINLOCK(tracing_start_lock);
335 352
336static void wakeup_work_handler(struct work_struct *work) 353/**
354 * trace_wake_up - wake up tasks waiting for trace input
355 *
356 * Schedules a delayed work to wake up any task that is blocked on the
357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
359 */
360static void trace_wake_up(struct irq_work *work)
337{ 361{
338 wake_up(&trace_wait); 362 wake_up_all(&trace_wait);
339}
340 363
341static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); 364}
342 365
343/** 366/**
344 * tracing_on - enable tracing buffers 367 * tracing_on - enable tracing buffers
@@ -393,22 +416,6 @@ int tracing_is_on(void)
393} 416}
394EXPORT_SYMBOL_GPL(tracing_is_on); 417EXPORT_SYMBOL_GPL(tracing_is_on);
395 418
396/**
397 * trace_wake_up - wake up tasks waiting for trace input
398 *
399 * Schedules a delayed work to wake up any task that is blocked on the
400 * trace_wait queue. These is used with trace_poll for tasks polling the
401 * trace.
402 */
403void trace_wake_up(void)
404{
405 const unsigned long delay = msecs_to_jiffies(2);
406
407 if (trace_flags & TRACE_ITER_BLOCK)
408 return;
409 schedule_delayed_work(&wakeup_work, delay);
410}
411
412static int __init set_buf_size(char *str) 419static int __init set_buf_size(char *str)
413{ 420{
414 unsigned long buf_size; 421 unsigned long buf_size;
@@ -431,7 +438,7 @@ static int __init set_tracing_thresh(char *str)
431 438
432 if (!str) 439 if (!str)
433 return 0; 440 return 0;
434 ret = strict_strtoul(str, 0, &threshold); 441 ret = kstrtoul(str, 0, &threshold);
435 if (ret < 0) 442 if (ret < 0)
436 return 0; 443 return 0;
437 tracing_thresh = threshold * 1000; 444 tracing_thresh = threshold * 1000;
@@ -477,10 +484,12 @@ static const char *trace_options[] = {
477static struct { 484static struct {
478 u64 (*func)(void); 485 u64 (*func)(void);
479 const char *name; 486 const char *name;
487 int in_ns; /* is this clock in nanoseconds? */
480} trace_clocks[] = { 488} trace_clocks[] = {
481 { trace_clock_local, "local" }, 489 { trace_clock_local, "local", 1 },
482 { trace_clock_global, "global" }, 490 { trace_clock_global, "global", 1 },
483 { trace_clock_counter, "counter" }, 491 { trace_clock_counter, "counter", 0 },
492 ARCH_TRACE_CLOCKS
484}; 493};
485 494
486int trace_clock_id; 495int trace_clock_id;
@@ -757,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
757} 766}
758#endif /* CONFIG_TRACER_MAX_TRACE */ 767#endif /* CONFIG_TRACER_MAX_TRACE */
759 768
769static void default_wait_pipe(struct trace_iterator *iter)
770{
771 DEFINE_WAIT(wait);
772
773 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
774
775 /*
776 * The events can happen in critical sections where
777 * checking a work queue can cause deadlocks.
778 * After adding a task to the queue, this flag is set
779 * only to notify events to try to wake up the queue
780 * using irq_work.
781 *
782 * We don't clear it even if the buffer is no longer
783 * empty. The flag only causes the next event to run
784 * irq_work to do the work queue wake up. The worse
785 * that can happen if we race with !trace_empty() is that
786 * an event will cause an irq_work to try to wake up
787 * an empty queue.
788 *
789 * There's no reason to protect this flag either, as
790 * the work queue and irq_work logic will do the necessary
791 * synchronization for the wake ups. The only thing
792 * that is necessary is that the wake up happens after
793 * a task has been queued. It's OK for spurious wake ups.
794 */
795 trace_wakeup_needed = true;
796
797 if (trace_empty(iter))
798 schedule();
799
800 finish_wait(&trace_wait, &wait);
801}
802
760/** 803/**
761 * register_tracer - register a tracer with the ftrace system. 804 * register_tracer - register a tracer with the ftrace system.
762 * @type - the plugin for the tracer 805 * @type - the plugin for the tracer
@@ -875,32 +918,6 @@ int register_tracer(struct tracer *type)
875 return ret; 918 return ret;
876} 919}
877 920
878void unregister_tracer(struct tracer *type)
879{
880 struct tracer **t;
881
882 mutex_lock(&trace_types_lock);
883 for (t = &trace_types; *t; t = &(*t)->next) {
884 if (*t == type)
885 goto found;
886 }
887 pr_info("Tracer %s not registered\n", type->name);
888 goto out;
889
890 found:
891 *t = (*t)->next;
892
893 if (type == current_trace && tracer_enabled) {
894 tracer_enabled = 0;
895 tracing_stop();
896 if (current_trace->stop)
897 current_trace->stop(&global_trace);
898 current_trace = &nop_trace;
899 }
900out:
901 mutex_unlock(&trace_types_lock);
902}
903
904void tracing_reset(struct trace_array *tr, int cpu) 921void tracing_reset(struct trace_array *tr, int cpu)
905{ 922{
906 struct ring_buffer *buffer = tr->buffer; 923 struct ring_buffer *buffer = tr->buffer;
@@ -1131,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[])
1131 1148
1132void tracing_record_cmdline(struct task_struct *tsk) 1149void tracing_record_cmdline(struct task_struct *tsk)
1133{ 1150{
1134 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || 1151 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
1135 !tracing_is_on())
1136 return; 1152 return;
1137 1153
1154 if (!__this_cpu_read(trace_cmdline_save))
1155 return;
1156
1157 __this_cpu_write(trace_cmdline_save, false);
1158
1138 trace_save_cmdline(tsk); 1159 trace_save_cmdline(tsk);
1139} 1160}
1140 1161
@@ -1178,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
1178 return event; 1199 return event;
1179} 1200}
1180 1201
1202void
1203__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1204{
1205 __this_cpu_write(trace_cmdline_save, true);
1206 if (trace_wakeup_needed) {
1207 trace_wakeup_needed = false;
1208 /* irq_work_queue() supplies it's own memory barriers */
1209 irq_work_queue(&trace_work_wakeup);
1210 }
1211 ring_buffer_unlock_commit(buffer, event);
1212}
1213
1181static inline void 1214static inline void
1182__trace_buffer_unlock_commit(struct ring_buffer *buffer, 1215__trace_buffer_unlock_commit(struct ring_buffer *buffer,
1183 struct ring_buffer_event *event, 1216 struct ring_buffer_event *event,
1184 unsigned long flags, int pc, 1217 unsigned long flags, int pc)
1185 int wake)
1186{ 1218{
1187 ring_buffer_unlock_commit(buffer, event); 1219 __buffer_unlock_commit(buffer, event);
1188 1220
1189 ftrace_trace_stack(buffer, flags, 6, pc); 1221 ftrace_trace_stack(buffer, flags, 6, pc);
1190 ftrace_trace_userstack(buffer, flags, pc); 1222 ftrace_trace_userstack(buffer, flags, pc);
1191
1192 if (wake)
1193 trace_wake_up();
1194} 1223}
1195 1224
1196void trace_buffer_unlock_commit(struct ring_buffer *buffer, 1225void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event, 1226 struct ring_buffer_event *event,
1198 unsigned long flags, int pc) 1227 unsigned long flags, int pc)
1199{ 1228{
1200 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1229 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1201} 1230}
1231EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1202 1232
1203struct ring_buffer_event * 1233struct ring_buffer_event *
1204trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1234trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1215,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1215 struct ring_buffer_event *event, 1245 struct ring_buffer_event *event,
1216 unsigned long flags, int pc) 1246 unsigned long flags, int pc)
1217{ 1247{
1218 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1248 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1219} 1249}
1220EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1250EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
1221 1251
1222void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, 1252void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1223 struct ring_buffer_event *event, 1253 struct ring_buffer_event *event,
1224 unsigned long flags, int pc) 1254 unsigned long flags, int pc,
1255 struct pt_regs *regs)
1225{ 1256{
1226 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); 1257 __buffer_unlock_commit(buffer, event);
1227}
1228EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1229
1230void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1231 struct ring_buffer_event *event,
1232 unsigned long flags, int pc,
1233 struct pt_regs *regs)
1234{
1235 ring_buffer_unlock_commit(buffer, event);
1236 1258
1237 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); 1259 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1238 ftrace_trace_userstack(buffer, flags, pc); 1260 ftrace_trace_userstack(buffer, flags, pc);
1239} 1261}
1240EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); 1262EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
1241 1263
1242void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1264void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1243 struct ring_buffer_event *event) 1265 struct ring_buffer_event *event)
@@ -1269,7 +1291,7 @@ trace_function(struct trace_array *tr,
1269 entry->parent_ip = parent_ip; 1291 entry->parent_ip = parent_ip;
1270 1292
1271 if (!filter_check_discard(call, entry, buffer, event)) 1293 if (!filter_check_discard(call, entry, buffer, event))
1272 ring_buffer_unlock_commit(buffer, event); 1294 __buffer_unlock_commit(buffer, event);
1273} 1295}
1274 1296
1275void 1297void
@@ -1362,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1362 entry->size = trace.nr_entries; 1384 entry->size = trace.nr_entries;
1363 1385
1364 if (!filter_check_discard(call, entry, buffer, event)) 1386 if (!filter_check_discard(call, entry, buffer, event))
1365 ring_buffer_unlock_commit(buffer, event); 1387 __buffer_unlock_commit(buffer, event);
1366 1388
1367 out: 1389 out:
1368 /* Again, don't let gcc optimize things here */ 1390 /* Again, don't let gcc optimize things here */
@@ -1458,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1458 1480
1459 save_stack_trace_user(&trace); 1481 save_stack_trace_user(&trace);
1460 if (!filter_check_discard(call, entry, buffer, event)) 1482 if (!filter_check_discard(call, entry, buffer, event))
1461 ring_buffer_unlock_commit(buffer, event); 1483 __buffer_unlock_commit(buffer, event);
1462 1484
1463 out_drop_count: 1485 out_drop_count:
1464 __this_cpu_dec(user_stack_count); 1486 __this_cpu_dec(user_stack_count);
@@ -1559,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void)
1559 return -ENOMEM; 1581 return -ENOMEM;
1560} 1582}
1561 1583
1584static int buffers_allocated;
1585
1562void trace_printk_init_buffers(void) 1586void trace_printk_init_buffers(void)
1563{ 1587{
1564 static int buffers_allocated;
1565
1566 if (buffers_allocated) 1588 if (buffers_allocated)
1567 return; 1589 return;
1568 1590
@@ -1571,7 +1593,38 @@ void trace_printk_init_buffers(void)
1571 1593
1572 pr_info("ftrace: Allocated trace_printk buffers\n"); 1594 pr_info("ftrace: Allocated trace_printk buffers\n");
1573 1595
1596 /* Expand the buffers to set size */
1597 tracing_update_buffers();
1598
1574 buffers_allocated = 1; 1599 buffers_allocated = 1;
1600
1601 /*
1602 * trace_printk_init_buffers() can be called by modules.
1603 * If that happens, then we need to start cmdline recording
1604 * directly here. If the global_trace.buffer is already
1605 * allocated here, then this was called by module code.
1606 */
1607 if (global_trace.buffer)
1608 tracing_start_cmdline_record();
1609}
1610
1611void trace_printk_start_comm(void)
1612{
1613 /* Start tracing comms if trace printk is set */
1614 if (!buffers_allocated)
1615 return;
1616 tracing_start_cmdline_record();
1617}
1618
1619static void trace_printk_start_stop_comm(int enabled)
1620{
1621 if (!buffers_allocated)
1622 return;
1623
1624 if (enabled)
1625 tracing_start_cmdline_record();
1626 else
1627 tracing_stop_cmdline_record();
1575} 1628}
1576 1629
1577/** 1630/**
@@ -1622,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1622 1675
1623 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 1676 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1624 if (!filter_check_discard(call, entry, buffer, event)) { 1677 if (!filter_check_discard(call, entry, buffer, event)) {
1625 ring_buffer_unlock_commit(buffer, event); 1678 __buffer_unlock_commit(buffer, event);
1626 ftrace_trace_stack(buffer, flags, 6, pc); 1679 ftrace_trace_stack(buffer, flags, 6, pc);
1627 } 1680 }
1628 1681
@@ -1693,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr,
1693 memcpy(&entry->buf, tbuffer, len); 1746 memcpy(&entry->buf, tbuffer, len);
1694 entry->buf[len] = '\0'; 1747 entry->buf[len] = '\0';
1695 if (!filter_check_discard(call, entry, buffer, event)) { 1748 if (!filter_check_discard(call, entry, buffer, event)) {
1696 ring_buffer_unlock_commit(buffer, event); 1749 __buffer_unlock_commit(buffer, event);
1697 ftrace_trace_stack(buffer, flags, 6, pc); 1750 ftrace_trace_stack(buffer, flags, 6, pc);
1698 } 1751 }
1699 out: 1752 out:
@@ -2426,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file)
2426 if (ring_buffer_overruns(iter->tr->buffer)) 2479 if (ring_buffer_overruns(iter->tr->buffer))
2427 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2480 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2428 2481
2482 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2483 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485
2429 /* stop the trace while dumping */ 2486 /* stop the trace while dumping */
2430 tracing_stop(); 2487 tracing_stop();
2431 2488
@@ -2794,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2794 2851
2795 if (mask == TRACE_ITER_OVERWRITE) 2852 if (mask == TRACE_ITER_OVERWRITE)
2796 ring_buffer_change_overwrite(global_trace.buffer, enabled); 2853 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2854
2855 if (mask == TRACE_ITER_PRINTK)
2856 trace_printk_start_stop_comm(enabled);
2797} 2857}
2798 2858
2799static ssize_t 2859static int trace_set_options(char *option)
2800tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2801 size_t cnt, loff_t *ppos)
2802{ 2860{
2803 char buf[64];
2804 char *cmp; 2861 char *cmp;
2805 int neg = 0; 2862 int neg = 0;
2806 int ret; 2863 int ret = 0;
2807 int i; 2864 int i;
2808 2865
2809 if (cnt >= sizeof(buf)) 2866 cmp = strstrip(option);
2810 return -EINVAL;
2811
2812 if (copy_from_user(&buf, ubuf, cnt))
2813 return -EFAULT;
2814
2815 buf[cnt] = 0;
2816 cmp = strstrip(buf);
2817 2867
2818 if (strncmp(cmp, "no", 2) == 0) { 2868 if (strncmp(cmp, "no", 2) == 0) {
2819 neg = 1; 2869 neg = 1;
@@ -2832,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2832 mutex_lock(&trace_types_lock); 2882 mutex_lock(&trace_types_lock);
2833 ret = set_tracer_option(current_trace, cmp, neg); 2883 ret = set_tracer_option(current_trace, cmp, neg);
2834 mutex_unlock(&trace_types_lock); 2884 mutex_unlock(&trace_types_lock);
2835 if (ret)
2836 return ret;
2837 } 2885 }
2838 2886
2887 return ret;
2888}
2889
2890static ssize_t
2891tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2892 size_t cnt, loff_t *ppos)
2893{
2894 char buf[64];
2895
2896 if (cnt >= sizeof(buf))
2897 return -EINVAL;
2898
2899 if (copy_from_user(&buf, ubuf, cnt))
2900 return -EFAULT;
2901
2902 trace_set_options(buf);
2903
2839 *ppos += cnt; 2904 *ppos += cnt;
2840 2905
2841 return cnt; 2906 return cnt;
@@ -2940,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
2940}; 3005};
2941 3006
2942static ssize_t 3007static ssize_t
2943tracing_ctrl_read(struct file *filp, char __user *ubuf,
2944 size_t cnt, loff_t *ppos)
2945{
2946 char buf[64];
2947 int r;
2948
2949 r = sprintf(buf, "%u\n", tracer_enabled);
2950 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2951}
2952
2953static ssize_t
2954tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2955 size_t cnt, loff_t *ppos)
2956{
2957 struct trace_array *tr = filp->private_data;
2958 unsigned long val;
2959 int ret;
2960
2961 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2962 if (ret)
2963 return ret;
2964
2965 val = !!val;
2966
2967 mutex_lock(&trace_types_lock);
2968 if (tracer_enabled ^ val) {
2969
2970 /* Only need to warn if this is used to change the state */
2971 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2972
2973 if (val) {
2974 tracer_enabled = 1;
2975 if (current_trace->start)
2976 current_trace->start(tr);
2977 tracing_start();
2978 } else {
2979 tracer_enabled = 0;
2980 tracing_stop();
2981 if (current_trace->stop)
2982 current_trace->stop(tr);
2983 }
2984 }
2985 mutex_unlock(&trace_types_lock);
2986
2987 *ppos += cnt;
2988
2989 return cnt;
2990}
2991
2992static ssize_t
2993tracing_set_trace_read(struct file *filp, char __user *ubuf, 3008tracing_set_trace_read(struct file *filp, char __user *ubuf,
2994 size_t cnt, loff_t *ppos) 3009 size_t cnt, loff_t *ppos)
2995{ 3010{
@@ -3019,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
3019 tr->data[cpu]->entries = val; 3034 tr->data[cpu]->entries = val;
3020} 3035}
3021 3036
3037/* resize @tr's buffer to the size of @size_tr's entries */
3038static int resize_buffer_duplicate_size(struct trace_array *tr,
3039 struct trace_array *size_tr, int cpu_id)
3040{
3041 int cpu, ret = 0;
3042
3043 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3044 for_each_tracing_cpu(cpu) {
3045 ret = ring_buffer_resize(tr->buffer,
3046 size_tr->data[cpu]->entries, cpu);
3047 if (ret < 0)
3048 break;
3049 tr->data[cpu]->entries = size_tr->data[cpu]->entries;
3050 }
3051 } else {
3052 ret = ring_buffer_resize(tr->buffer,
3053 size_tr->data[cpu_id]->entries, cpu_id);
3054 if (ret == 0)
3055 tr->data[cpu_id]->entries =
3056 size_tr->data[cpu_id]->entries;
3057 }
3058
3059 return ret;
3060}
3061
3022static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3062static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3023{ 3063{
3024 int ret; 3064 int ret;
@@ -3030,6 +3070,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3030 */ 3070 */
3031 ring_buffer_expanded = 1; 3071 ring_buffer_expanded = 1;
3032 3072
3073 /* May be called before buffers are initialized */
3074 if (!global_trace.buffer)
3075 return 0;
3076
3033 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3077 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
3034 if (ret < 0) 3078 if (ret < 0)
3035 return ret; 3079 return ret;
@@ -3039,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3039 3083
3040 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3084 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
3041 if (ret < 0) { 3085 if (ret < 0) {
3042 int r = 0; 3086 int r = resize_buffer_duplicate_size(&global_trace,
3043 3087 &global_trace, cpu);
3044 if (cpu == RING_BUFFER_ALL_CPUS) {
3045 int i;
3046 for_each_tracing_cpu(i) {
3047 r = ring_buffer_resize(global_trace.buffer,
3048 global_trace.data[i]->entries,
3049 i);
3050 if (r < 0)
3051 break;
3052 }
3053 } else {
3054 r = ring_buffer_resize(global_trace.buffer,
3055 global_trace.data[cpu]->entries,
3056 cpu);
3057 }
3058
3059 if (r < 0) { 3088 if (r < 0) {
3060 /* 3089 /*
3061 * AARGH! We are left with different 3090 * AARGH! We are left with different
@@ -3193,17 +3222,11 @@ static int tracing_set_tracer(const char *buf)
3193 3222
3194 topts = create_trace_option_files(t); 3223 topts = create_trace_option_files(t);
3195 if (t->use_max_tr) { 3224 if (t->use_max_tr) {
3196 int cpu;
3197 /* we need to make per cpu buffer sizes equivalent */ 3225 /* we need to make per cpu buffer sizes equivalent */
3198 for_each_tracing_cpu(cpu) { 3226 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3199 ret = ring_buffer_resize(max_tr.buffer, 3227 RING_BUFFER_ALL_CPUS);
3200 global_trace.data[cpu]->entries, 3228 if (ret < 0)
3201 cpu); 3229 goto out;
3202 if (ret < 0)
3203 goto out;
3204 max_tr.data[cpu]->entries =
3205 global_trace.data[cpu]->entries;
3206 }
3207 } 3230 }
3208 3231
3209 if (t->init) { 3232 if (t->init) {
@@ -3325,6 +3348,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3325 if (trace_flags & TRACE_ITER_LATENCY_FMT) 3348 if (trace_flags & TRACE_ITER_LATENCY_FMT)
3326 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3349 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3327 3350
3351 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3352 if (trace_clocks[trace_clock_id].in_ns)
3353 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3354
3328 iter->cpu_file = cpu_file; 3355 iter->cpu_file = cpu_file;
3329 iter->tr = &global_trace; 3356 iter->tr = &global_trace;
3330 mutex_init(&iter->mutex); 3357 mutex_init(&iter->mutex);
@@ -3385,19 +3412,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3385 } 3412 }
3386} 3413}
3387 3414
3388
3389void default_wait_pipe(struct trace_iterator *iter)
3390{
3391 DEFINE_WAIT(wait);
3392
3393 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3394
3395 if (trace_empty(iter))
3396 schedule();
3397
3398 finish_wait(&trace_wait, &wait);
3399}
3400
3401/* 3415/*
3402 * This is a make-shift waitqueue. 3416 * This is a make-shift waitqueue.
3403 * A tracer might use this callback on some rare cases: 3417 * A tracer might use this callback on some rare cases:
@@ -3438,7 +3452,7 @@ static int tracing_wait_pipe(struct file *filp)
3438 return -EINTR; 3452 return -EINTR;
3439 3453
3440 /* 3454 /*
3441 * We block until we read something and tracing is disabled. 3455 * We block until we read something and tracing is enabled.
3442 * We still block if tracing is disabled, but we have never 3456 * We still block if tracing is disabled, but we have never
3443 * read anything. This allows a user to cat this file, and 3457 * read anything. This allows a user to cat this file, and
3444 * then enable tracing. But after we have read something, 3458 * then enable tracing. But after we have read something,
@@ -3446,7 +3460,7 @@ static int tracing_wait_pipe(struct file *filp)
3446 * 3460 *
3447 * iter->pos will be 0 if we haven't read anything. 3461 * iter->pos will be 0 if we haven't read anything.
3448 */ 3462 */
3449 if (!tracer_enabled && iter->pos) 3463 if (tracing_is_enabled() && iter->pos)
3450 break; 3464 break;
3451 } 3465 }
3452 3466
@@ -3955,7 +3969,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3955 } else 3969 } else
3956 entry->buf[cnt] = '\0'; 3970 entry->buf[cnt] = '\0';
3957 3971
3958 ring_buffer_unlock_commit(buffer, event); 3972 __buffer_unlock_commit(buffer, event);
3959 3973
3960 written = cnt; 3974 written = cnt;
3961 3975
@@ -4016,6 +4030,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4016 if (max_tr.buffer) 4030 if (max_tr.buffer)
4017 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4031 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
4018 4032
4033 /*
4034 * New clock may not be consistent with the previous clock.
4035 * Reset the buffer so that it doesn't have incomparable timestamps.
4036 */
4037 tracing_reset_online_cpus(&global_trace);
4038 if (max_tr.buffer)
4039 tracing_reset_online_cpus(&max_tr);
4040
4019 mutex_unlock(&trace_types_lock); 4041 mutex_unlock(&trace_types_lock);
4020 4042
4021 *fpos += cnt; 4043 *fpos += cnt;
@@ -4037,13 +4059,6 @@ static const struct file_operations tracing_max_lat_fops = {
4037 .llseek = generic_file_llseek, 4059 .llseek = generic_file_llseek,
4038}; 4060};
4039 4061
4040static const struct file_operations tracing_ctrl_fops = {
4041 .open = tracing_open_generic,
4042 .read = tracing_ctrl_read,
4043 .write = tracing_ctrl_write,
4044 .llseek = generic_file_llseek,
4045};
4046
4047static const struct file_operations set_tracer_fops = { 4062static const struct file_operations set_tracer_fops = {
4048 .open = tracing_open_generic, 4063 .open = tracing_open_generic,
4049 .read = tracing_set_trace_read, 4064 .read = tracing_set_trace_read,
@@ -4260,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4260 return -ENOMEM; 4275 return -ENOMEM;
4261 4276
4262 if (*ppos & (PAGE_SIZE - 1)) { 4277 if (*ppos & (PAGE_SIZE - 1)) {
4263 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
4264 ret = -EINVAL; 4278 ret = -EINVAL;
4265 goto out; 4279 goto out;
4266 } 4280 }
4267 4281
4268 if (len & (PAGE_SIZE - 1)) { 4282 if (len & (PAGE_SIZE - 1)) {
4269 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
4270 if (len < PAGE_SIZE) { 4283 if (len < PAGE_SIZE) {
4271 ret = -EINVAL; 4284 ret = -EINVAL;
4272 goto out; 4285 goto out;
@@ -4377,13 +4390,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4377 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 4390 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4378 trace_seq_printf(s, "bytes: %ld\n", cnt); 4391 trace_seq_printf(s, "bytes: %ld\n", cnt);
4379 4392
4380 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 4393 if (trace_clocks[trace_clock_id].in_ns) {
4381 usec_rem = do_div(t, USEC_PER_SEC); 4394 /* local or global for trace_clock */
4382 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); 4395 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4396 usec_rem = do_div(t, USEC_PER_SEC);
4397 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4398 t, usec_rem);
4383 4399
4384 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 4400 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4385 usec_rem = do_div(t, USEC_PER_SEC); 4401 usec_rem = do_div(t, USEC_PER_SEC);
4386 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); 4402 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4403 } else {
4404 /* counter or tsc mode for trace_clock */
4405 trace_seq_printf(s, "oldest event ts: %llu\n",
4406 ring_buffer_oldest_event_ts(tr->buffer, cpu));
4407
4408 trace_seq_printf(s, "now ts: %llu\n",
4409 ring_buffer_time_stamp(tr->buffer, cpu));
4410 }
4411
4412 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4413 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4387 4414
4388 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4415 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4389 4416
@@ -4815,9 +4842,6 @@ static __init int tracer_init_debugfs(void)
4815 4842
4816 d_tracer = tracing_init_dentry(); 4843 d_tracer = tracing_init_dentry();
4817 4844
4818 trace_create_file("tracing_enabled", 0644, d_tracer,
4819 &global_trace, &tracing_ctrl_fops);
4820
4821 trace_create_file("trace_options", 0644, d_tracer, 4845 trace_create_file("trace_options", 0644, d_tracer,
4822 NULL, &tracing_iter_fops); 4846 NULL, &tracing_iter_fops);
4823 4847
@@ -5089,6 +5113,7 @@ __init static int tracer_alloc_buffers(void)
5089 5113
5090 /* Only allocate trace_printk buffers if a trace_printk exists */ 5114 /* Only allocate trace_printk buffers if a trace_printk exists */
5091 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) 5115 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5116 /* Must be called before global_trace.buffer is allocated */
5092 trace_printk_init_buffers(); 5117 trace_printk_init_buffers();
5093 5118
5094 /* To save memory, keep the ring buffer size to its minimum */ 5119 /* To save memory, keep the ring buffer size to its minimum */
@@ -5136,6 +5161,7 @@ __init static int tracer_alloc_buffers(void)
5136#endif 5161#endif
5137 5162
5138 trace_init_cmdlines(); 5163 trace_init_cmdlines();
5164 init_irq_work(&trace_work_wakeup, trace_wake_up);
5139 5165
5140 register_tracer(&nop_trace); 5166 register_tracer(&nop_trace);
5141 current_trace = &nop_trace; 5167 current_trace = &nop_trace;
@@ -5147,6 +5173,13 @@ __init static int tracer_alloc_buffers(void)
5147 5173
5148 register_die_notifier(&trace_die_notifier); 5174 register_die_notifier(&trace_die_notifier);
5149 5175
5176 while (trace_boot_options) {
5177 char *option;
5178
5179 option = strsep(&trace_boot_options, ",");
5180 trace_set_options(option);
5181 }
5182
5150 return 0; 5183 return 0;
5151 5184
5152out_free_cpumask: 5185out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c15f528c1af4..c75d7988902c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -285,8 +285,8 @@ struct tracer {
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 285 int (*set_flag)(u32 old_flags, u32 bit, int set);
286 struct tracer *next; 286 struct tracer *next;
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 int print_max; 288 bool print_max;
289 int use_max_tr; 289 bool use_max_tr;
290}; 290};
291 291
292 292
@@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
327 327
328int tracer_init(struct tracer *t, struct trace_array *tr); 328int tracer_init(struct tracer *t, struct trace_array *tr);
329int tracing_is_enabled(void); 329int tracing_is_enabled(void);
330void trace_wake_up(void);
331void tracing_reset(struct trace_array *tr, int cpu); 330void tracing_reset(struct trace_array *tr, int cpu);
332void tracing_reset_online_cpus(struct trace_array *tr); 331void tracing_reset_online_cpus(struct trace_array *tr);
333void tracing_reset_current(int cpu); 332void tracing_reset_current(int cpu);
@@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
349 unsigned long len, 348 unsigned long len,
350 unsigned long flags, 349 unsigned long flags,
351 int pc); 350 int pc);
352void trace_buffer_unlock_commit(struct ring_buffer *buffer,
353 struct ring_buffer_event *event,
354 unsigned long flags, int pc);
355 351
356struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 352struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
357 struct trace_array_cpu *data); 353 struct trace_array_cpu *data);
@@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
359struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 355struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
360 int *ent_cpu, u64 *ent_ts); 356 int *ent_cpu, u64 *ent_ts);
361 357
358void __buffer_unlock_commit(struct ring_buffer *buffer,
359 struct ring_buffer_event *event);
360
362int trace_empty(struct trace_iterator *iter); 361int trace_empty(struct trace_iterator *iter);
363 362
364void *trace_find_next_entry_inc(struct trace_iterator *iter); 363void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
367 366
368void tracing_iter_reset(struct trace_iterator *iter, int cpu); 367void tracing_iter_reset(struct trace_iterator *iter, int cpu);
369 368
370void default_wait_pipe(struct trace_iterator *iter);
371void poll_wait_pipe(struct trace_iterator *iter); 369void poll_wait_pipe(struct trace_iterator *iter);
372 370
373void ftrace(struct trace_array *tr, 371void ftrace(struct trace_array *tr,
@@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
407void tracing_stop_sched_switch_record(void); 405void tracing_stop_sched_switch_record(void);
408void tracing_start_sched_switch_record(void); 406void tracing_start_sched_switch_record(void);
409int register_tracer(struct tracer *type); 407int register_tracer(struct tracer *type);
410void unregister_tracer(struct tracer *type);
411int is_tracing_stopped(void); 408int is_tracing_stopped(void);
412enum trace_file_type {
413 TRACE_FILE_LAT_FMT = 1,
414 TRACE_FILE_ANNOTATE = 2,
415};
416 409
417extern cpumask_var_t __read_mostly tracing_buffer_mask; 410extern cpumask_var_t __read_mostly tracing_buffer_mask;
418 411
@@ -841,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[];
841extern const char *__stop___trace_bprintk_fmt[]; 834extern const char *__stop___trace_bprintk_fmt[];
842 835
843void trace_printk_init_buffers(void); 836void trace_printk_init_buffers(void);
837void trace_printk_start_comm(void);
844 838
845#undef FTRACE_ENTRY 839#undef FTRACE_ENTRY
846#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 840#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8d3538b4ea5f..95e96842ed29 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
77 entry->correct = val == expect; 77 entry->correct = val == expect;
78 78
79 if (!filter_check_discard(call, entry, buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
80 ring_buffer_unlock_commit(buffer, event); 80 __buffer_unlock_commit(buffer, event);
81 81
82 out: 82 out:
83 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
199 } 199 }
200 return register_tracer(&branch_trace); 200 return register_tracer(&branch_trace);
201} 201}
202device_initcall(init_branch_tracer); 202core_initcall(init_branch_tracer);
203 203
204#else 204#else
205static inline 205static inline
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d608d09d08c0..880073d0b946 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p)
491 mutex_unlock(&event_mutex); 491 mutex_unlock(&event_mutex);
492} 492}
493 493
494static int
495ftrace_event_seq_open(struct inode *inode, struct file *file)
496{
497 const struct seq_operations *seq_ops;
498
499 if ((file->f_mode & FMODE_WRITE) &&
500 (file->f_flags & O_TRUNC))
501 ftrace_clear_events();
502
503 seq_ops = inode->i_private;
504 return seq_open(file, seq_ops);
505}
506
507static ssize_t 494static ssize_t
508event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 495event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
509 loff_t *ppos) 496 loff_t *ppos)
@@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
980 return r; 967 return r;
981} 968}
982 969
970static int ftrace_event_avail_open(struct inode *inode, struct file *file);
971static int ftrace_event_set_open(struct inode *inode, struct file *file);
972
983static const struct seq_operations show_event_seq_ops = { 973static const struct seq_operations show_event_seq_ops = {
984 .start = t_start, 974 .start = t_start,
985 .next = t_next, 975 .next = t_next,
@@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = {
995}; 985};
996 986
997static const struct file_operations ftrace_avail_fops = { 987static const struct file_operations ftrace_avail_fops = {
998 .open = ftrace_event_seq_open, 988 .open = ftrace_event_avail_open,
999 .read = seq_read, 989 .read = seq_read,
1000 .llseek = seq_lseek, 990 .llseek = seq_lseek,
1001 .release = seq_release, 991 .release = seq_release,
1002}; 992};
1003 993
1004static const struct file_operations ftrace_set_event_fops = { 994static const struct file_operations ftrace_set_event_fops = {
1005 .open = ftrace_event_seq_open, 995 .open = ftrace_event_set_open,
1006 .read = seq_read, 996 .read = seq_read,
1007 .write = ftrace_event_write, 997 .write = ftrace_event_write,
1008 .llseek = seq_lseek, 998 .llseek = seq_lseek,
@@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void)
1078 return d_events; 1068 return d_events;
1079} 1069}
1080 1070
1071static int
1072ftrace_event_avail_open(struct inode *inode, struct file *file)
1073{
1074 const struct seq_operations *seq_ops = &show_event_seq_ops;
1075
1076 return seq_open(file, seq_ops);
1077}
1078
1079static int
1080ftrace_event_set_open(struct inode *inode, struct file *file)
1081{
1082 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1083
1084 if ((file->f_mode & FMODE_WRITE) &&
1085 (file->f_flags & O_TRUNC))
1086 ftrace_clear_events();
1087
1088 return seq_open(file, seq_ops);
1089}
1090
1081static struct dentry * 1091static struct dentry *
1082event_subsystem_dir(const char *name, struct dentry *d_events) 1092event_subsystem_dir(const char *name, struct dentry *d_events)
1083{ 1093{
@@ -1489,6 +1499,9 @@ static __init int event_trace_enable(void)
1489 if (ret) 1499 if (ret)
1490 pr_warn("Failed to enable trace event: %s\n", token); 1500 pr_warn("Failed to enable trace event: %s\n", token);
1491 } 1501 }
1502
1503 trace_printk_start_comm();
1504
1492 return 0; 1505 return 0;
1493} 1506}
1494 1507
@@ -1505,15 +1518,13 @@ static __init int event_trace_init(void)
1505 return 0; 1518 return 0;
1506 1519
1507 entry = debugfs_create_file("available_events", 0444, d_tracer, 1520 entry = debugfs_create_file("available_events", 0444, d_tracer,
1508 (void *)&show_event_seq_ops, 1521 NULL, &ftrace_avail_fops);
1509 &ftrace_avail_fops);
1510 if (!entry) 1522 if (!entry)
1511 pr_warning("Could not create debugfs " 1523 pr_warning("Could not create debugfs "
1512 "'available_events' entry\n"); 1524 "'available_events' entry\n");
1513 1525
1514 entry = debugfs_create_file("set_event", 0644, d_tracer, 1526 entry = debugfs_create_file("set_event", 0644, d_tracer,
1515 (void *)&show_set_event_seq_ops, 1527 NULL, &ftrace_set_event_fops);
1516 &ftrace_set_event_fops);
1517 if (!entry) 1528 if (!entry)
1518 pr_warning("Could not create debugfs " 1529 pr_warning("Could not create debugfs "
1519 "'set_event' entry\n"); 1530 "'set_event' entry\n");
@@ -1749,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
1749 entry->ip = ip; 1760 entry->ip = ip;
1750 entry->parent_ip = parent_ip; 1761 entry->parent_ip = parent_ip;
1751 1762
1752 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1763 trace_buffer_unlock_commit(buffer, event, flags, pc);
1753 1764
1754 out: 1765 out:
1755 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1766 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c154797a7ff7..e5b0ca8b8d4d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps,
1000 } 1000 }
1001 } else { 1001 } else {
1002 if (field->is_signed) 1002 if (field->is_signed)
1003 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = kstrtoll(pred->regex.pattern, 0, &val);
1004 else 1004 else
1005 ret = strict_strtoull(pred->regex.pattern, 0, &val); 1005 ret = kstrtoull(pred->regex.pattern, 0, &val);
1006 if (ret) { 1006 if (ret) {
1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
1008 return -EINVAL; 1008 return -EINVAL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 507a7a9630bf..8e3ad8082ab7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/ring_buffer.h> 12#include <linux/ring_buffer.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -366,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
366 * We use the callback data field (which is a pointer) 366 * We use the callback data field (which is a pointer)
367 * as our counter. 367 * as our counter.
368 */ 368 */
369 ret = strict_strtoul(number, 0, (unsigned long *)&count); 369 ret = kstrtoul(number, 0, (unsigned long *)&count);
370 if (ret) 370 if (ret)
371 return ret; 371 return ret;
372 372
@@ -411,5 +411,4 @@ static __init int init_function_trace(void)
411 init_func_cmd_traceon(); 411 init_func_cmd_traceon();
412 return register_tracer(&function_trace); 412 return register_tracer(&function_trace);
413} 413}
414device_initcall(init_function_trace); 414core_initcall(init_function_trace);
415
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 99b4378393d5..4edb4b74eb7e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -223,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr,
223 entry = ring_buffer_event_data(event); 223 entry = ring_buffer_event_data(event);
224 entry->graph_ent = *trace; 224 entry->graph_ent = *trace;
225 if (!filter_current_check_discard(buffer, call, entry, event)) 225 if (!filter_current_check_discard(buffer, call, entry, event))
226 ring_buffer_unlock_commit(buffer, event); 226 __buffer_unlock_commit(buffer, event);
227 227
228 return 1; 228 return 1;
229} 229}
@@ -327,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr,
327 entry = ring_buffer_event_data(event); 327 entry = ring_buffer_event_data(event);
328 entry->ret = *trace; 328 entry->ret = *trace;
329 if (!filter_current_check_discard(buffer, call, entry, event)) 329 if (!filter_current_check_discard(buffer, call, entry, event))
330 ring_buffer_unlock_commit(buffer, event); 330 __buffer_unlock_commit(buffer, event);
331} 331}
332 332
333void trace_graph_return(struct ftrace_graph_ret *trace) 333void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1474,4 +1474,4 @@ static __init int init_graph_trace(void)
1474 return register_tracer(&graph_trace); 1474 return register_tracer(&graph_trace);
1475} 1475}
1476 1476
1477device_initcall(init_graph_trace); 1477core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d98ee8283b29..713a2cac4881 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
7 * From code in the latency_tracer, that is: 7 * From code in the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -604,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly =
604 .reset = irqsoff_tracer_reset, 604 .reset = irqsoff_tracer_reset,
605 .start = irqsoff_tracer_start, 605 .start = irqsoff_tracer_start,
606 .stop = irqsoff_tracer_stop, 606 .stop = irqsoff_tracer_stop,
607 .print_max = 1, 607 .print_max = true,
608 .print_header = irqsoff_print_header, 608 .print_header = irqsoff_print_header,
609 .print_line = irqsoff_print_line, 609 .print_line = irqsoff_print_line,
610 .flags = &tracer_flags, 610 .flags = &tracer_flags,
@@ -614,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly =
614#endif 614#endif
615 .open = irqsoff_trace_open, 615 .open = irqsoff_trace_open,
616 .close = irqsoff_trace_close, 616 .close = irqsoff_trace_close,
617 .use_max_tr = 1, 617 .use_max_tr = true,
618}; 618};
619# define register_irqsoff(trace) register_tracer(&trace) 619# define register_irqsoff(trace) register_tracer(&trace)
620#else 620#else
@@ -637,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly =
637 .reset = irqsoff_tracer_reset, 637 .reset = irqsoff_tracer_reset,
638 .start = irqsoff_tracer_start, 638 .start = irqsoff_tracer_start,
639 .stop = irqsoff_tracer_stop, 639 .stop = irqsoff_tracer_stop,
640 .print_max = 1, 640 .print_max = true,
641 .print_header = irqsoff_print_header, 641 .print_header = irqsoff_print_header,
642 .print_line = irqsoff_print_line, 642 .print_line = irqsoff_print_line,
643 .flags = &tracer_flags, 643 .flags = &tracer_flags,
@@ -647,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly =
647#endif 647#endif
648 .open = irqsoff_trace_open, 648 .open = irqsoff_trace_open,
649 .close = irqsoff_trace_close, 649 .close = irqsoff_trace_close,
650 .use_max_tr = 1, 650 .use_max_tr = true,
651}; 651};
652# define register_preemptoff(trace) register_tracer(&trace) 652# define register_preemptoff(trace) register_tracer(&trace)
653#else 653#else
@@ -672,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
672 .reset = irqsoff_tracer_reset, 672 .reset = irqsoff_tracer_reset,
673 .start = irqsoff_tracer_start, 673 .start = irqsoff_tracer_start,
674 .stop = irqsoff_tracer_stop, 674 .stop = irqsoff_tracer_stop,
675 .print_max = 1, 675 .print_max = true,
676 .print_header = irqsoff_print_header, 676 .print_header = irqsoff_print_header,
677 .print_line = irqsoff_print_line, 677 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags, 678 .flags = &tracer_flags,
@@ -682,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
682#endif 682#endif
683 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
684 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
685 .use_max_tr = 1, 685 .use_max_tr = true,
686}; 686};
687 687
688# define register_preemptirqsoff(trace) register_tracer(&trace) 688# define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -698,4 +698,4 @@ __init static int init_irqsoff_tracer(void)
698 698
699 return 0; 699 return 0;
700} 700}
701device_initcall(init_irqsoff_tracer); 701core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1a2117043bb1..1865d5f76538 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* an address specified */ 446 /* an address specified */
447 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); 447 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
448 if (ret) { 448 if (ret) {
449 pr_info("Failed to parse address.\n"); 449 pr_info("Failed to parse address.\n");
450 return ret; 450 return ret;
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
752 752
753 if (!filter_current_check_discard(buffer, call, entry, event)) 753 if (!filter_current_check_discard(buffer, call, entry, event))
754 trace_nowake_buffer_unlock_commit_regs(buffer, event, 754 trace_buffer_unlock_commit_regs(buffer, event,
755 irq_flags, pc, regs); 755 irq_flags, pc, regs);
756} 756}
757 757
758/* Kretprobe handler */ 758/* Kretprobe handler */
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
785 785
786 if (!filter_current_check_discard(buffer, call, entry, event)) 786 if (!filter_current_check_discard(buffer, call, entry, event))
787 trace_nowake_buffer_unlock_commit_regs(buffer, event, 787 trace_buffer_unlock_commit_regs(buffer, event,
788 irq_flags, pc, regs); 788 irq_flags, pc, regs);
789} 789}
790 790
791/* Event entry printers */ 791/* Event entry printers */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 123b189c732c..194d79602dc7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
610 return trace_print_lat_fmt(s, entry); 610 return trace_print_lat_fmt(s, entry);
611} 611}
612 612
613static unsigned long preempt_mark_thresh = 100; 613static unsigned long preempt_mark_thresh_us = 100;
614 614
615static int 615static int
616lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, 616lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617 unsigned long rel_usecs)
618{ 617{
619 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, 618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
620 rel_usecs > preempt_mark_thresh ? '!' : 619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
621 rel_usecs > 1 ? '+' : ' '); 620 unsigned long long abs_ts = iter->ts - iter->tr->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq;
623
624 if (in_ns) {
625 abs_ts = ns2usecs(abs_ts);
626 rel_ts = ns2usecs(rel_ts);
627 }
628
629 if (verbose && in_ns) {
630 unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
631 unsigned long abs_msec = (unsigned long)abs_ts;
632 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
633 unsigned long rel_msec = (unsigned long)rel_ts;
634
635 return trace_seq_printf(
636 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
637 ns2usecs(iter->ts),
638 abs_msec, abs_usec,
639 rel_msec, rel_usec);
640 } else if (verbose && !in_ns) {
641 return trace_seq_printf(
642 s, "[%016llx] %lld (+%lld): ",
643 iter->ts, abs_ts, rel_ts);
644 } else if (!verbose && in_ns) {
645 return trace_seq_printf(
646 s, " %4lldus%c: ",
647 abs_ts,
648 rel_ts > preempt_mark_thresh_us ? '!' :
649 rel_ts > 1 ? '+' : ' ');
650 } else { /* !verbose && !in_ns */
651 return trace_seq_printf(s, " %4lld: ", abs_ts);
652 }
622} 653}
623 654
624int trace_print_context(struct trace_iterator *iter) 655int trace_print_context(struct trace_iterator *iter)
625{ 656{
626 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
627 struct trace_entry *entry = iter->ent; 658 struct trace_entry *entry = iter->ent;
628 unsigned long long t = ns2usecs(iter->ts); 659 unsigned long long t;
629 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 660 unsigned long secs, usec_rem;
630 unsigned long secs = (unsigned long)t;
631 char comm[TASK_COMM_LEN]; 661 char comm[TASK_COMM_LEN];
632 int ret; 662 int ret;
633 663
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter)
644 return 0; 674 return 0;
645 } 675 }
646 676
647 return trace_seq_printf(s, " %5lu.%06lu: ", 677 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
648 secs, usec_rem); 678 t = ns2usecs(iter->ts);
679 usec_rem = do_div(t, USEC_PER_SEC);
680 secs = (unsigned long)t;
681 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
682 } else
683 return trace_seq_printf(s, " %12llu: ", iter->ts);
649} 684}
650 685
651int trace_print_lat_context(struct trace_iterator *iter) 686int trace_print_lat_context(struct trace_iterator *iter)
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter)
659 *next_entry = trace_find_next_entry(iter, NULL, 694 *next_entry = trace_find_next_entry(iter, NULL,
660 &next_ts); 695 &next_ts);
661 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 696 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
662 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
663 unsigned long rel_usecs;
664 697
665 /* Restore the original ent_size */ 698 /* Restore the original ent_size */
666 iter->ent_size = ent_size; 699 iter->ent_size = ent_size;
667 700
668 if (!next_entry) 701 if (!next_entry)
669 next_ts = iter->ts; 702 next_ts = iter->ts;
670 rel_usecs = ns2usecs(next_ts - iter->ts);
671 703
672 if (verbose) { 704 if (verbose) {
673 char comm[TASK_COMM_LEN]; 705 char comm[TASK_COMM_LEN];
674 706
675 trace_find_cmdline(entry->pid, comm); 707 trace_find_cmdline(entry->pid, comm);
676 708
677 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" 709 ret = trace_seq_printf(
678 " %ld.%03ldms (+%ld.%03ldms): ", comm, 710 s, "%16s %5d %3d %d %08x %08lx ",
679 entry->pid, iter->cpu, entry->flags, 711 comm, entry->pid, iter->cpu, entry->flags,
680 entry->preempt_count, iter->idx, 712 entry->preempt_count, iter->idx);
681 ns2usecs(iter->ts),
682 abs_usecs / USEC_PER_MSEC,
683 abs_usecs % USEC_PER_MSEC,
684 rel_usecs / USEC_PER_MSEC,
685 rel_usecs % USEC_PER_MSEC);
686 } else { 713 } else {
687 ret = lat_print_generic(s, entry, iter->cpu); 714 ret = lat_print_generic(s, entry, iter->cpu);
688 if (ret)
689 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
690 } 715 }
691 716
717 if (ret)
718 ret = lat_print_timestamp(iter, next_ts);
719
692 return ret; 720 return ret;
693} 721}
694 722
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index daa9980153af..412e959709b4 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type)
441 goto fail; 441 goto fail;
442 442
443 type++; 443 type++;
444 if (strict_strtoul(type, 0, &bs)) 444 if (kstrtoul(type, 0, &bs))
445 goto fail; 445 goto fail;
446 446
447 switch (bs) { 447 switch (bs) {
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
501 501
502 tmp = strchr(symbol, '+'); 502 tmp = strchr(symbol, '+');
503 if (tmp) { 503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */ 504 /* skip sign because kstrtoul doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset); 505 ret = kstrtoul(tmp + 1, 0, offset);
506 if (ret) 506 if (ret)
507 return ret; 507 return ret;
508 508
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
533 else 533 else
534 ret = -EINVAL; 534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) { 535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param); 536 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK) 537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL; 538 ret = -EINVAL;
539 else { 539 else {
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
579 579
580 case '@': /* memory or symbol */ 580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) { 581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param); 582 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret) 583 if (ret)
584 break; 584 break;
585 585
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
597 break; 597 break;
598 598
599 case '+': /* deref memory */ 599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */ 600 arg++; /* Skip '+', because kstrtol() rejects it. */
601 case '-': 601 case '-':
602 tmp = strchr(arg, '('); 602 tmp = strchr(arg, '(');
603 if (!tmp) 603 if (!tmp)
604 break; 604 break;
605 605
606 *tmp = '\0'; 606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset); 607 ret = kstrtol(arg, 0, &offset);
608 608
609 if (ret) 609 if (ret)
610 break; 610 break;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7e62c0a18456..3374c792ccd8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!filter_check_discard(call, entry, buffer, event))
105 ring_buffer_unlock_commit(buffer, event); 105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106 ftrace_trace_stack(tr->buffer, flags, 6, pc);
107 ftrace_trace_userstack(tr->buffer, flags, pc);
108} 106}
109 107
110static void 108static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 02170c00c413..9fe45fcefca0 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
@@ -589,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly =
589 .reset = wakeup_tracer_reset, 589 .reset = wakeup_tracer_reset,
590 .start = wakeup_tracer_start, 590 .start = wakeup_tracer_start,
591 .stop = wakeup_tracer_stop, 591 .stop = wakeup_tracer_stop,
592 .print_max = 1, 592 .print_max = true,
593 .print_header = wakeup_print_header, 593 .print_header = wakeup_print_header,
594 .print_line = wakeup_print_line, 594 .print_line = wakeup_print_line,
595 .flags = &tracer_flags, 595 .flags = &tracer_flags,
@@ -599,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly =
599#endif 599#endif
600 .open = wakeup_trace_open, 600 .open = wakeup_trace_open,
601 .close = wakeup_trace_close, 601 .close = wakeup_trace_close,
602 .use_max_tr = 1, 602 .use_max_tr = true,
603}; 603};
604 604
605static struct tracer wakeup_rt_tracer __read_mostly = 605static struct tracer wakeup_rt_tracer __read_mostly =
@@ -610,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
610 .start = wakeup_tracer_start, 610 .start = wakeup_tracer_start,
611 .stop = wakeup_tracer_stop, 611 .stop = wakeup_tracer_stop,
612 .wait_pipe = poll_wait_pipe, 612 .wait_pipe = poll_wait_pipe,
613 .print_max = 1, 613 .print_max = true,
614 .print_header = wakeup_print_header, 614 .print_header = wakeup_print_header,
615 .print_line = wakeup_print_line, 615 .print_line = wakeup_print_line,
616 .flags = &tracer_flags, 616 .flags = &tracer_flags,
@@ -620,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
620#endif 620#endif
621 .open = wakeup_trace_open, 621 .open = wakeup_trace_open,
622 .close = wakeup_trace_close, 622 .close = wakeup_trace_close,
623 .use_max_tr = 1, 623 .use_max_tr = true,
624}; 624};
625 625
626__init static int init_wakeup_tracer(void) 626__init static int init_wakeup_tracer(void)
@@ -637,4 +637,4 @@ __init static int init_wakeup_tracer(void)
637 637
638 return 0; 638 return 0;
639} 639}
640device_initcall(init_wakeup_tracer); 640core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2c00a691a540..47623169a815 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
320 int (*func)(void)) 320 int (*func)(void))
321{ 321{
322 int save_ftrace_enabled = ftrace_enabled; 322 int save_ftrace_enabled = ftrace_enabled;
323 int save_tracer_enabled = tracer_enabled;
324 unsigned long count; 323 unsigned long count;
325 char *func_name; 324 char *func_name;
326 int ret; 325 int ret;
@@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
331 330
332 /* enable tracing, and record the filter function */ 331 /* enable tracing, and record the filter function */
333 ftrace_enabled = 1; 332 ftrace_enabled = 1;
334 tracer_enabled = 1;
335 333
336 /* passed in by parameter to fool gcc from optimizing */ 334 /* passed in by parameter to fool gcc from optimizing */
337 func(); 335 func();
@@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
395 393
396 out: 394 out:
397 ftrace_enabled = save_ftrace_enabled; 395 ftrace_enabled = save_ftrace_enabled;
398 tracer_enabled = save_tracer_enabled;
399 396
400 /* Enable tracing on all functions again */ 397 /* Enable tracing on all functions again */
401 ftrace_set_global_filter(NULL, 0, 1); 398 ftrace_set_global_filter(NULL, 0, 1);
@@ -452,7 +449,6 @@ static int
452trace_selftest_function_recursion(void) 449trace_selftest_function_recursion(void)
453{ 450{
454 int save_ftrace_enabled = ftrace_enabled; 451 int save_ftrace_enabled = ftrace_enabled;
455 int save_tracer_enabled = tracer_enabled;
456 char *func_name; 452 char *func_name;
457 int len; 453 int len;
458 int ret; 454 int ret;
@@ -465,7 +461,6 @@ trace_selftest_function_recursion(void)
465 461
466 /* enable tracing, and record the filter function */ 462 /* enable tracing, and record the filter function */
467 ftrace_enabled = 1; 463 ftrace_enabled = 1;
468 tracer_enabled = 1;
469 464
470 /* Handle PPC64 '.' name */ 465 /* Handle PPC64 '.' name */
471 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 466 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -534,7 +529,6 @@ trace_selftest_function_recursion(void)
534 ret = 0; 529 ret = 0;
535out: 530out:
536 ftrace_enabled = save_ftrace_enabled; 531 ftrace_enabled = save_ftrace_enabled;
537 tracer_enabled = save_tracer_enabled;
538 532
539 return ret; 533 return ret;
540} 534}
@@ -569,7 +563,6 @@ static int
569trace_selftest_function_regs(void) 563trace_selftest_function_regs(void)
570{ 564{
571 int save_ftrace_enabled = ftrace_enabled; 565 int save_ftrace_enabled = ftrace_enabled;
572 int save_tracer_enabled = tracer_enabled;
573 char *func_name; 566 char *func_name;
574 int len; 567 int len;
575 int ret; 568 int ret;
@@ -586,7 +579,6 @@ trace_selftest_function_regs(void)
586 579
587 /* enable tracing, and record the filter function */ 580 /* enable tracing, and record the filter function */
588 ftrace_enabled = 1; 581 ftrace_enabled = 1;
589 tracer_enabled = 1;
590 582
591 /* Handle PPC64 '.' name */ 583 /* Handle PPC64 '.' name */
592 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 584 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -648,7 +640,6 @@ trace_selftest_function_regs(void)
648 ret = 0; 640 ret = 0;
649out: 641out:
650 ftrace_enabled = save_ftrace_enabled; 642 ftrace_enabled = save_ftrace_enabled;
651 tracer_enabled = save_tracer_enabled;
652 643
653 return ret; 644 return ret;
654} 645}
@@ -662,7 +653,6 @@ int
662trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 653trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
663{ 654{
664 int save_ftrace_enabled = ftrace_enabled; 655 int save_ftrace_enabled = ftrace_enabled;
665 int save_tracer_enabled = tracer_enabled;
666 unsigned long count; 656 unsigned long count;
667 int ret; 657 int ret;
668 658
@@ -671,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
671 661
672 /* start the tracing */ 662 /* start the tracing */
673 ftrace_enabled = 1; 663 ftrace_enabled = 1;
674 tracer_enabled = 1;
675 664
676 ret = tracer_init(trace, tr); 665 ret = tracer_init(trace, tr);
677 if (ret) { 666 if (ret) {
@@ -708,7 +697,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
708 ret = trace_selftest_function_regs(); 697 ret = trace_selftest_function_regs();
709 out: 698 out:
710 ftrace_enabled = save_ftrace_enabled; 699 ftrace_enabled = save_ftrace_enabled;
711 tracer_enabled = save_tracer_enabled;
712 700
713 /* kill ftrace totally if we failed */ 701 /* kill ftrace totally if we failed */
714 if (ret) 702 if (ret)
@@ -1106,6 +1094,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1106 tracing_stop(); 1094 tracing_stop();
1107 /* check both trace buffers */ 1095 /* check both trace buffers */
1108 ret = trace_test_buffer(tr, NULL); 1096 ret = trace_test_buffer(tr, NULL);
1097 printk("ret = %d\n", ret);
1109 if (!ret) 1098 if (!ret)
1110 ret = trace_test_buffer(&max_tr, &count); 1099 ret = trace_test_buffer(&max_tr, &count);
1111 1100
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0c1b165778e5..42ca822fc701 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
33static arch_spinlock_t max_stack_lock = 33static arch_spinlock_t max_stack_lock =
34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
35 35
36static int stack_trace_disabled __read_mostly;
37static DEFINE_PER_CPU(int, trace_active); 36static DEFINE_PER_CPU(int, trace_active);
38static DEFINE_MUTEX(stack_sysctl_mutex); 37static DEFINE_MUTEX(stack_sysctl_mutex);
39 38
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
116{ 115{
117 int cpu; 116 int cpu;
118 117
119 if (unlikely(!ftrace_enabled || stack_trace_disabled))
120 return;
121
122 preempt_disable_notrace(); 118 preempt_disable_notrace();
123 119
124 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 2485a7d09b11..7609dd6714c2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event,
21static int syscall_exit_register(struct ftrace_event_call *event, 21static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type, void *data); 22 enum trace_reg type, void *data);
23 23
24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call);
26
27static struct list_head * 24static struct list_head *
28syscall_get_enter_fields(struct ftrace_event_call *call) 25syscall_get_enter_fields(struct ftrace_event_call *call)
29{ 26{
@@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
32 return &entry->enter_fields; 29 return &entry->enter_fields;
33} 30}
34 31
35struct trace_event_functions enter_syscall_print_funcs = {
36 .trace = print_syscall_enter,
37};
38
39struct trace_event_functions exit_syscall_print_funcs = {
40 .trace = print_syscall_exit,
41};
42
43struct ftrace_event_class event_class_syscall_enter = {
44 .system = "syscalls",
45 .reg = syscall_enter_register,
46 .define_fields = syscall_enter_define_fields,
47 .get_fields = syscall_get_enter_fields,
48 .raw_init = init_syscall_trace,
49};
50
51struct ftrace_event_class event_class_syscall_exit = {
52 .system = "syscalls",
53 .reg = syscall_exit_register,
54 .define_fields = syscall_exit_define_fields,
55 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
56 .raw_init = init_syscall_trace,
57};
58
59extern struct syscall_metadata *__start_syscalls_metadata[]; 32extern struct syscall_metadata *__start_syscalls_metadata[];
60extern struct syscall_metadata *__stop_syscalls_metadata[]; 33extern struct syscall_metadata *__stop_syscalls_metadata[];
61 34
@@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
432 mutex_unlock(&syscall_trace_lock); 405 mutex_unlock(&syscall_trace_lock);
433} 406}
434 407
435int init_syscall_trace(struct ftrace_event_call *call) 408static int init_syscall_trace(struct ftrace_event_call *call)
436{ 409{
437 int id; 410 int id;
438 int num; 411 int num;
@@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call)
457 return id; 430 return id;
458} 431}
459 432
433struct trace_event_functions enter_syscall_print_funcs = {
434 .trace = print_syscall_enter,
435};
436
437struct trace_event_functions exit_syscall_print_funcs = {
438 .trace = print_syscall_exit,
439};
440
441struct ftrace_event_class event_class_syscall_enter = {
442 .system = "syscalls",
443 .reg = syscall_enter_register,
444 .define_fields = syscall_enter_define_fields,
445 .get_fields = syscall_get_enter_fields,
446 .raw_init = init_syscall_trace,
447};
448
449struct ftrace_event_class event_class_syscall_exit = {
450 .system = "syscalls",
451 .reg = syscall_exit_register,
452 .define_fields = syscall_exit_define_fields,
453 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
454 .raw_init = init_syscall_trace,
455};
456
460unsigned long __init __weak arch_syscall_addr(int nr) 457unsigned long __init __weak arch_syscall_addr(int nr)
461{ 458{
462 return (unsigned long)sys_call_table[nr]; 459 return (unsigned long)sys_call_table[nr];
@@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
537 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
538} 535}
539 536
540int perf_sysenter_enable(struct ftrace_event_call *call) 537static int perf_sysenter_enable(struct ftrace_event_call *call)
541{ 538{
542 int ret = 0; 539 int ret = 0;
543 int num; 540 int num;
@@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
558 return ret; 555 return ret;
559} 556}
560 557
561void perf_sysenter_disable(struct ftrace_event_call *call) 558static void perf_sysenter_disable(struct ftrace_event_call *call)
562{ 559{
563 int num; 560 int num;
564 561
@@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
615 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 612 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
616} 613}
617 614
618int perf_sysexit_enable(struct ftrace_event_call *call) 615static int perf_sysexit_enable(struct ftrace_event_call *call)
619{ 616{
620 int ret = 0; 617 int ret = 0;
621 int num; 618 int num;
@@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
636 return ret; 633 return ret;
637} 634}
638 635
639void perf_sysexit_disable(struct ftrace_event_call *call) 636static void perf_sysexit_disable(struct ftrace_event_call *call)
640{ 637{
641 int num; 638 int num;
642 639
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 03003cd7dd96..c86e6d4f67fb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,6 +22,7 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/uprobes.h> 23#include <linux/uprobes.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/string.h>
25 26
26#include "trace_probe.h" 27#include "trace_probe.h"
27 28
@@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv)
189 if (argv[0][0] == '-') 190 if (argv[0][0] == '-')
190 is_delete = true; 191 is_delete = true;
191 else if (argv[0][0] != 'p') { 192 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); 193 pr_info("Probe definition must be started with 'p' or '-'.\n");
193 return -EINVAL; 194 return -EINVAL;
194 } 195 }
195 196
@@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv)
252 if (ret) 253 if (ret)
253 goto fail_address_parse; 254 goto fail_address_parse;
254 255
255 ret = strict_strtoul(arg, 0, &offset); 256 ret = kstrtoul(arg, 0, &offset);
256 if (ret) 257 if (ret)
257 goto fail_address_parse; 258 goto fail_address_parse;
258 259
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv)
263 264
264 /* setup a probe */ 265 /* setup a probe */
265 if (!event) { 266 if (!event) {
266 char *tail = strrchr(filename, '/'); 267 char *tail;
267 char *ptr; 268 char *ptr;
268 269
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); 270 tail = kstrdup(kbasename(filename), GFP_KERNEL);
270 if (!ptr) { 271 if (!tail) {
271 ret = -ENOMEM; 272 ret = -ENOMEM;
272 goto fail_address_parse; 273 goto fail_address_parse;
273 } 274 }
274 275
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_"); 276 ptr = strpbrk(tail, ".-_");
277 if (ptr) 277 if (ptr)
278 *ptr = '\0'; 278 *ptr = '\0';
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9ec..33acb5e53a5f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h>
19 20
20/* 21/*
21 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
51 }, 52 },
52 .owner = GLOBAL_ROOT_UID, 53 .owner = GLOBAL_ROOT_UID,
53 .group = GLOBAL_ROOT_GID, 54 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO,
54}; 56};
55EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
56 58
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba34..2b042c42fbc4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h>
12#include <linux/highuid.h> 13#include <linux/highuid.h>
13#include <linux/cred.h> 14#include <linux/cred.h>
14#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
26static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 27static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
27 struct uid_gid_map *map); 28 struct uid_gid_map *map);
28 29
30static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
31{
32 /* Start with the same capabilities as init but useless for doing
33 * anything as the capabilities are bound to the new user namespace.
34 */
35 cred->securebits = SECUREBITS_DEFAULT;
36 cred->cap_inheritable = CAP_EMPTY_SET;
37 cred->cap_permitted = CAP_FULL_SET;
38 cred->cap_effective = CAP_FULL_SET;
39 cred->cap_bset = CAP_FULL_SET;
40#ifdef CONFIG_KEYS
41 key_put(cred->request_key_auth);
42 cred->request_key_auth = NULL;
43#endif
44 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
45 cred->user_ns = user_ns;
46}
47
29/* 48/*
30 * Create a new user namespace, deriving the creator from the user in the 49 * Create a new user namespace, deriving the creator from the user in the
31 * passed credentials, and replacing that user with the new root user for the 50 * passed credentials, and replacing that user with the new root user for the
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
39 struct user_namespace *ns, *parent_ns = new->user_ns; 58 struct user_namespace *ns, *parent_ns = new->user_ns;
40 kuid_t owner = new->euid; 59 kuid_t owner = new->euid;
41 kgid_t group = new->egid; 60 kgid_t group = new->egid;
61 int ret;
42 62
43 /* The creator needs a mapping in the parent user namespace 63 /* The creator needs a mapping in the parent user namespace
44 * or else we won't be able to reasonably tell userspace who 64 * or else we won't be able to reasonably tell userspace who
@@ -52,38 +72,45 @@ int create_user_ns(struct cred *new)
52 if (!ns) 72 if (!ns)
53 return -ENOMEM; 73 return -ENOMEM;
54 74
75 ret = proc_alloc_inum(&ns->proc_inum);
76 if (ret) {
77 kmem_cache_free(user_ns_cachep, ns);
78 return ret;
79 }
80
55 kref_init(&ns->kref); 81 kref_init(&ns->kref);
82 /* Leave the new->user_ns reference with the new user namespace. */
56 ns->parent = parent_ns; 83 ns->parent = parent_ns;
57 ns->owner = owner; 84 ns->owner = owner;
58 ns->group = group; 85 ns->group = group;
59 86
60 /* Start with the same capabilities as init but useless for doing 87 set_cred_user_ns(new, ns);
61 * anything as the capabilities are bound to the new user namespace.
62 */
63 new->securebits = SECUREBITS_DEFAULT;
64 new->cap_inheritable = CAP_EMPTY_SET;
65 new->cap_permitted = CAP_FULL_SET;
66 new->cap_effective = CAP_FULL_SET;
67 new->cap_bset = CAP_FULL_SET;
68#ifdef CONFIG_KEYS
69 key_put(new->request_key_auth);
70 new->request_key_auth = NULL;
71#endif
72 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
73
74 /* Leave the new->user_ns reference with the new user namespace. */
75 /* Leave the reference to our user_ns with the new cred. */
76 new->user_ns = ns;
77 88
78 return 0; 89 return 0;
79} 90}
80 91
92int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
93{
94 struct cred *cred;
95
96 if (!(unshare_flags & CLONE_NEWUSER))
97 return 0;
98
99 cred = prepare_creds();
100 if (!cred)
101 return -ENOMEM;
102
103 *new_cred = cred;
104 return create_user_ns(cred);
105}
106
81void free_user_ns(struct kref *kref) 107void free_user_ns(struct kref *kref)
82{ 108{
83 struct user_namespace *parent, *ns = 109 struct user_namespace *parent, *ns =
84 container_of(kref, struct user_namespace, kref); 110 container_of(kref, struct user_namespace, kref);
85 111
86 parent = ns->parent; 112 parent = ns->parent;
113 proc_free_inum(ns->proc_inum);
87 kmem_cache_free(user_ns_cachep, ns); 114 kmem_cache_free(user_ns_cachep, ns);
88 put_user_ns(parent); 115 put_user_ns(parent);
89} 116}
@@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
372 struct user_namespace *lower_ns; 399 struct user_namespace *lower_ns;
373 uid_t lower; 400 uid_t lower;
374 401
375 lower_ns = current_user_ns(); 402 lower_ns = seq_user_ns(seq);
376 if ((lower_ns == ns) && lower_ns->parent) 403 if ((lower_ns == ns) && lower_ns->parent)
377 lower_ns = lower_ns->parent; 404 lower_ns = lower_ns->parent;
378 405
@@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
393 struct user_namespace *lower_ns; 420 struct user_namespace *lower_ns;
394 gid_t lower; 421 gid_t lower;
395 422
396 lower_ns = current_user_ns(); 423 lower_ns = seq_user_ns(seq);
397 if ((lower_ns == ns) && lower_ns->parent) 424 if ((lower_ns == ns) && lower_ns->parent)
398 lower_ns = lower_ns->parent; 425 lower_ns = lower_ns->parent;
399 426
@@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
669{ 696{
670 struct seq_file *seq = file->private_data; 697 struct seq_file *seq = file->private_data;
671 struct user_namespace *ns = seq->private; 698 struct user_namespace *ns = seq->private;
699 struct user_namespace *seq_ns = seq_user_ns(seq);
672 700
673 if (!ns->parent) 701 if (!ns->parent)
674 return -EPERM; 702 return -EPERM;
675 703
704 if ((seq_ns != ns) && (seq_ns != ns->parent))
705 return -EPERM;
706
676 return map_write(file, buf, size, ppos, CAP_SETUID, 707 return map_write(file, buf, size, ppos, CAP_SETUID,
677 &ns->uid_map, &ns->parent->uid_map); 708 &ns->uid_map, &ns->parent->uid_map);
678} 709}
@@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
681{ 712{
682 struct seq_file *seq = file->private_data; 713 struct seq_file *seq = file->private_data;
683 struct user_namespace *ns = seq->private; 714 struct user_namespace *ns = seq->private;
715 struct user_namespace *seq_ns = seq_user_ns(seq);
684 716
685 if (!ns->parent) 717 if (!ns->parent)
686 return -EPERM; 718 return -EPERM;
687 719
720 if ((seq_ns != ns) && (seq_ns != ns->parent))
721 return -EPERM;
722
688 return map_write(file, buf, size, ppos, CAP_SETGID, 723 return map_write(file, buf, size, ppos, CAP_SETGID,
689 &ns->gid_map, &ns->parent->gid_map); 724 &ns->gid_map, &ns->parent->gid_map);
690} 725}
@@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
709static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 744static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
710 struct uid_gid_map *new_map) 745 struct uid_gid_map *new_map)
711{ 746{
747 /* Allow mapping to your own filesystem ids */
748 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
749 u32 id = new_map->extent[0].lower_first;
750 if (cap_setid == CAP_SETUID) {
751 kuid_t uid = make_kuid(ns->parent, id);
752 if (uid_eq(uid, current_fsuid()))
753 return true;
754 }
755 else if (cap_setid == CAP_SETGID) {
756 kgid_t gid = make_kgid(ns->parent, id);
757 if (gid_eq(gid, current_fsgid()))
758 return true;
759 }
760 }
761
712 /* Allow anyone to set a mapping that doesn't require privilege */ 762 /* Allow anyone to set a mapping that doesn't require privilege */
713 if (!cap_valid(cap_setid)) 763 if (!cap_valid(cap_setid))
714 return true; 764 return true;
@@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
722 return false; 772 return false;
723} 773}
724 774
775static void *userns_get(struct task_struct *task)
776{
777 struct user_namespace *user_ns;
778
779 rcu_read_lock();
780 user_ns = get_user_ns(__task_cred(task)->user_ns);
781 rcu_read_unlock();
782
783 return user_ns;
784}
785
786static void userns_put(void *ns)
787{
788 put_user_ns(ns);
789}
790
791static int userns_install(struct nsproxy *nsproxy, void *ns)
792{
793 struct user_namespace *user_ns = ns;
794 struct cred *cred;
795
796 /* Don't allow gaining capabilities by reentering
797 * the same user namespace.
798 */
799 if (user_ns == current_user_ns())
800 return -EINVAL;
801
802 /* Threaded processes may not enter a different user namespace */
803 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL;
805
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM;
808
809 cred = prepare_creds();
810 if (!cred)
811 return -ENOMEM;
812
813 put_user_ns(cred->user_ns);
814 set_cred_user_ns(cred, get_user_ns(user_ns));
815
816 return commit_creds(cred);
817}
818
819static unsigned int userns_inum(void *ns)
820{
821 struct user_namespace *user_ns = ns;
822 return user_ns->proc_inum;
823}
824
825const struct proc_ns_operations userns_operations = {
826 .name = "user",
827 .type = CLONE_NEWUSER,
828 .get = userns_get,
829 .put = userns_put,
830 .install = userns_install,
831 .inum = userns_inum,
832};
833
725static __init int user_namespaces_init(void) 834static __init int user_namespaces_init(void)
726{ 835{
727 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 836 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..08b197e8c485 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void)
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
37{ 37{
38 struct uts_namespace *ns; 38 struct uts_namespace *ns;
39 int err;
39 40
40 ns = create_uts_ns(); 41 ns = create_uts_ns();
41 if (!ns) 42 if (!ns)
42 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
43 44
45 err = proc_alloc_inum(&ns->proc_inum);
46 if (err) {
47 kfree(ns);
48 return ERR_PTR(err);
49 }
50
44 down_read(&uts_sem); 51 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 ns->user_ns = get_user_ns(user_ns);
47 up_read(&uts_sem); 54 up_read(&uts_sem);
48 return ns; 55 return ns;
49} 56}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
55 * versa. 62 * versa.
56 */ 63 */
57struct uts_namespace *copy_utsname(unsigned long flags, 64struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk) 65 struct user_namespace *user_ns, struct uts_namespace *old_ns)
59{ 66{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns; 67 struct uts_namespace *new_ns;
62 68
63 BUG_ON(!old_ns); 69 BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
66 if (!(flags & CLONE_NEWUTS)) 72 if (!(flags & CLONE_NEWUTS))
67 return old_ns; 73 return old_ns;
68 74
69 new_ns = clone_uts_ns(tsk, old_ns); 75 new_ns = clone_uts_ns(user_ns, old_ns);
70 76
71 put_uts_ns(old_ns); 77 put_uts_ns(old_ns);
72 return new_ns; 78 return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
78 84
79 ns = container_of(kref, struct uts_namespace, kref); 85 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns); 86 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum);
81 kfree(ns); 88 kfree(ns);
82} 89}
83 90
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
102 put_uts_ns(ns); 109 put_uts_ns(ns);
103} 110}
104 111
105static int utsns_install(struct nsproxy *nsproxy, void *ns) 112static int utsns_install(struct nsproxy *nsproxy, void *new)
106{ 113{
114 struct uts_namespace *ns = new;
115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN))
118 return -EPERM;
119
107 get_uts_ns(ns); 120 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns); 121 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns; 122 nsproxy->uts_ns = ns;
110 return 0; 123 return 0;
111} 124}
112 125
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
113const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
114 .name = "uts", 134 .name = "uts",
115 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
116 .get = utsns_get, 136 .get = utsns_get,
117 .put = utsns_put, 137 .put = utsns_put,
118 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
119}; 140};
120
diff --git a/kernel/wait.c b/kernel/wait.c
index 7fdd9eaca2c3..6698e0c04ead 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic waiting primitives. 2 * Generic waiting primitives.
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/export.h> 7#include <linux/export.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c8c21be11ab4..75a2ab3d0b02 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,7 @@
31int watchdog_enabled = 1; 31int watchdog_enabled = 1;
32int __read_mostly watchdog_thresh = 10; 32int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled; 33static int __read_mostly watchdog_disabled;
34static u64 __read_mostly sample_period;
34 35
35static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 36static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
36static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 37static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu)
116 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
117} 118}
118 119
119static u64 get_sample_period(void) 120static void set_sample_period(void)
120{ 121{
121 /* 122 /*
122 * convert watchdog_thresh from seconds to ns 123 * convert watchdog_thresh from seconds to ns
@@ -125,7 +126,7 @@ static u64 get_sample_period(void)
125 * and hard thresholds) to increment before the 126 * and hard thresholds) to increment before the
126 * hardlockup detector generates a warning 127 * hardlockup detector generates a warning
127 */ 128 */
128 return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); 129 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
129} 130}
130 131
131/* Commands for resetting the watchdog */ 132/* Commands for resetting the watchdog */
@@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
275 wake_up_process(__this_cpu_read(softlockup_watchdog)); 276 wake_up_process(__this_cpu_read(softlockup_watchdog));
276 277
277 /* .. and repeat */ 278 /* .. and repeat */
278 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 279 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
279 280
280 if (touch_ts == 0) { 281 if (touch_ts == 0) {
281 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 282 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -343,6 +344,10 @@ static void watchdog_enable(unsigned int cpu)
343{ 344{
344 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 345 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
345 346
347 /* kick off the timer for the hardlockup detector */
348 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
349 hrtimer->function = watchdog_timer_fn;
350
346 if (!watchdog_enabled) { 351 if (!watchdog_enabled) {
347 kthread_park(current); 352 kthread_park(current);
348 return; 353 return;
@@ -351,12 +356,8 @@ static void watchdog_enable(unsigned int cpu)
351 /* Enable the perf event */ 356 /* Enable the perf event */
352 watchdog_nmi_enable(cpu); 357 watchdog_nmi_enable(cpu);
353 358
354 /* kick off the timer for the hardlockup detector */
355 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
356 hrtimer->function = watchdog_timer_fn;
357
358 /* done here because hrtimer_start can only pin to smp_processor_id() */ 359 /* done here because hrtimer_start can only pin to smp_processor_id() */
359 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 360 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
360 HRTIMER_MODE_REL_PINNED); 361 HRTIMER_MODE_REL_PINNED);
361 362
362 /* initialize timestamp */ 363 /* initialize timestamp */
@@ -368,9 +369,6 @@ static void watchdog_disable(unsigned int cpu)
368{ 369{
369 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 370 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
370 371
371 if (!watchdog_enabled)
372 return;
373
374 watchdog_set_prio(SCHED_NORMAL, 0); 372 watchdog_set_prio(SCHED_NORMAL, 0);
375 hrtimer_cancel(hrtimer); 373 hrtimer_cancel(hrtimer);
376 /* disable the perf event */ 374 /* disable the perf event */
@@ -386,7 +384,7 @@ static int watchdog_should_run(unsigned int cpu)
386/* 384/*
387 * The watchdog thread function - touches the timestamp. 385 * The watchdog thread function - touches the timestamp.
388 * 386 *
389 * It only runs once every get_sample_period() seconds (4 seconds by 387 * It only runs once every sample_period seconds (4 seconds by
390 * default) to reset the softlockup timestamp. If this gets delayed 388 * default) to reset the softlockup timestamp. If this gets delayed
391 * for more than 2*watchdog_thresh seconds then the debug-printout 389 * for more than 2*watchdog_thresh seconds then the debug-printout
392 * triggers in watchdog_timer_fn(). 390 * triggers in watchdog_timer_fn().
@@ -519,6 +517,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
519 if (ret || !write) 517 if (ret || !write)
520 return ret; 518 return ret;
521 519
520 set_sample_period();
522 if (watchdog_enabled && watchdog_thresh) 521 if (watchdog_enabled && watchdog_thresh)
523 watchdog_enable_all_cpus(); 522 watchdog_enable_all_cpus();
524 else 523 else
@@ -540,6 +539,7 @@ static struct smp_hotplug_thread watchdog_threads = {
540 539
541void __init lockup_detector_init(void) 540void __init lockup_detector_init(void)
542{ 541{
542 set_sample_period();
543 if (smpboot_register_percpu_thread(&watchdog_threads)) { 543 if (smpboot_register_percpu_thread(&watchdog_threads)) {
544 pr_err("Failed to create watchdog threads, disabled\n"); 544 pr_err("Failed to create watchdog threads, disabled\n");
545 watchdog_disabled = -ENODEV; 545 watchdog_disabled = -ENODEV;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1dae900df798..fbc6576a83c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -739,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
739{ 739{
740 struct worker *worker = kthread_data(task); 740 struct worker *worker = kthread_data(task);
741 741
742 if (!(worker->flags & WORKER_NOT_RUNNING)) 742 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
743 atomic_inc(get_pool_nr_running(worker->pool)); 744 atomic_inc(get_pool_nr_running(worker->pool));
745 }
744} 746}
745 747
746/** 748/**
@@ -3485,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work)
3485 unsigned int ret = 0; 3487 unsigned int ret = 0;
3486 3488
3487 if (!gcwq) 3489 if (!gcwq)
3488 return false; 3490 return 0;
3489 3491
3490 spin_lock_irqsave(&gcwq->lock, flags); 3492 spin_lock_irqsave(&gcwq->lock, flags);
3491 3493