aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.freezer2
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/cgroup.c274
-rw-r--r--kernel/cgroup_debug.c4
-rw-r--r--kernel/cgroup_freezer.c379
-rw-r--r--kernel/compat.c58
-rw-r--r--kernel/configs.c9
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c19
-rw-r--r--kernel/dma-coherent.c2
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/freezer.c154
-rw-r--r--kernel/hrtimer.c95
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c10
-rw-r--r--kernel/kgdb.c13
-rw-r--r--kernel/kmod.c67
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c35
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/module.c56
-rw-r--r--kernel/panic.c67
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/disk.c11
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/process.c119
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c42
-rw-r--r--kernel/profile.c41
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcuclassic.c337
-rw-r--r--kernel/rcupreempt.c10
-rw-r--r--kernel/rcupreempt_trace.c7
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/resource.c150
-rw-r--r--kernel/sched.c389
-rw-r--r--kernel/sched_clock.c6
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c234
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c58
-rw-r--r--kernel/softirq.c142
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c38
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c133
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/clockevents.c12
-rw-r--r--kernel/time/tick-broadcast.c23
-rw-r--r--kernel/time/tick-common.c14
-rw-r--r--kernel/time/tick-internal.h9
-rw-r--r--kernel/time/tick-sched.c26
-rw-r--r--kernel/timer.c1
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/wait.c14
-rw-r--r--kernel/workqueue.c2
64 files changed, 2232 insertions, 964 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000000..a3bb4cb52539
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
1config FREEZER
2 def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df7c3e2..066550aa61c5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg 24CFLAGS_REMOVE_sched.o = -mno-spe -pg
25endif 25endif
26 26
27obj-$(CONFIG_FREEZER) += freezer.o
27obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
28obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
29obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -55,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55obj-$(CONFIG_COMPAT) += compat.o 56obj-$(CONFIG_COMPAT) += compat.o
56obj-$(CONFIG_CGROUPS) += cgroup.o 57obj-$(CONFIG_CGROUPS) += cgroup.o
57obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 58obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
59obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
58obj-$(CONFIG_CPUSETS) += cpuset.o 60obj-$(CONFIG_CPUSETS) += cpuset.o
59obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 61obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
60obj-$(CONFIG_UTS_NS) += utsname.o 62obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/acct.c b/kernel/acct.c
index dd68b9059418..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
548#endif 548#endif
549 549
550 spin_lock_irq(&current->sighand->siglock); 550 spin_lock_irq(&current->sighand->siglock);
551 tty = current->signal->tty; 551 tty = current->signal->tty; /* Safe as we hold the siglock */
552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; 552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 59cedfb040e7..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
246 unsigned n; 246 unsigned n;
247 if (unlikely(!ctx)) 247 if (unlikely(!ctx))
248 return 0; 248 return 0;
249
250 n = ctx->major; 249 n = ctx->major;
250
251 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
252 case 0: /* native */ 252 case 0: /* native */
253 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
@@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1205 context->return_code); 1205 context->return_code);
1206 1206
1207 mutex_lock(&tty_mutex); 1207 spin_lock_irq(&tsk->sighand->siglock);
1208 read_lock(&tasklist_lock);
1209 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1208 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1210 tty = tsk->signal->tty->name; 1209 tty = tsk->signal->tty->name;
1211 else 1210 else
1212 tty = "(none)"; 1211 tty = "(none)";
1213 read_unlock(&tasklist_lock); 1212 spin_unlock_irq(&tsk->sighand->siglock);
1213
1214 audit_log_format(ab, 1214 audit_log_format(ab,
1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1230 context->egid, context->sgid, context->fsgid, tty, 1230 context->egid, context->sgid, context->fsgid, tty,
1231 tsk->sessionid); 1231 tsk->sessionid);
1232 1232
1233 mutex_unlock(&tty_mutex);
1234 1233
1235 audit_log_task_info(ab, tsk); 1234 audit_log_task_info(ab, tsk);
1236 if (context->filterkey) { 1235 if (context->filterkey) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..046c1609606b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)
241 struct cg_cgroup_link *link; 241 struct cg_cgroup_link *link;
242 struct cg_cgroup_link *saved_link; 242 struct cg_cgroup_link *saved_link;
243 243
244 write_lock(&css_set_lock);
245 hlist_del(&cg->hlist); 244 hlist_del(&cg->hlist);
246 css_set_count--; 245 css_set_count--;
247 246
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)
251 list_del(&link->cgrp_link_list); 250 list_del(&link->cgrp_link_list);
252 kfree(link); 251 kfree(link);
253 } 252 }
254
255 write_unlock(&css_set_lock);
256} 253}
257 254
258static void __release_css_set(struct kref *k, int taskexit) 255static void __put_css_set(struct css_set *cg, int taskexit)
259{ 256{
260 int i; 257 int i;
261 struct css_set *cg = container_of(k, struct css_set, ref); 258 /*
262 259 * Ensure that the refcount doesn't hit zero while any readers
260 * can see it. Similar to atomic_dec_and_lock(), but for an
261 * rwlock
262 */
263 if (atomic_add_unless(&cg->refcount, -1, 1))
264 return;
265 write_lock(&css_set_lock);
266 if (!atomic_dec_and_test(&cg->refcount)) {
267 write_unlock(&css_set_lock);
268 return;
269 }
263 unlink_css_set(cg); 270 unlink_css_set(cg);
271 write_unlock(&css_set_lock);
264 272
265 rcu_read_lock(); 273 rcu_read_lock();
266 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
276 kfree(cg); 284 kfree(cg);
277} 285}
278 286
279static void release_css_set(struct kref *k)
280{
281 __release_css_set(k, 0);
282}
283
284static void release_css_set_taskexit(struct kref *k)
285{
286 __release_css_set(k, 1);
287}
288
289/* 287/*
290 * refcounted get/put for css_set objects 288 * refcounted get/put for css_set objects
291 */ 289 */
292static inline void get_css_set(struct css_set *cg) 290static inline void get_css_set(struct css_set *cg)
293{ 291{
294 kref_get(&cg->ref); 292 atomic_inc(&cg->refcount);
295} 293}
296 294
297static inline void put_css_set(struct css_set *cg) 295static inline void put_css_set(struct css_set *cg)
298{ 296{
299 kref_put(&cg->ref, release_css_set); 297 __put_css_set(cg, 0);
300} 298}
301 299
302static inline void put_css_set_taskexit(struct css_set *cg) 300static inline void put_css_set_taskexit(struct css_set *cg)
303{ 301{
304 kref_put(&cg->ref, release_css_set_taskexit); 302 __put_css_set(cg, 1);
305} 303}
306 304
307/* 305/*
@@ -427,7 +425,7 @@ static struct css_set *find_css_set(
427 return NULL; 425 return NULL;
428 } 426 }
429 427
430 kref_init(&res->ref); 428 atomic_set(&res->refcount, 1);
431 INIT_LIST_HEAD(&res->cg_links); 429 INIT_LIST_HEAD(&res->cg_links);
432 INIT_LIST_HEAD(&res->tasks); 430 INIT_LIST_HEAD(&res->tasks);
433 INIT_HLIST_NODE(&res->hlist); 431 INIT_HLIST_NODE(&res->hlist);
@@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {
870 .remount_fs = cgroup_remount, 868 .remount_fs = cgroup_remount,
871}; 869};
872 870
871static void init_cgroup_housekeeping(struct cgroup *cgrp)
872{
873 INIT_LIST_HEAD(&cgrp->sibling);
874 INIT_LIST_HEAD(&cgrp->children);
875 INIT_LIST_HEAD(&cgrp->css_sets);
876 INIT_LIST_HEAD(&cgrp->release_list);
877 init_rwsem(&cgrp->pids_mutex);
878}
873static void init_cgroup_root(struct cgroupfs_root *root) 879static void init_cgroup_root(struct cgroupfs_root *root)
874{ 880{
875 struct cgroup *cgrp = &root->top_cgroup; 881 struct cgroup *cgrp = &root->top_cgroup;
@@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
878 root->number_of_cgroups = 1; 884 root->number_of_cgroups = 1;
879 cgrp->root = root; 885 cgrp->root = root;
880 cgrp->top_cgroup = cgrp; 886 cgrp->top_cgroup = cgrp;
881 INIT_LIST_HEAD(&cgrp->sibling); 887 init_cgroup_housekeeping(cgrp);
882 INIT_LIST_HEAD(&cgrp->children);
883 INIT_LIST_HEAD(&cgrp->css_sets);
884 INIT_LIST_HEAD(&cgrp->release_list);
885} 888}
886 889
887static int cgroup_test_super(struct super_block *sb, void *data) 890static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1728 1731
1729 read_lock(&css_set_lock); 1732 read_lock(&css_set_lock);
1730 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 1733 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1731 count += atomic_read(&link->cg->ref.refcount); 1734 count += atomic_read(&link->cg->refcount);
1732 } 1735 }
1733 read_unlock(&css_set_lock); 1736 read_unlock(&css_set_lock);
1734 return count; 1737 return count;
@@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1997 * but we cannot guarantee that the information we produce is correct 2000 * but we cannot guarantee that the information we produce is correct
1998 * unless we produce it entirely atomically. 2001 * unless we produce it entirely atomically.
1999 * 2002 *
2000 * Upon tasks file open(), a struct ctr_struct is allocated, that
2001 * will have a pointer to an array (also allocated here). The struct
2002 * ctr_struct * is stored in file->private_data. Its resources will
2003 * be freed by release() when the file is closed. The array is used
2004 * to sprintf the PIDs and then used by read().
2005 */ 2003 */
2006struct ctr_struct {
2007 char *buf;
2008 int bufsz;
2009};
2010 2004
2011/* 2005/*
2012 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2006 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
2088 return *(pid_t *)a - *(pid_t *)b; 2082 return *(pid_t *)a - *(pid_t *)b;
2089} 2083}
2090 2084
2085
2091/* 2086/*
2092 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2087 * seq_file methods for the "tasks" file. The seq_file position is the
2093 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2088 * next pid to display; the seq_file iterator is a pointer to the pid
2094 * count 'cnt' of how many chars would be written if buf were large enough. 2089 * in the cgroup->tasks_pids array.
2095 */ 2090 */
2096static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2091
2092static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2097{ 2093{
2098 int cnt = 0; 2094 /*
2099 int i; 2095 * Initially we receive a position value that corresponds to
2096 * one more than the last pid shown (or 0 on the first call or
2097 * after a seek to the start). Use a binary-search to find the
2098 * next pid to display, if any
2099 */
2100 struct cgroup *cgrp = s->private;
2101 int index = 0, pid = *pos;
2102 int *iter;
2103
2104 down_read(&cgrp->pids_mutex);
2105 if (pid) {
2106 int end = cgrp->pids_length;
2107 int i;
2108 while (index < end) {
2109 int mid = (index + end) / 2;
2110 if (cgrp->tasks_pids[mid] == pid) {
2111 index = mid;
2112 break;
2113 } else if (cgrp->tasks_pids[mid] <= pid)
2114 index = mid + 1;
2115 else
2116 end = mid;
2117 }
2118 }
2119 /* If we're off the end of the array, we're done */
2120 if (index >= cgrp->pids_length)
2121 return NULL;
2122 /* Update the abstract position to be the actual pid that we found */
2123 iter = cgrp->tasks_pids + index;
2124 *pos = *iter;
2125 return iter;
2126}
2127
2128static void cgroup_tasks_stop(struct seq_file *s, void *v)
2129{
2130 struct cgroup *cgrp = s->private;
2131 up_read(&cgrp->pids_mutex);
2132}
2133
2134static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2135{
2136 struct cgroup *cgrp = s->private;
2137 int *p = v;
2138 int *end = cgrp->tasks_pids + cgrp->pids_length;
2139
2140 /*
2141 * Advance to the next pid in the array. If this goes off the
2142 * end, we're done
2143 */
2144 p++;
2145 if (p >= end) {
2146 return NULL;
2147 } else {
2148 *pos = *p;
2149 return p;
2150 }
2151}
2152
2153static int cgroup_tasks_show(struct seq_file *s, void *v)
2154{
2155 return seq_printf(s, "%d\n", *(int *)v);
2156}
2157
2158static struct seq_operations cgroup_tasks_seq_operations = {
2159 .start = cgroup_tasks_start,
2160 .stop = cgroup_tasks_stop,
2161 .next = cgroup_tasks_next,
2162 .show = cgroup_tasks_show,
2163};
2164
2165static void release_cgroup_pid_array(struct cgroup *cgrp)
2166{
2167 down_write(&cgrp->pids_mutex);
2168 BUG_ON(!cgrp->pids_use_count);
2169 if (!--cgrp->pids_use_count) {
2170 kfree(cgrp->tasks_pids);
2171 cgrp->tasks_pids = NULL;
2172 cgrp->pids_length = 0;
2173 }
2174 up_write(&cgrp->pids_mutex);
2175}
2176
2177static int cgroup_tasks_release(struct inode *inode, struct file *file)
2178{
2179 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2100 2180
2101 for (i = 0; i < npids; i++) 2181 if (!(file->f_mode & FMODE_READ))
2102 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2182 return 0;
2103 return cnt; 2183
2184 release_cgroup_pid_array(cgrp);
2185 return seq_release(inode, file);
2104} 2186}
2105 2187
2188static struct file_operations cgroup_tasks_operations = {
2189 .read = seq_read,
2190 .llseek = seq_lseek,
2191 .write = cgroup_file_write,
2192 .release = cgroup_tasks_release,
2193};
2194
2106/* 2195/*
2107 * Handle an open on 'tasks' file. Prepare a buffer listing the 2196 * Handle an open on 'tasks' file. Prepare an array containing the
2108 * process id's of tasks currently attached to the cgroup being opened. 2197 * process id's of tasks currently attached to the cgroup being opened.
2109 *
2110 * Does not require any specific cgroup mutexes, and does not take any.
2111 */ 2198 */
2199
2112static int cgroup_tasks_open(struct inode *unused, struct file *file) 2200static int cgroup_tasks_open(struct inode *unused, struct file *file)
2113{ 2201{
2114 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2202 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2115 struct ctr_struct *ctr;
2116 pid_t *pidarray; 2203 pid_t *pidarray;
2117 int npids; 2204 int npids;
2118 char c; 2205 int retval;
2119 2206
2207 /* Nothing to do for write-only files */
2120 if (!(file->f_mode & FMODE_READ)) 2208 if (!(file->f_mode & FMODE_READ))
2121 return 0; 2209 return 0;
2122 2210
2123 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2124 if (!ctr)
2125 goto err0;
2126
2127 /* 2211 /*
2128 * If cgroup gets more users after we read count, we won't have 2212 * If cgroup gets more users after we read count, we won't have
2129 * enough space - tough. This race is indistinguishable to the 2213 * enough space - tough. This race is indistinguishable to the
@@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2131 * show up until sometime later on. 2215 * show up until sometime later on.
2132 */ 2216 */
2133 npids = cgroup_task_count(cgrp); 2217 npids = cgroup_task_count(cgrp);
2134 if (npids) { 2218 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2135 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2219 if (!pidarray)
2136 if (!pidarray) 2220 return -ENOMEM;
2137 goto err1; 2221 npids = pid_array_load(pidarray, npids, cgrp);
2138 2222 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2139 npids = pid_array_load(pidarray, npids, cgrp);
2140 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2141
2142 /* Call pid_array_to_buf() twice, first just to get bufsz */
2143 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2144 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2145 if (!ctr->buf)
2146 goto err2;
2147 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2148
2149 kfree(pidarray);
2150 } else {
2151 ctr->buf = NULL;
2152 ctr->bufsz = 0;
2153 }
2154 file->private_data = ctr;
2155 return 0;
2156
2157err2:
2158 kfree(pidarray);
2159err1:
2160 kfree(ctr);
2161err0:
2162 return -ENOMEM;
2163}
2164
2165static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2166 struct cftype *cft,
2167 struct file *file, char __user *buf,
2168 size_t nbytes, loff_t *ppos)
2169{
2170 struct ctr_struct *ctr = file->private_data;
2171 2223
2172 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2224 /*
2173} 2225 * Store the array in the cgroup, freeing the old
2226 * array if necessary
2227 */
2228 down_write(&cgrp->pids_mutex);
2229 kfree(cgrp->tasks_pids);
2230 cgrp->tasks_pids = pidarray;
2231 cgrp->pids_length = npids;
2232 cgrp->pids_use_count++;
2233 up_write(&cgrp->pids_mutex);
2174 2234
2175static int cgroup_tasks_release(struct inode *unused_inode, 2235 file->f_op = &cgroup_tasks_operations;
2176 struct file *file)
2177{
2178 struct ctr_struct *ctr;
2179 2236
2180 if (file->f_mode & FMODE_READ) { 2237 retval = seq_open(file, &cgroup_tasks_seq_operations);
2181 ctr = file->private_data; 2238 if (retval) {
2182 kfree(ctr->buf); 2239 release_cgroup_pid_array(cgrp);
2183 kfree(ctr); 2240 return retval;
2184 } 2241 }
2242 ((struct seq_file *)file->private_data)->private = cgrp;
2185 return 0; 2243 return 0;
2186} 2244}
2187 2245
@@ -2210,7 +2268,6 @@ static struct cftype files[] = {
2210 { 2268 {
2211 .name = "tasks", 2269 .name = "tasks",
2212 .open = cgroup_tasks_open, 2270 .open = cgroup_tasks_open,
2213 .read = cgroup_tasks_read,
2214 .write_u64 = cgroup_tasks_write, 2271 .write_u64 = cgroup_tasks_write,
2215 .release = cgroup_tasks_release, 2272 .release = cgroup_tasks_release,
2216 .private = FILE_TASKLIST, 2273 .private = FILE_TASKLIST,
@@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2300 2357
2301 mutex_lock(&cgroup_mutex); 2358 mutex_lock(&cgroup_mutex);
2302 2359
2303 INIT_LIST_HEAD(&cgrp->sibling); 2360 init_cgroup_housekeeping(cgrp);
2304 INIT_LIST_HEAD(&cgrp->children);
2305 INIT_LIST_HEAD(&cgrp->css_sets);
2306 INIT_LIST_HEAD(&cgrp->release_list);
2307 2361
2308 cgrp->parent = parent; 2362 cgrp->parent = parent;
2309 cgrp->root = parent->root; 2363 cgrp->root = parent->root;
@@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2495int __init cgroup_init_early(void) 2549int __init cgroup_init_early(void)
2496{ 2550{
2497 int i; 2551 int i;
2498 kref_init(&init_css_set.ref); 2552 atomic_set(&init_css_set.refcount, 1);
2499 kref_get(&init_css_set.ref);
2500 INIT_LIST_HEAD(&init_css_set.cg_links); 2553 INIT_LIST_HEAD(&init_css_set.cg_links);
2501 INIT_LIST_HEAD(&init_css_set.tasks); 2554 INIT_LIST_HEAD(&init_css_set.tasks);
2502 INIT_HLIST_NODE(&init_css_set.hlist); 2555 INIT_HLIST_NODE(&init_css_set.hlist);
@@ -2735,21 +2788,24 @@ void cgroup_fork_callbacks(struct task_struct *child)
2735 * Called on every change to mm->owner. mm_init_owner() does not 2788 * Called on every change to mm->owner. mm_init_owner() does not
2736 * invoke this routine, since it assigns the mm->owner the first time 2789 * invoke this routine, since it assigns the mm->owner the first time
2737 * and does not change it. 2790 * and does not change it.
2791 *
2792 * The callbacks are invoked with mmap_sem held in read mode.
2738 */ 2793 */
2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2794void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2740{ 2795{
2741 struct cgroup *oldcgrp, *newcgrp; 2796 struct cgroup *oldcgrp, *newcgrp = NULL;
2742 2797
2743 if (need_mm_owner_callback) { 2798 if (need_mm_owner_callback) {
2744 int i; 2799 int i;
2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2800 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2746 struct cgroup_subsys *ss = subsys[i]; 2801 struct cgroup_subsys *ss = subsys[i];
2747 oldcgrp = task_cgroup(old, ss->subsys_id); 2802 oldcgrp = task_cgroup(old, ss->subsys_id);
2748 newcgrp = task_cgroup(new, ss->subsys_id); 2803 if (new)
2804 newcgrp = task_cgroup(new, ss->subsys_id);
2749 if (oldcgrp == newcgrp) 2805 if (oldcgrp == newcgrp)
2750 continue; 2806 continue;
2751 if (ss->mm_owner_changed) 2807 if (ss->mm_owner_changed)
2752 ss->mm_owner_changed(ss, oldcgrp, newcgrp); 2808 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2753 } 2809 }
2754 } 2810 }
2755} 2811}
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index c3dc3aba4c02..daca6209202d 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
57 u64 count; 57 u64 count;
58 58
59 rcu_read_lock(); 59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount); 60 count = atomic_read(&current->cgroups->refcount);
61 rcu_read_unlock(); 61 rcu_read_unlock();
62 return count; 62 return count;
63} 63}
@@ -90,7 +90,7 @@ static struct cftype files[] = {
90 { 90 {
91 .name = "releasable", 91 .name = "releasable",
92 .read_u64 = releasable_read, 92 .read_u64 = releasable_read,
93 } 93 },
94}; 94};
95 95
96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000000..e95056954498
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,379 @@
1/*
2 * cgroup_freezer.c - control group freezer subsystem
3 *
4 * Copyright IBM Corporation, 2007
5 *
6 * Author : Cedric Le Goater <clg@fr.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of version 2.1 of the GNU Lesser General Public License
10 * as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it would be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */
16
17#include <linux/module.h>
18#include <linux/cgroup.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/freezer.h>
22#include <linux/seq_file.h>
23
24enum freezer_state {
25 CGROUP_THAWED = 0,
26 CGROUP_FREEZING,
27 CGROUP_FROZEN,
28};
29
30struct freezer {
31 struct cgroup_subsys_state css;
32 enum freezer_state state;
33 spinlock_t lock; /* protects _writes_ to state */
34};
35
36static inline struct freezer *cgroup_freezer(
37 struct cgroup *cgroup)
38{
39 return container_of(
40 cgroup_subsys_state(cgroup, freezer_subsys_id),
41 struct freezer, css);
42}
43
44static inline struct freezer *task_freezer(struct task_struct *task)
45{
46 return container_of(task_subsys_state(task, freezer_subsys_id),
47 struct freezer, css);
48}
49
50int cgroup_frozen(struct task_struct *task)
51{
52 struct freezer *freezer;
53 enum freezer_state state;
54
55 task_lock(task);
56 freezer = task_freezer(task);
57 state = freezer->state;
58 task_unlock(task);
59
60 return state == CGROUP_FROZEN;
61}
62
63/*
64 * cgroups_write_string() limits the size of freezer state strings to
65 * CGROUP_LOCAL_BUFFER_SIZE
66 */
67static const char *freezer_state_strs[] = {
68 "THAWED",
69 "FREEZING",
70 "FROZEN",
71};
72
73/*
74 * State diagram
75 * Transitions are caused by userspace writes to the freezer.state file.
76 * The values in parenthesis are state labels. The rest are edge labels.
77 *
78 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
79 * ^ ^ | |
80 * | \_______THAWED_______/ |
81 * \__________________________THAWED____________/
82 */
83
84struct cgroup_subsys freezer_subsys;
85
86/* Locks taken and their ordering
87 * ------------------------------
88 * css_set_lock
89 * cgroup_mutex (AKA cgroup_lock)
90 * task->alloc_lock (AKA task_lock)
91 * freezer->lock
92 * task->sighand->siglock
93 *
94 * cgroup code forces css_set_lock to be taken before task->alloc_lock
95 *
96 * freezer_create(), freezer_destroy():
97 * cgroup_mutex [ by cgroup core ]
98 *
99 * can_attach():
100 * cgroup_mutex
101 *
102 * cgroup_frozen():
103 * task->alloc_lock (to get task's cgroup)
104 *
105 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
106 * task->alloc_lock (to get task's cgroup)
107 * freezer->lock
108 * sighand->siglock (if the cgroup is freezing)
109 *
110 * freezer_read():
111 * cgroup_mutex
112 * freezer->lock
113 * read_lock css_set_lock (cgroup iterator start)
114 *
115 * freezer_write() (freeze):
116 * cgroup_mutex
117 * freezer->lock
118 * read_lock css_set_lock (cgroup iterator start)
119 * sighand->siglock
120 *
121 * freezer_write() (unfreeze):
122 * cgroup_mutex
123 * freezer->lock
124 * read_lock css_set_lock (cgroup iterator start)
125 * task->alloc_lock (to prevent races with freeze_task())
126 * sighand->siglock
127 */
128static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
129 struct cgroup *cgroup)
130{
131 struct freezer *freezer;
132
133 freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
134 if (!freezer)
135 return ERR_PTR(-ENOMEM);
136
137 spin_lock_init(&freezer->lock);
138 freezer->state = CGROUP_THAWED;
139 return &freezer->css;
140}
141
142static void freezer_destroy(struct cgroup_subsys *ss,
143 struct cgroup *cgroup)
144{
145 kfree(cgroup_freezer(cgroup));
146}
147
148/* Task is frozen or will freeze immediately when next it gets woken */
149static bool is_task_frozen_enough(struct task_struct *task)
150{
151 return frozen(task) ||
152 (task_is_stopped_or_traced(task) && freezing(task));
153}
154
155/*
156 * The call to cgroup_lock() in the freezer.state write method prevents
157 * a write to that file racing against an attach, and hence the
158 * can_attach() result will remain valid until the attach completes.
159 */
160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup,
162 struct task_struct *task)
163{
164 struct freezer *freezer;
165 int retval;
166
167 /* Anything frozen can't move or be moved to/from */
168
169 if (is_task_frozen_enough(task))
170 return -EBUSY;
171
172 freezer = cgroup_freezer(new_cgroup);
173 if (freezer->state == CGROUP_FROZEN)
174 return -EBUSY;
175
176 retval = 0;
177 task_lock(task);
178 freezer = task_freezer(task);
179 if (freezer->state == CGROUP_FROZEN)
180 retval = -EBUSY;
181 task_unlock(task);
182 return retval;
183}
184
185static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
186{
187 struct freezer *freezer;
188
189 task_lock(task);
190 freezer = task_freezer(task);
191 task_unlock(task);
192
193 BUG_ON(freezer->state == CGROUP_FROZEN);
194 spin_lock_irq(&freezer->lock);
195 /* Locking avoids race with FREEZING -> THAWED transitions. */
196 if (freezer->state == CGROUP_FREEZING)
197 freeze_task(task, true);
198 spin_unlock_irq(&freezer->lock);
199}
200
201/*
202 * caller must hold freezer->lock
203 */
204static void update_freezer_state(struct cgroup *cgroup,
205 struct freezer *freezer)
206{
207 struct cgroup_iter it;
208 struct task_struct *task;
209 unsigned int nfrozen = 0, ntotal = 0;
210
211 cgroup_iter_start(cgroup, &it);
212 while ((task = cgroup_iter_next(cgroup, &it))) {
213 ntotal++;
214 if (is_task_frozen_enough(task))
215 nfrozen++;
216 }
217
218 /*
219 * Transition to FROZEN when no new tasks can be added ensures
220 * that we never exist in the FROZEN state while there are unfrozen
221 * tasks.
222 */
223 if (nfrozen == ntotal)
224 freezer->state = CGROUP_FROZEN;
225 else if (nfrozen > 0)
226 freezer->state = CGROUP_FREEZING;
227 else
228 freezer->state = CGROUP_THAWED;
229 cgroup_iter_end(cgroup, &it);
230}
231
232static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
233 struct seq_file *m)
234{
235 struct freezer *freezer;
236 enum freezer_state state;
237
238 if (!cgroup_lock_live_group(cgroup))
239 return -ENODEV;
240
241 freezer = cgroup_freezer(cgroup);
242 spin_lock_irq(&freezer->lock);
243 state = freezer->state;
244 if (state == CGROUP_FREEZING) {
245 /* We change from FREEZING to FROZEN lazily if the cgroup was
246 * only partially frozen when we exitted write. */
247 update_freezer_state(cgroup, freezer);
248 state = freezer->state;
249 }
250 spin_unlock_irq(&freezer->lock);
251 cgroup_unlock();
252
253 seq_puts(m, freezer_state_strs[state]);
254 seq_putc(m, '\n');
255 return 0;
256}
257
258static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
259{
260 struct cgroup_iter it;
261 struct task_struct *task;
262 unsigned int num_cant_freeze_now = 0;
263
264 freezer->state = CGROUP_FREEZING;
265 cgroup_iter_start(cgroup, &it);
266 while ((task = cgroup_iter_next(cgroup, &it))) {
267 if (!freeze_task(task, true))
268 continue;
269 if (is_task_frozen_enough(task))
270 continue;
271 if (!freezing(task) && !freezer_should_skip(task))
272 num_cant_freeze_now++;
273 }
274 cgroup_iter_end(cgroup, &it);
275
276 return num_cant_freeze_now ? -EBUSY : 0;
277}
278
279static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
280{
281 struct cgroup_iter it;
282 struct task_struct *task;
283
284 cgroup_iter_start(cgroup, &it);
285 while ((task = cgroup_iter_next(cgroup, &it))) {
286 int do_wake;
287
288 task_lock(task);
289 do_wake = __thaw_process(task);
290 task_unlock(task);
291 if (do_wake)
292 wake_up_process(task);
293 }
294 cgroup_iter_end(cgroup, &it);
295 freezer->state = CGROUP_THAWED;
296
297 return 0;
298}
299
300static int freezer_change_state(struct cgroup *cgroup,
301 enum freezer_state goal_state)
302{
303 struct freezer *freezer;
304 int retval = 0;
305
306 freezer = cgroup_freezer(cgroup);
307 spin_lock_irq(&freezer->lock);
308 update_freezer_state(cgroup, freezer);
309 if (goal_state == freezer->state)
310 goto out;
311 switch (freezer->state) {
312 case CGROUP_THAWED:
313 retval = try_to_freeze_cgroup(cgroup, freezer);
314 break;
315 case CGROUP_FREEZING:
316 if (goal_state == CGROUP_FROZEN) {
317 /* Userspace is retrying after
318 * "/bin/echo FROZEN > freezer.state" returned -EBUSY */
319 retval = try_to_freeze_cgroup(cgroup, freezer);
320 break;
321 }
322 /* state == FREEZING and goal_state == THAWED, so unfreeze */
323 case CGROUP_FROZEN:
324 retval = unfreeze_cgroup(cgroup, freezer);
325 break;
326 default:
327 break;
328 }
329out:
330 spin_unlock_irq(&freezer->lock);
331
332 return retval;
333}
334
335static int freezer_write(struct cgroup *cgroup,
336 struct cftype *cft,
337 const char *buffer)
338{
339 int retval;
340 enum freezer_state goal_state;
341
342 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
343 goal_state = CGROUP_THAWED;
344 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
345 goal_state = CGROUP_FROZEN;
346 else
347 return -EIO;
348
349 if (!cgroup_lock_live_group(cgroup))
350 return -ENODEV;
351 retval = freezer_change_state(cgroup, goal_state);
352 cgroup_unlock();
353 return retval;
354}
355
356static struct cftype files[] = {
357 {
358 .name = "state",
359 .read_seq_string = freezer_read,
360 .write_string = freezer_write,
361 },
362};
363
364static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
365{
366 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
367}
368
369struct cgroup_subsys freezer_subsys = {
370 .name = "freezer",
371 .create = freezer_create,
372 .destroy = freezer_destroy,
373 .populate = freezer_populate,
374 .subsys_id = freezer_subsys_id,
375 .can_attach = freezer_can_attach,
376 .attach = NULL,
377 .fork = freezer_fork,
378 .exit = NULL,
379};
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..143990e48cb9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -26,6 +26,64 @@
26 26
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28 28
29/*
30 * Note that the native side is already converted to a timespec, because
31 * that's what we want anyway.
32 */
33static int compat_get_timeval(struct timespec *o,
34 struct compat_timeval __user *i)
35{
36 long usec;
37
38 if (get_user(o->tv_sec, &i->tv_sec) ||
39 get_user(usec, &i->tv_usec))
40 return -EFAULT;
41 o->tv_nsec = usec * 1000;
42 return 0;
43}
44
45static int compat_put_timeval(struct compat_timeval __user *o,
46 struct timeval *i)
47{
48 return (put_user(i->tv_sec, &o->tv_sec) ||
49 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
50}
51
52asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
53 struct timezone __user *tz)
54{
55 if (tv) {
56 struct timeval ktv;
57 do_gettimeofday(&ktv);
58 if (compat_put_timeval(tv, &ktv))
59 return -EFAULT;
60 }
61 if (tz) {
62 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
63 return -EFAULT;
64 }
65
66 return 0;
67}
68
69asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
70 struct timezone __user *tz)
71{
72 struct timespec kts;
73 struct timezone ktz;
74
75 if (tv) {
76 if (compat_get_timeval(&kts, tv))
77 return -EFAULT;
78 }
79 if (tz) {
80 if (copy_from_user(&ktz, tz, sizeof(ktz)))
81 return -EFAULT;
82 }
83
84 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
85}
86
29int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 87int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
30{ 88{
31 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 89 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
diff --git a/kernel/configs.c b/kernel/configs.c
index 4c345210ed8c..abaee684ecbf 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -54,9 +54,6 @@
54 54
55#ifdef CONFIG_IKCONFIG_PROC 55#ifdef CONFIG_IKCONFIG_PROC
56 56
57/**************************************************/
58/* globals and useful constants */
59
60static ssize_t 57static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 58ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 59 size_t len, loff_t * offset)
@@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = {
71 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
72}; 69};
73 70
74/***************************************************/
75/* ikconfig_init: start up everything we need to */
76
77static int __init ikconfig_init(void) 71static int __init ikconfig_init(void)
78{ 72{
79 struct proc_dir_entry *entry; 73 struct proc_dir_entry *entry;
@@ -89,9 +83,6 @@ static int __init ikconfig_init(void)
89 return 0; 83 return 0;
90} 84}
91 85
92/***************************************************/
93/* ikconfig_cleanup: clean up our mess */
94
95static void __exit ikconfig_cleanup(void) 86static void __exit ikconfig_cleanup(void)
96{ 87{
97 remove_proc_entry("config.gz", NULL); 88 remove_proc_entry("config.gz", NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
199 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
200 int err; 200 int err;
201 201
202 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
203 param->hcpu);
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
205 err = __cpu_disable(); 203 err = __cpu_disable();
206 if (err < 0) 204 if (err < 0)
207 return err; 205 return err;
208 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
209 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
210 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
211 sched_idle_next(); 212 sched_idle_next();
@@ -453,6 +454,25 @@ out:
453} 454}
454#endif /* CONFIG_PM_SLEEP_SMP */ 455#endif /* CONFIG_PM_SLEEP_SMP */
455 456
457/**
458 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
459 * @cpu: cpu that just started
460 *
461 * This function calls the cpu_chain notifiers with CPU_STARTING.
462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */
465void notify_cpu_starting(unsigned int cpu)
466{
467 unsigned long val = CPU_STARTING;
468
469#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
474}
475
456#endif /* CONFIG_SMP */ 476#endif /* CONFIG_SMP */
457 477
458/* 478/*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 827cd9adccb2..3e00526f52ec 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1172{ 1172{
1173 struct cpuset trialcs; 1173 struct cpuset trialcs;
1174 int err; 1174 int err;
1175 int cpus_nonempty, balance_flag_changed; 1175 int balance_flag_changed;
1176 1176
1177 trialcs = *cs; 1177 trialcs = *cs;
1178 if (turning_on) 1178 if (turning_on)
@@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1184 if (err < 0) 1184 if (err < 0)
1185 return err; 1185 return err;
1186 1186
1187 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1188 balance_flag_changed = (is_sched_load_balance(cs) != 1187 balance_flag_changed = (is_sched_load_balance(cs) !=
1189 is_sched_load_balance(&trialcs)); 1188 is_sched_load_balance(&trialcs));
1190 1189
@@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1192 cs->flags = trialcs.flags; 1191 cs->flags = trialcs.flags;
1193 mutex_unlock(&callback_mutex); 1192 mutex_unlock(&callback_mutex);
1194 1193
1195 if (cpus_nonempty && balance_flag_changed) 1194 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
1196 async_rebuild_sched_domains(); 1195 async_rebuild_sched_domains();
1197 1196
1198 return 0; 1197 return 0;
@@ -1921,7 +1920,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1921 * that has tasks along with an empty 'mems'. But if we did see such 1920 * that has tasks along with an empty 'mems'. But if we did see such
1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1921 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1923 */ 1922 */
1924static void scan_for_empty_cpusets(const struct cpuset *root) 1923static void scan_for_empty_cpusets(struct cpuset *root)
1925{ 1924{
1926 LIST_HEAD(queue); 1925 LIST_HEAD(queue);
1927 struct cpuset *cp; /* scans cpusets being updated */ 1926 struct cpuset *cp; /* scans cpusets being updated */
@@ -2437,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = {
2437void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2436void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2438{ 2437{
2439 seq_printf(m, "Cpus_allowed:\t"); 2438 seq_printf(m, "Cpus_allowed:\t");
2440 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2439 seq_cpumask(m, &task->cpus_allowed);
2441 task->cpus_allowed);
2442 seq_printf(m, "\n"); 2440 seq_printf(m, "\n");
2443 seq_printf(m, "Cpus_allowed_list:\t"); 2441 seq_printf(m, "Cpus_allowed_list:\t");
2444 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, 2442 seq_cpumask_list(m, &task->cpus_allowed);
2445 task->cpus_allowed);
2446 seq_printf(m, "\n"); 2443 seq_printf(m, "\n");
2447 seq_printf(m, "Mems_allowed:\t"); 2444 seq_printf(m, "Mems_allowed:\t");
2448 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2445 seq_nodemask(m, &task->mems_allowed);
2449 task->mems_allowed);
2450 seq_printf(m, "\n"); 2446 seq_printf(m, "\n");
2451 seq_printf(m, "Mems_allowed_list:\t"); 2447 seq_printf(m, "Mems_allowed_list:\t");
2452 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, 2448 seq_nodemask_list(m, &task->mems_allowed);
2453 task->mems_allowed);
2454 seq_printf(m, "\n"); 2449 seq_printf(m, "\n");
2455} 2450}
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
124 } 124 }
125 return (mem != NULL); 125 return (mem != NULL);
126} 126}
127EXPORT_SYMBOL(dma_alloc_from_coherent);
127 128
128/** 129/**
129 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool 130 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
151 } 152 }
152 return 0; 153 return 0;
153} 154}
155EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
index d2c60a822790..f903189c5304 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,4 +1,4 @@
1/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ 1/*
2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. 2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
3 * 3 *
4 * Written by Hennus Bergman, 1992. 4 * Written by Hennus Bergman, 1992.
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..0ef4673e351b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
583 * If there are other users of the mm and the owner (us) is exiting 583 * If there are other users of the mm and the owner (us) is exiting
584 * we need to find a new owner to take on the responsibility. 584 * we need to find a new owner to take on the responsibility.
585 */ 585 */
586 if (!mm)
587 return 0;
588 if (atomic_read(&mm->mm_users) <= 1) 586 if (atomic_read(&mm->mm_users) <= 1)
589 return 0; 587 return 0;
590 if (mm->owner != p) 588 if (mm->owner != p)
@@ -627,29 +625,38 @@ retry:
627 } while_each_thread(g, c); 625 } while_each_thread(g, c);
628 626
629 read_unlock(&tasklist_lock); 627 read_unlock(&tasklist_lock);
628 /*
629 * We found no owner yet mm_users > 1: this implies that we are
630 * most likely racing with swapoff (try_to_unuse()) or /proc or
631 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
632 * so that subsystems can understand the callback and take action.
633 */
634 down_write(&mm->mmap_sem);
635 cgroup_mm_owner_callbacks(mm->owner, NULL);
636 mm->owner = NULL;
637 up_write(&mm->mmap_sem);
630 return; 638 return;
631 639
632assign_new_owner: 640assign_new_owner:
633 BUG_ON(c == p); 641 BUG_ON(c == p);
634 get_task_struct(c); 642 get_task_struct(c);
643 read_unlock(&tasklist_lock);
644 down_write(&mm->mmap_sem);
635 /* 645 /*
636 * The task_lock protects c->mm from changing. 646 * The task_lock protects c->mm from changing.
637 * We always want mm->owner->mm == mm 647 * We always want mm->owner->mm == mm
638 */ 648 */
639 task_lock(c); 649 task_lock(c);
640 /*
641 * Delay read_unlock() till we have the task_lock()
642 * to ensure that c does not slip away underneath us
643 */
644 read_unlock(&tasklist_lock);
645 if (c->mm != mm) { 650 if (c->mm != mm) {
646 task_unlock(c); 651 task_unlock(c);
652 up_write(&mm->mmap_sem);
647 put_task_struct(c); 653 put_task_struct(c);
648 goto retry; 654 goto retry;
649 } 655 }
650 cgroup_mm_owner_callbacks(mm->owner, c); 656 cgroup_mm_owner_callbacks(mm->owner, c);
651 mm->owner = c; 657 mm->owner = c;
652 task_unlock(c); 658 task_unlock(c);
659 up_write(&mm->mmap_sem);
653 put_task_struct(c); 660 put_task_struct(c);
654} 661}
655#endif /* CONFIG_MM_OWNER */ 662#endif /* CONFIG_MM_OWNER */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..30de644a40c4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,6 +802,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
802 802
803 sig->leader = 0; /* session leadership doesn't inherit */ 803 sig->leader = 0; /* session leadership doesn't inherit */
804 sig->tty_old_pgrp = NULL; 804 sig->tty_old_pgrp = NULL;
805 sig->tty = NULL;
805 806
806 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 807 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
807 sig->gtime = cputime_zero; 808 sig->gtime = cputime_zero;
@@ -838,6 +839,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
838void __cleanup_signal(struct signal_struct *sig) 839void __cleanup_signal(struct signal_struct *sig)
839{ 840{
840 exit_thread_group_keys(sig); 841 exit_thread_group_keys(sig);
842 tty_kref_put(sig->tty);
841 kmem_cache_free(signal_cachep, sig); 843 kmem_cache_free(signal_cachep, sig);
842} 844}
843 845
@@ -1227,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1227 p->nsproxy->pid_ns->child_reaper = p; 1229 p->nsproxy->pid_ns->child_reaper = p;
1228 1230
1229 p->signal->leader_pid = pid; 1231 p->signal->leader_pid = pid;
1230 p->signal->tty = current->signal->tty; 1232 tty_kref_put(p->signal->tty);
1233 p->signal->tty = tty_kref_get(current->signal->tty);
1231 set_task_pgrp(p, task_pgrp_nr(current)); 1234 set_task_pgrp(p, task_pgrp_nr(current));
1232 set_task_session(p, task_session_nr(current)); 1235 set_task_session(p, task_session_nr(current));
1233 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1236 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000000..ba6248b323ef
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,154 @@
1/*
2 * kernel/freezer.c - Function to freeze a process
3 *
4 * Originally from kernel/power/process.c
5 */
6
7#include <linux/interrupt.h>
8#include <linux/suspend.h>
9#include <linux/module.h>
10#include <linux/syscalls.h>
11#include <linux/freezer.h>
12
13/*
14 * freezing is complete, mark current process as frozen
15 */
16static inline void frozen_process(void)
17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN;
20 wmb();
21 }
22 clear_freeze_flag(current);
23}
24
25/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void)
27{
28 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */
30 long save;
31
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm);
42
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current))
50 break;
51 schedule();
52 }
53 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save);
55}
56EXPORT_SYMBOL(refrigerator);
57
58static void fake_signal_wake_up(struct task_struct *p)
59{
60 unsigned long flags;
61
62 spin_lock_irqsave(&p->sighand->siglock, flags);
63 signal_wake_up(p, 0);
64 spin_unlock_irqrestore(&p->sighand->siglock, flags);
65}
66
67/**
68 * freeze_task - send a freeze request to given task
69 * @p: task to send the request to
70 * @sig_only: if set, the request will only be sent if the task has the
71 * PF_FREEZER_NOSIG flag unset
72 * Return value: 'false', if @sig_only is set and the task has
73 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
74 *
75 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
76 * either sending a fake signal to it or waking it up, depending on whether
77 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
78 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
79 * TIF_FREEZE flag will not be set.
80 */
81bool freeze_task(struct task_struct *p, bool sig_only)
82{
83 /*
84 * We first check if the task is freezing and next if it has already
85 * been frozen to avoid the race with frozen_process() which first marks
86 * the task as frozen and next clears its TIF_FREEZE.
87 */
88 if (!freezing(p)) {
89 rmb();
90 if (frozen(p))
91 return false;
92
93 if (!sig_only || should_send_signal(p))
94 set_freeze_flag(p);
95 else
96 return false;
97 }
98
99 if (should_send_signal(p)) {
100 if (!signal_pending(p))
101 fake_signal_wake_up(p);
102 } else if (sig_only) {
103 return false;
104 } else {
105 wake_up_state(p, TASK_INTERRUPTIBLE);
106 }
107
108 return true;
109}
110
111void cancel_freezing(struct task_struct *p)
112{
113 unsigned long flags;
114
115 if (freezing(p)) {
116 pr_debug(" clean up: %s\n", p->comm);
117 clear_freeze_flag(p);
118 spin_lock_irqsave(&p->sighand->siglock, flags);
119 recalc_sigpending_and_wake(p);
120 spin_unlock_irqrestore(&p->sighand->siglock, flags);
121 }
122}
123
124/*
125 * Wake up a frozen process
126 *
127 * task_lock() is needed to prevent the race with refrigerator() which may
128 * occur if the freezing of tasks fails. Namely, without the lock, if the
129 * freezing of tasks failed, thaw_tasks() might have run before a task in
130 * refrigerator() could call frozen_process(), in which case the task would be
131 * frozen and no one would thaw it.
132 */
133int __thaw_process(struct task_struct *p)
134{
135 if (frozen(p)) {
136 p->flags &= ~PF_FROZEN;
137 return 1;
138 }
139 clear_freeze_flag(p);
140 return 0;
141}
142
143int thaw_process(struct task_struct *p)
144{
145 task_lock(p);
146 if (__thaw_process(p) == 1) {
147 task_unlock(p);
148 wake_up_process(p);
149 return 1;
150 }
151 task_unlock(p);
152 return 0;
153}
154EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 */ 672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1; 674 return 1;
675 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
676 /* 677 /*
677 * This is solely for the sched tick emulation with 678 * This is solely for the sched tick emulation with
678 * dynamic tick support to ensure that we do not 679 * dynamic tick support to ensure that we do not
679 * restart the tick right on the edge and end up with 680 * restart the tick right on the edge and end up with
680 * the tick timer in the softirq ! The calling site 681 * the tick timer in the softirq ! The calling site
681 * takes care of this. 682 * takes care of this. Also used for hrtimer sleeper !
682 */ 683 */
683 debug_hrtimer_deactivate(timer); 684 debug_hrtimer_deactivate(timer);
684 return 1; 685 return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1245 timer_stats_account_hrtimer(timer); 1246 timer_stats_account_hrtimer(timer);
1246 1247
1247 fn = timer->function; 1248 fn = timer->function;
1248 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1249 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1250 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1249 /* 1251 /*
1250 * Used for scheduler timers, avoid lock inversion with 1252 * Used for scheduler timers, avoid lock inversion with
1251 * rq->lock and tasklist_lock. 1253 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1452 sl->timer.function = hrtimer_wakeup; 1454 sl->timer.function = hrtimer_wakeup;
1453 sl->task = task; 1455 sl->task = task;
1454#ifdef CONFIG_HIGH_RES_TIMERS 1456#ifdef CONFIG_HIGH_RES_TIMERS
1455 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1457 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1456#endif 1458#endif
1457} 1459}
1458 1460
@@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 1593
1592#ifdef CONFIG_HOTPLUG_CPU 1594#ifdef CONFIG_HOTPLUG_CPU
1593 1595
1594static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1596static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1595 struct hrtimer_clock_base *new_base) 1597 struct hrtimer_clock_base *new_base, int dcpu)
1596{ 1598{
1597 struct hrtimer *timer; 1599 struct hrtimer *timer;
1598 struct rb_node *node; 1600 struct rb_node *node;
1601 int raise = 0;
1599 1602
1600 while ((node = rb_first(&old_base->active))) { 1603 while ((node = rb_first(&old_base->active))) {
1601 timer = rb_entry(node, struct hrtimer, node); 1604 timer = rb_entry(node, struct hrtimer, node);
1602 BUG_ON(hrtimer_callback_running(timer)); 1605 BUG_ON(hrtimer_callback_running(timer));
1603 debug_hrtimer_deactivate(timer); 1606 debug_hrtimer_deactivate(timer);
1604 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1607
1608 /*
1609 * Should not happen. Per CPU timers should be
1610 * canceled _before_ the migration code is called
1611 */
1612 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1613 __remove_hrtimer(timer, old_base,
1614 HRTIMER_STATE_INACTIVE, 0);
1615 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1616 timer, timer->function, dcpu);
1617 continue;
1618 }
1619
1620 /*
1621 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1622 * timer could be seen as !active and just vanish away
1623 * under us on another CPU
1624 */
1625 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1605 timer->base = new_base; 1626 timer->base = new_base;
1606 /* 1627 /*
1607 * Enqueue the timer. Allow reprogramming of the event device 1628 * Enqueue the timer. Allow reprogramming of the event device
1608 */ 1629 */
1609 enqueue_hrtimer(timer, new_base, 1); 1630 enqueue_hrtimer(timer, new_base, 1);
1631
1632#ifdef CONFIG_HIGH_RES_TIMERS
1633 /*
1634 * Happens with high res enabled when the timer was
1635 * already expired and the callback mode is
1636 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1637 * enqueue code does not move them to the soft irq
1638 * pending list for performance/latency reasons, but
1639 * in the migration state, we need to do that
1640 * otherwise we end up with a stale timer.
1641 */
1642 if (timer->state == HRTIMER_STATE_MIGRATE) {
1643 timer->state = HRTIMER_STATE_PENDING;
1644 list_add_tail(&timer->cb_entry,
1645 &new_base->cpu_base->cb_pending);
1646 raise = 1;
1647 }
1648#endif
1649 /* Clear the migration state bit */
1650 timer->state &= ~HRTIMER_STATE_MIGRATE;
1651 }
1652 return raise;
1653}
1654
1655#ifdef CONFIG_HIGH_RES_TIMERS
1656static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1657 struct hrtimer_cpu_base *new_base)
1658{
1659 struct hrtimer *timer;
1660 int raise = 0;
1661
1662 while (!list_empty(&old_base->cb_pending)) {
1663 timer = list_entry(old_base->cb_pending.next,
1664 struct hrtimer, cb_entry);
1665
1666 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1667 timer->base = &new_base->clock_base[timer->base->index];
1668 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1669 raise = 1;
1610 } 1670 }
1671 return raise;
1672}
1673#else
1674static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1675 struct hrtimer_cpu_base *new_base)
1676{
1677 return 0;
1611} 1678}
1679#endif
1612 1680
1613static void migrate_hrtimers(int cpu) 1681static void migrate_hrtimers(int cpu)
1614{ 1682{
1615 struct hrtimer_cpu_base *old_base, *new_base; 1683 struct hrtimer_cpu_base *old_base, *new_base;
1616 int i; 1684 int i, raise = 0;
1617 1685
1618 BUG_ON(cpu_online(cpu)); 1686 BUG_ON(cpu_online(cpu));
1619 old_base = &per_cpu(hrtimer_bases, cpu); 1687 old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
1626 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1627 1695
1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1629 migrate_hrtimer_list(&old_base->clock_base[i], 1697 if (migrate_hrtimer_list(&old_base->clock_base[i],
1630 &new_base->clock_base[i]); 1698 &new_base->clock_base[i], cpu))
1699 raise = 1;
1631 } 1700 }
1632 1701
1702 if (migrate_hrtimer_pending(old_base, new_base))
1703 raise = 1;
1704
1633 spin_unlock(&old_base->lock); 1705 spin_unlock(&old_base->lock);
1634 spin_unlock(&new_base->lock); 1706 spin_unlock(&new_base->lock);
1635 local_irq_enable(); 1707 local_irq_enable();
1636 put_cpu_var(hrtimer_bases); 1708 put_cpu_var(hrtimer_bases);
1709
1710 if (raise)
1711 hrtimer_raise_softirq();
1637} 1712}
1638#endif /* CONFIG_HOTPLUG_CPU */ 1713#endif /* CONFIG_HOTPLUG_CPU */
1639 1714
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0314074fa232..60c49e324390 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
89 set_balance_irq_affinity(irq, cpumask); 89 set_balance_irq_affinity(irq, cpumask);
90 90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
92 set_pending_irq(irq, cpumask); 92 if (desc->status & IRQ_MOVE_PCNTXT) {
93 unsigned long flags;
94
95 spin_lock_irqsave(&desc->lock, flags);
96 desc->chip->set_affinity(irq, cpumask);
97 spin_unlock_irqrestore(&desc->lock, flags);
98 } else
99 set_pending_irq(irq, cpumask);
93#else 100#else
94 desc->affinity = cpumask; 101 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 38fc10ac7541..5072cf1685a2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr,
260 /* see if it's in a module */ 260 /* see if it's in a module */
261 return module_address_lookup(addr, symbolsize, offset, modname, 261 return module_address_lookup(addr, symbolsize, offset, modname,
262 namebuf); 262 namebuf);
263 return NULL;
264} 263}
265 264
266int lookup_symbol_name(unsigned long addr, char *symname) 265int lookup_symbol_name(unsigned long addr, char *symname)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f3f0df35d4..777ac458ac99 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
753 *old = addr | (*old & ~PAGE_MASK); 753 *old = addr | (*old & ~PAGE_MASK);
754 754
755 /* The old page I have found cannot be a 755 /* The old page I have found cannot be a
756 * destination page, so return it. 756 * destination page, so return it if it's
757 * gfp_flags honor the ones passed in.
757 */ 758 */
759 if (!(gfp_mask & __GFP_HIGHMEM) &&
760 PageHighMem(old_page)) {
761 kimage_free_pages(old_page);
762 continue;
763 }
758 addr = old_addr; 764 addr = old_addr;
759 page = old_page; 765 page = old_page;
760 break; 766 break;
@@ -1365,6 +1371,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1365 VMCOREINFO_SYMBOL(node_online_map); 1371 VMCOREINFO_SYMBOL(node_online_map);
1366 VMCOREINFO_SYMBOL(swapper_pg_dir); 1372 VMCOREINFO_SYMBOL(swapper_pg_dir);
1367 VMCOREINFO_SYMBOL(_stext); 1373 VMCOREINFO_SYMBOL(_stext);
1374 VMCOREINFO_SYMBOL(vmlist);
1368 1375
1369#ifndef CONFIG_NEED_MULTIPLE_NODES 1376#ifndef CONFIG_NEED_MULTIPLE_NODES
1370 VMCOREINFO_SYMBOL(mem_map); 1377 VMCOREINFO_SYMBOL(mem_map);
@@ -1400,6 +1407,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1400 VMCOREINFO_OFFSET(free_area, free_list); 1407 VMCOREINFO_OFFSET(free_area, free_list);
1401 VMCOREINFO_OFFSET(list_head, next); 1408 VMCOREINFO_OFFSET(list_head, next);
1402 VMCOREINFO_OFFSET(list_head, prev); 1409 VMCOREINFO_OFFSET(list_head, prev);
1410 VMCOREINFO_OFFSET(vm_struct, addr);
1403 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1404 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1412 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1405 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1413 VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
488 if (err) 488 if (err)
489 return err; 489 return err;
490 if (CACHE_FLUSH_IS_SAFE) 490 if (CACHE_FLUSH_IS_SAFE)
491 flush_icache_range(addr, addr + length + 1); 491 flush_icache_range(addr, addr + length);
492 return 0; 492 return 0;
493 } 493 }
494 494
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 590
591 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
593 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
594 local_irq_restore(flags); 595 local_irq_restore(flags);
595} 596}
@@ -1432,6 +1433,7 @@ acquirelock:
1432 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1433 1434
1434 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1435 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1436 local_irq_restore(flags); 1438 local_irq_restore(flags);
1437 1439
@@ -1462,7 +1464,7 @@ acquirelock:
1462 * Get the passive CPU lock which will hold all the non-primary 1464 * Get the passive CPU lock which will hold all the non-primary
1463 * CPU in a spin state while the debugger is active 1465 * CPU in a spin state while the debugger is active
1464 */ 1466 */
1465 if (!kgdb_single_step || !kgdb_contthread) { 1467 if (!kgdb_single_step) {
1466 for (i = 0; i < NR_CPUS; i++) 1468 for (i = 0; i < NR_CPUS; i++)
1467 atomic_set(&passive_cpu_wait[i], 1); 1469 atomic_set(&passive_cpu_wait[i], 1);
1468 } 1470 }
@@ -1475,7 +1477,7 @@ acquirelock:
1475 1477
1476#ifdef CONFIG_SMP 1478#ifdef CONFIG_SMP
1477 /* Signal the other CPUs to enter kgdb_wait() */ 1479 /* Signal the other CPUs to enter kgdb_wait() */
1478 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) 1480 if ((!kgdb_single_step) && kgdb_do_roundup)
1479 kgdb_roundup_cpus(flags); 1481 kgdb_roundup_cpus(flags);
1480#endif 1482#endif
1481 1483
@@ -1494,7 +1496,7 @@ acquirelock:
1494 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); 1496 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1495 kgdb_deactivate_sw_breakpoints(); 1497 kgdb_deactivate_sw_breakpoints();
1496 kgdb_single_step = 0; 1498 kgdb_single_step = 0;
1497 kgdb_contthread = NULL; 1499 kgdb_contthread = current;
1498 exception_level = 0; 1500 exception_level = 0;
1499 1501
1500 /* Talk to debugger with gdbserial protocol */ 1502 /* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1510,7 @@ acquirelock:
1508 kgdb_info[ks->cpu].task = NULL; 1510 kgdb_info[ks->cpu].task = NULL;
1509 atomic_set(&cpu_in_kgdb[ks->cpu], 0); 1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1510 1512
1511 if (!kgdb_single_step || !kgdb_contthread) { 1513 if (!kgdb_single_step) {
1512 for (i = NR_CPUS-1; i >= 0; i--) 1514 for (i = NR_CPUS-1; i >= 0; i--)
1513 atomic_set(&passive_cpu_wait[i], 0); 1515 atomic_set(&passive_cpu_wait[i], 0);
1514 /* 1516 /*
@@ -1524,6 +1526,7 @@ acquirelock:
1524kgdb_restore: 1526kgdb_restore:
1525 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1526 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1527 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1528 local_irq_restore(flags); 1531 local_irq_restore(flags);
1529 1532
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2456d1a0befb..3d3c3ea3a023 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -113,7 +113,7 @@ int request_module(const char *fmt, ...)
113 return ret; 113 return ret;
114} 114}
115EXPORT_SYMBOL(request_module); 115EXPORT_SYMBOL(request_module);
116#endif /* CONFIG_KMOD */ 116#endif /* CONFIG_MODULES */
117 117
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
@@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work)
265 } 265 }
266} 266}
267 267
268#ifdef CONFIG_PM 268#ifdef CONFIG_PM_SLEEP
269/* 269/*
270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
271 * (used for preventing user land processes from being created after the user 271 * (used for preventing user land processes from being created after the user
@@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
288 */ 288 */
289#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 289#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
290 290
291static int usermodehelper_pm_callback(struct notifier_block *nfb, 291/**
292 unsigned long action, 292 * usermodehelper_disable - prevent new helpers from being started
293 void *ignored) 293 */
294int usermodehelper_disable(void)
294{ 295{
295 long retval; 296 long retval;
296 297
297 switch (action) { 298 usermodehelper_disabled = 1;
298 case PM_HIBERNATION_PREPARE: 299 smp_mb();
299 case PM_SUSPEND_PREPARE: 300 /*
300 usermodehelper_disabled = 1; 301 * From now on call_usermodehelper_exec() won't start any new
301 smp_mb(); 302 * helpers, so it is sufficient if running_helpers turns out to
302 /* 303 * be zero at one point (it may be increased later, but that
303 * From now on call_usermodehelper_exec() won't start any new 304 * doesn't matter).
304 * helpers, so it is sufficient if running_helpers turns out to 305 */
305 * be zero at one point (it may be increased later, but that 306 retval = wait_event_timeout(running_helpers_waitq,
306 * doesn't matter).
307 */
308 retval = wait_event_timeout(running_helpers_waitq,
309 atomic_read(&running_helpers) == 0, 307 atomic_read(&running_helpers) == 0,
310 RUNNING_HELPERS_TIMEOUT); 308 RUNNING_HELPERS_TIMEOUT);
311 if (retval) { 309 if (retval)
312 return NOTIFY_OK; 310 return 0;
313 } else {
314 usermodehelper_disabled = 0;
315 return NOTIFY_BAD;
316 }
317 case PM_POST_HIBERNATION:
318 case PM_POST_SUSPEND:
319 usermodehelper_disabled = 0;
320 return NOTIFY_OK;
321 }
322 311
323 return NOTIFY_DONE; 312 usermodehelper_disabled = 0;
313 return -EAGAIN;
314}
315
316/**
317 * usermodehelper_enable - allow new helpers to be started again
318 */
319void usermodehelper_enable(void)
320{
321 usermodehelper_disabled = 0;
324} 322}
325 323
326static void helper_lock(void) 324static void helper_lock(void)
@@ -334,18 +332,12 @@ static void helper_unlock(void)
334 if (atomic_dec_and_test(&running_helpers)) 332 if (atomic_dec_and_test(&running_helpers))
335 wake_up(&running_helpers_waitq); 333 wake_up(&running_helpers_waitq);
336} 334}
337 335#else /* CONFIG_PM_SLEEP */
338static void register_pm_notifier_callback(void)
339{
340 pm_notifier(usermodehelper_pm_callback, 0);
341}
342#else /* CONFIG_PM */
343#define usermodehelper_disabled 0 336#define usermodehelper_disabled 0
344 337
345static inline void helper_lock(void) {} 338static inline void helper_lock(void) {}
346static inline void helper_unlock(void) {} 339static inline void helper_unlock(void) {}
347static inline void register_pm_notifier_callback(void) {} 340#endif /* CONFIG_PM_SLEEP */
348#endif /* CONFIG_PM */
349 341
350/** 342/**
351 * call_usermodehelper_setup - prepare to call a usermode helper 343 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -515,5 +507,4 @@ void __init usermodehelper_init(void)
515{ 507{
516 khelper_wq = create_singlethread_workqueue("khelper"); 508 khelper_wq = create_singlethread_workqueue("khelper");
517 BUG_ON(!khelper_wq); 509 BUG_ON(!khelper_wq);
518 register_pm_notifier_callback();
519} 510}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 75bc2cd9ebc6..8b57a2597f21 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 404 spin_lock_irqsave(hlist_lock, *flags);
405} 405}
406 406
407void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{ 408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 410 spin_lock_irqsave(hlist_lock, *flags);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e53bc30e9ba5..08dd8ed86c77 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18 19
19#define KERNEL_ATTR_RO(_name) \ 20#define KERNEL_ATTR_RO(_name) \
@@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
53KERNEL_ATTR_RW(uevent_helper); 54KERNEL_ATTR_RW(uevent_helper);
54#endif 55#endif
55 56
57#ifdef CONFIG_PROFILING
58static ssize_t profiling_show(struct kobject *kobj,
59 struct kobj_attribute *attr, char *buf)
60{
61 return sprintf(buf, "%d\n", prof_on);
62}
63static ssize_t profiling_store(struct kobject *kobj,
64 struct kobj_attribute *attr,
65 const char *buf, size_t count)
66{
67 int ret;
68
69 if (prof_on)
70 return -EEXIST;
71 /*
72 * This eventually calls into get_option() which
73 * has a ton of callers and is not const. It is
74 * easiest to cast it away here.
75 */
76 profile_setup((char *)buf);
77 ret = profile_init();
78 if (ret)
79 return ret;
80 ret = create_proc_profile();
81 if (ret)
82 return ret;
83 return count;
84}
85KERNEL_ATTR_RW(profiling);
86#endif
87
56#ifdef CONFIG_KEXEC 88#ifdef CONFIG_KEXEC
57static ssize_t kexec_loaded_show(struct kobject *kobj, 89static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf) 90 struct kobj_attribute *attr, char *buf)
@@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = {
109 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
110 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
111#endif 143#endif
144#ifdef CONFIG_PROFILING
145 &profiling_attr.attr,
146#endif
112#ifdef CONFIG_KEXEC 147#ifdef CONFIG_KEXEC
113 &kexec_loaded_attr.attr, 148 &kexec_loaded_attr.attr,
114 &kexec_crash_loaded_attr.attr, 149 &kexec_crash_loaded_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 96cff2f8710b..14ec64fe175a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -171,12 +171,11 @@ EXPORT_SYMBOL(kthread_create);
171 */ 171 */
172void kthread_bind(struct task_struct *k, unsigned int cpu) 172void kthread_bind(struct task_struct *k, unsigned int cpu)
173{ 173{
174 if (k->state != TASK_UNINTERRUPTIBLE) { 174 /* Must have done schedule() in kthread() before we set_task_cpu */
175 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
175 WARN_ON(1); 176 WARN_ON(1);
176 return; 177 return;
177 } 178 }
178 /* Must have done schedule() in kthread() before we set_task_cpu */
179 wait_task_inactive(k, 0);
180 set_task_cpu(k, cpu); 179 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 180 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 181 k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/module.c b/kernel/module.c
index 9db11911e04b..25bc9ac9e226 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod)
100static inline void add_taint_module(struct module *mod, unsigned flag) 100static inline void add_taint_module(struct module *mod, unsigned flag)
101{ 101{
102 add_taint(flag); 102 add_taint(flag);
103 mod->taints |= flag; 103 mod->taints |= (1U << flag);
104} 104}
105 105
106/* 106/*
@@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
784 mutex_lock(&module_mutex); 784 mutex_lock(&module_mutex);
785 /* Store the name of the last unloaded module for diagnostic purposes */ 785 /* Store the name of the last unloaded module for diagnostic purposes */
786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
787 unregister_dynamic_debug_module(mod->name);
787 free_module(mod); 788 free_module(mod);
788 789
789 out: 790 out:
@@ -923,7 +924,7 @@ static const char vermagic[] = VERMAGIC_STRING;
923static int try_to_force_load(struct module *mod, const char *symname) 924static int try_to_force_load(struct module *mod, const char *symname)
924{ 925{
925#ifdef CONFIG_MODULE_FORCE_LOAD 926#ifdef CONFIG_MODULE_FORCE_LOAD
926 if (!(tainted & TAINT_FORCED_MODULE)) 927 if (!test_taint(TAINT_FORCED_MODULE))
927 printk("%s: no version for \"%s\" found: kernel tainted.\n", 928 printk("%s: no version for \"%s\" found: kernel tainted.\n",
928 mod->name, symname); 929 mod->name, symname);
929 add_taint_module(mod, TAINT_FORCED_MODULE); 930 add_taint_module(mod, TAINT_FORCED_MODULE);
@@ -1033,7 +1034,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
1033 const unsigned long *crc; 1034 const unsigned long *crc;
1034 1035
1035 ret = find_symbol(name, &owner, &crc, 1036 ret = find_symbol(name, &owner, &crc,
1036 !(mod->taints & TAINT_PROPRIETARY_MODULE), true); 1037 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1037 if (!IS_ERR_VALUE(ret)) { 1038 if (!IS_ERR_VALUE(ret)) {
1038 /* use_module can fail due to OOM, 1039 /* use_module can fail due to OOM,
1039 or module initialization or unloading */ 1040 or module initialization or unloading */
@@ -1173,7 +1174,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1173 while (i-- > 0) 1174 while (i-- > 0)
1174 sysfs_remove_bin_file(notes_attrs->dir, 1175 sysfs_remove_bin_file(notes_attrs->dir,
1175 &notes_attrs->attrs[i]); 1176 &notes_attrs->attrs[i]);
1176 kobject_del(notes_attrs->dir); 1177 kobject_put(notes_attrs->dir);
1177 } 1178 }
1178 kfree(notes_attrs); 1179 kfree(notes_attrs);
1179} 1180}
@@ -1634,7 +1635,7 @@ static void set_license(struct module *mod, const char *license)
1634 license = "unspecified"; 1635 license = "unspecified";
1635 1636
1636 if (!license_is_gpl_compatible(license)) { 1637 if (!license_is_gpl_compatible(license)) {
1637 if (!(tainted & TAINT_PROPRIETARY_MODULE)) 1638 if (!test_taint(TAINT_PROPRIETARY_MODULE))
1638 printk(KERN_WARNING "%s: module license '%s' taints " 1639 printk(KERN_WARNING "%s: module license '%s' taints "
1639 "kernel.\n", mod->name, license); 1640 "kernel.\n", mod->name, license);
1640 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1641 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod,
1783} 1784}
1784#endif /* CONFIG_KALLSYMS */ 1785#endif /* CONFIG_KALLSYMS */
1785 1786
1787#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
1788static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex)
1789{
1790 struct mod_debug *debug_info;
1791 unsigned long pos, end;
1792 unsigned int num_verbose;
1793
1794 pos = sechdrs[verboseindex].sh_addr;
1795 num_verbose = sechdrs[verboseindex].sh_size /
1796 sizeof(struct mod_debug);
1797 end = pos + (num_verbose * sizeof(struct mod_debug));
1798
1799 for (; pos < end; pos += sizeof(struct mod_debug)) {
1800 debug_info = (struct mod_debug *)pos;
1801 register_dynamic_debug_module(debug_info->modname,
1802 debug_info->type, debug_info->logical_modname,
1803 debug_info->flag_names, debug_info->hash,
1804 debug_info->hash2);
1805 }
1806}
1807#else
1808static inline void dynamic_printk_setup(Elf_Shdr *sechdrs,
1809 unsigned int verboseindex)
1810{
1811}
1812#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1813
1786static void *module_alloc_update_bounds(unsigned long size) 1814static void *module_alloc_update_bounds(unsigned long size)
1787{ 1815{
1788 void *ret = module_alloc(size); 1816 void *ret = module_alloc(size);
@@ -1806,6 +1834,7 @@ static noinline struct module *load_module(void __user *umod,
1806 Elf_Ehdr *hdr; 1834 Elf_Ehdr *hdr;
1807 Elf_Shdr *sechdrs; 1835 Elf_Shdr *sechdrs;
1808 char *secstrings, *args, *modmagic, *strtab = NULL; 1836 char *secstrings, *args, *modmagic, *strtab = NULL;
1837 char *staging;
1809 unsigned int i; 1838 unsigned int i;
1810 unsigned int symindex = 0; 1839 unsigned int symindex = 0;
1811 unsigned int strindex = 0; 1840 unsigned int strindex = 0;
@@ -1831,6 +1860,7 @@ static noinline struct module *load_module(void __user *umod,
1831#endif 1860#endif
1832 unsigned int markersindex; 1861 unsigned int markersindex;
1833 unsigned int markersstringsindex; 1862 unsigned int markersstringsindex;
1863 unsigned int verboseindex;
1834 struct module *mod; 1864 struct module *mod;
1835 long err = 0; 1865 long err = 0;
1836 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1866 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1960,6 +1990,14 @@ static noinline struct module *load_module(void __user *umod,
1960 goto free_hdr; 1990 goto free_hdr;
1961 } 1991 }
1962 1992
1993 staging = get_modinfo(sechdrs, infoindex, "staging");
1994 if (staging) {
1995 add_taint_module(mod, TAINT_CRAP);
1996 printk(KERN_WARNING "%s: module is from the staging directory,"
1997 " the quality is unknown, you have been warned.\n",
1998 mod->name);
1999 }
2000
1963 /* Now copy in args */ 2001 /* Now copy in args */
1964 args = strndup_user(uargs, ~0UL >> 1); 2002 args = strndup_user(uargs, ~0UL >> 1);
1965 if (IS_ERR(args)) { 2003 if (IS_ERR(args)) {
@@ -2117,6 +2155,7 @@ static noinline struct module *load_module(void __user *umod,
2117 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); 2155 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
2118 markersstringsindex = find_sec(hdr, sechdrs, secstrings, 2156 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
2119 "__markers_strings"); 2157 "__markers_strings");
2158 verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
2120 2159
2121 /* Now do relocations. */ 2160 /* Now do relocations. */
2122 for (i = 1; i < hdr->e_shnum; i++) { 2161 for (i = 1; i < hdr->e_shnum; i++) {
@@ -2167,6 +2206,7 @@ static noinline struct module *load_module(void __user *umod,
2167 marker_update_probe_range(mod->markers, 2206 marker_update_probe_range(mod->markers,
2168 mod->markers + mod->num_markers); 2207 mod->markers + mod->num_markers);
2169#endif 2208#endif
2209 dynamic_printk_setup(sechdrs, verboseindex);
2170 err = module_finalize(hdr, sechdrs, mod); 2210 err = module_finalize(hdr, sechdrs, mod);
2171 if (err < 0) 2211 if (err < 0)
2172 goto cleanup; 2212 goto cleanup;
@@ -2552,10 +2592,12 @@ static char *module_flags(struct module *mod, char *buf)
2552 mod->state == MODULE_STATE_GOING || 2592 mod->state == MODULE_STATE_GOING ||
2553 mod->state == MODULE_STATE_COMING) { 2593 mod->state == MODULE_STATE_COMING) {
2554 buf[bx++] = '('; 2594 buf[bx++] = '(';
2555 if (mod->taints & TAINT_PROPRIETARY_MODULE) 2595 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
2556 buf[bx++] = 'P'; 2596 buf[bx++] = 'P';
2557 if (mod->taints & TAINT_FORCED_MODULE) 2597 if (mod->taints & (1 << TAINT_FORCED_MODULE))
2558 buf[bx++] = 'F'; 2598 buf[bx++] = 'F';
2599 if (mod->taints & (1 << TAINT_CRAP))
2600 buf[bx++] = 'C';
2559 /* 2601 /*
2560 * TAINT_FORCED_RMMOD: could be added. 2602 * TAINT_FORCED_RMMOD: could be added.
2561 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2603 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
diff --git a/kernel/panic.c b/kernel/panic.c
index 12c5a0a6c89b..bda561ef3cdf 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,7 +23,7 @@
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24 24
25int panic_on_oops; 25int panic_on_oops;
26int tainted; 26static unsigned long tainted_mask;
27static int pause_on_oops; 27static int pause_on_oops;
28static int pause_on_oops_flag; 28static int pause_on_oops_flag;
29static DEFINE_SPINLOCK(pause_on_oops_lock); 29static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -143,6 +143,27 @@ NORET_TYPE void panic(const char * fmt, ...)
143 143
144EXPORT_SYMBOL(panic); 144EXPORT_SYMBOL(panic);
145 145
146
147struct tnt {
148 u8 bit;
149 char true;
150 char false;
151};
152
153static const struct tnt tnts[] = {
154 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
155 { TAINT_FORCED_MODULE, 'F', ' ' },
156 { TAINT_UNSAFE_SMP, 'S', ' ' },
157 { TAINT_FORCED_RMMOD, 'R', ' ' },
158 { TAINT_MACHINE_CHECK, 'M', ' ' },
159 { TAINT_BAD_PAGE, 'B', ' ' },
160 { TAINT_USER, 'U', ' ' },
161 { TAINT_DIE, 'D', ' ' },
162 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
163 { TAINT_WARN, 'W', ' ' },
164 { TAINT_CRAP, 'C', ' ' },
165};
166
146/** 167/**
147 * print_tainted - return a string to represent the kernel taint state. 168 * print_tainted - return a string to represent the kernel taint state.
148 * 169 *
@@ -155,35 +176,45 @@ EXPORT_SYMBOL(panic);
155 * 'U' - Userspace-defined naughtiness. 176 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden. 177 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning. 178 * 'W' - Taint on warning.
179 * 'C' - modules from drivers/staging are loaded.
158 * 180 *
159 * The string is overwritten by the next call to print_taint(). 181 * The string is overwritten by the next call to print_taint().
160 */ 182 */
161
162const char *print_tainted(void) 183const char *print_tainted(void)
163{ 184{
164 static char buf[20]; 185 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
165 if (tainted) { 186
166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", 187 if (tainted_mask) {
167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 188 char *s;
168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 189 int i;
169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 190
170 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 191 s = buf + sprintf(buf, "Tainted: ");
171 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 192 for (i = 0; i < ARRAY_SIZE(tnts); i++) {
172 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 193 const struct tnt *t = &tnts[i];
173 tainted & TAINT_USER ? 'U' : ' ', 194 *s++ = test_bit(t->bit, &tainted_mask) ?
174 tainted & TAINT_DIE ? 'D' : ' ', 195 t->true : t->false;
175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', 196 }
176 tainted & TAINT_WARN ? 'W' : ' '); 197 *s = 0;
177 } 198 } else
178 else
179 snprintf(buf, sizeof(buf), "Not tainted"); 199 snprintf(buf, sizeof(buf), "Not tainted");
180 return(buf); 200 return(buf);
181} 201}
182 202
203int test_taint(unsigned flag)
204{
205 return test_bit(flag, &tainted_mask);
206}
207EXPORT_SYMBOL(test_taint);
208
209unsigned long get_taint(void)
210{
211 return tainted_mask;
212}
213
183void add_taint(unsigned flag) 214void add_taint(unsigned flag)
184{ 215{
185 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 216 debug_locks = 0; /* can't trust the integrity of the kernel anymore */
186 tainted |= flag; 217 set_bit(flag, &tainted_mask);
187} 218}
188EXPORT_SYMBOL(add_taint); 219EXPORT_SYMBOL(add_taint);
189 220
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
441 return tmr; 441 return tmr;
442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
443 kmem_cache_free(posix_timers_cache, tmr); 443 kmem_cache_free(posix_timers_cache, tmr);
444 tmr = NULL; 444 return NULL;
445 } 445 }
446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); 446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
447 return tmr; 447 return tmr;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index bbd85c60f741..331f9836383f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -14,6 +14,7 @@
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/kmod.h>
17#include <linux/delay.h> 18#include <linux/delay.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/mount.h> 20#include <linux/mount.h>
@@ -520,6 +521,10 @@ int hibernate(void)
520 if (error) 521 if (error)
521 goto Exit; 522 goto Exit;
522 523
524 error = usermodehelper_disable();
525 if (error)
526 goto Exit;
527
523 /* Allocate memory management structures */ 528 /* Allocate memory management structures */
524 error = create_basic_memory_bitmaps(); 529 error = create_basic_memory_bitmaps();
525 if (error) 530 if (error)
@@ -558,6 +563,7 @@ int hibernate(void)
558 thaw_processes(); 563 thaw_processes();
559 Finish: 564 Finish:
560 free_basic_memory_bitmaps(); 565 free_basic_memory_bitmaps();
566 usermodehelper_enable();
561 Exit: 567 Exit:
562 pm_notifier_call_chain(PM_POST_HIBERNATION); 568 pm_notifier_call_chain(PM_POST_HIBERNATION);
563 pm_restore_console(); 569 pm_restore_console();
@@ -634,6 +640,10 @@ static int software_resume(void)
634 if (error) 640 if (error)
635 goto Finish; 641 goto Finish;
636 642
643 error = usermodehelper_disable();
644 if (error)
645 goto Finish;
646
637 error = create_basic_memory_bitmaps(); 647 error = create_basic_memory_bitmaps();
638 if (error) 648 if (error)
639 goto Finish; 649 goto Finish;
@@ -656,6 +666,7 @@ static int software_resume(void)
656 thaw_processes(); 666 thaw_processes();
657 Done: 667 Done:
658 free_basic_memory_bitmaps(); 668 free_basic_memory_bitmaps();
669 usermodehelper_enable();
659 Finish: 670 Finish:
660 pm_notifier_call_chain(PM_POST_RESTORE); 671 pm_notifier_call_chain(PM_POST_RESTORE);
661 pm_restore_console(); 672 pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 540b16b68565..19122cf6d827 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -14,6 +14,7 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/kmod.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
@@ -237,6 +238,10 @@ static int suspend_prepare(void)
237 if (error) 238 if (error)
238 goto Finish; 239 goto Finish;
239 240
241 error = usermodehelper_disable();
242 if (error)
243 goto Finish;
244
240 if (suspend_freeze_processes()) { 245 if (suspend_freeze_processes()) {
241 error = -EAGAIN; 246 error = -EAGAIN;
242 goto Thaw; 247 goto Thaw;
@@ -256,6 +261,7 @@ static int suspend_prepare(void)
256 261
257 Thaw: 262 Thaw:
258 suspend_thaw_processes(); 263 suspend_thaw_processes();
264 usermodehelper_enable();
259 Finish: 265 Finish:
260 pm_notifier_call_chain(PM_POST_SUSPEND); 266 pm_notifier_call_chain(PM_POST_SUSPEND);
261 pm_restore_console(); 267 pm_restore_console();
@@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state)
376static void suspend_finish(void) 382static void suspend_finish(void)
377{ 383{
378 suspend_thaw_processes(); 384 suspend_thaw_processes();
385 usermodehelper_enable();
379 pm_notifier_call_chain(PM_POST_SUSPEND); 386 pm_notifier_call_chain(PM_POST_SUSPEND);
380 pm_restore_console(); 387 pm_restore_console();
381} 388}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 278946aecaf0..ca634019497a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p)
28 return 1; 28 return 1;
29} 29}
30 30
31/*
32 * freezing is complete, mark current process as frozen
33 */
34static inline void frozen_process(void)
35{
36 if (!unlikely(current->flags & PF_NOFREEZE)) {
37 current->flags |= PF_FROZEN;
38 wmb();
39 }
40 clear_freeze_flag(current);
41}
42
43/* Refrigerator is place where frozen processes are stored :-). */
44void refrigerator(void)
45{
46 /* Hmm, should we be allowed to suspend when there are realtime
47 processes around? */
48 long save;
49
50 task_lock(current);
51 if (freezing(current)) {
52 frozen_process();
53 task_unlock(current);
54 } else {
55 task_unlock(current);
56 return;
57 }
58 save = current->state;
59 pr_debug("%s entered refrigerator\n", current->comm);
60
61 spin_lock_irq(&current->sighand->siglock);
62 recalc_sigpending(); /* We sent fake signal, clean it up */
63 spin_unlock_irq(&current->sighand->siglock);
64
65 for (;;) {
66 set_current_state(TASK_UNINTERRUPTIBLE);
67 if (!frozen(current))
68 break;
69 schedule();
70 }
71 pr_debug("%s left refrigerator\n", current->comm);
72 __set_current_state(save);
73}
74
75static void fake_signal_wake_up(struct task_struct *p)
76{
77 unsigned long flags;
78
79 spin_lock_irqsave(&p->sighand->siglock, flags);
80 signal_wake_up(p, 0);
81 spin_unlock_irqrestore(&p->sighand->siglock, flags);
82}
83
84static inline bool should_send_signal(struct task_struct *p)
85{
86 return !(p->flags & PF_FREEZER_NOSIG);
87}
88
89/**
90 * freeze_task - send a freeze request to given task
91 * @p: task to send the request to
92 * @sig_only: if set, the request will only be sent if the task has the
93 * PF_FREEZER_NOSIG flag unset
94 * Return value: 'false', if @sig_only is set and the task has
95 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
96 *
97 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
98 * either sending a fake signal to it or waking it up, depending on whether
99 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
100 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
101 * TIF_FREEZE flag will not be set.
102 */
103static bool freeze_task(struct task_struct *p, bool sig_only)
104{
105 /*
106 * We first check if the task is freezing and next if it has already
107 * been frozen to avoid the race with frozen_process() which first marks
108 * the task as frozen and next clears its TIF_FREEZE.
109 */
110 if (!freezing(p)) {
111 rmb();
112 if (frozen(p))
113 return false;
114
115 if (!sig_only || should_send_signal(p))
116 set_freeze_flag(p);
117 else
118 return false;
119 }
120
121 if (should_send_signal(p)) {
122 if (!signal_pending(p))
123 fake_signal_wake_up(p);
124 } else if (sig_only) {
125 return false;
126 } else {
127 wake_up_state(p, TASK_INTERRUPTIBLE);
128 }
129
130 return true;
131}
132
133static void cancel_freezing(struct task_struct *p)
134{
135 unsigned long flags;
136
137 if (freezing(p)) {
138 pr_debug(" clean up: %s\n", p->comm);
139 clear_freeze_flag(p);
140 spin_lock_irqsave(&p->sighand->siglock, flags);
141 recalc_sigpending_and_wake(p);
142 spin_unlock_irqrestore(&p->sighand->siglock, flags);
143 }
144}
145
146static int try_to_freeze_tasks(bool sig_only) 31static int try_to_freeze_tasks(bool sig_only)
147{ 32{
148 struct task_struct *g, *p; 33 struct task_struct *g, *p;
@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
250 if (nosig_only && should_send_signal(p)) 135 if (nosig_only && should_send_signal(p))
251 continue; 136 continue;
252 137
138 if (cgroup_frozen(p))
139 continue;
140
253 thaw_process(p); 141 thaw_process(p);
254 } while_each_thread(g, p); 142 } while_each_thread(g, p);
255 read_unlock(&tasklist_lock); 143 read_unlock(&tasklist_lock);
@@ -264,4 +152,3 @@ void thaw_processes(void)
264 printk("done.\n"); 152 printk("done.\n");
265} 153}
266 154
267EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a6332a313262..005b93d839ba 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
212 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
213 if (data->frozen) 213 if (data->frozen)
214 break; 214 break;
215
215 printk("Syncing filesystems ... "); 216 printk("Syncing filesystems ... ");
216 sys_sync(); 217 sys_sync();
217 printk("done.\n"); 218 printk("done.\n");
218 219
219 error = freeze_processes(); 220 error = usermodehelper_disable();
220 if (error) 221 if (error)
222 break;
223
224 error = freeze_processes();
225 if (error) {
221 thaw_processes(); 226 thaw_processes();
227 usermodehelper_enable();
228 }
222 if (!error) 229 if (!error)
223 data->frozen = 1; 230 data->frozen = 1;
224 break; 231 break;
@@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
227 if (!data->frozen || data->ready) 234 if (!data->frozen || data->ready)
228 break; 235 break;
229 thaw_processes(); 236 thaw_processes();
237 usermodehelper_enable();
230 data->frozen = 0; 238 data->frozen = 0;
231 break; 239 break;
232 240
diff --git a/kernel/printk.c b/kernel/printk.c
index b51b1567bb55..6341af77eb65 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -13,7 +13,7 @@
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfred@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton
17 */ 17 */
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
@@ -577,9 +577,6 @@ static int have_callable_console(void)
577 * @fmt: format string 577 * @fmt: format string
578 * 578 *
579 * This is printk(). It can be called from any context. We want it to work. 579 * This is printk(). It can be called from any context. We want it to work.
580 * Be aware of the fact that if oops_in_progress is not set, we might try to
581 * wake klogd up which could deadlock on runqueue lock if printk() is called
582 * from scheduler code.
583 * 580 *
584 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 581 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
585 * call the console drivers. If we fail to get the semaphore we place the output 582 * call the console drivers. If we fail to get the semaphore we place the output
@@ -593,6 +590,8 @@ static int have_callable_console(void)
593 * 590 *
594 * See also: 591 * See also:
595 * printf(3) 592 * printf(3)
593 *
594 * See the vsnprintf() documentation for format string extensions over C99.
596 */ 595 */
597 596
598asmlinkage int printk(const char *fmt, ...) 597asmlinkage int printk(const char *fmt, ...)
@@ -982,10 +981,25 @@ int is_console_locked(void)
982 return console_locked; 981 return console_locked;
983} 982}
984 983
985void wake_up_klogd(void) 984static DEFINE_PER_CPU(int, printk_pending);
985
986void printk_tick(void)
986{ 987{
987 if (!oops_in_progress && waitqueue_active(&log_wait)) 988 if (__get_cpu_var(printk_pending)) {
989 __get_cpu_var(printk_pending) = 0;
988 wake_up_interruptible(&log_wait); 990 wake_up_interruptible(&log_wait);
991 }
992}
993
994int printk_needs_cpu(int cpu)
995{
996 return per_cpu(printk_pending, cpu);
997}
998
999void wake_up_klogd(void)
1000{
1001 if (waitqueue_active(&log_wait))
1002 __raw_get_cpu_var(printk_pending) = 1;
989} 1003}
990 1004
991/** 1005/**
@@ -1291,22 +1305,6 @@ static int __init disable_boot_consoles(void)
1291} 1305}
1292late_initcall(disable_boot_consoles); 1306late_initcall(disable_boot_consoles);
1293 1307
1294/**
1295 * tty_write_message - write a message to a certain tty, not just the console.
1296 * @tty: the destination tty_struct
1297 * @msg: the message to write
1298 *
1299 * This is used for messages that need to be redirected to a specific tty.
1300 * We don't put it into the syslog queue right now maybe in the future if
1301 * really needed.
1302 */
1303void tty_write_message(struct tty_struct *tty, char *msg)
1304{
1305 if (tty && tty->ops->write)
1306 tty->ops->write(tty, msg, strlen(msg));
1307 return;
1308}
1309
1310#if defined CONFIG_PRINTK 1308#if defined CONFIG_PRINTK
1311 1309
1312/* 1310/*
diff --git a/kernel/profile.c b/kernel/profile.c
index cd26bed4cc26..a9e422df6bf6 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,6 +22,8 @@
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
25#include <asm/sections.h> 27#include <asm/sections.h>
26#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
27#include <asm/ptrace.h> 29#include <asm/ptrace.h>
@@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
50static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
51#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
52 54
53static int __init profile_setup(char *str) 55int profile_setup(char *str)
54{ 56{
55 static char __initdata schedstr[] = "schedule"; 57 static char schedstr[] = "schedule";
56 static char __initdata sleepstr[] = "sleep"; 58 static char sleepstr[] = "sleep";
57 static char __initdata kvmstr[] = "kvm"; 59 static char kvmstr[] = "kvm";
58 int par; 60 int par;
59 61
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 62 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -100,14 +102,33 @@ static int __init profile_setup(char *str)
100__setup("profile=", profile_setup); 102__setup("profile=", profile_setup);
101 103
102 104
103void __init profile_init(void) 105int profile_init(void)
104{ 106{
107 int buffer_bytes;
105 if (!prof_on) 108 if (!prof_on)
106 return; 109 return 0;
107 110
108 /* only text is profiled */ 111 /* only text is profiled */
109 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
110 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 return 0;
117 }
118
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer)
121 return 0;
122
123 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
124 if (prof_buffer)
125 return 0;
126
127 prof_buffer = vmalloc(buffer_bytes);
128 if (prof_buffer)
129 return 0;
130
131 return -ENOMEM;
111} 132}
112 133
113/* Profile event notifications */ 134/* Profile event notifications */
@@ -527,7 +548,7 @@ static void __init profile_nop(void *unused)
527{ 548{
528} 549}
529 550
530static int __init create_hash_tables(void) 551static int create_hash_tables(void)
531{ 552{
532 int cpu; 553 int cpu;
533 554
@@ -575,14 +596,14 @@ out_cleanup:
575#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
576#endif 597#endif
577 598
578static int __init create_proc_profile(void) 599int create_proc_profile(void)
579{ 600{
580 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
581 602
582 if (!prof_on) 603 if (!prof_on)
583 return 0; 604 return 0;
584 if (create_hash_tables()) 605 if (create_hash_tables())
585 return -1; 606 return -ENOMEM;
586 entry = proc_create("profile", S_IWUSR | S_IRUGO, 607 entry = proc_create("profile", S_IWUSR | S_IRUGO,
587 NULL, &proc_profile_operations); 608 NULL, &proc_profile_operations);
588 if (!entry) 609 if (!entry)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 356699a96d56..1e68e4c39e2c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
45 * TASK_TRACED, resume it now. 45 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled. 46 * Requires that irqs be disabled.
47 */ 47 */
48void ptrace_untrace(struct task_struct *child) 48static void ptrace_untrace(struct task_struct *child)
49{ 49{
50 spin_lock(&child->sighand->siglock); 50 spin_lock(&child->sighand->siglock);
51 if (task_is_traced(child)) { 51 if (task_is_traced(child)) {
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cdc9f68..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/cpu.h> 48#include <linux/cpu.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/time.h>
50 51
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
60static struct rcu_ctrlblk rcu_ctrlblk = { 61static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300, 62 .cur = -300,
62 .completed = -300, 63 .completed = -300,
64 .pending = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_MASK_NONE,
65}; 67};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300, 69 .cur = -300,
68 .completed = -300, 70 .completed = -300,
71 .pending = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_MASK_NONE,
71}; 74};
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
83{ 86{
84 int cpu; 87 int cpu;
85 cpumask_t cpumask; 88 cpumask_t cpumask;
89 unsigned long flags;
90
86 set_need_resched(); 91 set_need_resched();
92 spin_lock_irqsave(&rcp->lock, flags);
87 if (unlikely(!rcp->signaled)) { 93 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1; 94 rcp->signaled = 1;
89 /* 95 /*
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
109 for_each_cpu_mask_nr(cpu, cpumask) 115 for_each_cpu_mask_nr(cpu, cpumask)
110 smp_send_reschedule(cpu); 116 smp_send_reschedule(cpu);
111 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags);
112} 119}
113#else 120#else
114static inline void force_quiescent_state(struct rcu_data *rdp, 121static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
118} 125}
119#endif 126#endif
120 127
128static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
129 struct rcu_data *rdp)
130{
131 long batch;
132
133 head->next = NULL;
134 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
135
136 /*
137 * Determine the batch number of this callback.
138 *
139 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
140 * local variable "batch" and emits codes like this:
141 * 1) rdp->batch = rcp->cur + 1 # gets old value
142 * ......
143 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
144 * then [*nxttail[0], *nxttail[1]) may contain callbacks
145 * that batch# = rdp->batch, see the comment of struct rcu_data.
146 */
147 batch = ACCESS_ONCE(rcp->cur) + 1;
148
149 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
150 /* process callbacks */
151 rdp->nxttail[0] = rdp->nxttail[1];
152 rdp->nxttail[1] = rdp->nxttail[2];
153 if (rcu_batch_after(batch - 1, rdp->batch))
154 rdp->nxttail[0] = rdp->nxttail[2];
155 }
156
157 rdp->batch = batch;
158 *rdp->nxttail[2] = head;
159 rdp->nxttail[2] = &head->next;
160
161 if (unlikely(++rdp->qlen > qhimark)) {
162 rdp->blimit = INT_MAX;
163 force_quiescent_state(rdp, &rcu_ctrlblk);
164 }
165}
166
167#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
168
169static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
170{
171 rcp->gp_start = jiffies;
172 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
173}
174
175static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
176{
177 int cpu;
178 long delta;
179 unsigned long flags;
180
181 /* Only let one CPU complain about others per time interval. */
182
183 spin_lock_irqsave(&rcp->lock, flags);
184 delta = jiffies - rcp->jiffies_stall;
185 if (delta < 2 || rcp->cur != rcp->completed) {
186 spin_unlock_irqrestore(&rcp->lock, flags);
187 return;
188 }
189 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
190 spin_unlock_irqrestore(&rcp->lock, flags);
191
192 /* OK, time to rat on our buddy... */
193
194 printk(KERN_ERR "RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu);
198 }
199 printk(" (detected by %d, t=%ld jiffies)\n",
200 smp_processor_id(), (long)(jiffies - rcp->gp_start));
201}
202
203static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{
205 unsigned long flags;
206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start);
210 dump_stack();
211 spin_lock_irqsave(&rcp->lock, flags);
212 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
213 rcp->jiffies_stall =
214 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
215 spin_unlock_irqrestore(&rcp->lock, flags);
216 set_need_resched(); /* kick ourselves to get things going. */
217}
218
219static void check_cpu_stall(struct rcu_ctrlblk *rcp)
220{
221 long delta;
222
223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
225
226 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp);
228
229 } else if (rcp->cur != rcp->completed && delta >= 2) {
230
231 /* They had two seconds to dump stack, so complain. */
232 print_other_cpu_stall(rcp);
233 }
234}
235
236#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237
238static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
239{
240}
241
242static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
243{
244}
245
246#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
247
121/** 248/**
122 * call_rcu - Queue an RCU callback for invocation after a grace period. 249 * call_rcu - Queue an RCU callback for invocation after a grace period.
123 * @head: structure to be used for queueing the RCU updates. 250 * @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
133 void (*func)(struct rcu_head *rcu)) 260 void (*func)(struct rcu_head *rcu))
134{ 261{
135 unsigned long flags; 262 unsigned long flags;
136 struct rcu_data *rdp;
137 263
138 head->func = func; 264 head->func = func;
139 head->next = NULL;
140 local_irq_save(flags); 265 local_irq_save(flags);
141 rdp = &__get_cpu_var(rcu_data); 266 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
142 *rdp->nxttail = head;
143 rdp->nxttail = &head->next;
144 if (unlikely(++rdp->qlen > qhimark)) {
145 rdp->blimit = INT_MAX;
146 force_quiescent_state(rdp, &rcu_ctrlblk);
147 }
148 local_irq_restore(flags); 267 local_irq_restore(flags);
149} 268}
150EXPORT_SYMBOL_GPL(call_rcu); 269EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
169 void (*func)(struct rcu_head *rcu)) 288 void (*func)(struct rcu_head *rcu))
170{ 289{
171 unsigned long flags; 290 unsigned long flags;
172 struct rcu_data *rdp;
173 291
174 head->func = func; 292 head->func = func;
175 head->next = NULL;
176 local_irq_save(flags); 293 local_irq_save(flags);
177 rdp = &__get_cpu_var(rcu_bh_data); 294 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
178 *rdp->nxttail = head;
179 rdp->nxttail = &head->next;
180
181 if (unlikely(++rdp->qlen > qhimark)) {
182 rdp->blimit = INT_MAX;
183 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
184 }
185
186 local_irq_restore(flags); 295 local_irq_restore(flags);
187} 296}
188EXPORT_SYMBOL_GPL(call_rcu_bh); 297EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
211static inline void raise_rcu_softirq(void) 320static inline void raise_rcu_softirq(void)
212{ 321{
213 raise_softirq(RCU_SOFTIRQ); 322 raise_softirq(RCU_SOFTIRQ);
214 /*
215 * The smp_mb() here is required to ensure that this cpu's
216 * __rcu_process_callbacks() reads the most recently updated
217 * value of rcu->cur.
218 */
219 smp_mb();
220} 323}
221 324
222/* 325/*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
225 */ 328 */
226static void rcu_do_batch(struct rcu_data *rdp) 329static void rcu_do_batch(struct rcu_data *rdp)
227{ 330{
331 unsigned long flags;
228 struct rcu_head *next, *list; 332 struct rcu_head *next, *list;
229 int count = 0; 333 int count = 0;
230 334
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
239 } 343 }
240 rdp->donelist = list; 344 rdp->donelist = list;
241 345
242 local_irq_disable(); 346 local_irq_save(flags);
243 rdp->qlen -= count; 347 rdp->qlen -= count;
244 local_irq_enable(); 348 local_irq_restore(flags);
245 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) 349 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
246 rdp->blimit = blimit; 350 rdp->blimit = blimit;
247 351
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
269 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 373 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
270 * period (if necessary). 374 * period (if necessary).
271 */ 375 */
376
272/* 377/*
273 * Register a new batch of callbacks, and start it up if there is currently no 378 * Register a new batch of callbacks, and start it up if there is currently no
274 * active batch and the batch to be registered has not already occurred. 379 * active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
276 */ 381 */
277static void rcu_start_batch(struct rcu_ctrlblk *rcp) 382static void rcu_start_batch(struct rcu_ctrlblk *rcp)
278{ 383{
279 if (rcp->next_pending && 384 if (rcp->cur != rcp->pending &&
280 rcp->completed == rcp->cur) { 385 rcp->completed == rcp->cur) {
281 rcp->next_pending = 0;
282 /*
283 * next_pending == 0 must be visible in
284 * __rcu_process_callbacks() before it can see new value of cur.
285 */
286 smp_wmb();
287 rcp->cur++; 386 rcp->cur++;
387 record_gp_stall_check_time(rcp);
288 388
289 /* 389 /*
290 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 390 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
322static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 422static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
323 struct rcu_data *rdp) 423 struct rcu_data *rdp)
324{ 424{
425 unsigned long flags;
426
325 if (rdp->quiescbatch != rcp->cur) { 427 if (rdp->quiescbatch != rcp->cur) {
326 /* start new grace period: */ 428 /* start new grace period: */
327 rdp->qs_pending = 1; 429 rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
345 return; 447 return;
346 rdp->qs_pending = 0; 448 rdp->qs_pending = 0;
347 449
348 spin_lock(&rcp->lock); 450 spin_lock_irqsave(&rcp->lock, flags);
349 /* 451 /*
350 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 452 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
351 * during cpu startup. Ignore the quiescent state. 453 * during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
353 if (likely(rdp->quiescbatch == rcp->cur)) 455 if (likely(rdp->quiescbatch == rcp->cur))
354 cpu_quiet(rdp->cpu, rcp); 456 cpu_quiet(rdp->cpu, rcp);
355 457
356 spin_unlock(&rcp->lock); 458 spin_unlock_irqrestore(&rcp->lock, flags);
357} 459}
358 460
359 461
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
364 * which is dead and hence not processing interrupts. 466 * which is dead and hence not processing interrupts.
365 */ 467 */
366static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, 468static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
367 struct rcu_head **tail) 469 struct rcu_head **tail, long batch)
368{ 470{
369 local_irq_disable(); 471 unsigned long flags;
370 *this_rdp->nxttail = list; 472
371 if (list) 473 if (list) {
372 this_rdp->nxttail = tail; 474 local_irq_save(flags);
373 local_irq_enable(); 475 this_rdp->batch = batch;
476 *this_rdp->nxttail[2] = list;
477 this_rdp->nxttail[2] = tail;
478 local_irq_restore(flags);
479 }
374} 480}
375 481
376static void __rcu_offline_cpu(struct rcu_data *this_rdp, 482static void __rcu_offline_cpu(struct rcu_data *this_rdp,
377 struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 483 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
378{ 484{
379 /* if the cpu going offline owns the grace period 485 unsigned long flags;
486
487 /*
488 * if the cpu going offline owns the grace period
380 * we can block indefinitely waiting for it, so flush 489 * we can block indefinitely waiting for it, so flush
381 * it here 490 * it here
382 */ 491 */
383 spin_lock_bh(&rcp->lock); 492 spin_lock_irqsave(&rcp->lock, flags);
384 if (rcp->cur != rcp->completed) 493 if (rcp->cur != rcp->completed)
385 cpu_quiet(rdp->cpu, rcp); 494 cpu_quiet(rdp->cpu, rcp);
386 spin_unlock_bh(&rcp->lock); 495 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
387 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); 496 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
388 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 497 spin_unlock(&rcp->lock);
389 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
390 498
391 local_irq_disable();
392 this_rdp->qlen += rdp->qlen; 499 this_rdp->qlen += rdp->qlen;
393 local_irq_enable(); 500 local_irq_restore(flags);
394} 501}
395 502
396static void rcu_offline_cpu(int cpu) 503static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
420static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 527static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
421 struct rcu_data *rdp) 528 struct rcu_data *rdp)
422{ 529{
423 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 530 unsigned long flags;
424 *rdp->donetail = rdp->curlist; 531 long completed_snap;
425 rdp->donetail = rdp->curtail;
426 rdp->curlist = NULL;
427 rdp->curtail = &rdp->curlist;
428 }
429 532
430 if (rdp->nxtlist && !rdp->curlist) { 533 if (rdp->nxtlist) {
431 local_irq_disable(); 534 local_irq_save(flags);
432 rdp->curlist = rdp->nxtlist; 535 completed_snap = ACCESS_ONCE(rcp->completed);
433 rdp->curtail = rdp->nxttail;
434 rdp->nxtlist = NULL;
435 rdp->nxttail = &rdp->nxtlist;
436 local_irq_enable();
437 536
438 /* 537 /*
439 * start the next batch of callbacks 538 * move the other grace-period-completed entries to
539 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
440 */ 540 */
541 if (!rcu_batch_before(completed_snap, rdp->batch))
542 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
543 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
544 rdp->nxttail[0] = rdp->nxttail[1];
441 545
442 /* determine batch number */ 546 /*
443 rdp->batch = rcp->cur + 1; 547 * the grace period for entries in
444 /* see the comment and corresponding wmb() in 548 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
445 * the rcu_start_batch() 549 * move these entries to donelist
446 */ 550 */
447 smp_rmb(); 551 if (rdp->nxttail[0] != &rdp->nxtlist) {
552 *rdp->donetail = rdp->nxtlist;
553 rdp->donetail = rdp->nxttail[0];
554 rdp->nxtlist = *rdp->nxttail[0];
555 *rdp->donetail = NULL;
556
557 if (rdp->nxttail[1] == rdp->nxttail[0])
558 rdp->nxttail[1] = &rdp->nxtlist;
559 if (rdp->nxttail[2] == rdp->nxttail[0])
560 rdp->nxttail[2] = &rdp->nxtlist;
561 rdp->nxttail[0] = &rdp->nxtlist;
562 }
563
564 local_irq_restore(flags);
565
566 if (rcu_batch_after(rdp->batch, rcp->pending)) {
567 unsigned long flags2;
448 568
449 if (!rcp->next_pending) {
450 /* and start it/schedule start if it's a new batch */ 569 /* and start it/schedule start if it's a new batch */
451 spin_lock(&rcp->lock); 570 spin_lock_irqsave(&rcp->lock, flags2);
452 rcp->next_pending = 1; 571 if (rcu_batch_after(rdp->batch, rcp->pending)) {
453 rcu_start_batch(rcp); 572 rcp->pending = rdp->batch;
454 spin_unlock(&rcp->lock); 573 rcu_start_batch(rcp);
574 }
575 spin_unlock_irqrestore(&rcp->lock, flags2);
455 } 576 }
456 } 577 }
457 578
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
462 583
463static void rcu_process_callbacks(struct softirq_action *unused) 584static void rcu_process_callbacks(struct softirq_action *unused)
464{ 585{
586 /*
587 * Memory references from any prior RCU read-side critical sections
588 * executed by the interrupted code must be see before any RCU
589 * grace-period manupulations below.
590 */
591
592 smp_mb(); /* See above block comment. */
593
465 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); 594 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
466 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); 595 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
596
597 /*
598 * Memory references from any later RCU read-side critical sections
599 * executed by the interrupted code must be see after any RCU
600 * grace-period manupulations above.
601 */
602
603 smp_mb(); /* See above block comment. */
467} 604}
468 605
469static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 606static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
470{ 607{
471 /* This cpu has pending rcu entries and the grace period 608 /* Check for CPU stalls, if enabled. */
472 * for them has completed. 609 check_cpu_stall(rcp);
473 */
474 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
475 return 1;
476 610
477 /* This cpu has no pending entries, but there are new entries */ 611 if (rdp->nxtlist) {
478 if (!rdp->curlist && rdp->nxtlist) 612 long completed_snap = ACCESS_ONCE(rcp->completed);
479 return 1; 613
614 /*
615 * This cpu has pending rcu entries and the grace period
616 * for them has completed.
617 */
618 if (!rcu_batch_before(completed_snap, rdp->batch))
619 return 1;
620 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
621 rdp->nxttail[0] != rdp->nxttail[1])
622 return 1;
623 if (rdp->nxttail[0] != &rdp->nxtlist)
624 return 1;
625
626 /*
627 * This cpu has pending rcu entries and the new batch
628 * for then hasn't been started nor scheduled start
629 */
630 if (rcu_batch_after(rdp->batch, rcp->pending))
631 return 1;
632 }
480 633
481 /* This cpu has finished callbacks to invoke */ 634 /* This cpu has finished callbacks to invoke */
482 if (rdp->donelist) 635 if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
512 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 665 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
513 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); 666 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
514 667
515 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); 668 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
516} 669}
517 670
671/*
672 * Top-level function driving RCU grace-period detection, normally
673 * invoked from the scheduler-clock interrupt. This function simply
674 * increments counters that are read only from softirq by this same
675 * CPU, so there are no memory barriers required.
676 */
518void rcu_check_callbacks(int cpu, int user) 677void rcu_check_callbacks(int cpu, int user)
519{ 678{
520 if (user || 679 if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
558static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 717static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
559 struct rcu_data *rdp) 718 struct rcu_data *rdp)
560{ 719{
720 unsigned long flags;
721
722 spin_lock_irqsave(&rcp->lock, flags);
561 memset(rdp, 0, sizeof(*rdp)); 723 memset(rdp, 0, sizeof(*rdp));
562 rdp->curtail = &rdp->curlist; 724 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
563 rdp->nxttail = &rdp->nxtlist;
564 rdp->donetail = &rdp->donelist; 725 rdp->donetail = &rdp->donelist;
565 rdp->quiescbatch = rcp->completed; 726 rdp->quiescbatch = rcp->completed;
566 rdp->qs_pending = 0; 727 rdp->qs_pending = 0;
567 rdp->cpu = cpu; 728 rdp->cpu = cpu;
568 rdp->blimit = blimit; 729 rdp->blimit = blimit;
730 spin_unlock_irqrestore(&rcp->lock, flags);
569} 731}
570 732
571static void __cpuinit rcu_online_cpu(int cpu) 733static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
610 */ 772 */
611void __init __rcu_init(void) 773void __init __rcu_init(void)
612{ 774{
775#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
776 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
777#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
613 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 778 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
614 (void *)(long)smp_processor_id()); 779 (void *)(long)smp_processor_id());
615 /* Register notifier for non-boot CPUs */ 780 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..59236e8b9daa 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -54,17 +54,9 @@
54#include <linux/cpu.h> 54#include <linux/cpu.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
57#include <linux/byteorder/swabb.h>
58#include <linux/cpumask.h> 57#include <linux/cpumask.h>
59#include <linux/rcupreempt_trace.h> 58#include <linux/rcupreempt_trace.h>
60 59#include <asm/byteorder.h>
61/*
62 * Macro that prevents the compiler from reordering accesses, but does
63 * absolutely -nothing- to prevent CPUs from reordering. This is used
64 * only to mediate communication between mainline code and hardware
65 * interrupt and NMI handlers.
66 */
67#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
68 60
69/* 61/*
70 * PREEMPT_RCU data structures. 62 * PREEMPT_RCU data structures.
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
308 308
309static int __init rcupreempt_trace_init(void) 309static int __init rcupreempt_trace_init(void)
310{ 310{
311 int ret;
312
311 mutex_init(&rcupreempt_trace_mutex); 313 mutex_init(&rcupreempt_trace_mutex);
312 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); 314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
313 if (!rcupreempt_trace_buf) 315 if (!rcupreempt_trace_buf)
314 return 1; 316 return 1;
315 return rcupreempt_debugfs_init(); 317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
316} 321}
317 322
318static void __exit rcupreempt_trace_cleanup(void) 323static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 90b5b123f7a1..85cb90588a55 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -42,10 +42,10 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/cpu.h> 43#include <linux/cpu.h>
44#include <linux/delay.h> 44#include <linux/delay.h>
45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 45#include <linux/stat.h>
47#include <linux/srcu.h> 46#include <linux/srcu.h>
48#include <linux/slab.h> 47#include <linux/slab.h>
48#include <asm/byteorder.h>
49 49
50MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/resource.c b/kernel/resource.c
index 03d796c1b2e9..4089d12af6e0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource);
38 38
39static DEFINE_RWLOCK(resource_lock); 39static DEFINE_RWLOCK(resource_lock);
40 40
41#ifdef CONFIG_PROC_FS
42
43enum { MAX_IORES_LEVEL = 5 };
44
45static void *r_next(struct seq_file *m, void *v, loff_t *pos) 41static void *r_next(struct seq_file *m, void *v, loff_t *pos)
46{ 42{
47 struct resource *p = v; 43 struct resource *p = v;
@@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
53 return p->sibling; 49 return p->sibling;
54} 50}
55 51
52#ifdef CONFIG_PROC_FS
53
54enum { MAX_IORES_LEVEL = 5 };
55
56static void *r_start(struct seq_file *m, loff_t *pos) 56static void *r_start(struct seq_file *m, loff_t *pos)
57 __acquires(resource_lock) 57 __acquires(resource_lock)
58{ 58{
@@ -516,6 +516,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
516 return result; 516 return result;
517} 517}
518 518
519static void __init __reserve_region_with_split(struct resource *root,
520 resource_size_t start, resource_size_t end,
521 const char *name)
522{
523 struct resource *parent = root;
524 struct resource *conflict;
525 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
526
527 if (!res)
528 return;
529
530 res->name = name;
531 res->start = start;
532 res->end = end;
533 res->flags = IORESOURCE_BUSY;
534
535 for (;;) {
536 conflict = __request_resource(parent, res);
537 if (!conflict)
538 break;
539 if (conflict != parent) {
540 parent = conflict;
541 if (!(conflict->flags & IORESOURCE_BUSY))
542 continue;
543 }
544
545 /* Uhhuh, that didn't work out.. */
546 kfree(res);
547 res = NULL;
548 break;
549 }
550
551 if (!res) {
552 /* failed, split and try again */
553
554 /* conflict covered whole area */
555 if (conflict->start <= start && conflict->end >= end)
556 return;
557
558 if (conflict->start > start)
559 __reserve_region_with_split(root, start, conflict->start-1, name);
560 if (!(conflict->flags & IORESOURCE_BUSY)) {
561 resource_size_t common_start, common_end;
562
563 common_start = max(conflict->start, start);
564 common_end = min(conflict->end, end);
565 if (common_start < common_end)
566 __reserve_region_with_split(root, common_start, common_end, name);
567 }
568 if (conflict->end < end)
569 __reserve_region_with_split(root, conflict->end+1, end, name);
570 }
571
572}
573
574void reserve_region_with_split(struct resource *root,
575 resource_size_t start, resource_size_t end,
576 const char *name)
577{
578 write_lock(&resource_lock);
579 __reserve_region_with_split(root, start, end, name);
580 write_unlock(&resource_lock);
581}
582
519EXPORT_SYMBOL(adjust_resource); 583EXPORT_SYMBOL(adjust_resource);
520 584
521/** 585/**
@@ -562,33 +626,34 @@ struct resource * __request_region(struct resource *parent,
562{ 626{
563 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 627 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
564 628
565 if (res) { 629 if (!res)
566 res->name = name; 630 return NULL;
567 res->start = start;
568 res->end = start + n - 1;
569 res->flags = IORESOURCE_BUSY;
570 631
571 write_lock(&resource_lock); 632 res->name = name;
633 res->start = start;
634 res->end = start + n - 1;
635 res->flags = IORESOURCE_BUSY;
572 636
573 for (;;) { 637 write_lock(&resource_lock);
574 struct resource *conflict;
575 638
576 conflict = __request_resource(parent, res); 639 for (;;) {
577 if (!conflict) 640 struct resource *conflict;
578 break;
579 if (conflict != parent) {
580 parent = conflict;
581 if (!(conflict->flags & IORESOURCE_BUSY))
582 continue;
583 }
584 641
585 /* Uhhuh, that didn't work out.. */ 642 conflict = __request_resource(parent, res);
586 kfree(res); 643 if (!conflict)
587 res = NULL;
588 break; 644 break;
645 if (conflict != parent) {
646 parent = conflict;
647 if (!(conflict->flags & IORESOURCE_BUSY))
648 continue;
589 } 649 }
590 write_unlock(&resource_lock); 650
651 /* Uhhuh, that didn't work out.. */
652 kfree(res);
653 res = NULL;
654 break;
591 } 655 }
656 write_unlock(&resource_lock);
592 return res; 657 return res;
593} 658}
594EXPORT_SYMBOL(__request_region); 659EXPORT_SYMBOL(__request_region);
@@ -763,3 +828,40 @@ static int __init reserve_setup(char *str)
763} 828}
764 829
765__setup("reserve=", reserve_setup); 830__setup("reserve=", reserve_setup);
831
832/*
833 * Check if the requested addr and size spans more than any slot in the
834 * iomem resource tree.
835 */
836int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
837{
838 struct resource *p = &iomem_resource;
839 int err = 0;
840 loff_t l;
841
842 read_lock(&resource_lock);
843 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
844 /*
845 * We can probably skip the resources without
846 * IORESOURCE_IO attribute?
847 */
848 if (p->start >= addr + size)
849 continue;
850 if (p->end < addr)
851 continue;
852 if (p->start <= addr && (p->end >= addr + size - 1))
853 continue;
854 printk(KERN_WARNING "resource map sanity check conflict: "
855 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
856 (unsigned long long)addr,
857 (unsigned long long)(addr + size - 1),
858 (unsigned long long)p->start,
859 (unsigned long long)p->end,
860 p->name);
861 err = -1;
862 break;
863 }
864 read_unlock(&resource_lock);
865
866 return err;
867}
diff --git a/kernel/sched.c b/kernel/sched.c
index cc1f81b50b82..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
205} 210}
206 211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 305#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 306#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 307#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 308#endif /* CONFIG_USER_SCHED */
304 309
305/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -604,9 +609,9 @@ struct rq {
604 609
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 611
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 613{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 615}
611 616
612static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -1087,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1087 return NOTIFY_DONE; 1092 return NOTIFY_DONE;
1088} 1093}
1089 1094
1090static void init_hrtick(void) 1095static __init void init_hrtick(void)
1091{ 1096{
1092 hotcpu_notifier(hotplug_hrtick, 0); 1097 hotcpu_notifier(hotplug_hrtick, 0);
1093} 1098}
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1108}
1104 1109
1105static void init_hrtick(void) 1110static inline void init_hrtick(void)
1106{ 1111{
1107} 1112}
1108#endif /* CONFIG_SMP */ 1113#endif /* CONFIG_SMP */
@@ -1119,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
1119 1124
1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1121 rq->hrtick_timer.function = hrtick; 1126 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1128}
1124#else 1129#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1126{ 1131{
1127} 1132}
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1134{ 1139{
1135} 1140}
1136#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1137 1142
1138/* 1143/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1381} 1386}
1382 1387
1383#ifdef CONFIG_SMP 1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1389typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1390
1402/* 1391/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1392 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1393 * leaving it for the final time.
1405 */ 1394 */
1406static void 1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1396{
1409 struct task_group *parent, *child; 1397 struct task_group *parent, *child;
1398 int ret;
1410 1399
1411 rcu_read_lock(); 1400 rcu_read_lock();
1412 parent = &root_task_group; 1401 parent = &root_task_group;
1413down: 1402down:
1414 (*down)(parent, cpu, sd); 1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1407 parent = child;
1417 goto down; 1408 goto down;
@@ -1419,14 +1410,42 @@ down:
1419up: 1410up:
1420 continue; 1411 continue;
1421 } 1412 }
1422 (*up)(parent, cpu, sd); 1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1423 1416
1424 child = parent; 1417 child = parent;
1425 parent = parent->parent; 1418 parent = parent->parent;
1426 if (parent) 1419 if (parent)
1427 goto up; 1420 goto up;
1421out_unlock:
1428 rcu_read_unlock(); 1422 rcu_read_unlock();
1423
1424 return ret;
1425}
1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1429} 1430}
1431#endif
1432
1433#ifdef CONFIG_SMP
1434static unsigned long source_load(int cpu, int type);
1435static unsigned long target_load(int cpu, int type);
1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1446}
1447
1448#ifdef CONFIG_FAIR_GROUP_SCHED
1430 1449
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1451
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1506 * parent group depends on the shares of its child groups.
1488 */ 1507 */
1489static void 1508static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1509{
1492 unsigned long rq_weight = 0; 1510 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1494 int i; 1513 int i;
1495 1514
1496 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1515 __update_group_shares_cpu(tg, i, shares, rq_weight); 1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags); 1535 spin_unlock_irqrestore(&rq->lock, flags);
1517 } 1536 }
1537
1538 return 0;
1518} 1539}
1519 1540
1520/* 1541/*
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1543 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1544 * group is a fraction of its parents load.
1524 */ 1545 */
1525static void 1546static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1547{
1528 unsigned long load; 1548 unsigned long load;
1549 long cpu = (long)data;
1529 1550
1530 if (!tg->parent) { 1551 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1552 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1557 }
1537 1558
1538 tg->cfs_rq[cpu]->h_load = load; 1559 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1560
1541static void 1561 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1562}
1545 1563
1546static void update_shares(struct sched_domain *sd) 1564static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
1550 1568
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1570 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1572 }
1555} 1573}
1556 1574
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1579 spin_lock(&rq->lock);
1562} 1580}
1563 1581
1564static void update_h_load(int cpu) 1582static void update_h_load(long cpu)
1565{ 1583{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1585}
1568 1586
1569#else 1587#else
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1921 running = task_running(rq, p); 1939 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1923 ncsw = 0; 1941 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1942 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1930 1945
1931 /* 1946 /*
@@ -2285,7 +2300,7 @@ out_running:
2285 trace_mark(kernel_sched_wakeup, 2300 trace_mark(kernel_sched_wakeup,
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2287 p->pid, p->state, rq, p, rq->curr); 2302 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p); 2303 check_preempt_curr(rq, p, sync);
2289 2304
2290 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2420 trace_mark(kernel_sched_wakeup_new, 2435 trace_mark(kernel_sched_wakeup_new,
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2422 p->pid, p->state, rq, p, rq->curr); 2437 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p); 2438 check_preempt_curr(rq, p, 0);
2424#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2896 * to be always true for them.
2882 */ 2897 */
2883 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2884} 2899}
2885 2900
2886/* 2901/*
@@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4642}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4630void complete(struct completion *x) 4654void complete(struct completion *x)
4631{ 4655{
4632 unsigned long flags; 4656 unsigned long flags;
@@ -4638,6 +4662,12 @@ void complete(struct completion *x)
4638} 4662}
4639EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4640 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4641void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4642{ 4672{
4643 unsigned long flags; 4673 unsigned long flags;
@@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4690 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4666 break; 4693 break;
4667 } 4694 }
@@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4716 return timeout;
4690} 4717}
4691 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4692void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4693{ 4730{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4732}
4696EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4697 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4698unsigned long __sched 4744unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4746{
@@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4748}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4759{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4764}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4714unsigned long __sched 4775unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4777 unsigned long timeout)
@@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4780}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4722int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4723{ 4791{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5189,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5190 * assigned.
5123 */ 5191 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5194 return -EPERM;
5126#endif 5195#endif
5127 5196
@@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6027 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6028 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6029 check_preempt_curr(rq_dest, p, 0);
5961 } 6030 }
5962done: 6031done:
5963 ret = 1; 6032 ret = 1;
@@ -6282,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
6282static struct ctl_table * 6351static struct ctl_table *
6283sd_alloc_ctl_domain_table(struct sched_domain *sd) 6352sd_alloc_ctl_domain_table(struct sched_domain *sd)
6284{ 6353{
6285 struct ctl_table *table = sd_alloc_ctl_entry(12); 6354 struct ctl_table *table = sd_alloc_ctl_entry(13);
6286 6355
6287 if (table == NULL) 6356 if (table == NULL)
6288 return NULL; 6357 return NULL;
@@ -6310,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6310 sizeof(int), 0644, proc_dointvec_minmax); 6379 sizeof(int), 0644, proc_dointvec_minmax);
6311 set_table_entry(&table[10], "flags", &sd->flags, 6380 set_table_entry(&table[10], "flags", &sd->flags,
6312 sizeof(int), 0644, proc_dointvec_minmax); 6381 sizeof(int), 0644, proc_dointvec_minmax);
6313 /* &table[11] is terminator */ 6382 set_table_entry(&table[11], "name", sd->name,
6383 CORENAME_MAX_SIZE, 0444, proc_dostring);
6384 /* &table[12] is terminator */
6314 6385
6315 return table; 6386 return table;
6316} 6387}
@@ -7194,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7194 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7265 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7195 */ 7266 */
7196 7267
7268#ifdef CONFIG_SCHED_DEBUG
7269# define SD_INIT_NAME(sd, type) sd->name = #type
7270#else
7271# define SD_INIT_NAME(sd, type) do { } while (0)
7272#endif
7273
7197#define SD_INIT(sd, type) sd_init_##type(sd) 7274#define SD_INIT(sd, type) sd_init_##type(sd)
7275
7198#define SD_INIT_FUNC(type) \ 7276#define SD_INIT_FUNC(type) \
7199static noinline void sd_init_##type(struct sched_domain *sd) \ 7277static noinline void sd_init_##type(struct sched_domain *sd) \
7200{ \ 7278{ \
7201 memset(sd, 0, sizeof(*sd)); \ 7279 memset(sd, 0, sizeof(*sd)); \
7202 *sd = SD_##type##_INIT; \ 7280 *sd = SD_##type##_INIT; \
7203 sd->level = SD_LV_##type; \ 7281 sd->level = SD_LV_##type; \
7282 SD_INIT_NAME(sd, type); \
7204} 7283}
7205 7284
7206SD_INIT_FUNC(CPU) 7285SD_INIT_FUNC(CPU)
@@ -8242,20 +8321,25 @@ void __might_sleep(char *file, int line)
8242#ifdef in_atomic 8321#ifdef in_atomic
8243 static unsigned long prev_jiffy; /* ratelimiting */ 8322 static unsigned long prev_jiffy; /* ratelimiting */
8244 8323
8245 if ((in_atomic() || irqs_disabled()) && 8324 if ((!in_atomic() && !irqs_disabled()) ||
8246 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8325 system_state != SYSTEM_RUNNING || oops_in_progress)
8247 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8326 return;
8248 return; 8327 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8249 prev_jiffy = jiffies; 8328 return;
8250 printk(KERN_ERR "BUG: sleeping function called from invalid" 8329 prev_jiffy = jiffies;
8251 " context at %s:%d\n", file, line); 8330
8252 printk("in_atomic():%d, irqs_disabled():%d\n", 8331 printk(KERN_ERR
8253 in_atomic(), irqs_disabled()); 8332 "BUG: sleeping function called from invalid context at %s:%d\n",
8254 debug_show_held_locks(current); 8333 file, line);
8255 if (irqs_disabled()) 8334 printk(KERN_ERR
8256 print_irqtrace_events(current); 8335 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8257 dump_stack(); 8336 in_atomic(), irqs_disabled(),
8258 } 8337 current->pid, current->comm);
8338
8339 debug_show_held_locks(current);
8340 if (irqs_disabled())
8341 print_irqtrace_events(current);
8342 dump_stack();
8259#endif 8343#endif
8260} 8344}
8261EXPORT_SYMBOL(__might_sleep); 8345EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8753static unsigned long to_ratio(u64 period, u64 runtime) 8837static unsigned long to_ratio(u64 period, u64 runtime)
8754{ 8838{
8755 if (runtime == RUNTIME_INF) 8839 if (runtime == RUNTIME_INF)
8756 return 1ULL << 16; 8840 return 1ULL << 20;
8757 8841
8758 return div64_u64(runtime << 16, period); 8842 return div64_u64(runtime << 20, period);
8759} 8843}
8760 8844
8761#ifdef CONFIG_CGROUP_SCHED 8845/* Must be called with tasklist_lock held */
8762static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8846static inline int tg_has_rt_tasks(struct task_group *tg)
8763{ 8847{
8764 struct task_group *tgi, *parent = tg->parent; 8848 struct task_struct *g, *p;
8765 unsigned long total = 0;
8766 8849
8767 if (!parent) { 8850 do_each_thread(g, p) {
8768 if (global_rt_period() < period) 8851 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8769 return 0; 8852 return 1;
8853 } while_each_thread(g, p);
8770 8854
8771 return to_ratio(period, runtime) < 8855 return 0;
8772 to_ratio(global_rt_period(), global_rt_runtime()); 8856}
8773 }
8774 8857
8775 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8858struct rt_schedulable_data {
8776 return 0; 8859 struct task_group *tg;
8860 u64 rt_period;
8861 u64 rt_runtime;
8862};
8777 8863
8778 rcu_read_lock(); 8864static int tg_schedulable(struct task_group *tg, void *data)
8779 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8865{
8780 if (tgi == tg) 8866 struct rt_schedulable_data *d = data;
8781 continue; 8867 struct task_group *child;
8868 unsigned long total, sum = 0;
8869 u64 period, runtime;
8870
8871 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8872 runtime = tg->rt_bandwidth.rt_runtime;
8782 8873
8783 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8874 if (tg == d->tg) {
8784 tgi->rt_bandwidth.rt_runtime); 8875 period = d->rt_period;
8876 runtime = d->rt_runtime;
8785 } 8877 }
8786 rcu_read_unlock();
8787 8878
8788 return total + to_ratio(period, runtime) <= 8879 /*
8789 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8880 * Cannot have more runtime than the period.
8790 parent->rt_bandwidth.rt_runtime); 8881 */
8791} 8882 if (runtime > period && runtime != RUNTIME_INF)
8792#elif defined CONFIG_USER_SCHED 8883 return -EINVAL;
8793static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8794{
8795 struct task_group *tgi;
8796 unsigned long total = 0;
8797 unsigned long global_ratio =
8798 to_ratio(global_rt_period(), global_rt_runtime());
8799 8884
8800 rcu_read_lock(); 8885 /*
8801 list_for_each_entry_rcu(tgi, &task_groups, list) { 8886 * Ensure we don't starve existing RT tasks.
8802 if (tgi == tg) 8887 */
8803 continue; 8888 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8889 return -EBUSY;
8890
8891 total = to_ratio(period, runtime);
8804 8892
8805 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8893 /*
8806 tgi->rt_bandwidth.rt_runtime); 8894 * Nobody can have more than the global setting allows.
8895 */
8896 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8897 return -EINVAL;
8898
8899 /*
8900 * The sum of our children's runtime should not exceed our own.
8901 */
8902 list_for_each_entry_rcu(child, &tg->children, siblings) {
8903 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8904 runtime = child->rt_bandwidth.rt_runtime;
8905
8906 if (child == d->tg) {
8907 period = d->rt_period;
8908 runtime = d->rt_runtime;
8909 }
8910
8911 sum += to_ratio(period, runtime);
8807 } 8912 }
8808 rcu_read_unlock();
8809 8913
8810 return total + to_ratio(period, runtime) < global_ratio; 8914 if (sum > total)
8915 return -EINVAL;
8916
8917 return 0;
8811} 8918}
8812#endif
8813 8919
8814/* Must be called with tasklist_lock held */ 8920static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8815static inline int tg_has_rt_tasks(struct task_group *tg)
8816{ 8921{
8817 struct task_struct *g, *p; 8922 struct rt_schedulable_data data = {
8818 do_each_thread(g, p) { 8923 .tg = tg,
8819 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8924 .rt_period = period,
8820 return 1; 8925 .rt_runtime = runtime,
8821 } while_each_thread(g, p); 8926 };
8822 return 0; 8927
8928 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8823} 8929}
8824 8930
8825static int tg_set_bandwidth(struct task_group *tg, 8931static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8829 8935
8830 mutex_lock(&rt_constraints_mutex); 8936 mutex_lock(&rt_constraints_mutex);
8831 read_lock(&tasklist_lock); 8937 read_lock(&tasklist_lock);
8832 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8938 err = __rt_schedulable(tg, rt_period, rt_runtime);
8833 err = -EBUSY; 8939 if (err)
8834 goto unlock;
8835 }
8836 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8837 err = -EINVAL;
8838 goto unlock; 8940 goto unlock;
8839 }
8840 8941
8841 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8942 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8842 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8943 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
8905 9006
8906static int sched_rt_global_constraints(void) 9007static int sched_rt_global_constraints(void)
8907{ 9008{
8908 struct task_group *tg = &root_task_group; 9009 u64 runtime, period;
8909 u64 rt_runtime, rt_period;
8910 int ret = 0; 9010 int ret = 0;
8911 9011
8912 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9012 if (sysctl_sched_rt_period <= 0)
8913 rt_runtime = tg->rt_bandwidth.rt_runtime; 9013 return -EINVAL;
9014
9015 runtime = global_rt_runtime();
9016 period = global_rt_period();
9017
9018 /*
9019 * Sanity check on the sysctl variables.
9020 */
9021 if (runtime > period && runtime != RUNTIME_INF)
9022 return -EINVAL;
8914 9023
8915 mutex_lock(&rt_constraints_mutex); 9024 mutex_lock(&rt_constraints_mutex);
8916 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9025 read_lock(&tasklist_lock);
8917 ret = -EINVAL; 9026 ret = __rt_schedulable(NULL, 0, 0);
9027 read_unlock(&tasklist_lock);
8918 mutex_unlock(&rt_constraints_mutex); 9028 mutex_unlock(&rt_constraints_mutex);
8919 9029
8920 return ret; 9030 return ret;
@@ -8925,6 +9035,9 @@ static int sched_rt_global_constraints(void)
8925 unsigned long flags; 9035 unsigned long flags;
8926 int i; 9036 int i;
8927 9037
9038 if (sysctl_sched_rt_period <= 0)
9039 return -EINVAL;
9040
8928 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9041 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8929 for_each_possible_cpu(i) { 9042 for_each_possible_cpu(i) {
8930 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9043 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8985,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8985 9098
8986 if (!cgrp->parent) { 9099 if (!cgrp->parent) {
8987 /* This is early initialization for the top cgroup */ 9100 /* This is early initialization for the top cgroup */
8988 init_task_group.css.cgroup = cgrp;
8989 return &init_task_group.css; 9101 return &init_task_group.css;
8990 } 9102 }
8991 9103
@@ -8994,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8994 if (IS_ERR(tg)) 9106 if (IS_ERR(tg))
8995 return ERR_PTR(-ENOMEM); 9107 return ERR_PTR(-ENOMEM);
8996 9108
8997 /* Bind the cgroup to task_group object we just created */
8998 tg->css.cgroup = cgrp;
8999
9000 return &tg->css; 9109 return &tg->css;
9001} 9110}
9002 9111
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..81787248b60f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
118 118
119 /* 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC); 122 * max(scd->clock, scd->tick_gtod + TICK_NSEC));
123 */ 123 */
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c56..ad958c1ec708 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
333 unsigned long flags; 333 unsigned long flags;
334 int num_threads = 1; 334 int num_threads = 1;
335 335
336 rcu_read_lock();
337 if (lock_task_sighand(p, &flags)) { 336 if (lock_task_sighand(p, &flags)) {
338 num_threads = atomic_read(&p->signal->count); 337 num_threads = atomic_read(&p->signal->count);
339 unlock_task_sighand(p, &flags); 338 unlock_task_sighand(p, &flags);
340 } 339 }
341 rcu_read_unlock();
342 340
343 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 341 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
344 SEQ_printf(m, 342 SEQ_printf(m,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
409} 409}
410 410
411/* 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467}
468
469/*
470 * Update the current task's runtime statistics. Skip current tasks that 412 * Update the current task's runtime statistics. Skip current tasks that
471 * are not in our scheduling class. 413 * are not in our scheduling class.
472 */ 414 */
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 528 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 529 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 530 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 531 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 532 add_cfs_task_weight(cfs_rq, se->load.weight);
533 list_add(&se->group_node, &cfs_rq->tasks);
534 }
591 cfs_rq->nr_running++; 535 cfs_rq->nr_running++;
592 se->on_rq = 1; 536 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 537}
595 538
596static void 539static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 542 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 543 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 545 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 list_del_init(&se->group_node);
548 }
604 cfs_rq->nr_running--; 549 cfs_rq->nr_running--;
605 se->on_rq = 0; 550 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 551}
608 552
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 553static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1029 long wl, long wg)
1086{ 1030{
1087 struct sched_entity *se = tg->se[cpu]; 1031 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1032
1090 if (!tg->parent) 1033 if (!tg->parent)
1091 return wl; 1034 return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1040 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1041 return wl;
1099 1042
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1043 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1044 long S, rw, s, a, b;
1045 long more_w;
1046
1047 /*
1048 * Instead of using this increment, also add the difference
1049 * between when the shares were last updated and now.
1050 */
1051 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1052 wl += more_w;
1053 wg += more_w;
1112 1054
1113 S = se->my_q->tg->shares; 1055 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1056 s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1059 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1060 b = S*rw + s*wg;
1119 1061
1120 wl = s*(a-b)/D(b); 1062 wl = s*(a-b);
1063
1064 if (likely(b))
1065 wl /= b;
1066
1121 /* 1067 /*
1122 * Assume the group is already running and will 1068 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1069 * thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1072 * alter the group weight.
1127 */ 1073 */
1128 wg = 0; 1074 wg = 0;
1129#undef D
1130 } 1075 }
1131 1076
1132 return wl; 1077 return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1088#endif
1144 1089
1145static int 1090static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1091wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1092 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1093 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1094 unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1103 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1159 return 0; 1104 return 0;
1160 1105
1106 if (!sync && sched_feat(SYNC_WAKEUPS) &&
1107 curr->se.avg_overlap < sysctl_sched_migration_cost &&
1108 p->se.avg_overlap < sysctl_sched_migration_cost)
1109 sync = 1;
1110
1161 /* 1111 /*
1162 * If sync wakeup then subtract the (maximum possible) 1112 * If sync wakeup then subtract the (maximum possible)
1163 * effect of the currently running task from the load 1113 * effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1182 * a reasonable amount of time then attract this newly 1132 * a reasonable amount of time then attract this newly
1183 * woken task: 1133 * woken task:
1184 */ 1134 */
1185 if (sync && balanced) { 1135 if (sync && balanced)
1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1136 return 1;
1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1188 return 1;
1189 }
1190 1137
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1138 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1139 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1140
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1141 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1142 tl_per_task)) {
1196 /* 1143 /*
1197 * This domain has SD_WAKE_AFFINE and 1144 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1145 * p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1158 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1159 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1160 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1161 struct rq *this_rq;
1215 unsigned int imbalance; 1162 unsigned int imbalance;
1216 int idx; 1163 int idx;
1217 1164
1218 prev_cpu = task_cpu(p); 1165 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1166 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1167 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1168 new_cpu = prev_cpu;
1223 1169
1170 if (prev_cpu == this_cpu)
1171 goto out;
1224 /* 1172 /*
1225 * 'this_sd' is the first domain that both 1173 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1174 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1196 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1197 this_load = target_load(this_cpu, idx);
1250 1198
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1199 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1200 load, this_load, imbalance))
1253 return this_cpu; 1201 return this_cpu;
1254 1202
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1203 /*
1259 * Start passive balancing when half the imbalance_pct 1204 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1205 * limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1281 * + nice tasks. 1226 * + nice tasks.
1282 */ 1227 */
1283 if (sched_feat(ASYM_GRAN)) 1228 if (sched_feat(ASYM_GRAN))
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1229 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1230
1288 return gran; 1231 return gran;
1289} 1232}
1290 1233
1291/* 1234/*
1292 * Should 'se' preempt 'curr'.
1293 *
1294 * |s1
1295 * |s2
1296 * |s3
1297 * g
1298 * |<--->|c
1299 *
1300 * w(c, s1) = -1
1301 * w(c, s2) = 0
1302 * w(c, s3) = 1
1303 *
1304 */
1305static int
1306wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309
1310 if (vdiff < 0)
1311 return -1;
1312
1313 gran = wakeup_gran(curr);
1314 if (vdiff > gran)
1315 return 1;
1316
1317 return 0;
1318}
1319
1320/* return depth at which a sched entity is present in the hierarchy */
1321static inline int depth_se(struct sched_entity *se)
1322{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se)
1326 depth++;
1327
1328 return depth;
1329}
1330
1331/*
1332 * Preempt the current task with a newly woken task if needed: 1235 * Preempt the current task with a newly woken task if needed:
1333 */ 1236 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1237static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1238{
1336 struct task_struct *curr = rq->curr; 1239 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1240 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1241 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth; 1242 s64 delta_exec;
1340 1243
1341 if (unlikely(rt_prio(p->prio))) { 1244 if (unlikely(rt_prio(p->prio))) {
1342 update_rq_clock(rq); 1245 update_rq_clock(rq);
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1351 cfs_rq_of(pse)->next = pse; 1254 cfs_rq_of(pse)->next = pse;
1352 1255
1353 /* 1256 /*
1257 * We can come here with TIF_NEED_RESCHED already set from new task
1258 * wake up path.
1259 */
1260 if (test_tsk_need_resched(curr))
1261 return;
1262
1263 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1264 * Batch tasks do not preempt (their preemption is driven by
1355 * the tick): 1265 * the tick):
1356 */ 1266 */
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1270 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1271 return;
1362 1272
1363 /* 1273 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1364 * preemption test can be made between sibling entities who are in the 1274 (se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1275 pse->avg_overlap < sysctl_sched_migration_cost))) {
1366 * both tasks until we find their ancestors who are siblings of common 1276 resched_task(curr);
1367 * parent. 1277 return;
1368 */
1369
1370 /* First walk up until both entities are at same depth */
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373
1374 while (se_depth > pse_depth) {
1375 se_depth--;
1376 se = parent_entity(se);
1377 }
1378
1379 while (pse_depth > se_depth) {
1380 pse_depth--;
1381 pse = parent_entity(pse);
1382 }
1383
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se);
1386 pse = parent_entity(pse);
1387 } 1278 }
1388 1279
1389 if (wakeup_preempt_entity(se, pse) == 1) 1280 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1281 if (delta_exec > wakeup_gran(pse))
1390 resched_task(curr); 1282 resched_task(curr);
1391} 1283}
1392 1284
@@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1337 if (next == &cfs_rq->tasks)
1446 return NULL; 1338 return NULL;
1447 1339
1448 /* Skip over entities that are not tasks */ 1340 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1341 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1342 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1343
1462 return p; 1344 return p;
1463} 1345}
@@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1389 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1390 update_h_load(busiest_cpu);
1509 1391
1510 list_for_each_entry(tg, &task_groups, list) { 1392 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1393 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1394 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1395 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1502 * 'current' within the tree based on its new key value.
1621 */ 1503 */
1622 swap(curr->vruntime, se->vruntime); 1504 swap(curr->vruntime, se->vruntime);
1505 resched_task(rq->curr);
1623 } 1506 }
1624 1507
1625 enqueue_task_fair(rq, p, 0); 1508 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1509}
1628 1510
1629/* 1511/*
@@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1524 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1525 resched_task(rq->curr);
1644 } else 1526 } else
1645 check_preempt_curr(rq, p); 1527 check_preempt_curr(rq, p, 0);
1646} 1528}
1647 1529
1648/* 1530/*
@@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1541 if (running)
1660 resched_task(rq->curr); 1542 resched_task(rq->curr);
1661 else 1543 else
1662 check_preempt_curr(rq, p); 1544 check_preempt_curr(rq, p, 0);
1663} 1545}
1664 1546
1665/* Account for a task changing its policy or group. 1547/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
102 102
103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
104{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
105 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
106 107
107 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
108 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
109 110 enqueue_rt_entity(rt_se);
110 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
112 resched_task(curr); 112 resched_task(curr);
113 } 113 }
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
231#endif /* CONFIG_RT_GROUP_SCHED */ 231#endif /* CONFIG_RT_GROUP_SCHED */
232 232
233#ifdef CONFIG_SMP 233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
234static int do_balance_runtime(struct rt_rq *rt_rq) 237static int do_balance_runtime(struct rt_rq *rt_rq)
235{ 238{
236 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
250 continue; 253 continue;
251 254
252 spin_lock(&iter->rt_runtime_lock); 255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
253 if (iter->rt_runtime == RUNTIME_INF) 261 if (iter->rt_runtime == RUNTIME_INF)
254 goto next; 262 goto next;
255 263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
256 diff = iter->rt_runtime - iter->rt_time; 268 diff = iter->rt_runtime - iter->rt_time;
257 if (diff > 0) { 269 if (diff > 0) {
258 diff = div_u64((u64)diff, weight); 270 diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
274 return more; 286 return more;
275} 287}
276 288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
277static void __disable_runtime(struct rq *rq) 292static void __disable_runtime(struct rq *rq)
278{ 293{
279 struct root_domain *rd = rq->rd; 294 struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
289 304
290 spin_lock(&rt_b->rt_runtime_lock); 305 spin_lock(&rt_b->rt_runtime_lock);
291 spin_lock(&rt_rq->rt_runtime_lock); 306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
292 if (rt_rq->rt_runtime == RUNTIME_INF || 312 if (rt_rq->rt_runtime == RUNTIME_INF ||
293 rt_rq->rt_runtime == rt_b->rt_runtime) 313 rt_rq->rt_runtime == rt_b->rt_runtime)
294 goto balanced; 314 goto balanced;
295 spin_unlock(&rt_rq->rt_runtime_lock); 315 spin_unlock(&rt_rq->rt_runtime_lock);
296 316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
297 want = rt_b->rt_runtime - rt_rq->rt_runtime; 322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
298 323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
299 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu_mask(i, rd->span) {
300 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
301 s64 diff; 329 s64 diff;
302 330
331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
303 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
304 continue; 335 continue;
305 336
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
319 } 350 }
320 351
321 spin_lock(&rt_rq->rt_runtime_lock); 352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
322 BUG_ON(want); 357 BUG_ON(want);
323balanced: 358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
324 rt_rq->rt_runtime = RUNTIME_INF; 363 rt_rq->rt_runtime = RUNTIME_INF;
325 spin_unlock(&rt_rq->rt_runtime_lock); 364 spin_unlock(&rt_rq->rt_runtime_lock);
326 spin_unlock(&rt_b->rt_runtime_lock); 365 spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
343 if (unlikely(!scheduler_running)) 382 if (unlikely(!scheduler_running))
344 return; 383 return;
345 384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
346 for_each_leaf_rt_rq(rt_rq, rq) { 388 for_each_leaf_rt_rq(rt_rq, rq) {
347 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
348 390
@@ -350,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
350 spin_lock(&rt_rq->rt_runtime_lock); 392 spin_lock(&rt_rq->rt_runtime_lock);
351 rt_rq->rt_runtime = rt_b->rt_runtime; 393 rt_rq->rt_runtime = rt_b->rt_runtime;
352 rt_rq->rt_time = 0; 394 rt_rq->rt_time = 0;
395 rt_rq->rt_throttled = 0;
353 spin_unlock(&rt_rq->rt_runtime_lock); 396 spin_unlock(&rt_rq->rt_runtime_lock);
354 spin_unlock(&rt_b->rt_runtime_lock); 397 spin_unlock(&rt_b->rt_runtime_lock);
355 } 398 }
@@ -388,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
388 int i, idle = 1; 431 int i, idle = 1;
389 cpumask_t span; 432 cpumask_t span;
390 433
391 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
392 return 1; 435 return 1;
393 436
394 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -486,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
486 curr->se.exec_start = rq->clock; 529 curr->se.exec_start = rq->clock;
487 cpuacct_charge(curr, delta_exec); 530 cpuacct_charge(curr, delta_exec);
488 531
532 if (!rt_bandwidth_enabled())
533 return;
534
489 for_each_sched_rt_entity(rt_se) { 535 for_each_sched_rt_entity(rt_se) {
490 rt_rq = rt_rq_of_se(rt_se); 536 rt_rq = rt_rq_of_se(rt_se);
491 537
@@ -783,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
783/* 829/*
784 * Preempt the current task with a newly woken task if needed: 830 * Preempt the current task with a newly woken task if needed:
785 */ 831 */
786static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 832static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
787{ 833{
788 if (p->prio < rq->curr->prio) { 834 if (p->prio < rq->curr->prio) {
789 resched_task(rq->curr); 835 resched_task(rq->curr);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f266a6b9..83ba21a13bd4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 *
10 * Remote softirq infrastructure is by Jens Axboe.
9 */ 11 */
10 12
11#include <linux/module.h> 13#include <linux/module.h>
@@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
46EXPORT_SYMBOL(irq_stat); 48EXPORT_SYMBOL(irq_stat);
47#endif 49#endif
48 50
49static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; 51static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
50 52
51static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
52 54
@@ -205,7 +207,18 @@ restart:
205 207
206 do { 208 do {
207 if (pending & 1) { 209 if (pending & 1) {
210 int prev_count = preempt_count();
211
208 h->action(h); 212 h->action(h);
213
214 if (unlikely(prev_count != preempt_count())) {
215 printk(KERN_ERR "huh, entered softirq %td %p"
216 "with preempt_count %08x,"
217 " exited with %08x?\n", h - softirq_vec,
218 h->action, prev_count, preempt_count());
219 preempt_count() = prev_count;
220 }
221
209 rcu_bh_qsctr_inc(cpu); 222 rcu_bh_qsctr_inc(cpu);
210 } 223 }
211 h++; 224 h++;
@@ -463,17 +476,144 @@ void tasklet_kill(struct tasklet_struct *t)
463 476
464EXPORT_SYMBOL(tasklet_kill); 477EXPORT_SYMBOL(tasklet_kill);
465 478
479DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
480EXPORT_PER_CPU_SYMBOL(softirq_work_list);
481
482static void __local_trigger(struct call_single_data *cp, int softirq)
483{
484 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
485
486 list_add_tail(&cp->list, head);
487
488 /* Trigger the softirq only if the list was previously empty. */
489 if (head->next == &cp->list)
490 raise_softirq_irqoff(softirq);
491}
492
493#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
494static void remote_softirq_receive(void *data)
495{
496 struct call_single_data *cp = data;
497 unsigned long flags;
498 int softirq;
499
500 softirq = cp->priv;
501
502 local_irq_save(flags);
503 __local_trigger(cp, softirq);
504 local_irq_restore(flags);
505}
506
507static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
508{
509 if (cpu_online(cpu)) {
510 cp->func = remote_softirq_receive;
511 cp->info = cp;
512 cp->flags = 0;
513 cp->priv = softirq;
514
515 __smp_call_function_single(cpu, cp);
516 return 0;
517 }
518 return 1;
519}
520#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
521static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
522{
523 return 1;
524}
525#endif
526
527/**
528 * __send_remote_softirq - try to schedule softirq work on a remote cpu
529 * @cp: private SMP call function data area
530 * @cpu: the remote cpu
531 * @this_cpu: the currently executing cpu
532 * @softirq: the softirq for the work
533 *
534 * Attempt to schedule softirq work on a remote cpu. If this cannot be
535 * done, the work is instead queued up on the local cpu.
536 *
537 * Interrupts must be disabled.
538 */
539void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
540{
541 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
542 __local_trigger(cp, softirq);
543}
544EXPORT_SYMBOL(__send_remote_softirq);
545
546/**
547 * send_remote_softirq - try to schedule softirq work on a remote cpu
548 * @cp: private SMP call function data area
549 * @cpu: the remote cpu
550 * @softirq: the softirq for the work
551 *
552 * Like __send_remote_softirq except that disabling interrupts and
553 * computing the current cpu is done for the caller.
554 */
555void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
556{
557 unsigned long flags;
558 int this_cpu;
559
560 local_irq_save(flags);
561 this_cpu = smp_processor_id();
562 __send_remote_softirq(cp, cpu, this_cpu, softirq);
563 local_irq_restore(flags);
564}
565EXPORT_SYMBOL(send_remote_softirq);
566
567static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
568 unsigned long action, void *hcpu)
569{
570 /*
571 * If a CPU goes away, splice its entries to the current CPU
572 * and trigger a run of the softirq
573 */
574 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
575 int cpu = (unsigned long) hcpu;
576 int i;
577
578 local_irq_disable();
579 for (i = 0; i < NR_SOFTIRQS; i++) {
580 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
581 struct list_head *local_head;
582
583 if (list_empty(head))
584 continue;
585
586 local_head = &__get_cpu_var(softirq_work_list[i]);
587 list_splice_init(head, local_head);
588 raise_softirq_irqoff(i);
589 }
590 local_irq_enable();
591 }
592
593 return NOTIFY_OK;
594}
595
596static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
597 .notifier_call = remote_softirq_cpu_notify,
598};
599
466void __init softirq_init(void) 600void __init softirq_init(void)
467{ 601{
468 int cpu; 602 int cpu;
469 603
470 for_each_possible_cpu(cpu) { 604 for_each_possible_cpu(cpu) {
605 int i;
606
471 per_cpu(tasklet_vec, cpu).tail = 607 per_cpu(tasklet_vec, cpu).tail =
472 &per_cpu(tasklet_vec, cpu).head; 608 &per_cpu(tasklet_vec, cpu).head;
473 per_cpu(tasklet_hi_vec, cpu).tail = 609 per_cpu(tasklet_hi_vec, cpu).tail =
474 &per_cpu(tasklet_hi_vec, cpu).head; 610 &per_cpu(tasklet_hi_vec, cpu).head;
611 for (i = 0; i < NR_SOFTIRQS; i++)
612 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
475 } 613 }
476 614
615 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
616
477 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 617 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
478 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 618 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
479} 619}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index cb838ee93a82..3953e4aed733 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
226 * If the system crashed already then all bets are off, 226 * If the system crashed already then all bets are off,
227 * do not report extra hung tasks: 227 * do not report extra hung tasks:
228 */ 228 */
229 if ((tainted & TAINT_DIE) || did_panic) 229 if (test_taint(TAINT_DIE) || did_panic)
230 return; 230 return;
231 231
232 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..0bc8fa3c2288 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1060,9 +1060,7 @@ asmlinkage long sys_setsid(void)
1060 group_leader->signal->leader = 1; 1060 group_leader->signal->leader = 1;
1061 __set_special_pids(sid); 1061 __set_special_pids(sid);
1062 1062
1063 spin_lock(&group_leader->sighand->siglock); 1063 proc_clear_tty(group_leader);
1064 group_leader->signal->tty = NULL;
1065 spin_unlock(&group_leader->sighand->siglock);
1066 1064
1067 err = session; 1065 err = session;
1068out: 1066out:
@@ -1351,8 +1349,10 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1351 down_write(&uts_sem); 1349 down_write(&uts_sem);
1352 errno = -EFAULT; 1350 errno = -EFAULT;
1353 if (!copy_from_user(tmp, name, len)) { 1351 if (!copy_from_user(tmp, name, len)) {
1354 memcpy(utsname()->nodename, tmp, len); 1352 struct new_utsname *u = utsname();
1355 utsname()->nodename[len] = 0; 1353
1354 memcpy(u->nodename, tmp, len);
1355 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1356 errno = 0; 1356 errno = 0;
1357 } 1357 }
1358 up_write(&uts_sem); 1358 up_write(&uts_sem);
@@ -1364,15 +1364,17 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1364asmlinkage long sys_gethostname(char __user *name, int len) 1364asmlinkage long sys_gethostname(char __user *name, int len)
1365{ 1365{
1366 int i, errno; 1366 int i, errno;
1367 struct new_utsname *u;
1367 1368
1368 if (len < 0) 1369 if (len < 0)
1369 return -EINVAL; 1370 return -EINVAL;
1370 down_read(&uts_sem); 1371 down_read(&uts_sem);
1371 i = 1 + strlen(utsname()->nodename); 1372 u = utsname();
1373 i = 1 + strlen(u->nodename);
1372 if (i > len) 1374 if (i > len)
1373 i = len; 1375 i = len;
1374 errno = 0; 1376 errno = 0;
1375 if (copy_to_user(name, utsname()->nodename, i)) 1377 if (copy_to_user(name, u->nodename, i))
1376 errno = -EFAULT; 1378 errno = -EFAULT;
1377 up_read(&uts_sem); 1379 up_read(&uts_sem);
1378 return errno; 1380 return errno;
@@ -1397,8 +1399,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1397 down_write(&uts_sem); 1399 down_write(&uts_sem);
1398 errno = -EFAULT; 1400 errno = -EFAULT;
1399 if (!copy_from_user(tmp, name, len)) { 1401 if (!copy_from_user(tmp, name, len)) {
1400 memcpy(utsname()->domainname, tmp, len); 1402 struct new_utsname *u = utsname();
1401 utsname()->domainname[len] = 0; 1403
1404 memcpy(u->domainname, tmp, len);
1405 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1402 errno = 0; 1406 errno = 0;
1403 } 1407 }
1404 up_write(&uts_sem); 1408 up_write(&uts_sem);
@@ -1452,14 +1456,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1452 return -EINVAL; 1456 return -EINVAL;
1453 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1457 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1454 return -EFAULT; 1458 return -EFAULT;
1455 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1456 return -EINVAL;
1457 old_rlim = current->signal->rlim + resource; 1459 old_rlim = current->signal->rlim + resource;
1458 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1460 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1459 !capable(CAP_SYS_RESOURCE)) 1461 !capable(CAP_SYS_RESOURCE))
1460 return -EPERM; 1462 return -EPERM;
1461 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) 1463
1462 return -EPERM; 1464 if (resource == RLIMIT_NOFILE) {
1465 if (new_rlim.rlim_max == RLIM_INFINITY)
1466 new_rlim.rlim_max = sysctl_nr_open;
1467 if (new_rlim.rlim_cur == RLIM_INFINITY)
1468 new_rlim.rlim_cur = sysctl_nr_open;
1469 if (new_rlim.rlim_max > sysctl_nr_open)
1470 return -EPERM;
1471 }
1472
1473 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1474 return -EINVAL;
1463 1475
1464 retval = security_task_setrlimit(resource, &new_rlim); 1476 retval = security_task_setrlimit(resource, &new_rlim);
1465 if (retval) 1477 if (retval)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 08d6e1bb99ac..a77b27b11b04 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -125,6 +125,12 @@ cond_syscall(sys_vm86old);
125cond_syscall(sys_vm86); 125cond_syscall(sys_vm86);
126cond_syscall(compat_sys_ipc); 126cond_syscall(compat_sys_ipc);
127cond_syscall(compat_sys_sysctl); 127cond_syscall(compat_sys_sysctl);
128cond_syscall(sys_flock);
129cond_syscall(sys_io_setup);
130cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents);
128 134
129/* arch-specific weak syscall entries */ 135/* arch-specific weak syscall entries */
130cond_syscall(sys_pciconfig_read); 136cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec0886fa3d..b3cc73931d1f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max;
80extern int sysctl_drop_caches; 80extern int sysctl_drop_caches;
81extern int percpu_pagelist_fraction; 81extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int maps_protect;
84extern int latencytop_enabled; 83extern int latencytop_enabled;
85extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
86#ifdef CONFIG_RCU_TORTURE_TEST 85#ifdef CONFIG_RCU_TORTURE_TEST
@@ -97,7 +96,7 @@ static int sixty = 60;
97static int neg_one = -1; 96static int neg_one = -1;
98#endif 97#endif
99 98
100#ifdef CONFIG_MMU 99#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
101static int two = 2; 100static int two = 2;
102#endif 101#endif
103 102
@@ -118,10 +117,8 @@ extern char modprobe_path[];
118extern int sg_big_buff; 117extern int sg_big_buff;
119#endif 118#endif
120 119
121#ifdef __sparc__ 120#ifdef CONFIG_SPARC
122extern char reboot_command []; 121#include <asm/system.h>
123extern int stop_a_enabled;
124extern int scons_pwroff;
125#endif 122#endif
126 123
127#ifdef __hppa__ 124#ifdef __hppa__
@@ -152,7 +149,7 @@ extern int max_lock_depth;
152#ifdef CONFIG_PROC_SYSCTL 149#ifdef CONFIG_PROC_SYSCTL
153static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 150static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
154 void __user *buffer, size_t *lenp, loff_t *ppos); 151 void __user *buffer, size_t *lenp, loff_t *ppos);
155static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 152static int proc_taint(struct ctl_table *table, int write, struct file *filp,
156 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
157#endif 154#endif
158 155
@@ -382,10 +379,9 @@ static struct ctl_table kern_table[] = {
382#ifdef CONFIG_PROC_SYSCTL 379#ifdef CONFIG_PROC_SYSCTL
383 { 380 {
384 .procname = "tainted", 381 .procname = "tainted",
385 .data = &tainted, 382 .maxlen = sizeof(long),
386 .maxlen = sizeof(int),
387 .mode = 0644, 383 .mode = 0644,
388 .proc_handler = &proc_dointvec_taint, 384 .proc_handler = &proc_taint,
389 }, 385 },
390#endif 386#endif
391#ifdef CONFIG_LATENCYTOP 387#ifdef CONFIG_LATENCYTOP
@@ -415,7 +411,7 @@ static struct ctl_table kern_table[] = {
415 .mode = 0644, 411 .mode = 0644,
416 .proc_handler = &proc_dointvec, 412 .proc_handler = &proc_dointvec,
417 }, 413 },
418#ifdef __sparc__ 414#ifdef CONFIG_SPARC
419 { 415 {
420 .ctl_name = KERN_SPARC_REBOOT, 416 .ctl_name = KERN_SPARC_REBOOT,
421 .procname = "reboot-cmd", 417 .procname = "reboot-cmd",
@@ -810,16 +806,6 @@ static struct ctl_table kern_table[] = {
810 .proc_handler = &proc_dointvec, 806 .proc_handler = &proc_dointvec,
811 }, 807 },
812#endif 808#endif
813#ifdef CONFIG_PROC_FS
814 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "maps_protect",
817 .data = &maps_protect,
818 .maxlen = sizeof(int),
819 .mode = 0644,
820 .proc_handler = &proc_dointvec,
821 },
822#endif
823 { 809 {
824 .ctl_name = CTL_UNNUMBERED, 810 .ctl_name = CTL_UNNUMBERED,
825 .procname = "poweroff_cmd", 811 .procname = "poweroff_cmd",
@@ -847,6 +833,16 @@ static struct ctl_table kern_table[] = {
847 .proc_handler = &proc_dointvec, 833 .proc_handler = &proc_dointvec,
848 }, 834 },
849#endif 835#endif
836#ifdef CONFIG_UNEVICTABLE_LRU
837 {
838 .ctl_name = CTL_UNNUMBERED,
839 .procname = "scan_unevictable_pages",
840 .data = &scan_unevictable_pages,
841 .maxlen = sizeof(scan_unevictable_pages),
842 .mode = 0644,
843 .proc_handler = &scan_unevictable_handler,
844 },
845#endif
850/* 846/*
851 * NOTE: do not add new entries to this table unless you have read 847 * NOTE: do not add new entries to this table unless you have read
852 * Documentation/sysctl/ctl_unnumbered.txt 848 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1261,6 +1257,7 @@ static struct ctl_table fs_table[] = {
1261 .extra1 = &minolduid, 1257 .extra1 = &minolduid,
1262 .extra2 = &maxolduid, 1258 .extra2 = &maxolduid,
1263 }, 1259 },
1260#ifdef CONFIG_FILE_LOCKING
1264 { 1261 {
1265 .ctl_name = FS_LEASES, 1262 .ctl_name = FS_LEASES,
1266 .procname = "leases-enable", 1263 .procname = "leases-enable",
@@ -1269,6 +1266,7 @@ static struct ctl_table fs_table[] = {
1269 .mode = 0644, 1266 .mode = 0644,
1270 .proc_handler = &proc_dointvec, 1267 .proc_handler = &proc_dointvec,
1271 }, 1268 },
1269#endif
1272#ifdef CONFIG_DNOTIFY 1270#ifdef CONFIG_DNOTIFY
1273 { 1271 {
1274 .ctl_name = FS_DIR_NOTIFY, 1272 .ctl_name = FS_DIR_NOTIFY,
@@ -1280,6 +1278,7 @@ static struct ctl_table fs_table[] = {
1280 }, 1278 },
1281#endif 1279#endif
1282#ifdef CONFIG_MMU 1280#ifdef CONFIG_MMU
1281#ifdef CONFIG_FILE_LOCKING
1283 { 1282 {
1284 .ctl_name = FS_LEASE_TIME, 1283 .ctl_name = FS_LEASE_TIME,
1285 .procname = "lease-break-time", 1284 .procname = "lease-break-time",
@@ -1291,6 +1290,8 @@ static struct ctl_table fs_table[] = {
1291 .extra1 = &zero, 1290 .extra1 = &zero,
1292 .extra2 = &two, 1291 .extra2 = &two,
1293 }, 1292 },
1293#endif
1294#ifdef CONFIG_AIO
1294 { 1295 {
1295 .procname = "aio-nr", 1296 .procname = "aio-nr",
1296 .data = &aio_nr, 1297 .data = &aio_nr,
@@ -1305,6 +1306,7 @@ static struct ctl_table fs_table[] = {
1305 .mode = 0644, 1306 .mode = 0644,
1306 .proc_handler = &proc_doulongvec_minmax, 1307 .proc_handler = &proc_doulongvec_minmax,
1307 }, 1308 },
1309#endif /* CONFIG_AIO */
1308#ifdef CONFIG_INOTIFY_USER 1310#ifdef CONFIG_INOTIFY_USER
1309 { 1311 {
1310 .ctl_name = FS_INOTIFY, 1312 .ctl_name = FS_INOTIFY,
@@ -1510,7 +1512,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1510/* Perform the actual read/write of a sysctl table entry. */ 1512/* Perform the actual read/write of a sysctl table entry. */
1511static int do_sysctl_strategy(struct ctl_table_root *root, 1513static int do_sysctl_strategy(struct ctl_table_root *root,
1512 struct ctl_table *table, 1514 struct ctl_table *table,
1513 int __user *name, int nlen,
1514 void __user *oldval, size_t __user *oldlenp, 1515 void __user *oldval, size_t __user *oldlenp,
1515 void __user *newval, size_t newlen) 1516 void __user *newval, size_t newlen)
1516{ 1517{
@@ -1524,8 +1525,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1524 return -EPERM; 1525 return -EPERM;
1525 1526
1526 if (table->strategy) { 1527 if (table->strategy) {
1527 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1528 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1528 newval, newlen);
1529 if (rc < 0) 1529 if (rc < 0)
1530 return rc; 1530 return rc;
1531 if (rc > 0) 1531 if (rc > 0)
@@ -1535,8 +1535,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1535 /* If there is no strategy routine, or if the strategy returns 1535 /* If there is no strategy routine, or if the strategy returns
1536 * zero, proceed with automatic r/w */ 1536 * zero, proceed with automatic r/w */
1537 if (table->data && table->maxlen) { 1537 if (table->data && table->maxlen) {
1538 rc = sysctl_data(table, name, nlen, oldval, oldlenp, 1538 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1539 newval, newlen);
1540 if (rc < 0) 1539 if (rc < 0)
1541 return rc; 1540 return rc;
1542 } 1541 }
@@ -1568,7 +1567,7 @@ repeat:
1568 table = table->child; 1567 table = table->child;
1569 goto repeat; 1568 goto repeat;
1570 } 1569 }
1571 error = do_sysctl_strategy(root, table, name, nlen, 1570 error = do_sysctl_strategy(root, table,
1572 oldval, oldlenp, 1571 oldval, oldlenp,
1573 newval, newlen); 1572 newval, newlen);
1574 return error; 1573 return error;
@@ -2237,49 +2236,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2237 NULL,NULL); 2236 NULL,NULL);
2238} 2237}
2239 2238
2240#define OP_SET 0
2241#define OP_AND 1
2242#define OP_OR 2
2243
2244static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
2245 int *valp,
2246 int write, void *data)
2247{
2248 int op = *(int *)data;
2249 if (write) {
2250 int val = *negp ? -*lvalp : *lvalp;
2251 switch(op) {
2252 case OP_SET: *valp = val; break;
2253 case OP_AND: *valp &= val; break;
2254 case OP_OR: *valp |= val; break;
2255 }
2256 } else {
2257 int val = *valp;
2258 if (val < 0) {
2259 *negp = -1;
2260 *lvalp = (unsigned long)-val;
2261 } else {
2262 *negp = 0;
2263 *lvalp = (unsigned long)val;
2264 }
2265 }
2266 return 0;
2267}
2268
2269/* 2239/*
2270 * Taint values can only be increased 2240 * Taint values can only be increased
2241 * This means we can safely use a temporary.
2271 */ 2242 */
2272static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 2243static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2273 void __user *buffer, size_t *lenp, loff_t *ppos) 2244 void __user *buffer, size_t *lenp, loff_t *ppos)
2274{ 2245{
2275 int op; 2246 struct ctl_table t;
2247 unsigned long tmptaint = get_taint();
2248 int err;
2276 2249
2277 if (write && !capable(CAP_SYS_ADMIN)) 2250 if (write && !capable(CAP_SYS_ADMIN))
2278 return -EPERM; 2251 return -EPERM;
2279 2252
2280 op = OP_OR; 2253 t = *table;
2281 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2254 t.data = &tmptaint;
2282 do_proc_dointvec_bset_conv,&op); 2255 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
2256 if (err < 0)
2257 return err;
2258
2259 if (write) {
2260 /*
2261 * Poor man's atomic or. Not worth adding a primitive
2262 * to everyone's atomic.h for this
2263 */
2264 int i;
2265 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2266 if ((tmptaint >> i) & 1)
2267 add_taint(i);
2268 }
2269 }
2270
2271 return err;
2283} 2272}
2284 2273
2285struct do_proc_dointvec_minmax_conv_param { 2274struct do_proc_dointvec_minmax_conv_param {
@@ -2727,7 +2716,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2727 */ 2716 */
2728 2717
2729/* The generic sysctl data routine (used if no strategy routine supplied) */ 2718/* The generic sysctl data routine (used if no strategy routine supplied) */
2730int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2719int sysctl_data(struct ctl_table *table,
2731 void __user *oldval, size_t __user *oldlenp, 2720 void __user *oldval, size_t __user *oldlenp,
2732 void __user *newval, size_t newlen) 2721 void __user *newval, size_t newlen)
2733{ 2722{
@@ -2761,7 +2750,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2761} 2750}
2762 2751
2763/* The generic string strategy routine: */ 2752/* The generic string strategy routine: */
2764int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2753int sysctl_string(struct ctl_table *table,
2765 void __user *oldval, size_t __user *oldlenp, 2754 void __user *oldval, size_t __user *oldlenp,
2766 void __user *newval, size_t newlen) 2755 void __user *newval, size_t newlen)
2767{ 2756{
@@ -2807,7 +2796,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2807 * are between the minimum and maximum values given in the arrays 2796 * are between the minimum and maximum values given in the arrays
2808 * table->extra1 and table->extra2, respectively. 2797 * table->extra1 and table->extra2, respectively.
2809 */ 2798 */
2810int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2799int sysctl_intvec(struct ctl_table *table,
2811 void __user *oldval, size_t __user *oldlenp, 2800 void __user *oldval, size_t __user *oldlenp,
2812 void __user *newval, size_t newlen) 2801 void __user *newval, size_t newlen)
2813{ 2802{
@@ -2843,7 +2832,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2843} 2832}
2844 2833
2845/* Strategy function to convert jiffies to seconds */ 2834/* Strategy function to convert jiffies to seconds */
2846int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2835int sysctl_jiffies(struct ctl_table *table,
2847 void __user *oldval, size_t __user *oldlenp, 2836 void __user *oldval, size_t __user *oldlenp,
2848 void __user *newval, size_t newlen) 2837 void __user *newval, size_t newlen)
2849{ 2838{
@@ -2877,7 +2866,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2877} 2866}
2878 2867
2879/* Strategy function to convert jiffies to seconds */ 2868/* Strategy function to convert jiffies to seconds */
2880int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2869int sysctl_ms_jiffies(struct ctl_table *table,
2881 void __user *oldval, size_t __user *oldlenp, 2870 void __user *oldval, size_t __user *oldlenp,
2882 void __user *newval, size_t newlen) 2871 void __user *newval, size_t newlen)
2883{ 2872{
@@ -2932,35 +2921,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2932 return error; 2921 return error;
2933} 2922}
2934 2923
2935int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2924int sysctl_data(struct ctl_table *table,
2936 void __user *oldval, size_t __user *oldlenp, 2925 void __user *oldval, size_t __user *oldlenp,
2937 void __user *newval, size_t newlen) 2926 void __user *newval, size_t newlen)
2938{ 2927{
2939 return -ENOSYS; 2928 return -ENOSYS;
2940} 2929}
2941 2930
2942int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2931int sysctl_string(struct ctl_table *table,
2943 void __user *oldval, size_t __user *oldlenp, 2932 void __user *oldval, size_t __user *oldlenp,
2944 void __user *newval, size_t newlen) 2933 void __user *newval, size_t newlen)
2945{ 2934{
2946 return -ENOSYS; 2935 return -ENOSYS;
2947} 2936}
2948 2937
2949int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2938int sysctl_intvec(struct ctl_table *table,
2950 void __user *oldval, size_t __user *oldlenp, 2939 void __user *oldval, size_t __user *oldlenp,
2951 void __user *newval, size_t newlen) 2940 void __user *newval, size_t newlen)
2952{ 2941{
2953 return -ENOSYS; 2942 return -ENOSYS;
2954} 2943}
2955 2944
2956int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2945int sysctl_jiffies(struct ctl_table *table,
2957 void __user *oldval, size_t __user *oldlenp, 2946 void __user *oldval, size_t __user *oldlenp,
2958 void __user *newval, size_t newlen) 2947 void __user *newval, size_t newlen)
2959{ 2948{
2960 return -ENOSYS; 2949 return -ENOSYS;
2961} 2950}
2962 2951
2963int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2952int sysctl_ms_jiffies(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp, 2953 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen) 2954 void __user *newval, size_t newlen)
2966{ 2955{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8d53106a0a92..95ed42951e0a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config TICK_ONESHOT 4config TICK_ONESHOT
5 bool 5 bool
6 default n
7 6
8config NO_HZ 7config NO_HZ
9 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1876b526c778..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
72} 72}
73 73
74/** 74/**
75 * clockevents_shutdown - shutdown the device and clear next_event
76 * @dev: device to shutdown
77 */
78void clockevents_shutdown(struct clock_event_device *dev)
79{
80 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
81 dev->next_event.tv64 = KTIME_MAX;
82}
83
84/**
75 * clockevents_program_event - Reprogram the clock event device. 85 * clockevents_program_event - Reprogram the clock event device.
76 * @expires: absolute expiry time (monotonic clock) 86 * @expires: absolute expiry time (monotonic clock)
77 * 87 *
@@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
206 216
207 if (new) { 217 if (new) {
208 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 218 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
209 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); 219 clockevents_shutdown(new);
210 } 220 }
211 local_irq_restore(flags); 221 local_irq_restore(flags);
212} 222}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2f5a38294bf9..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,9 +235,9 @@ static void tick_do_broadcast_on_off(void *why)
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
238 if (td->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
239 clockevents_set_mode(dev, 239 TICKDEV_MODE_PERIODIC)
240 CLOCK_EVT_MODE_SHUTDOWN); 240 clockevents_shutdown(dev);
241 } 241 }
242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
243 tick_broadcast_force = 1; 243 tick_broadcast_force = 1;
@@ -246,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why)
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpu_isset(cpu, tick_broadcast_mask)) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
249 if (td->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
250 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
251 } 252 }
252 break; 253 break;
@@ -254,7 +255,7 @@ static void tick_do_broadcast_on_off(void *why)
254 255
255 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpus_empty(tick_broadcast_mask)) {
256 if (!bc_stopped) 257 if (!bc_stopped)
257 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 258 clockevents_shutdown(bc);
258 } else if (bc_stopped) { 259 } else if (bc_stopped) {
259 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 260 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
260 tick_broadcast_start_periodic(bc); 261 tick_broadcast_start_periodic(bc);
@@ -306,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
306 307
307 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
308 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpus_empty(tick_broadcast_mask))
309 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 310 clockevents_shutdown(bc);
310 } 311 }
311 312
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 313 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -321,7 +322,7 @@ void tick_suspend_broadcast(void)
321 322
322 bc = tick_broadcast_device.evtdev; 323 bc = tick_broadcast_device.evtdev;
323 if (bc) 324 if (bc)
324 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 325 clockevents_shutdown(bc);
325 326
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 327 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 328}
@@ -576,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
576 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 577 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
577} 578}
578 579
580/*
581 * Check, whether the broadcast device is in one shot mode
582 */
583int tick_broadcast_oneshot_active(void)
584{
585 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
586}
587
579#endif 588#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index c4777193d567..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = -1; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37DEFINE_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
109 if (!tick_device_is_functional(dev)) 109 if (!tick_device_is_functional(dev))
110 return; 110 return;
111 111
112 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { 112 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
113 !tick_broadcast_oneshot_active()) {
113 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 114 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
114 } else { 115 } else {
115 unsigned long seq; 116 unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
148 * If no cpu took the do_timer update, assign it to 149 * If no cpu took the do_timer update, assign it to
149 * this cpu: 150 * this cpu:
150 */ 151 */
151 if (tick_do_timer_cpu == -1) { 152 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
152 tick_do_timer_cpu = cpu; 153 tick_do_timer_cpu = cpu;
153 tick_next_period = ktime_get(); 154 tick_next_period = ktime_get();
154 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 155 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -249,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
249 * not give it back to the clockevents layer ! 250 * not give it back to the clockevents layer !
250 */ 251 */
251 if (tick_is_broadcast_device(curdev)) { 252 if (tick_is_broadcast_device(curdev)) {
252 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); 253 clockevents_shutdown(curdev);
253 curdev = NULL; 254 curdev = NULL;
254 } 255 }
255 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
@@ -300,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
300 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
301 int cpu = first_cpu(cpu_online_map); 302 int cpu = first_cpu(cpu_online_map);
302 303
303 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; 304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
304 } 306 }
305 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
306} 308}
@@ -311,7 +313,7 @@ static void tick_suspend(void)
311 unsigned long flags; 313 unsigned long flags;
312 314
313 spin_lock_irqsave(&tick_device_lock, flags); 315 spin_lock_irqsave(&tick_device_lock, flags);
314 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); 316 clockevents_shutdown(td->evtdev);
315 spin_unlock_irqrestore(&tick_device_lock, flags); 317 spin_unlock_irqrestore(&tick_device_lock, flags);
316} 318}
317 319
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 0ffc2918ea6f..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4
5#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2
7
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 9extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 10extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 14extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
11extern void tick_handle_periodic(struct clock_event_device *dev); 15extern void tick_handle_periodic(struct clock_event_device *dev);
12 16
17extern void clockevents_shutdown(struct clock_event_device *dev);
18
13/* 19/*
14 * NO_HZ / high resolution timer shared code 20 * NO_HZ / high resolution timer shared code
15 */ 21 */
@@ -29,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
29extern void tick_broadcast_switch_to_oneshot(void); 35extern void tick_broadcast_switch_to_oneshot(void);
30extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
31extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void);
32# else /* BROADCAST */ 39# else /* BROADCAST */
33static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
34{ 41{
@@ -37,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
37static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 44static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
38static inline void tick_broadcast_switch_to_oneshot(void) { } 45static inline void tick_broadcast_switch_to_oneshot(void) { }
39static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; }
40# endif /* !BROADCAST */ 48# endif /* !BROADCAST */
41 49
42#else /* !ONESHOT */ 50#else /* !ONESHOT */
@@ -66,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
66{ 74{
67 return 0; 75 return 0;
68} 76}
77static inline int tick_broadcast_oneshot_active(void) { return 0; }
69#endif /* !TICK_ONESHOT */ 78#endif /* !TICK_ONESHOT */
70 79
71/* 80/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a87b0468568b..b711ffcb106c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/module.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
75 incr * ticks); 76 incr * ticks);
76 } 77 }
77 do_timer(++ticks); 78 do_timer(++ticks);
79
80 /* Keep the tick_next_period variable up to date */
81 tick_next_period = ktime_add(last_jiffies_update, tick_period);
78 } 82 }
79 write_sequnlock(&xtime_lock); 83 write_sequnlock(&xtime_lock);
80} 84}
@@ -187,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
187{ 191{
188 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 192 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
189 193
190 *last_update_time = ktime_to_us(ts->idle_lastupdate); 194 if (!tick_nohz_enabled)
195 return -1;
196
197 if (ts->idle_active)
198 *last_update_time = ktime_to_us(ts->idle_lastupdate);
199 else
200 *last_update_time = ktime_to_us(ktime_get());
201
191 return ktime_to_us(ts->idle_sleeptime); 202 return ktime_to_us(ts->idle_sleeptime);
192} 203}
204EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
193 205
194/** 206/**
195 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 207 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -221,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle)
221 */ 233 */
222 if (unlikely(!cpu_online(cpu))) { 234 if (unlikely(!cpu_online(cpu))) {
223 if (cpu == tick_do_timer_cpu) 235 if (cpu == tick_do_timer_cpu)
224 tick_do_timer_cpu = -1; 236 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
225 } 237 }
226 238
227 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -258,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle)
258 next_jiffies = get_next_timer_interrupt(last_jiffies); 270 next_jiffies = get_next_timer_interrupt(last_jiffies);
259 delta_jiffies = next_jiffies - last_jiffies; 271 delta_jiffies = next_jiffies - last_jiffies;
260 272
261 if (rcu_needs_cpu(cpu)) 273 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
262 delta_jiffies = 1; 274 delta_jiffies = 1;
263 /* 275 /*
264 * Do not stop the tick, if we are only one off 276 * Do not stop the tick, if we are only one off
@@ -303,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle)
303 * invoked. 315 * invoked.
304 */ 316 */
305 if (cpu == tick_do_timer_cpu) 317 if (cpu == tick_do_timer_cpu)
306 tick_do_timer_cpu = -1; 318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
307 319
308 ts->idle_sleeps++; 320 ts->idle_sleeps++;
309 321
@@ -468,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
468 * this duty, then the jiffies update is still serialized by 480 * this duty, then the jiffies update is still serialized by
469 * xtime_lock. 481 * xtime_lock.
470 */ 482 */
471 if (unlikely(tick_do_timer_cpu == -1)) 483 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
472 tick_do_timer_cpu = cpu; 484 tick_do_timer_cpu = cpu;
473 485
474 /* Check, if the jiffies need an update */ 486 /* Check, if the jiffies need an update */
@@ -570,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
570 * this duty, then the jiffies update is still serialized by 582 * this duty, then the jiffies update is still serialized by
571 * xtime_lock. 583 * xtime_lock.
572 */ 584 */
573 if (unlikely(tick_do_timer_cpu == -1)) 585 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
574 tick_do_timer_cpu = cpu; 586 tick_do_timer_cpu = cpu;
575#endif 587#endif
576 588
@@ -622,7 +634,7 @@ void tick_setup_sched_timer(void)
622 */ 634 */
623 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 635 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
624 ts->sched_timer.function = tick_sched_timer; 636 ts->sched_timer.function = tick_sched_timer;
625 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 637 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
626 638
627 /* Get the next period (per cpu) */ 639 /* Get the next period (per cpu) */
628 ts->sched_timer.expires = tick_init_jiffy_update(); 640 ts->sched_timer.expires = tick_init_jiffy_update();
diff --git a/kernel/timer.c b/kernel/timer.c
index 03bc7f1f1593..510fe69351ca 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -978,6 +978,7 @@ void update_process_times(int user_tick)
978 run_local_timers(); 978 run_local_timers();
979 if (rcu_pending(cpu)) 979 if (rcu_pending(cpu))
980 rcu_check_callbacks(cpu, user_tick); 980 rcu_check_callbacks(cpu, user_tick);
981 printk_tick();
981 scheduler_tick(); 982 scheduler_tick();
982 run_posix_cpu_timers(p); 983 run_posix_cpu_timers(p);
983} 984}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 208}
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4ab9659d269e..3b34b3545936 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
60 60
61#ifdef CONFIG_SYSCTL_SYSCALL 61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */ 62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, 63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp, 64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen) 65 void __user *newval, size_t newlen)
66{ 66{
@@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
69 write = newval && newlen; 69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table)); 70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write); 71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, name, nlen, 72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 oldval, oldlenp, newval, newlen);
74 put_uts(table, write, uts_table.data); 73 put_uts(table, write, uts_table.data);
75 return r; 74 return r;
76} 75}
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..cd87131f2fc2 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
72 spin_lock_irqsave(&q->lock, flags); 72 spin_lock_irqsave(&q->lock, flags);
73 if (list_empty(&wait->task_list)) 73 if (list_empty(&wait->task_list))
74 __add_wait_queue(q, wait); 74 __add_wait_queue(q, wait);
75 /* 75 set_current_state(state);
76 * don't alter the task state if this is just going to
77 * queue an async wait queue callback
78 */
79 if (is_sync_wait(wait))
80 set_current_state(state);
81 spin_unlock_irqrestore(&q->lock, flags); 76 spin_unlock_irqrestore(&q->lock, flags);
82} 77}
83EXPORT_SYMBOL(prepare_to_wait); 78EXPORT_SYMBOL(prepare_to_wait);
@@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91 spin_lock_irqsave(&q->lock, flags); 86 spin_lock_irqsave(&q->lock, flags);
92 if (list_empty(&wait->task_list)) 87 if (list_empty(&wait->task_list))
93 __add_wait_queue_tail(q, wait); 88 __add_wait_queue_tail(q, wait);
94 /* 89 set_current_state(state);
95 * don't alter the task state if this is just going to
96 * queue an async wait queue callback
97 */
98 if (is_sync_wait(wait))
99 set_current_state(state);
100 spin_unlock_irqrestore(&q->lock, flags); 90 spin_unlock_irqrestore(&q->lock, flags);
101} 91}
102EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4048e92aa04f..714afad46539 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -9,7 +9,7 @@
9 * Derived from the taskqueue/keventd code by: 9 * Derived from the taskqueue/keventd code by:
10 * 10 *
11 * David Woodhouse <dwmw2@infradead.org> 11 * David Woodhouse <dwmw2@infradead.org>
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *