aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt15
-rw-r--r--kernel/audit.c19
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cgroup.c4
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/exit.c98
-rw-r--r--kernel/kprobes.c52
-rw-r--r--kernel/lockdep.c8
-rw-r--r--kernel/marker.c9
-rw-r--r--kernel/module.c24
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/process.c29
-rw-r--r--kernel/power/snapshot.c41
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/rcupreempt.c233
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/sched.c382
-rw-r--r--kernel/sched_fair.c228
-rw-r--r--kernel/sched_rt.c10
-rw-r--r--kernel/signal.c16
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/softlockup.c13
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/time/ntp.c23
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/timekeeping.c6
26 files changed, 670 insertions, 575 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70fa6a3..9fdba03dc1fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
55config RCU_TRACE 69config RCU_TRACE
56 bool "Enable tracing for RCU - currently stats in debugfs" 70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
57 select DEBUG_FS 72 select DEBUG_FS
58 default y 73 default y
59 help 74 help
diff --git a/kernel/audit.c b/kernel/audit.c
index 2eeea9a14240..10c4930c2bbf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -170,7 +170,9 @@ void audit_panic(const char *message)
170 printk(KERN_ERR "audit: %s\n", message); 170 printk(KERN_ERR "audit: %s\n", message);
171 break; 171 break;
172 case AUDIT_FAIL_PANIC: 172 case AUDIT_FAIL_PANIC:
173 panic("audit: %s\n", message); 173 /* test audit_pid since printk is always losey, why bother? */
174 if (audit_pid)
175 panic("audit: %s\n", message);
174 break; 176 break;
175 } 177 }
176} 178}
@@ -352,6 +354,7 @@ static int kauditd_thread(void *dummy)
352 if (err < 0) { 354 if (err < 0) {
353 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 355 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
354 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 356 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
357 audit_log_lost("auditd dissapeared\n");
355 audit_pid = 0; 358 audit_pid = 0;
356 } 359 }
357 } else { 360 } else {
@@ -1350,17 +1353,19 @@ void audit_log_end(struct audit_buffer *ab)
1350 if (!audit_rate_check()) { 1353 if (!audit_rate_check()) {
1351 audit_log_lost("rate limit exceeded"); 1354 audit_log_lost("rate limit exceeded");
1352 } else { 1355 } else {
1356 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1353 if (audit_pid) { 1357 if (audit_pid) {
1354 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1355 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); 1358 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1356 skb_queue_tail(&audit_skb_queue, ab->skb); 1359 skb_queue_tail(&audit_skb_queue, ab->skb);
1357 ab->skb = NULL; 1360 ab->skb = NULL;
1358 wake_up_interruptible(&kauditd_wait); 1361 wake_up_interruptible(&kauditd_wait);
1359 } else if (printk_ratelimit()) { 1362 } else if (nlh->nlmsg_type != AUDIT_EOE) {
1360 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1363 if (printk_ratelimit()) {
1361 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0)); 1364 printk(KERN_NOTICE "type=%d %s\n",
1362 } else { 1365 nlh->nlmsg_type,
1363 audit_log_lost("printk limit exceeded\n"); 1366 ab->skb->data + NLMSG_SPACE(0));
1367 } else
1368 audit_log_lost("printk limit exceeded\n");
1364 } 1369 }
1365 } 1370 }
1366 audit_buffer_free(ab); 1371 audit_buffer_free(ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2087d6de67ea..782262e4107d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1070 * so we can be sure nothing was lost. 1070 * so we can be sure nothing was lost.
1071 */ 1071 */
1072 if ((i == 0) && (too_long)) 1072 if ((i == 0) && (too_long))
1073 audit_log_format(*ab, "a%d_len=%ld ", arg_num, 1073 audit_log_format(*ab, "a%d_len=%zu ", arg_num,
1074 has_cntl ? 2*len : len); 1074 has_cntl ? 2*len : len);
1075 1075
1076 /* 1076 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d8abe996e009..e9c2fb01e89b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2232,7 +2232,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2232 2232
2233 mutex_lock(&cgroup_mutex); 2233 mutex_lock(&cgroup_mutex);
2234 2234
2235 cgrp->flags = 0;
2236 INIT_LIST_HEAD(&cgrp->sibling); 2235 INIT_LIST_HEAD(&cgrp->sibling);
2237 INIT_LIST_HEAD(&cgrp->children); 2236 INIT_LIST_HEAD(&cgrp->children);
2238 INIT_LIST_HEAD(&cgrp->css_sets); 2237 INIT_LIST_HEAD(&cgrp->css_sets);
@@ -2242,6 +2241,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2242 cgrp->root = parent->root; 2241 cgrp->root = parent->root;
2243 cgrp->top_cgroup = parent->top_cgroup; 2242 cgrp->top_cgroup = parent->top_cgroup;
2244 2243
2244 if (notify_on_release(parent))
2245 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2246
2245 for_each_subsys(root, ss) { 2247 for_each_subsys(root, ss) {
2246 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 2248 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2247 if (IS_ERR(css)) { 2249 if (IS_ERR(css)) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e296ed81d4d..a1b61f414228 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
322 * Call without callback_mutex or task_lock() held. May be 322 * Call without callback_mutex or task_lock() held. May be
323 * called with or without cgroup_mutex held. Thanks in part to 323 * called with or without cgroup_mutex held. Thanks in part to
324 * 'the_top_cpuset_hack', the task's cpuset pointer will never 324 * 'the_top_cpuset_hack', the task's cpuset pointer will never
325 * be NULL. This routine also might acquire callback_mutex and 325 * be NULL. This routine also might acquire callback_mutex during
326 * current->mm->mmap_sem during call. 326 * call.
327 * 327 *
328 * Reading current->cpuset->mems_generation doesn't need task_lock 328 * Reading current->cpuset->mems_generation doesn't need task_lock
329 * to guard the current->cpuset derefence, because it is guarded 329 * to guard the current->cpuset derefence, because it is guarded
diff --git a/kernel/exit.c b/kernel/exit.c
index 506a957b665a..53872bf993fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp)
214static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) 214static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
215{ 215{
216 struct task_struct *p; 216 struct task_struct *p;
217 int ret = 1;
218 217
219 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 218 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
220 if (p == ignored_task 219 if ((p == ignored_task) ||
221 || p->exit_state 220 (p->exit_state && thread_group_empty(p)) ||
222 || is_global_init(p->real_parent)) 221 is_global_init(p->real_parent))
223 continue; 222 continue;
223
224 if (task_pgrp(p->real_parent) != pgrp && 224 if (task_pgrp(p->real_parent) != pgrp &&
225 task_session(p->real_parent) == task_session(p)) { 225 task_session(p->real_parent) == task_session(p))
226 ret = 0; 226 return 0;
227 break;
228 }
229 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 227 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
230 return ret; /* (sighing) "Often!" */ 228
229 return 1;
231} 230}
232 231
233int is_current_pgrp_orphaned(void) 232int is_current_pgrp_orphaned(void)
@@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp)
255 return retval; 254 return retval;
256} 255}
257 256
257/*
258 * Check to see if any process groups have become orphaned as
259 * a result of our exiting, and if they have any stopped jobs,
260 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
261 */
262static void
263kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
264{
265 struct pid *pgrp = task_pgrp(tsk);
266 struct task_struct *ignored_task = tsk;
267
268 if (!parent)
269 /* exit: our father is in a different pgrp than
270 * we are and we were the only connection outside.
271 */
272 parent = tsk->real_parent;
273 else
274 /* reparent: our child is in a different pgrp than
275 * we are, and it was the only connection outside.
276 */
277 ignored_task = NULL;
278
279 if (task_pgrp(parent) != pgrp &&
280 task_session(parent) == task_session(tsk) &&
281 will_become_orphaned_pgrp(pgrp, ignored_task) &&
282 has_stopped_jobs(pgrp)) {
283 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
284 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
285 }
286}
287
258/** 288/**
259 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd 289 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
260 * 290 *
@@ -635,22 +665,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
635 p->exit_signal != -1 && thread_group_empty(p)) 665 p->exit_signal != -1 && thread_group_empty(p))
636 do_notify_parent(p, p->exit_signal); 666 do_notify_parent(p, p->exit_signal);
637 667
638 /* 668 kill_orphaned_pgrp(p, father);
639 * process group orphan check
640 * Case ii: Our child is in a different pgrp
641 * than we are, and it was the only connection
642 * outside, so the child pgrp is now orphaned.
643 */
644 if ((task_pgrp(p) != task_pgrp(father)) &&
645 (task_session(p) == task_session(father))) {
646 struct pid *pgrp = task_pgrp(p);
647
648 if (will_become_orphaned_pgrp(pgrp, NULL) &&
649 has_stopped_jobs(pgrp)) {
650 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
651 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
652 }
653 }
654} 669}
655 670
656/* 671/*
@@ -735,11 +750,9 @@ static void forget_original_parent(struct task_struct *father)
735 * Send signals to all our closest relatives so that they know 750 * Send signals to all our closest relatives so that they know
736 * to properly mourn us.. 751 * to properly mourn us..
737 */ 752 */
738static void exit_notify(struct task_struct *tsk) 753static void exit_notify(struct task_struct *tsk, int group_dead)
739{ 754{
740 int state; 755 int state;
741 struct task_struct *t;
742 struct pid *pgrp;
743 756
744 /* 757 /*
745 * This does two things: 758 * This does two things:
@@ -753,25 +766,8 @@ static void exit_notify(struct task_struct *tsk)
753 exit_task_namespaces(tsk); 766 exit_task_namespaces(tsk);
754 767
755 write_lock_irq(&tasklist_lock); 768 write_lock_irq(&tasklist_lock);
756 /* 769 if (group_dead)
757 * Check to see if any process groups have become orphaned 770 kill_orphaned_pgrp(tsk->group_leader, NULL);
758 * as a result of our exiting, and if they have any stopped
759 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
760 *
761 * Case i: Our father is in a different pgrp than we are
762 * and we were the only connection outside, so our pgrp
763 * is about to become orphaned.
764 */
765 t = tsk->real_parent;
766
767 pgrp = task_pgrp(tsk);
768 if ((task_pgrp(t) != pgrp) &&
769 (task_session(t) == task_session(tsk)) &&
770 will_become_orphaned_pgrp(pgrp, tsk) &&
771 has_stopped_jobs(pgrp)) {
772 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
773 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
774 }
775 771
776 /* Let father know we died 772 /* Let father know we died
777 * 773 *
@@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk)
788 * the same after a fork. 784 * the same after a fork.
789 */ 785 */
790 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && 786 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
791 ( tsk->parent_exec_id != t->self_exec_id || 787 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
792 tsk->self_exec_id != tsk->parent_exec_id) 788 tsk->self_exec_id != tsk->parent_exec_id)
793 && !capable(CAP_KILL)) 789 && !capable(CAP_KILL))
794 tsk->exit_signal = SIGCHLD; 790 tsk->exit_signal = SIGCHLD;
795 791
@@ -986,7 +982,7 @@ NORET_TYPE void do_exit(long code)
986 module_put(tsk->binfmt->module); 982 module_put(tsk->binfmt->module);
987 983
988 proc_exit_connector(tsk); 984 proc_exit_connector(tsk);
989 exit_notify(tsk); 985 exit_notify(tsk, group_dead);
990#ifdef CONFIG_NUMA 986#ifdef CONFIG_NUMA
991 mpol_free(tsk->mempolicy); 987 mpol_free(tsk->mempolicy);
992 tsk->mempolicy = NULL; 988 tsk->mempolicy = NULL;
@@ -1382,7 +1378,7 @@ unlock_sig:
1382 if (!retval && infop) 1378 if (!retval && infop)
1383 retval = put_user(0, &infop->si_errno); 1379 retval = put_user(0, &infop->si_errno);
1384 if (!retval && infop) 1380 if (!retval && infop)
1385 retval = put_user(why, &infop->si_code); 1381 retval = put_user((short)why, &infop->si_code);
1386 if (!retval && infop) 1382 if (!retval && infop)
1387 retval = put_user(exit_code, &infop->si_status); 1383 retval = put_user(exit_code, &infop->si_status);
1388 if (!retval && infop) 1384 if (!retval && infop)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7a86e6432338..fcfb580c3afc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
498 return 0; 498 return 0;
499} 499}
500 500
501/*
502 * If we have a symbol_name argument, look it up and add the offset field
503 * to it. This way, we can specify a relative address to a symbol.
504 */
505static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
506{
507 kprobe_opcode_t *addr = p->addr;
508 if (p->symbol_name) {
509 if (addr)
510 return NULL;
511 kprobe_lookup_name(p->symbol_name, addr);
512 }
513
514 if (!addr)
515 return NULL;
516 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
517}
518
501static int __kprobes __register_kprobe(struct kprobe *p, 519static int __kprobes __register_kprobe(struct kprobe *p,
502 unsigned long called_from) 520 unsigned long called_from)
503{ 521{
504 int ret = 0; 522 int ret = 0;
505 struct kprobe *old_p; 523 struct kprobe *old_p;
506 struct module *probed_mod; 524 struct module *probed_mod;
525 kprobe_opcode_t *addr;
507 526
508 /* 527 addr = kprobe_addr(p);
509 * If we have a symbol_name argument look it up, 528 if (!addr)
510 * and add it to the address. That way the addr
511 * field can either be global or relative to a symbol.
512 */
513 if (p->symbol_name) {
514 if (p->addr)
515 return -EINVAL;
516 kprobe_lookup_name(p->symbol_name, p->addr);
517 }
518
519 if (!p->addr)
520 return -EINVAL; 529 return -EINVAL;
521 p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); 530 p->addr = addr;
522 531
523 if (!kernel_text_address((unsigned long) p->addr) || 532 if (!kernel_text_address((unsigned long) p->addr) ||
524 in_kprobes_functions((unsigned long) p->addr)) 533 in_kprobes_functions((unsigned long) p->addr))
@@ -678,8 +687,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
678 unregister_kprobe(&jp->kp); 687 unregister_kprobe(&jp->kp);
679} 688}
680 689
681#ifdef ARCH_SUPPORTS_KRETPROBES 690#ifdef CONFIG_KRETPROBES
682
683/* 691/*
684 * This kprobe pre_handler is registered with every kretprobe. When probe 692 * This kprobe pre_handler is registered with every kretprobe. When probe
685 * hits it will set up the return probe. 693 * hits it will set up the return probe.
@@ -722,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
722 int ret = 0; 730 int ret = 0;
723 struct kretprobe_instance *inst; 731 struct kretprobe_instance *inst;
724 int i; 732 int i;
725 void *addr = rp->kp.addr; 733 void *addr;
726 734
727 if (kretprobe_blacklist_size) { 735 if (kretprobe_blacklist_size) {
728 if (addr == NULL) 736 addr = kprobe_addr(&rp->kp);
729 kprobe_lookup_name(rp->kp.symbol_name, addr); 737 if (!addr)
730 addr += rp->kp.offset; 738 return -EINVAL;
731 739
732 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 740 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
733 if (kretprobe_blacklist[i].addr == addr) 741 if (kretprobe_blacklist[i].addr == addr)
@@ -769,8 +777,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
769 return ret; 777 return ret;
770} 778}
771 779
772#else /* ARCH_SUPPORTS_KRETPROBES */ 780#else /* CONFIG_KRETPROBES */
773
774int __kprobes register_kretprobe(struct kretprobe *rp) 781int __kprobes register_kretprobe(struct kretprobe *rp)
775{ 782{
776 return -ENOSYS; 783 return -ENOSYS;
@@ -781,8 +788,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
781{ 788{
782 return 0; 789 return 0;
783} 790}
784 791#endif /* CONFIG_KRETPROBES */
785#endif /* ARCH_SUPPORTS_KRETPROBES */
786 792
787void __kprobes unregister_kretprobe(struct kretprobe *rp) 793void __kprobes unregister_kretprobe(struct kretprobe *rp)
788{ 794{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3574379f4d62..81a4e4a3f087 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
779 * parallel walking of the hash-list safe: 779 * parallel walking of the hash-list safe:
780 */ 780 */
781 list_add_tail_rcu(&class->hash_entry, hash_head); 781 list_add_tail_rcu(&class->hash_entry, hash_head);
782 /*
783 * Add it to the global list of classes:
784 */
785 list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
782 786
783 if (verbose(class)) { 787 if (verbose(class)) {
784 graph_unlock(); 788 graph_unlock();
@@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2282 return 0; 2286 return 0;
2283 break; 2287 break;
2284 case LOCK_USED: 2288 case LOCK_USED:
2285 /*
2286 * Add it to the global list of classes:
2287 */
2288 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
2289 debug_atomic_dec(&nr_unused_locks); 2289 debug_atomic_dec(&nr_unused_locks);
2290 break; 2290 break;
2291 default: 2291 default:
diff --git a/kernel/marker.c b/kernel/marker.c
index 50effc01d9a2..48a4ea5afffd 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -698,14 +698,12 @@ int marker_probe_unregister(const char *name,
698{ 698{
699 struct marker_entry *entry; 699 struct marker_entry *entry;
700 struct marker_probe_closure *old; 700 struct marker_probe_closure *old;
701 int ret = 0; 701 int ret = -ENOENT;
702 702
703 mutex_lock(&markers_mutex); 703 mutex_lock(&markers_mutex);
704 entry = get_marker(name); 704 entry = get_marker(name);
705 if (!entry) { 705 if (!entry)
706 ret = -ENOENT;
707 goto end; 706 goto end;
708 }
709 if (entry->rcu_pending) 707 if (entry->rcu_pending)
710 rcu_barrier(); 708 rcu_barrier();
711 old = marker_entry_remove_probe(entry, probe, probe_private); 709 old = marker_entry_remove_probe(entry, probe, probe_private);
@@ -713,12 +711,15 @@ int marker_probe_unregister(const char *name,
713 marker_update_probes(); /* may update entry */ 711 marker_update_probes(); /* may update entry */
714 mutex_lock(&markers_mutex); 712 mutex_lock(&markers_mutex);
715 entry = get_marker(name); 713 entry = get_marker(name);
714 if (!entry)
715 goto end;
716 entry->oldptr = old; 716 entry->oldptr = old;
717 entry->rcu_pending = 1; 717 entry->rcu_pending = 1;
718 /* write rcu_pending before calling the RCU callback */ 718 /* write rcu_pending before calling the RCU callback */
719 smp_wmb(); 719 smp_wmb();
720 call_rcu(&entry->rcu, free_old_closure); 720 call_rcu(&entry->rcu, free_old_closure);
721 remove_marker(name); /* Ignore busy error message */ 721 remove_marker(name); /* Ignore busy error message */
722 ret = 0;
722end: 723end:
723 mutex_unlock(&markers_mutex); 724 mutex_unlock(&markers_mutex);
724 return ret; 725 return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 901cd6ac2f11..5d437bffd8dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1933,8 +1933,15 @@ static struct module *load_module(void __user *umod,
1933 /* Set up license info based on the info section */ 1933 /* Set up license info based on the info section */
1934 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1934 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1935 1935
1936 /*
1937 * ndiswrapper is under GPL by itself, but loads proprietary modules.
1938 * Don't use add_taint_module(), as it would prevent ndiswrapper from
1939 * using GPL-only symbols it needs.
1940 */
1936 if (strcmp(mod->name, "ndiswrapper") == 0) 1941 if (strcmp(mod->name, "ndiswrapper") == 0)
1937 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1942 add_taint(TAINT_PROPRIETARY_MODULE);
1943
1944 /* driverloader was caught wrongly pretending to be under GPL */
1938 if (strcmp(mod->name, "driverloader") == 0) 1945 if (strcmp(mod->name, "driverloader") == 0)
1939 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1946 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1940 1947
@@ -2171,10 +2178,20 @@ sys_init_module(void __user *umod,
2171 wake_up(&module_wq); 2178 wake_up(&module_wq);
2172 return ret; 2179 return ret;
2173 } 2180 }
2181 if (ret > 0) {
2182 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
2183 "it should follow 0/-E convention\n"
2184 KERN_WARNING "%s: loading module anyway...\n",
2185 __func__, mod->name, ret,
2186 __func__);
2187 dump_stack();
2188 }
2174 2189
2175 /* Now it's a first class citizen! */ 2190 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2176 mutex_lock(&module_mutex);
2177 mod->state = MODULE_STATE_LIVE; 2191 mod->state = MODULE_STATE_LIVE;
2192 wake_up(&module_wq);
2193
2194 mutex_lock(&module_mutex);
2178 /* Drop initial reference. */ 2195 /* Drop initial reference. */
2179 module_put(mod); 2196 module_put(mod);
2180 unwind_remove_table(mod->unwind_info, 1); 2197 unwind_remove_table(mod->unwind_info, 1);
@@ -2183,7 +2200,6 @@ sys_init_module(void __user *umod,
2183 mod->init_size = 0; 2200 mod->init_size = 0;
2184 mod->init_text_size = 0; 2201 mod->init_text_size = 0;
2185 mutex_unlock(&module_mutex); 2202 mutex_unlock(&module_mutex);
2186 wake_up(&module_wq);
2187 2203
2188 return 0; 2204 return 0;
2189} 2205}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 79833170bb9c..6233f3b4ae66 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
190 notification of APM "events" (e.g. battery status change). 190 notification of APM "events" (e.g. battery status change).
191 191
192 In order to use APM, you will need supporting software. For location 192 In order to use APM, you will need supporting software. For location
193 and more information, read <file:Documentation/pm.txt> and the 193 and more information, read <file:Documentation/power/pm.txt> and the
194 Battery Powered Linux mini-HOWTO, available from 194 Battery Powered Linux mini-HOWTO, available from
195 <http://www.tldp.org/docs.html#howto>. 195 <http://www.tldp.org/docs.html#howto>.
196 196
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7c2118f9597f..f1d0b345c9ba 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,22 +75,15 @@ void refrigerator(void)
75 __set_current_state(save); 75 __set_current_state(save);
76} 76}
77 77
78static void fake_signal_wake_up(struct task_struct *p, int resume) 78static void fake_signal_wake_up(struct task_struct *p)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 81
82 spin_lock_irqsave(&p->sighand->siglock, flags); 82 spin_lock_irqsave(&p->sighand->siglock, flags);
83 signal_wake_up(p, resume); 83 signal_wake_up(p, 0);
84 spin_unlock_irqrestore(&p->sighand->siglock, flags); 84 spin_unlock_irqrestore(&p->sighand->siglock, flags);
85} 85}
86 86
87static void send_fake_signal(struct task_struct *p)
88{
89 if (task_is_stopped(p))
90 force_sig_specific(SIGSTOP, p);
91 fake_signal_wake_up(p, task_is_stopped(p));
92}
93
94static int has_mm(struct task_struct *p) 87static int has_mm(struct task_struct *p)
95{ 88{
96 return (p->mm && !(p->flags & PF_BORROWED_MM)); 89 return (p->mm && !(p->flags & PF_BORROWED_MM));
@@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
121 if (freezing(p)) { 114 if (freezing(p)) {
122 if (has_mm(p)) { 115 if (has_mm(p)) {
123 if (!signal_pending(p)) 116 if (!signal_pending(p))
124 fake_signal_wake_up(p, 0); 117 fake_signal_wake_up(p);
125 } else { 118 } else {
126 if (with_mm_only) 119 if (with_mm_only)
127 ret = 0; 120 ret = 0;
@@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
135 } else { 128 } else {
136 if (has_mm(p)) { 129 if (has_mm(p)) {
137 set_freeze_flag(p); 130 set_freeze_flag(p);
138 send_fake_signal(p); 131 fake_signal_wake_up(p);
139 } else { 132 } else {
140 if (with_mm_only) { 133 if (with_mm_only) {
141 ret = 0; 134 ret = 0;
@@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space)
182 if (frozen(p) || !freezeable(p)) 175 if (frozen(p) || !freezeable(p))
183 continue; 176 continue;
184 177
185 if (task_is_traced(p) && frozen(p->parent)) {
186 cancel_freezing(p);
187 continue;
188 }
189
190 if (!freeze_task(p, freeze_user_space)) 178 if (!freeze_task(p, freeze_user_space))
191 continue; 179 continue;
192 180
193 if (!freezer_should_skip(p)) 181 /*
182 * Now that we've done set_freeze_flag, don't
183 * perturb a task in TASK_STOPPED or TASK_TRACED.
184 * It is "frozen enough". If the task does wake
185 * up, it will immediately call try_to_freeze.
186 */
187 if (!task_is_stopped_or_traced(p) &&
188 !freezer_should_skip(p))
194 todo++; 189 todo++;
195 } while_each_thread(g, p); 190 } while_each_thread(g, p);
196 read_unlock(&tasklist_lock); 191 read_unlock(&tasklist_lock);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020cabb4c..5f91a07c4eac 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
447 * of @bm->cur_zone_bm are updated. 447 * of @bm->cur_zone_bm are updated.
448 */ 448 */
449 449
450static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 450static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
451 void **addr, unsigned int *bit_nr) 451 void **addr, unsigned int *bit_nr)
452{ 452{
453 struct zone_bitmap *zone_bm; 453 struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
461 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 461 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
462 zone_bm = zone_bm->next; 462 zone_bm = zone_bm->next;
463 463
464 BUG_ON(!zone_bm); 464 if (!zone_bm)
465 return -EFAULT;
465 } 466 }
466 bm->cur.zone_bm = zone_bm; 467 bm->cur.zone_bm = zone_bm;
467 } 468 }
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
479 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
480 *bit_nr = pfn % BM_BITS_PER_CHUNK; 481 *bit_nr = pfn % BM_BITS_PER_CHUNK;
481 *addr = bb->data + pfn / BM_BITS_PER_CHUNK; 482 *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
483 return 0;
482} 484}
483 485
484static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) 486static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
485{ 487{
486 void *addr; 488 void *addr;
487 unsigned int bit; 489 unsigned int bit;
490 int error;
488 491
489 memory_bm_find_bit(bm, pfn, &addr, &bit); 492 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
493 BUG_ON(error);
490 set_bit(bit, addr); 494 set_bit(bit, addr);
491} 495}
492 496
497static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
498{
499 void *addr;
500 unsigned int bit;
501 int error;
502
503 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
504 if (!error)
505 set_bit(bit, addr);
506 return error;
507}
508
493static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) 509static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
494{ 510{
495 void *addr; 511 void *addr;
496 unsigned int bit; 512 unsigned int bit;
513 int error;
497 514
498 memory_bm_find_bit(bm, pfn, &addr, &bit); 515 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
516 BUG_ON(error);
499 clear_bit(bit, addr); 517 clear_bit(bit, addr);
500} 518}
501 519
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
503{ 521{
504 void *addr; 522 void *addr;
505 unsigned int bit; 523 unsigned int bit;
524 int error;
506 525
507 memory_bm_find_bit(bm, pfn, &addr, &bit); 526 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
527 BUG_ON(error);
508 return test_bit(bit, addr); 528 return test_bit(bit, addr);
509} 529}
510 530
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
709 region->end_pfn << PAGE_SHIFT); 729 region->end_pfn << PAGE_SHIFT);
710 730
711 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 731 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
712 if (pfn_valid(pfn)) 732 if (pfn_valid(pfn)) {
713 memory_bm_set_bit(bm, pfn); 733 /*
734 * It is safe to ignore the result of
735 * mem_bm_set_bit_check() here, since we won't
736 * touch the PFNs for which the error is
737 * returned anyway.
738 */
739 mem_bm_set_bit_check(bm, pfn);
740 }
714 } 741 }
715} 742}
716 743
diff --git a/kernel/printk.c b/kernel/printk.c
index bee36100f110..9adc2a473e6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
669 sizeof(printk_buf), fmt, args); 669 sizeof(printk_buf) - printed_len, fmt, args);
670 670
671 /* 671 /*
672 * Copy the output into log_buf. If the caller didn't provide 672 * Copy the output into log_buf. If the caller didn't provide
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 987cfb7ade89..e9517014b57c 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -23,6 +23,10 @@
23 * to Suparna Bhattacharya for pushing me completely away 23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side. 24 * from atomic instructions on the read side.
25 * 25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
26 * Papers: http://www.rdrop.com/users/paulmck/RCU 30 * Papers: http://www.rdrop.com/users/paulmck/RCU
27 * 31 *
28 * Design Document: http://lwn.net/Articles/253651/ 32 * Design Document: http://lwn.net/Articles/253651/
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
409 } 413 }
410} 414}
411 415
416#ifdef CONFIG_NO_HZ
417
418DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
419static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
420static DEFINE_PER_CPU(int, rcu_update_flag);
421
422/**
423 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
424 *
425 * If the CPU was idle with dynamic ticks active, this updates the
426 * dynticks_progress_counter to let the RCU handling know that the
427 * CPU is active.
428 */
429void rcu_irq_enter(void)
430{
431 int cpu = smp_processor_id();
432
433 if (per_cpu(rcu_update_flag, cpu))
434 per_cpu(rcu_update_flag, cpu)++;
435
436 /*
437 * Only update if we are coming from a stopped ticks mode
438 * (dynticks_progress_counter is even).
439 */
440 if (!in_interrupt() &&
441 (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
442 /*
443 * The following might seem like we could have a race
444 * with NMI/SMIs. But this really isn't a problem.
445 * Here we do a read/modify/write, and the race happens
446 * when an NMI/SMI comes in after the read and before
447 * the write. But NMI/SMIs will increment this counter
448 * twice before returning, so the zero bit will not
449 * be corrupted by the NMI/SMI which is the most important
450 * part.
451 *
452 * The only thing is that we would bring back the counter
453 * to a postion that it was in during the NMI/SMI.
454 * But the zero bit would be set, so the rest of the
455 * counter would again be ignored.
456 *
457 * On return from the IRQ, the counter may have the zero
458 * bit be 0 and the counter the same as the return from
459 * the NMI/SMI. If the state machine was so unlucky to
460 * see that, it still doesn't matter, since all
461 * RCU read-side critical sections on this CPU would
462 * have already completed.
463 */
464 per_cpu(dynticks_progress_counter, cpu)++;
465 /*
466 * The following memory barrier ensures that any
467 * rcu_read_lock() primitives in the irq handler
468 * are seen by other CPUs to follow the above
469 * increment to dynticks_progress_counter. This is
470 * required in order for other CPUs to correctly
471 * determine when it is safe to advance the RCU
472 * grace-period state machine.
473 */
474 smp_mb(); /* see above block comment. */
475 /*
476 * Since we can't determine the dynamic tick mode from
477 * the dynticks_progress_counter after this routine,
478 * we use a second flag to acknowledge that we came
479 * from an idle state with ticks stopped.
480 */
481 per_cpu(rcu_update_flag, cpu)++;
482 /*
483 * If we take an NMI/SMI now, they will also increment
484 * the rcu_update_flag, and will not update the
485 * dynticks_progress_counter on exit. That is for
486 * this IRQ to do.
487 */
488 }
489}
490
491/**
492 * rcu_irq_exit - Called from exiting Hard irq context.
493 *
494 * If the CPU was idle with dynamic ticks active, update the
495 * dynticks_progress_counter to put let the RCU handling be
496 * aware that the CPU is going back to idle with no ticks.
497 */
498void rcu_irq_exit(void)
499{
500 int cpu = smp_processor_id();
501
502 /*
503 * rcu_update_flag is set if we interrupted the CPU
504 * when it was idle with ticks stopped.
505 * Once this occurs, we keep track of interrupt nesting
506 * because a NMI/SMI could also come in, and we still
507 * only want the IRQ that started the increment of the
508 * dynticks_progress_counter to be the one that modifies
509 * it on exit.
510 */
511 if (per_cpu(rcu_update_flag, cpu)) {
512 if (--per_cpu(rcu_update_flag, cpu))
513 return;
514
515 /* This must match the interrupt nesting */
516 WARN_ON(in_interrupt());
517
518 /*
519 * If an NMI/SMI happens now we are still
520 * protected by the dynticks_progress_counter being odd.
521 */
522
523 /*
524 * The following memory barrier ensures that any
525 * rcu_read_unlock() primitives in the irq handler
526 * are seen by other CPUs to preceed the following
527 * increment to dynticks_progress_counter. This
528 * is required in order for other CPUs to determine
529 * when it is safe to advance the RCU grace-period
530 * state machine.
531 */
532 smp_mb(); /* see above block comment. */
533 per_cpu(dynticks_progress_counter, cpu)++;
534 WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
535 }
536}
537
538static void dyntick_save_progress_counter(int cpu)
539{
540 per_cpu(rcu_dyntick_snapshot, cpu) =
541 per_cpu(dynticks_progress_counter, cpu);
542}
543
544static inline int
545rcu_try_flip_waitack_needed(int cpu)
546{
547 long curr;
548 long snap;
549
550 curr = per_cpu(dynticks_progress_counter, cpu);
551 snap = per_cpu(rcu_dyntick_snapshot, cpu);
552 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
553
554 /*
555 * If the CPU remained in dynticks mode for the entire time
556 * and didn't take any interrupts, NMIs, SMIs, or whatever,
557 * then it cannot be in the middle of an rcu_read_lock(), so
558 * the next rcu_read_lock() it executes must use the new value
559 * of the counter. So we can safely pretend that this CPU
560 * already acknowledged the counter.
561 */
562
563 if ((curr == snap) && ((curr & 0x1) == 0))
564 return 0;
565
566 /*
567 * If the CPU passed through or entered a dynticks idle phase with
568 * no active irq handlers, then, as above, we can safely pretend
569 * that this CPU already acknowledged the counter.
570 */
571
572 if ((curr - snap) > 2 || (snap & 0x1) == 0)
573 return 0;
574
575 /* We need this CPU to explicitly acknowledge the counter flip. */
576
577 return 1;
578}
579
580static inline int
581rcu_try_flip_waitmb_needed(int cpu)
582{
583 long curr;
584 long snap;
585
586 curr = per_cpu(dynticks_progress_counter, cpu);
587 snap = per_cpu(rcu_dyntick_snapshot, cpu);
588 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
589
590 /*
591 * If the CPU remained in dynticks mode for the entire time
592 * and didn't take any interrupts, NMIs, SMIs, or whatever,
593 * then it cannot have executed an RCU read-side critical section
594 * during that time, so there is no need for it to execute a
595 * memory barrier.
596 */
597
598 if ((curr == snap) && ((curr & 0x1) == 0))
599 return 0;
600
601 /*
602 * If the CPU either entered or exited an outermost interrupt,
603 * SMI, NMI, or whatever handler, then we know that it executed
604 * a memory barrier when doing so. So we don't need another one.
605 */
606 if (curr != snap)
607 return 0;
608
609 /* We need the CPU to execute a memory barrier. */
610
611 return 1;
612}
613
614#else /* !CONFIG_NO_HZ */
615
616# define dyntick_save_progress_counter(cpu) do { } while (0)
617# define rcu_try_flip_waitack_needed(cpu) (1)
618# define rcu_try_flip_waitmb_needed(cpu) (1)
619
620#endif /* CONFIG_NO_HZ */
621
412/* 622/*
413 * Get here when RCU is idle. Decide whether we need to 623 * Get here when RCU is idle. Decide whether we need to
414 * move out of idle state, and return non-zero if so. 624 * move out of idle state, and return non-zero if so.
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void)
447 657
448 /* Now ask each CPU for acknowledgement of the flip. */ 658 /* Now ask each CPU for acknowledgement of the flip. */
449 659
450 for_each_cpu_mask(cpu, rcu_cpu_online_map) 660 for_each_cpu_mask(cpu, rcu_cpu_online_map) {
451 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 661 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
662 dyntick_save_progress_counter(cpu);
663 }
452 664
453 return 1; 665 return 1;
454} 666}
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void)
464 676
465 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 677 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
466 for_each_cpu_mask(cpu, rcu_cpu_online_map) 678 for_each_cpu_mask(cpu, rcu_cpu_online_map)
467 if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 679 if (rcu_try_flip_waitack_needed(cpu) &&
680 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
468 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 681 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
469 return 0; 682 return 0;
470 } 683 }
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void)
509 smp_mb(); /* ^^^^^^^^^^^^ */ 722 smp_mb(); /* ^^^^^^^^^^^^ */
510 723
511 /* Call for a memory barrier from each CPU. */ 724 /* Call for a memory barrier from each CPU. */
512 for_each_cpu_mask(cpu, rcu_cpu_online_map) 725 for_each_cpu_mask(cpu, rcu_cpu_online_map) {
513 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 726 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
727 dyntick_save_progress_counter(cpu);
728 }
514 729
515 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); 730 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
516 return 1; 731 return 1;
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void)
528 743
529 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 744 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
530 for_each_cpu_mask(cpu, rcu_cpu_online_map) 745 for_each_cpu_mask(cpu, rcu_cpu_online_map)
531 if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 746 if (rcu_try_flip_waitmb_needed(cpu) &&
747 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
532 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 748 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
533 return 0; 749 return 0;
534 } 750 }
@@ -702,8 +918,9 @@ void rcu_offline_cpu(int cpu)
702 * fix. 918 * fix.
703 */ 919 */
704 920
921 local_irq_save(flags);
705 rdp = RCU_DATA_ME(); 922 rdp = RCU_DATA_ME();
706 spin_lock_irqsave(&rdp->lock, flags); 923 spin_lock(&rdp->lock);
707 *rdp->nexttail = list; 924 *rdp->nexttail = list;
708 if (list) 925 if (list)
709 rdp->nexttail = tail; 926 rdp->nexttail = tail;
@@ -735,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused)
735{ 952{
736 unsigned long flags; 953 unsigned long flags;
737 struct rcu_head *next, *list; 954 struct rcu_head *next, *list;
738 struct rcu_data *rdp = RCU_DATA_ME(); 955 struct rcu_data *rdp;
739 956
740 spin_lock_irqsave(&rdp->lock, flags); 957 local_irq_save(flags);
958 rdp = RCU_DATA_ME();
959 spin_lock(&rdp->lock);
741 list = rdp->donelist; 960 list = rdp->donelist;
742 if (list == NULL) { 961 if (list == NULL) {
743 spin_unlock_irqrestore(&rdp->lock, flags); 962 spin_unlock_irqrestore(&rdp->lock, flags);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 16cbec2d5d60..efbfc0fc232f 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
113 113
114 ret = -EINVAL; 114 ret = -EINVAL;
115 115
116 strstrip(buf);
116 if (write_strategy) { 117 if (write_strategy) {
117 if (write_strategy(buf, &tmp)) { 118 if (write_strategy(buf, &tmp)) {
118 goto out_free; 119 goto out_free;
diff --git a/kernel/sched.c b/kernel/sched.c
index b387a8de26a5..d1ad69b270ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
174 struct sched_entity **se; 174 struct sched_entity **se;
175 /* runqueue "owned" by this group on each cpu */ 175 /* runqueue "owned" by this group on each cpu */
176 struct cfs_rq **cfs_rq; 176 struct cfs_rq **cfs_rq;
177
178 /*
179 * shares assigned to a task group governs how much of cpu bandwidth
180 * is allocated to the group. The more shares a group has, the more is
181 * the cpu bandwidth allocated to it.
182 *
183 * For ex, lets say that there are three task groups, A, B and C which
184 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
185 * cpu bandwidth allocated by the scheduler to task groups A, B and C
186 * should be:
187 *
188 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
189 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
190 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
191 *
192 * The weight assigned to a task group's schedulable entities on every
193 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
194 * group's shares. For ex: lets say that task group A has been
195 * assigned shares of 1000 and there are two CPUs in a system. Then,
196 *
197 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
198 *
199 * Note: It's not necessary that each of a task's group schedulable
200 * entity have the same weight on all CPUs. If the group
201 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
202 * better distribution of weight could be:
203 *
204 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
205 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
206 *
207 * rebalance_shares() is responsible for distributing the shares of a
208 * task groups like this among the group's schedulable entities across
209 * cpus.
210 *
211 */
212 unsigned long shares; 177 unsigned long shares;
213#endif 178#endif
214 179
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
250static DEFINE_MUTEX(doms_cur_mutex); 215static DEFINE_MUTEX(doms_cur_mutex);
251 216
252#ifdef CONFIG_FAIR_GROUP_SCHED 217#ifdef CONFIG_FAIR_GROUP_SCHED
253#ifdef CONFIG_SMP
254/* kernel thread that runs rebalance_shares() periodically */
255static struct task_struct *lb_monitor_task;
256static int load_balance_monitor(void *unused);
257#endif
258
259static void set_se_shares(struct sched_entity *se, unsigned long shares);
260
261#ifdef CONFIG_USER_SCHED 218#ifdef CONFIG_USER_SCHED
262# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 219# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
263#else 220#else
264# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 221# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
265#endif 222#endif
266 223
267#define MIN_GROUP_SHARES 2
268
269static int init_task_group_load = INIT_TASK_GROUP_LOAD; 224static int init_task_group_load = INIT_TASK_GROUP_LOAD;
270#endif 225#endif
271 226
@@ -346,7 +301,7 @@ struct cfs_rq {
346 /* 'curr' points to currently running entity on this cfs_rq. 301 /* 'curr' points to currently running entity on this cfs_rq.
347 * It is set to NULL otherwise (i.e when none are currently running). 302 * It is set to NULL otherwise (i.e when none are currently running).
348 */ 303 */
349 struct sched_entity *curr; 304 struct sched_entity *curr, *next;
350 305
351 unsigned long nr_spread_over; 306 unsigned long nr_spread_over;
352 307
@@ -668,6 +623,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
668 */ 623 */
669unsigned int sysctl_sched_rt_period = 1000000; 624unsigned int sysctl_sched_rt_period = 1000000;
670 625
626static __read_mostly int scheduler_running;
627
671/* 628/*
672 * part of the period that we allow rt tasks to run in us. 629 * part of the period that we allow rt tasks to run in us.
673 * default: 0.95s 630 * default: 0.95s
@@ -689,14 +646,16 @@ unsigned long long cpu_clock(int cpu)
689 unsigned long flags; 646 unsigned long flags;
690 struct rq *rq; 647 struct rq *rq;
691 648
692 local_irq_save(flags);
693 rq = cpu_rq(cpu);
694 /* 649 /*
695 * Only call sched_clock() if the scheduler has already been 650 * Only call sched_clock() if the scheduler has already been
696 * initialized (some code might call cpu_clock() very early): 651 * initialized (some code might call cpu_clock() very early):
697 */ 652 */
698 if (rq->idle) 653 if (unlikely(!scheduler_running))
699 update_rq_clock(rq); 654 return 0;
655
656 local_irq_save(flags);
657 rq = cpu_rq(cpu);
658 update_rq_clock(rq);
700 now = rq->clock; 659 now = rq->clock;
701 local_irq_restore(flags); 660 local_irq_restore(flags);
702 661
@@ -1125,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1125 u64 tmp; 1084 u64 tmp;
1126 1085
1127 if (unlikely(!lw->inv_weight)) 1086 if (unlikely(!lw->inv_weight))
1128 lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; 1087 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
1129 1088
1130 tmp = (u64)delta_exec * weight; 1089 tmp = (u64)delta_exec * weight;
1131 /* 1090 /*
@@ -1149,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1149static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1108static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1150{ 1109{
1151 lw->weight += inc; 1110 lw->weight += inc;
1111 lw->inv_weight = 0;
1152} 1112}
1153 1113
1154static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1114static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1155{ 1115{
1156 lw->weight -= dec; 1116 lw->weight -= dec;
1117 lw->inv_weight = 0;
1157} 1118}
1158 1119
1159/* 1120/*
@@ -1241,16 +1202,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1241static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1202static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1242#endif 1203#endif
1243 1204
1244static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1245{
1246 update_load_add(&rq->load, load);
1247}
1248
1249static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1250{
1251 update_load_sub(&rq->load, load);
1252}
1253
1254#ifdef CONFIG_SMP 1205#ifdef CONFIG_SMP
1255static unsigned long source_load(int cpu, int type); 1206static unsigned long source_load(int cpu, int type);
1256static unsigned long target_load(int cpu, int type); 1207static unsigned long target_load(int cpu, int type);
@@ -1268,14 +1219,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1268 1219
1269#define sched_class_highest (&rt_sched_class) 1220#define sched_class_highest (&rt_sched_class)
1270 1221
1271static void inc_nr_running(struct rq *rq) 1222static inline void inc_load(struct rq *rq, const struct task_struct *p)
1223{
1224 update_load_add(&rq->load, p->se.load.weight);
1225}
1226
1227static inline void dec_load(struct rq *rq, const struct task_struct *p)
1228{
1229 update_load_sub(&rq->load, p->se.load.weight);
1230}
1231
1232static void inc_nr_running(struct task_struct *p, struct rq *rq)
1272{ 1233{
1273 rq->nr_running++; 1234 rq->nr_running++;
1235 inc_load(rq, p);
1274} 1236}
1275 1237
1276static void dec_nr_running(struct rq *rq) 1238static void dec_nr_running(struct task_struct *p, struct rq *rq)
1277{ 1239{
1278 rq->nr_running--; 1240 rq->nr_running--;
1241 dec_load(rq, p);
1279} 1242}
1280 1243
1281static void set_load_weight(struct task_struct *p) 1244static void set_load_weight(struct task_struct *p)
@@ -1367,7 +1330,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1367 rq->nr_uninterruptible--; 1330 rq->nr_uninterruptible--;
1368 1331
1369 enqueue_task(rq, p, wakeup); 1332 enqueue_task(rq, p, wakeup);
1370 inc_nr_running(rq); 1333 inc_nr_running(p, rq);
1371} 1334}
1372 1335
1373/* 1336/*
@@ -1379,7 +1342,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1379 rq->nr_uninterruptible++; 1342 rq->nr_uninterruptible++;
1380 1343
1381 dequeue_task(rq, p, sleep); 1344 dequeue_task(rq, p, sleep);
1382 dec_nr_running(rq); 1345 dec_nr_running(p, rq);
1383} 1346}
1384 1347
1385/** 1348/**
@@ -2019,7 +1982,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2019 * management (if any): 1982 * management (if any):
2020 */ 1983 */
2021 p->sched_class->task_new(rq, p); 1984 p->sched_class->task_new(rq, p);
2022 inc_nr_running(rq); 1985 inc_nr_running(p, rq);
2023 } 1986 }
2024 check_preempt_curr(rq, p); 1987 check_preempt_curr(rq, p);
2025#ifdef CONFIG_SMP 1988#ifdef CONFIG_SMP
@@ -3885,7 +3848,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
3885asmlinkage void __sched schedule(void) 3848asmlinkage void __sched schedule(void)
3886{ 3849{
3887 struct task_struct *prev, *next; 3850 struct task_struct *prev, *next;
3888 long *switch_count; 3851 unsigned long *switch_count;
3889 struct rq *rq; 3852 struct rq *rq;
3890 int cpu; 3853 int cpu;
3891 3854
@@ -4307,11 +4270,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4307 oldprio = p->prio; 4270 oldprio = p->prio;
4308 on_rq = p->se.on_rq; 4271 on_rq = p->se.on_rq;
4309 running = task_current(rq, p); 4272 running = task_current(rq, p);
4310 if (on_rq) { 4273 if (on_rq)
4311 dequeue_task(rq, p, 0); 4274 dequeue_task(rq, p, 0);
4312 if (running) 4275 if (running)
4313 p->sched_class->put_prev_task(rq, p); 4276 p->sched_class->put_prev_task(rq, p);
4314 }
4315 4277
4316 if (rt_prio(prio)) 4278 if (rt_prio(prio))
4317 p->sched_class = &rt_sched_class; 4279 p->sched_class = &rt_sched_class;
@@ -4320,10 +4282,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4320 4282
4321 p->prio = prio; 4283 p->prio = prio;
4322 4284
4285 if (running)
4286 p->sched_class->set_curr_task(rq);
4323 if (on_rq) { 4287 if (on_rq) {
4324 if (running)
4325 p->sched_class->set_curr_task(rq);
4326
4327 enqueue_task(rq, p, 0); 4288 enqueue_task(rq, p, 0);
4328 4289
4329 check_class_changed(rq, p, prev_class, oldprio, running); 4290 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4358,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice)
4358 goto out_unlock; 4319 goto out_unlock;
4359 } 4320 }
4360 on_rq = p->se.on_rq; 4321 on_rq = p->se.on_rq;
4361 if (on_rq) 4322 if (on_rq) {
4362 dequeue_task(rq, p, 0); 4323 dequeue_task(rq, p, 0);
4324 dec_load(rq, p);
4325 }
4363 4326
4364 p->static_prio = NICE_TO_PRIO(nice); 4327 p->static_prio = NICE_TO_PRIO(nice);
4365 set_load_weight(p); 4328 set_load_weight(p);
@@ -4369,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice)
4369 4332
4370 if (on_rq) { 4333 if (on_rq) {
4371 enqueue_task(rq, p, 0); 4334 enqueue_task(rq, p, 0);
4335 inc_load(rq, p);
4372 /* 4336 /*
4373 * If the task increased its priority or is running and 4337 * If the task increased its priority or is running and
4374 * lowered its priority, then reschedule its CPU: 4338 * lowered its priority, then reschedule its CPU:
@@ -4458,7 +4422,7 @@ int task_nice(const struct task_struct *p)
4458{ 4422{
4459 return TASK_NICE(p); 4423 return TASK_NICE(p);
4460} 4424}
4461EXPORT_SYMBOL_GPL(task_nice); 4425EXPORT_SYMBOL(task_nice);
4462 4426
4463/** 4427/**
4464 * idle_cpu - is a given cpu idle currently? 4428 * idle_cpu - is a given cpu idle currently?
@@ -4617,19 +4581,17 @@ recheck:
4617 update_rq_clock(rq); 4581 update_rq_clock(rq);
4618 on_rq = p->se.on_rq; 4582 on_rq = p->se.on_rq;
4619 running = task_current(rq, p); 4583 running = task_current(rq, p);
4620 if (on_rq) { 4584 if (on_rq)
4621 deactivate_task(rq, p, 0); 4585 deactivate_task(rq, p, 0);
4622 if (running) 4586 if (running)
4623 p->sched_class->put_prev_task(rq, p); 4587 p->sched_class->put_prev_task(rq, p);
4624 }
4625 4588
4626 oldprio = p->prio; 4589 oldprio = p->prio;
4627 __setscheduler(rq, p, policy, param->sched_priority); 4590 __setscheduler(rq, p, policy, param->sched_priority);
4628 4591
4592 if (running)
4593 p->sched_class->set_curr_task(rq);
4629 if (on_rq) { 4594 if (on_rq) {
4630 if (running)
4631 p->sched_class->set_curr_task(rq);
4632
4633 activate_task(rq, p, 0); 4595 activate_task(rq, p, 0);
4634 4596
4635 check_class_changed(rq, p, prev_class, oldprio, running); 4597 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5136,7 +5098,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
5136 time_slice = 0; 5098 time_slice = 0;
5137 if (p->policy == SCHED_RR) { 5099 if (p->policy == SCHED_RR) {
5138 time_slice = DEF_TIMESLICE; 5100 time_slice = DEF_TIMESLICE;
5139 } else { 5101 } else if (p->policy != SCHED_FIFO) {
5140 struct sched_entity *se = &p->se; 5102 struct sched_entity *se = &p->se;
5141 unsigned long flags; 5103 unsigned long flags;
5142 struct rq *rq; 5104 struct rq *rq;
@@ -5917,7 +5879,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5917 spin_unlock_irq(&rq->lock); 5879 spin_unlock_irq(&rq->lock);
5918 break; 5880 break;
5919 5881
5920 case CPU_DOWN_PREPARE: 5882 case CPU_DYING:
5883 case CPU_DYING_FROZEN:
5921 /* Update our root-domain */ 5884 /* Update our root-domain */
5922 rq = cpu_rq(cpu); 5885 rq = cpu_rq(cpu);
5923 spin_lock_irqsave(&rq->lock, flags); 5886 spin_lock_irqsave(&rq->lock, flags);
@@ -7083,21 +7046,6 @@ void __init sched_init_smp(void)
7083 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7046 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7084 BUG(); 7047 BUG();
7085 sched_init_granularity(); 7048 sched_init_granularity();
7086
7087#ifdef CONFIG_FAIR_GROUP_SCHED
7088 if (nr_cpu_ids == 1)
7089 return;
7090
7091 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7092 "group_balance");
7093 if (!IS_ERR(lb_monitor_task)) {
7094 lb_monitor_task->flags |= PF_NOFREEZE;
7095 wake_up_process(lb_monitor_task);
7096 } else {
7097 printk(KERN_ERR "Could not create load balance monitor thread"
7098 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7099 }
7100#endif
7101} 7049}
7102#else 7050#else
7103void __init sched_init_smp(void) 7051void __init sched_init_smp(void)
@@ -7284,6 +7232,8 @@ void __init sched_init(void)
7284 * During early bootup we pretend to be a normal task: 7232 * During early bootup we pretend to be a normal task:
7285 */ 7233 */
7286 current->sched_class = &fair_sched_class; 7234 current->sched_class = &fair_sched_class;
7235
7236 scheduler_running = 1;
7287} 7237}
7288 7238
7289#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7239#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7418,157 +7368,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7418 7368
7419#ifdef CONFIG_GROUP_SCHED 7369#ifdef CONFIG_GROUP_SCHED
7420 7370
7421#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7422/*
7423 * distribute shares of all task groups among their schedulable entities,
7424 * to reflect load distribution across cpus.
7425 */
7426static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7427{
7428 struct cfs_rq *cfs_rq;
7429 struct rq *rq = cpu_rq(this_cpu);
7430 cpumask_t sdspan = sd->span;
7431 int balanced = 1;
7432
7433 /* Walk thr' all the task groups that we have */
7434 for_each_leaf_cfs_rq(rq, cfs_rq) {
7435 int i;
7436 unsigned long total_load = 0, total_shares;
7437 struct task_group *tg = cfs_rq->tg;
7438
7439 /* Gather total task load of this group across cpus */
7440 for_each_cpu_mask(i, sdspan)
7441 total_load += tg->cfs_rq[i]->load.weight;
7442
7443 /* Nothing to do if this group has no load */
7444 if (!total_load)
7445 continue;
7446
7447 /*
7448 * tg->shares represents the number of cpu shares the task group
7449 * is eligible to hold on a single cpu. On N cpus, it is
7450 * eligible to hold (N * tg->shares) number of cpu shares.
7451 */
7452 total_shares = tg->shares * cpus_weight(sdspan);
7453
7454 /*
7455 * redistribute total_shares across cpus as per the task load
7456 * distribution.
7457 */
7458 for_each_cpu_mask(i, sdspan) {
7459 unsigned long local_load, local_shares;
7460
7461 local_load = tg->cfs_rq[i]->load.weight;
7462 local_shares = (local_load * total_shares) / total_load;
7463 if (!local_shares)
7464 local_shares = MIN_GROUP_SHARES;
7465 if (local_shares == tg->se[i]->load.weight)
7466 continue;
7467
7468 spin_lock_irq(&cpu_rq(i)->lock);
7469 set_se_shares(tg->se[i], local_shares);
7470 spin_unlock_irq(&cpu_rq(i)->lock);
7471 balanced = 0;
7472 }
7473 }
7474
7475 return balanced;
7476}
7477
7478/*
7479 * How frequently should we rebalance_shares() across cpus?
7480 *
7481 * The more frequently we rebalance shares, the more accurate is the fairness
7482 * of cpu bandwidth distribution between task groups. However higher frequency
7483 * also implies increased scheduling overhead.
7484 *
7485 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7486 * consecutive calls to rebalance_shares() in the same sched domain.
7487 *
7488 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7489 * consecutive calls to rebalance_shares() in the same sched domain.
7490 *
7491 * These settings allows for the appropriate trade-off between accuracy of
7492 * fairness and the associated overhead.
7493 *
7494 */
7495
7496/* default: 8ms, units: milliseconds */
7497const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7498
7499/* default: 128ms, units: milliseconds */
7500const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7501
7502/* kernel thread that runs rebalance_shares() periodically */
7503static int load_balance_monitor(void *unused)
7504{
7505 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7506 struct sched_param schedparm;
7507 int ret;
7508
7509 /*
7510 * We don't want this thread's execution to be limited by the shares
7511 * assigned to default group (init_task_group). Hence make it run
7512 * as a SCHED_RR RT task at the lowest priority.
7513 */
7514 schedparm.sched_priority = 1;
7515 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7516 if (ret)
7517 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7518 " monitor thread (error = %d) \n", ret);
7519
7520 while (!kthread_should_stop()) {
7521 int i, cpu, balanced = 1;
7522
7523 /* Prevent cpus going down or coming up */
7524 get_online_cpus();
7525 /* lockout changes to doms_cur[] array */
7526 lock_doms_cur();
7527 /*
7528 * Enter a rcu read-side critical section to safely walk rq->sd
7529 * chain on various cpus and to walk task group list
7530 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7531 */
7532 rcu_read_lock();
7533
7534 for (i = 0; i < ndoms_cur; i++) {
7535 cpumask_t cpumap = doms_cur[i];
7536 struct sched_domain *sd = NULL, *sd_prev = NULL;
7537
7538 cpu = first_cpu(cpumap);
7539
7540 /* Find the highest domain at which to balance shares */
7541 for_each_domain(cpu, sd) {
7542 if (!(sd->flags & SD_LOAD_BALANCE))
7543 continue;
7544 sd_prev = sd;
7545 }
7546
7547 sd = sd_prev;
7548 /* sd == NULL? No load balance reqd in this domain */
7549 if (!sd)
7550 continue;
7551
7552 balanced &= rebalance_shares(sd, cpu);
7553 }
7554
7555 rcu_read_unlock();
7556
7557 unlock_doms_cur();
7558 put_online_cpus();
7559
7560 if (!balanced)
7561 timeout = sysctl_sched_min_bal_int_shares;
7562 else if (timeout < sysctl_sched_max_bal_int_shares)
7563 timeout *= 2;
7564
7565 msleep_interruptible(timeout);
7566 }
7567
7568 return 0;
7569}
7570#endif /* CONFIG_SMP */
7571
7572#ifdef CONFIG_FAIR_GROUP_SCHED 7371#ifdef CONFIG_FAIR_GROUP_SCHED
7573static void free_fair_sched_group(struct task_group *tg) 7372static void free_fair_sched_group(struct task_group *tg)
7574{ 7373{
@@ -7817,47 +7616,46 @@ void sched_move_task(struct task_struct *tsk)
7817 running = task_current(rq, tsk); 7616 running = task_current(rq, tsk);
7818 on_rq = tsk->se.on_rq; 7617 on_rq = tsk->se.on_rq;
7819 7618
7820 if (on_rq) { 7619 if (on_rq)
7821 dequeue_task(rq, tsk, 0); 7620 dequeue_task(rq, tsk, 0);
7822 if (unlikely(running)) 7621 if (unlikely(running))
7823 tsk->sched_class->put_prev_task(rq, tsk); 7622 tsk->sched_class->put_prev_task(rq, tsk);
7824 }
7825 7623
7826 set_task_rq(tsk, task_cpu(tsk)); 7624 set_task_rq(tsk, task_cpu(tsk));
7827 7625
7828 if (on_rq) { 7626#ifdef CONFIG_FAIR_GROUP_SCHED
7829 if (unlikely(running)) 7627 if (tsk->sched_class->moved_group)
7830 tsk->sched_class->set_curr_task(rq); 7628 tsk->sched_class->moved_group(tsk);
7629#endif
7630
7631 if (unlikely(running))
7632 tsk->sched_class->set_curr_task(rq);
7633 if (on_rq)
7831 enqueue_task(rq, tsk, 0); 7634 enqueue_task(rq, tsk, 0);
7832 }
7833 7635
7834 task_rq_unlock(rq, &flags); 7636 task_rq_unlock(rq, &flags);
7835} 7637}
7836 7638
7837#ifdef CONFIG_FAIR_GROUP_SCHED 7639#ifdef CONFIG_FAIR_GROUP_SCHED
7838/* rq->lock to be locked by caller */
7839static void set_se_shares(struct sched_entity *se, unsigned long shares) 7640static void set_se_shares(struct sched_entity *se, unsigned long shares)
7840{ 7641{
7841 struct cfs_rq *cfs_rq = se->cfs_rq; 7642 struct cfs_rq *cfs_rq = se->cfs_rq;
7842 struct rq *rq = cfs_rq->rq; 7643 struct rq *rq = cfs_rq->rq;
7843 int on_rq; 7644 int on_rq;
7844 7645
7845 if (!shares) 7646 spin_lock_irq(&rq->lock);
7846 shares = MIN_GROUP_SHARES;
7847 7647
7848 on_rq = se->on_rq; 7648 on_rq = se->on_rq;
7849 if (on_rq) { 7649 if (on_rq)
7850 dequeue_entity(cfs_rq, se, 0); 7650 dequeue_entity(cfs_rq, se, 0);
7851 dec_cpu_load(rq, se->load.weight);
7852 }
7853 7651
7854 se->load.weight = shares; 7652 se->load.weight = shares;
7855 se->load.inv_weight = div64_64((1ULL<<32), shares); 7653 se->load.inv_weight = div64_64((1ULL<<32), shares);
7856 7654
7857 if (on_rq) { 7655 if (on_rq)
7858 enqueue_entity(cfs_rq, se, 0); 7656 enqueue_entity(cfs_rq, se, 0);
7859 inc_cpu_load(rq, se->load.weight); 7657
7860 } 7658 spin_unlock_irq(&rq->lock);
7861} 7659}
7862 7660
7863static DEFINE_MUTEX(shares_mutex); 7661static DEFINE_MUTEX(shares_mutex);
@@ -7867,18 +7665,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7867 int i; 7665 int i;
7868 unsigned long flags; 7666 unsigned long flags;
7869 7667
7668 /*
7669 * A weight of 0 or 1 can cause arithmetics problems.
7670 * (The default weight is 1024 - so there's no practical
7671 * limitation from this.)
7672 */
7673 if (shares < 2)
7674 shares = 2;
7675
7870 mutex_lock(&shares_mutex); 7676 mutex_lock(&shares_mutex);
7871 if (tg->shares == shares) 7677 if (tg->shares == shares)
7872 goto done; 7678 goto done;
7873 7679
7874 if (shares < MIN_GROUP_SHARES)
7875 shares = MIN_GROUP_SHARES;
7876
7877 /*
7878 * Prevent any load balance activity (rebalance_shares,
7879 * load_balance_fair) from referring to this group first,
7880 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7881 */
7882 spin_lock_irqsave(&task_group_lock, flags); 7680 spin_lock_irqsave(&task_group_lock, flags);
7883 for_each_possible_cpu(i) 7681 for_each_possible_cpu(i)
7884 unregister_fair_sched_group(tg, i); 7682 unregister_fair_sched_group(tg, i);
@@ -7892,11 +7690,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7892 * w/o tripping rebalance_share or load_balance_fair. 7690 * w/o tripping rebalance_share or load_balance_fair.
7893 */ 7691 */
7894 tg->shares = shares; 7692 tg->shares = shares;
7895 for_each_possible_cpu(i) { 7693 for_each_possible_cpu(i)
7896 spin_lock_irq(&cpu_rq(i)->lock);
7897 set_se_shares(tg->se[i], shares); 7694 set_se_shares(tg->se[i], shares);
7898 spin_unlock_irq(&cpu_rq(i)->lock);
7899 }
7900 7695
7901 /* 7696 /*
7902 * Enable load balance activity on this group, by inserting it back on 7697 * Enable load balance activity on this group, by inserting it back on
@@ -7928,9 +7723,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
7928 if (runtime == RUNTIME_INF) 7723 if (runtime == RUNTIME_INF)
7929 return 1ULL << 16; 7724 return 1ULL << 16;
7930 7725
7931 runtime *= (1ULL << 16); 7726 return div64_64(runtime << 16, period);
7932 div64_64(runtime, period);
7933 return runtime;
7934} 7727}
7935 7728
7936static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7729static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7954,25 +7747,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7954 return total + to_ratio(period, runtime) < global_ratio; 7747 return total + to_ratio(period, runtime) < global_ratio;
7955} 7748}
7956 7749
7750/* Must be called with tasklist_lock held */
7751static inline int tg_has_rt_tasks(struct task_group *tg)
7752{
7753 struct task_struct *g, *p;
7754 do_each_thread(g, p) {
7755 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
7756 return 1;
7757 } while_each_thread(g, p);
7758 return 0;
7759}
7760
7957int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7761int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7958{ 7762{
7959 u64 rt_runtime, rt_period; 7763 u64 rt_runtime, rt_period;
7960 int err = 0; 7764 int err = 0;
7961 7765
7962 rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; 7766 rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
7963 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7767 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7964 if (rt_runtime_us == -1) 7768 if (rt_runtime_us == -1)
7965 rt_runtime = rt_period; 7769 rt_runtime = RUNTIME_INF;
7966 7770
7967 mutex_lock(&rt_constraints_mutex); 7771 mutex_lock(&rt_constraints_mutex);
7772 read_lock(&tasklist_lock);
7773 if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
7774 err = -EBUSY;
7775 goto unlock;
7776 }
7968 if (!__rt_schedulable(tg, rt_period, rt_runtime)) { 7777 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
7969 err = -EINVAL; 7778 err = -EINVAL;
7970 goto unlock; 7779 goto unlock;
7971 } 7780 }
7972 if (rt_runtime_us == -1)
7973 rt_runtime = RUNTIME_INF;
7974 tg->rt_runtime = rt_runtime; 7781 tg->rt_runtime = rt_runtime;
7975 unlock: 7782 unlock:
7783 read_unlock(&tasklist_lock);
7976 mutex_unlock(&rt_constraints_mutex); 7784 mutex_unlock(&rt_constraints_mutex);
7977 7785
7978 return err; 7786 return err;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c091d6e159d..f2cc59080efa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
175 * Maintain a cache of leftmost tree entries (it is frequently 175 * Maintain a cache of leftmost tree entries (it is frequently
176 * used): 176 * used):
177 */ 177 */
178 if (leftmost) 178 if (leftmost) {
179 cfs_rq->rb_leftmost = &se->run_node; 179 cfs_rq->rb_leftmost = &se->run_node;
180 /*
181 * maintain cfs_rq->min_vruntime to be a monotonic increasing
182 * value tracking the leftmost vruntime in the tree.
183 */
184 cfs_rq->min_vruntime =
185 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
186 }
180 187
181 rb_link_node(&se->run_node, parent, link); 188 rb_link_node(&se->run_node, parent, link);
182 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 189 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
184 191
185static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 192static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
186{ 193{
187 if (cfs_rq->rb_leftmost == &se->run_node) 194 if (cfs_rq->rb_leftmost == &se->run_node) {
188 cfs_rq->rb_leftmost = rb_next(&se->run_node); 195 struct rb_node *next_node;
196 struct sched_entity *next;
197
198 next_node = rb_next(&se->run_node);
199 cfs_rq->rb_leftmost = next_node;
200
201 if (next_node) {
202 next = rb_entry(next_node,
203 struct sched_entity, run_node);
204 cfs_rq->min_vruntime =
205 max_vruntime(cfs_rq->min_vruntime,
206 next->vruntime);
207 }
208 }
209
210 if (cfs_rq->next == se)
211 cfs_rq->next = NULL;
189 212
190 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 213 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
191} 214}
@@ -202,17 +225,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
202 225
203static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 226static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
204{ 227{
205 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 228 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
206 struct sched_entity *se = NULL;
207 struct rb_node *parent;
208 229
209 while (*link) { 230 if (!last)
210 parent = *link; 231 return NULL;
211 se = rb_entry(parent, struct sched_entity, run_node);
212 link = &parent->rb_right;
213 }
214 232
215 return se; 233 return rb_entry(last, struct sched_entity, run_node);
216} 234}
217 235
218/************************************************************** 236/**************************************************************
@@ -265,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
265 */ 283 */
266static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 284static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
267{ 285{
268 u64 slice = __sched_period(cfs_rq->nr_running); 286 return calc_delta_mine(__sched_period(cfs_rq->nr_running),
269 287 se->load.weight, &cfs_rq->load);
270 slice *= se->load.weight;
271 do_div(slice, cfs_rq->load.weight);
272
273 return slice;
274} 288}
275 289
276/* 290/*
@@ -308,7 +322,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
308 unsigned long delta_exec) 322 unsigned long delta_exec)
309{ 323{
310 unsigned long delta_exec_weighted; 324 unsigned long delta_exec_weighted;
311 u64 vruntime;
312 325
313 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 326 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
314 327
@@ -320,19 +333,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
320 &curr->load); 333 &curr->load);
321 } 334 }
322 curr->vruntime += delta_exec_weighted; 335 curr->vruntime += delta_exec_weighted;
323
324 /*
325 * maintain cfs_rq->min_vruntime to be a monotonic increasing
326 * value tracking the leftmost vruntime in the tree.
327 */
328 if (first_fair(cfs_rq)) {
329 vruntime = min_vruntime(curr->vruntime,
330 __pick_next_entity(cfs_rq)->vruntime);
331 } else
332 vruntime = curr->vruntime;
333
334 cfs_rq->min_vruntime =
335 max_vruntime(cfs_rq->min_vruntime, vruntime);
336} 336}
337 337
338static void update_curr(struct cfs_rq *cfs_rq) 338static void update_curr(struct cfs_rq *cfs_rq)
@@ -498,7 +498,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
498{ 498{
499 u64 vruntime; 499 u64 vruntime;
500 500
501 vruntime = cfs_rq->min_vruntime; 501 if (first_fair(cfs_rq)) {
502 vruntime = min_vruntime(cfs_rq->min_vruntime,
503 __pick_next_entity(cfs_rq)->vruntime);
504 } else
505 vruntime = cfs_rq->min_vruntime;
502 506
503 if (sched_feat(TREE_AVG)) { 507 if (sched_feat(TREE_AVG)) {
504 struct sched_entity *last = __pick_last_entity(cfs_rq); 508 struct sched_entity *last = __pick_last_entity(cfs_rq);
@@ -520,8 +524,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
520 524
521 if (!initial) { 525 if (!initial) {
522 /* sleeps upto a single latency don't count. */ 526 /* sleeps upto a single latency don't count. */
523 if (sched_feat(NEW_FAIR_SLEEPERS)) 527 if (sched_feat(NEW_FAIR_SLEEPERS)) {
524 vruntime -= sysctl_sched_latency; 528 vruntime -= calc_delta_fair(sysctl_sched_latency,
529 &cfs_rq->load);
530 }
525 531
526 /* ensure we never gain time by being placed backwards. */ 532 /* ensure we never gain time by being placed backwards. */
527 vruntime = max_vruntime(se->vruntime, vruntime); 533 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -621,12 +627,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
621 se->prev_sum_exec_runtime = se->sum_exec_runtime; 627 se->prev_sum_exec_runtime = se->sum_exec_runtime;
622} 628}
623 629
630static struct sched_entity *
631pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
632{
633 s64 diff, gran;
634
635 if (!cfs_rq->next)
636 return se;
637
638 diff = cfs_rq->next->vruntime - se->vruntime;
639 if (diff < 0)
640 return se;
641
642 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
643 if (diff > gran)
644 return se;
645
646 return cfs_rq->next;
647}
648
624static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 649static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
625{ 650{
626 struct sched_entity *se = NULL; 651 struct sched_entity *se = NULL;
627 652
628 if (first_fair(cfs_rq)) { 653 if (first_fair(cfs_rq)) {
629 se = __pick_next_entity(cfs_rq); 654 se = __pick_next_entity(cfs_rq);
655 se = pick_next(cfs_rq, se);
630 set_next_entity(cfs_rq, se); 656 set_next_entity(cfs_rq, se);
631 } 657 }
632 658
@@ -732,8 +758,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
732 return se->parent; 758 return se->parent;
733} 759}
734 760
735#define GROUP_IMBALANCE_PCT 20
736
737#else /* CONFIG_FAIR_GROUP_SCHED */ 761#else /* CONFIG_FAIR_GROUP_SCHED */
738 762
739#define for_each_sched_entity(se) \ 763#define for_each_sched_entity(se) \
@@ -824,26 +848,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
824static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 848static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
825{ 849{
826 struct cfs_rq *cfs_rq; 850 struct cfs_rq *cfs_rq;
827 struct sched_entity *se = &p->se, 851 struct sched_entity *se = &p->se;
828 *topse = NULL; /* Highest schedulable entity */
829 int incload = 1;
830 852
831 for_each_sched_entity(se) { 853 for_each_sched_entity(se) {
832 topse = se; 854 if (se->on_rq)
833 if (se->on_rq) {
834 incload = 0;
835 break; 855 break;
836 }
837 cfs_rq = cfs_rq_of(se); 856 cfs_rq = cfs_rq_of(se);
838 enqueue_entity(cfs_rq, se, wakeup); 857 enqueue_entity(cfs_rq, se, wakeup);
839 wakeup = 1; 858 wakeup = 1;
840 } 859 }
841 /* Increment cpu load if we just enqueued the first task of a group on
842 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
843 * at the highest grouping level.
844 */
845 if (incload)
846 inc_cpu_load(rq, topse->load.weight);
847 860
848 hrtick_start_fair(rq, rq->curr); 861 hrtick_start_fair(rq, rq->curr);
849} 862}
@@ -856,28 +869,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
856static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 869static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
857{ 870{
858 struct cfs_rq *cfs_rq; 871 struct cfs_rq *cfs_rq;
859 struct sched_entity *se = &p->se, 872 struct sched_entity *se = &p->se;
860 *topse = NULL; /* Highest schedulable entity */
861 int decload = 1;
862 873
863 for_each_sched_entity(se) { 874 for_each_sched_entity(se) {
864 topse = se;
865 cfs_rq = cfs_rq_of(se); 875 cfs_rq = cfs_rq_of(se);
866 dequeue_entity(cfs_rq, se, sleep); 876 dequeue_entity(cfs_rq, se, sleep);
867 /* Don't dequeue parent if it has other entities besides us */ 877 /* Don't dequeue parent if it has other entities besides us */
868 if (cfs_rq->load.weight) { 878 if (cfs_rq->load.weight)
869 if (parent_entity(se))
870 decload = 0;
871 break; 879 break;
872 }
873 sleep = 1; 880 sleep = 1;
874 } 881 }
875 /* Decrement cpu load if we just dequeued the last task of a group on
876 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
877 * at the highest grouping level.
878 */
879 if (decload)
880 dec_cpu_load(rq, topse->load.weight);
881 882
882 hrtick_start_fair(rq, rq->curr); 883 hrtick_start_fair(rq, rq->curr);
883} 884}
@@ -1090,6 +1091,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1090 resched_task(curr); 1091 resched_task(curr);
1091 return; 1092 return;
1092 } 1093 }
1094
1095 cfs_rq_of(pse)->next = pse;
1096
1093 /* 1097 /*
1094 * Batch tasks do not preempt (their preemption is driven by 1098 * Batch tasks do not preempt (their preemption is driven by
1095 * the tick): 1099 * the tick):
@@ -1191,6 +1195,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
1191 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1195 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
1192} 1196}
1193 1197
1198#ifdef CONFIG_FAIR_GROUP_SCHED
1199static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1200{
1201 struct sched_entity *curr;
1202 struct task_struct *p;
1203
1204 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1205 return MAX_PRIO;
1206
1207 curr = cfs_rq->curr;
1208 if (!curr)
1209 curr = __pick_next_entity(cfs_rq);
1210
1211 p = task_of(curr);
1212
1213 return p->prio;
1214}
1215#endif
1216
1194static unsigned long 1217static unsigned long
1195load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1218load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1196 unsigned long max_load_move, 1219 unsigned long max_load_move,
@@ -1200,45 +1223,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1200 struct cfs_rq *busy_cfs_rq; 1223 struct cfs_rq *busy_cfs_rq;
1201 long rem_load_move = max_load_move; 1224 long rem_load_move = max_load_move;
1202 struct rq_iterator cfs_rq_iterator; 1225 struct rq_iterator cfs_rq_iterator;
1203 unsigned long load_moved;
1204 1226
1205 cfs_rq_iterator.start = load_balance_start_fair; 1227 cfs_rq_iterator.start = load_balance_start_fair;
1206 cfs_rq_iterator.next = load_balance_next_fair; 1228 cfs_rq_iterator.next = load_balance_next_fair;
1207 1229
1208 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1230 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1209#ifdef CONFIG_FAIR_GROUP_SCHED 1231#ifdef CONFIG_FAIR_GROUP_SCHED
1210 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; 1232 struct cfs_rq *this_cfs_rq;
1211 unsigned long maxload, task_load, group_weight; 1233 long imbalance;
1212 unsigned long thisload, per_task_load; 1234 unsigned long maxload;
1213 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1214 1235
1215 task_load = busy_cfs_rq->load.weight; 1236 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1216 group_weight = se->load.weight;
1217 1237
1218 /* 1238 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1219 * 'group_weight' is contributed by tasks of total weight 1239 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1220 * 'task_load'. To move 'rem_load_move' worth of weight only, 1240 if (imbalance <= 0)
1221 * we need to move a maximum task load of:
1222 *
1223 * maxload = (remload / group_weight) * task_load;
1224 */
1225 maxload = (rem_load_move * task_load) / group_weight;
1226
1227 if (!maxload || !task_load)
1228 continue; 1241 continue;
1229 1242
1230 per_task_load = task_load / busy_cfs_rq->nr_running; 1243 /* Don't pull more than imbalance/2 */
1231 /* 1244 imbalance /= 2;
1232 * balance_tasks will try to forcibly move atleast one task if 1245 maxload = min(rem_load_move, imbalance);
1233 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1234 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1235 */
1236 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1237 continue;
1238 1246
1239 /* Disable priority-based load balance */ 1247 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1240 *this_best_prio = 0;
1241 thisload = this_cfs_rq->load.weight;
1242#else 1248#else
1243# define maxload rem_load_move 1249# define maxload rem_load_move
1244#endif 1250#endif
@@ -1247,33 +1253,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1247 * load_balance_[start|next]_fair iterators 1253 * load_balance_[start|next]_fair iterators
1248 */ 1254 */
1249 cfs_rq_iterator.arg = busy_cfs_rq; 1255 cfs_rq_iterator.arg = busy_cfs_rq;
1250 load_moved = balance_tasks(this_rq, this_cpu, busiest, 1256 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1251 maxload, sd, idle, all_pinned, 1257 maxload, sd, idle, all_pinned,
1252 this_best_prio, 1258 this_best_prio,
1253 &cfs_rq_iterator); 1259 &cfs_rq_iterator);
1254 1260
1255#ifdef CONFIG_FAIR_GROUP_SCHED
1256 /*
1257 * load_moved holds the task load that was moved. The
1258 * effective (group) weight moved would be:
1259 * load_moved_eff = load_moved/task_load * group_weight;
1260 */
1261 load_moved = (group_weight * load_moved) / task_load;
1262
1263 /* Adjust shares on both cpus to reflect load_moved */
1264 group_weight -= load_moved;
1265 set_se_shares(se, group_weight);
1266
1267 se = busy_cfs_rq->tg->se[this_cpu];
1268 if (!thisload)
1269 group_weight = load_moved;
1270 else
1271 group_weight = se->load.weight + load_moved;
1272 set_se_shares(se, group_weight);
1273#endif
1274
1275 rem_load_move -= load_moved;
1276
1277 if (rem_load_move <= 0) 1261 if (rem_load_move <= 0)
1278 break; 1262 break;
1279 } 1263 }
@@ -1403,6 +1387,16 @@ static void set_curr_task_fair(struct rq *rq)
1403 set_next_entity(cfs_rq_of(se), se); 1387 set_next_entity(cfs_rq_of(se), se);
1404} 1388}
1405 1389
1390#ifdef CONFIG_FAIR_GROUP_SCHED
1391static void moved_group_fair(struct task_struct *p)
1392{
1393 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1394
1395 update_curr(cfs_rq);
1396 place_entity(cfs_rq, &p->se, 1);
1397}
1398#endif
1399
1406/* 1400/*
1407 * All the scheduling class methods: 1401 * All the scheduling class methods:
1408 */ 1402 */
@@ -1431,6 +1425,10 @@ static const struct sched_class fair_sched_class = {
1431 1425
1432 .prio_changed = prio_changed_fair, 1426 .prio_changed = prio_changed_fair,
1433 .switched_to = switched_to_fair, 1427 .switched_to = switched_to_fair,
1428
1429#ifdef CONFIG_FAIR_GROUP_SCHED
1430 .moved_group = moved_group_fair,
1431#endif
1434}; 1432};
1435 1433
1436#ifdef CONFIG_SCHED_DEBUG 1434#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f54792b175b2..0a6d2e516420 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
393 */ 393 */
394 for_each_sched_rt_entity(rt_se) 394 for_each_sched_rt_entity(rt_se)
395 enqueue_rt_entity(rt_se); 395 enqueue_rt_entity(rt_se);
396
397 inc_cpu_load(rq, p->se.load.weight);
398} 396}
399 397
400static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 398static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
414 if (rt_rq && rt_rq->rt_nr_running) 412 if (rt_rq && rt_rq->rt_nr_running)
415 enqueue_rt_entity(rt_se); 413 enqueue_rt_entity(rt_se);
416 } 414 }
417
418 dec_cpu_load(rq, p->se.load.weight);
419} 415}
420 416
421/* 417/*
@@ -1111,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1111 pull_rt_task(rq); 1107 pull_rt_task(rq);
1112 /* 1108 /*
1113 * If there's a higher priority task waiting to run 1109 * If there's a higher priority task waiting to run
1114 * then reschedule. 1110 * then reschedule. Note, the above pull_rt_task
1111 * can release the rq lock and p could migrate.
1112 * Only reschedule if p is still on the same runqueue.
1115 */ 1113 */
1116 if (p->prio > rq->rt.highest_prio) 1114 if (p->prio > rq->rt.highest_prio && rq->curr == p)
1117 resched_task(p); 1115 resched_task(p);
1118#else 1116#else
1119 /* For UP simply resched on drop of prio */ 1117 /* For UP simply resched on drop of prio */
diff --git a/kernel/signal.c b/kernel/signal.c
index 84917fe507f7..6af1210092c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1623 /* Let the debugger run. */ 1623 /* Let the debugger run. */
1624 __set_current_state(TASK_TRACED); 1624 __set_current_state(TASK_TRACED);
1625 spin_unlock_irq(&current->sighand->siglock); 1625 spin_unlock_irq(&current->sighand->siglock);
1626 try_to_freeze();
1627 read_lock(&tasklist_lock); 1626 read_lock(&tasklist_lock);
1628 if (!unlikely(killed) && may_ptrace_stop()) { 1627 if (!unlikely(killed) && may_ptrace_stop()) {
1629 do_notify_parent_cldstop(current, CLD_TRAPPED); 1628 do_notify_parent_cldstop(current, CLD_TRAPPED);
@@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1641 } 1640 }
1642 1641
1643 /* 1642 /*
1643 * While in TASK_TRACED, we were considered "frozen enough".
1644 * Now that we woke up, it's crucial if we're supposed to be
1645 * frozen that we freeze now before running anything substantial.
1646 */
1647 try_to_freeze();
1648
1649 /*
1644 * We are back. Now reacquire the siglock before touching 1650 * We are back. Now reacquire the siglock before touching
1645 * last_siginfo, so that we are sure to have synchronized with 1651 * last_siginfo, so that we are sure to have synchronized with
1646 * any signal-sending on another CPU that wants to examine it. 1652 * any signal-sending on another CPU that wants to examine it.
@@ -1757,9 +1763,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1757 sigset_t *mask = &current->blocked; 1763 sigset_t *mask = &current->blocked;
1758 int signr = 0; 1764 int signr = 0;
1759 1765
1766relock:
1767 /*
1768 * We'll jump back here after any time we were stopped in TASK_STOPPED.
1769 * While in TASK_STOPPED, we were considered "frozen enough".
1770 * Now that we woke up, it's crucial if we're supposed to be
1771 * frozen that we freeze now before running anything substantial.
1772 */
1760 try_to_freeze(); 1773 try_to_freeze();
1761 1774
1762relock:
1763 spin_lock_irq(&current->sighand->siglock); 1775 spin_lock_irq(&current->sighand->siglock);
1764 for (;;) { 1776 for (;;) {
1765 struct k_sigaction *ka; 1777 struct k_sigaction *ka;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5b3aea5f471e..31e9f2a47928 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -313,6 +313,7 @@ void irq_exit(void)
313 /* Make sure that timer wheel updates are propagated */ 313 /* Make sure that timer wheel updates are propagated */
314 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) 314 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
315 tick_nohz_stop_sched_tick(); 315 tick_nohz_stop_sched_tick();
316 rcu_irq_exit();
316#endif 317#endif
317 preempt_enable_no_resched(); 318 preempt_enable_no_resched();
318} 319}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7c2da88db4ed..01b6522fd92b 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu)
216 /* initialize timestamp */ 216 /* initialize timestamp */
217 touch_softlockup_watchdog(); 217 touch_softlockup_watchdog();
218 218
219 set_current_state(TASK_INTERRUPTIBLE);
219 /* 220 /*
220 * Run briefly once per second to reset the softlockup timestamp. 221 * Run briefly once per second to reset the softlockup timestamp.
221 * If this gets delayed for more than 60 seconds then the 222 * If this gets delayed for more than 60 seconds then the
222 * debug-printout triggers in softlockup_tick(). 223 * debug-printout triggers in softlockup_tick().
223 */ 224 */
224 while (!kthread_should_stop()) { 225 while (!kthread_should_stop()) {
225 set_current_state(TASK_INTERRUPTIBLE);
226 touch_softlockup_watchdog(); 226 touch_softlockup_watchdog();
227 schedule(); 227 schedule();
228 228
229 if (kthread_should_stop()) 229 if (kthread_should_stop())
230 break; 230 break;
231 231
232 if (this_cpu != check_cpu) 232 if (this_cpu == check_cpu) {
233 continue; 233 if (sysctl_hung_task_timeout_secs)
234 234 check_hung_uninterruptible_tasks(this_cpu);
235 if (sysctl_hung_task_timeout_secs) 235 }
236 check_hung_uninterruptible_tasks(this_cpu);
237 236
237 set_current_state(TASK_INTERRUPTIBLE);
238 } 238 }
239 __set_current_state(TASK_RUNNING);
239 240
240 return 0; 241 return 0;
241} 242}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b7e95411795..b2a2d6889bab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = {
311 .mode = 0644, 311 .mode = 0644,
312 .proc_handler = &proc_dointvec, 312 .proc_handler = &proc_dointvec,
313 }, 313 },
314#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
315 {
316 .ctl_name = CTL_UNNUMBERED,
317 .procname = "sched_min_bal_int_shares",
318 .data = &sysctl_sched_min_bal_int_shares,
319 .maxlen = sizeof(unsigned int),
320 .mode = 0644,
321 .proc_handler = &proc_dointvec,
322 },
323 {
324 .ctl_name = CTL_UNNUMBERED,
325 .procname = "sched_max_bal_int_shares",
326 .data = &sysctl_sched_max_bal_int_shares,
327 .maxlen = sizeof(unsigned int),
328 .mode = 0644,
329 .proc_handler = &proc_dointvec,
330 },
331#endif
332#endif 314#endif
333 { 315 {
334 .ctl_name = CTL_UNNUMBERED, 316 .ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b5910e7ab..5fd9b9469770 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
42long time_freq; /* frequency offset (scaled ppm)*/ 42long time_freq; /* frequency offset (scaled ppm)*/
43static long time_reftime; /* time at last adjustment (s) */ 43static long time_reftime; /* time at last adjustment (s) */
44long time_adjust; 44long time_adjust;
45static long ntp_tick_adj;
45 46
46static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
47{ 48{
48 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
49 << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
50 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 51 second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
51 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 tick_length_base = second_length; 54 tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
342 freq_adj = shift_right(freq_adj, time_constant * 2 + 343 freq_adj = shift_right(freq_adj, time_constant * 2 +
343 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); 344 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
344 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { 345 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
346 u64 utemp64;
345 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); 347 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
346 if (time_offset < 0) { 348 if (time_offset < 0) {
347 temp64 = -temp64; 349 utemp64 = -temp64;
348 do_div(temp64, mtemp); 350 do_div(utemp64, mtemp);
349 freq_adj -= temp64; 351 freq_adj -= utemp64;
350 } else { 352 } else {
351 do_div(temp64, mtemp); 353 utemp64 = temp64;
352 freq_adj += temp64; 354 do_div(utemp64, mtemp);
355 freq_adj += utemp64;
353 } 356 }
354 } 357 }
355 freq_adj += time_freq; 358 freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
400 notify_cmos_timer(); 403 notify_cmos_timer();
401 return(result); 404 return(result);
402} 405}
406
407static int __init ntp_tick_adj_setup(char *str)
408{
409 ntp_tick_adj = simple_strtol(str, NULL, 0);
410 return 1;
411}
412
413__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa9bb73dbdb4..686da821d376 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void)
282 ts->idle_tick = ts->sched_timer.expires; 282 ts->idle_tick = ts->sched_timer.expires;
283 ts->tick_stopped = 1; 283 ts->tick_stopped = 1;
284 ts->idle_jiffies = last_jiffies; 284 ts->idle_jiffies = last_jiffies;
285 rcu_enter_nohz();
285 } 286 }
286 287
287 /* 288 /*
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void)
375 return; 376 return;
376 } 377 }
377 378
379 rcu_exit_nohz();
380
378 /* Update jiffies first */ 381 /* Update jiffies first */
379 select_nohz_load_balancer(0); 382 select_nohz_load_balancer(0);
380 now = ktime_get(); 383 now = ktime_get();
@@ -637,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
637 640
638 if (ts->sched_timer.base) 641 if (ts->sched_timer.base)
639 hrtimer_cancel(&ts->sched_timer); 642 hrtimer_cancel(&ts->sched_timer);
640 ts->tick_stopped = 0; 643
641 ts->nohz_mode = NOHZ_MODE_INACTIVE; 644 ts->nohz_mode = NOHZ_MODE_INACTIVE;
642} 645}
643#endif /* HIGH_RES_TIMERS */ 646#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb050fe2..671af612b768 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,8 +187,7 @@ static void change_clocksource(void)
187 187
188 clock->error = 0; 188 clock->error = 0;
189 clock->xtime_nsec = 0; 189 clock->xtime_nsec = 0;
190 clocksource_calculate_interval(clock, 190 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
191 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
192 191
193 tick_clock_notify(); 192 tick_clock_notify();
194 193
@@ -245,8 +244,7 @@ void __init timekeeping_init(void)
245 ntp_clear(); 244 ntp_clear();
246 245
247 clock = clocksource_get_next(); 246 clock = clocksource_get_next();
248 clocksource_calculate_interval(clock, 247 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
249 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
250 clock->cycle_last = clocksource_read(clock); 248 clock->cycle_last = clocksource_read(clock);
251 249
252 xtime.tv_sec = sec; 250 xtime.tv_sec = sec;