diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 15 | ||||
-rw-r--r-- | kernel/audit.c | 19 | ||||
-rw-r--r-- | kernel/auditsc.c | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 4 | ||||
-rw-r--r-- | kernel/cpuset.c | 4 | ||||
-rw-r--r-- | kernel/exit.c | 98 | ||||
-rw-r--r-- | kernel/kprobes.c | 52 | ||||
-rw-r--r-- | kernel/lockdep.c | 8 | ||||
-rw-r--r-- | kernel/marker.c | 9 | ||||
-rw-r--r-- | kernel/module.c | 24 | ||||
-rw-r--r-- | kernel/power/process.c | 29 | ||||
-rw-r--r-- | kernel/printk.c | 2 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 233 | ||||
-rw-r--r-- | kernel/res_counter.c | 1 | ||||
-rw-r--r-- | kernel/sched.c | 338 | ||||
-rw-r--r-- | kernel/sched_fair.c | 142 | ||||
-rw-r--r-- | kernel/sched_rt.c | 10 | ||||
-rw-r--r-- | kernel/signal.c | 16 | ||||
-rw-r--r-- | kernel/softirq.c | 1 | ||||
-rw-r--r-- | kernel/softlockup.c | 13 | ||||
-rw-r--r-- | kernel/sysctl.c | 18 | ||||
-rw-r--r-- | kernel/time/ntp.c | 23 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 5 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 6 |
24 files changed, 555 insertions, 517 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0669b70fa6a3..9fdba03dc1fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -52,8 +52,23 @@ config PREEMPT | |||
52 | 52 | ||
53 | endchoice | 53 | endchoice |
54 | 54 | ||
55 | config PREEMPT_RCU | ||
56 | bool "Preemptible RCU" | ||
57 | depends on PREEMPT | ||
58 | default n | ||
59 | help | ||
60 | This option reduces the latency of the kernel by making certain | ||
61 | RCU sections preemptible. Normally RCU code is non-preemptible, if | ||
62 | this option is selected then read-only RCU sections become | ||
63 | preemptible. This helps latency, but may expose bugs due to | ||
64 | now-naive assumptions about each RCU read-side critical section | ||
65 | remaining on a given CPU through its execution. | ||
66 | |||
67 | Say N if you are unsure. | ||
68 | |||
55 | config RCU_TRACE | 69 | config RCU_TRACE |
56 | bool "Enable tracing for RCU - currently stats in debugfs" | 70 | bool "Enable tracing for RCU - currently stats in debugfs" |
71 | depends on PREEMPT_RCU | ||
57 | select DEBUG_FS | 72 | select DEBUG_FS |
58 | default y | 73 | default y |
59 | help | 74 | help |
diff --git a/kernel/audit.c b/kernel/audit.c index 2eeea9a14240..10c4930c2bbf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -170,7 +170,9 @@ void audit_panic(const char *message) | |||
170 | printk(KERN_ERR "audit: %s\n", message); | 170 | printk(KERN_ERR "audit: %s\n", message); |
171 | break; | 171 | break; |
172 | case AUDIT_FAIL_PANIC: | 172 | case AUDIT_FAIL_PANIC: |
173 | panic("audit: %s\n", message); | 173 | /* test audit_pid since printk is always losey, why bother? */ |
174 | if (audit_pid) | ||
175 | panic("audit: %s\n", message); | ||
174 | break; | 176 | break; |
175 | } | 177 | } |
176 | } | 178 | } |
@@ -352,6 +354,7 @@ static int kauditd_thread(void *dummy) | |||
352 | if (err < 0) { | 354 | if (err < 0) { |
353 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | 355 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ |
354 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 356 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
357 | audit_log_lost("auditd dissapeared\n"); | ||
355 | audit_pid = 0; | 358 | audit_pid = 0; |
356 | } | 359 | } |
357 | } else { | 360 | } else { |
@@ -1350,17 +1353,19 @@ void audit_log_end(struct audit_buffer *ab) | |||
1350 | if (!audit_rate_check()) { | 1353 | if (!audit_rate_check()) { |
1351 | audit_log_lost("rate limit exceeded"); | 1354 | audit_log_lost("rate limit exceeded"); |
1352 | } else { | 1355 | } else { |
1356 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | ||
1353 | if (audit_pid) { | 1357 | if (audit_pid) { |
1354 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | ||
1355 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); | 1358 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); |
1356 | skb_queue_tail(&audit_skb_queue, ab->skb); | 1359 | skb_queue_tail(&audit_skb_queue, ab->skb); |
1357 | ab->skb = NULL; | 1360 | ab->skb = NULL; |
1358 | wake_up_interruptible(&kauditd_wait); | 1361 | wake_up_interruptible(&kauditd_wait); |
1359 | } else if (printk_ratelimit()) { | 1362 | } else if (nlh->nlmsg_type != AUDIT_EOE) { |
1360 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1363 | if (printk_ratelimit()) { |
1361 | printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0)); | 1364 | printk(KERN_NOTICE "type=%d %s\n", |
1362 | } else { | 1365 | nlh->nlmsg_type, |
1363 | audit_log_lost("printk limit exceeded\n"); | 1366 | ab->skb->data + NLMSG_SPACE(0)); |
1367 | } else | ||
1368 | audit_log_lost("printk limit exceeded\n"); | ||
1364 | } | 1369 | } |
1365 | } | 1370 | } |
1366 | audit_buffer_free(ab); | 1371 | audit_buffer_free(ab); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2087d6de67ea..782262e4107d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, | |||
1070 | * so we can be sure nothing was lost. | 1070 | * so we can be sure nothing was lost. |
1071 | */ | 1071 | */ |
1072 | if ((i == 0) && (too_long)) | 1072 | if ((i == 0) && (too_long)) |
1073 | audit_log_format(*ab, "a%d_len=%ld ", arg_num, | 1073 | audit_log_format(*ab, "a%d_len=%zu ", arg_num, |
1074 | has_cntl ? 2*len : len); | 1074 | has_cntl ? 2*len : len); |
1075 | 1075 | ||
1076 | /* | 1076 | /* |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d8abe996e009..e9c2fb01e89b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -2232,7 +2232,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
2232 | 2232 | ||
2233 | mutex_lock(&cgroup_mutex); | 2233 | mutex_lock(&cgroup_mutex); |
2234 | 2234 | ||
2235 | cgrp->flags = 0; | ||
2236 | INIT_LIST_HEAD(&cgrp->sibling); | 2235 | INIT_LIST_HEAD(&cgrp->sibling); |
2237 | INIT_LIST_HEAD(&cgrp->children); | 2236 | INIT_LIST_HEAD(&cgrp->children); |
2238 | INIT_LIST_HEAD(&cgrp->css_sets); | 2237 | INIT_LIST_HEAD(&cgrp->css_sets); |
@@ -2242,6 +2241,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
2242 | cgrp->root = parent->root; | 2241 | cgrp->root = parent->root; |
2243 | cgrp->top_cgroup = parent->top_cgroup; | 2242 | cgrp->top_cgroup = parent->top_cgroup; |
2244 | 2243 | ||
2244 | if (notify_on_release(parent)) | ||
2245 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | ||
2246 | |||
2245 | for_each_subsys(root, ss) { | 2247 | for_each_subsys(root, ss) { |
2246 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 2248 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); |
2247 | if (IS_ERR(css)) { | 2249 | if (IS_ERR(css)) { |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e296ed81d4d..a1b61f414228 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
322 | * Call without callback_mutex or task_lock() held. May be | 322 | * Call without callback_mutex or task_lock() held. May be |
323 | * called with or without cgroup_mutex held. Thanks in part to | 323 | * called with or without cgroup_mutex held. Thanks in part to |
324 | * 'the_top_cpuset_hack', the task's cpuset pointer will never | 324 | * 'the_top_cpuset_hack', the task's cpuset pointer will never |
325 | * be NULL. This routine also might acquire callback_mutex and | 325 | * be NULL. This routine also might acquire callback_mutex during |
326 | * current->mm->mmap_sem during call. | 326 | * call. |
327 | * | 327 | * |
328 | * Reading current->cpuset->mems_generation doesn't need task_lock | 328 | * Reading current->cpuset->mems_generation doesn't need task_lock |
329 | * to guard the current->cpuset derefence, because it is guarded | 329 | * to guard the current->cpuset derefence, because it is guarded |
diff --git a/kernel/exit.c b/kernel/exit.c index 506a957b665a..53872bf993fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp) | |||
214 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) | 214 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) |
215 | { | 215 | { |
216 | struct task_struct *p; | 216 | struct task_struct *p; |
217 | int ret = 1; | ||
218 | 217 | ||
219 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 218 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
220 | if (p == ignored_task | 219 | if ((p == ignored_task) || |
221 | || p->exit_state | 220 | (p->exit_state && thread_group_empty(p)) || |
222 | || is_global_init(p->real_parent)) | 221 | is_global_init(p->real_parent)) |
223 | continue; | 222 | continue; |
223 | |||
224 | if (task_pgrp(p->real_parent) != pgrp && | 224 | if (task_pgrp(p->real_parent) != pgrp && |
225 | task_session(p->real_parent) == task_session(p)) { | 225 | task_session(p->real_parent) == task_session(p)) |
226 | ret = 0; | 226 | return 0; |
227 | break; | ||
228 | } | ||
229 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 227 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
230 | return ret; /* (sighing) "Often!" */ | 228 | |
229 | return 1; | ||
231 | } | 230 | } |
232 | 231 | ||
233 | int is_current_pgrp_orphaned(void) | 232 | int is_current_pgrp_orphaned(void) |
@@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp) | |||
255 | return retval; | 254 | return retval; |
256 | } | 255 | } |
257 | 256 | ||
257 | /* | ||
258 | * Check to see if any process groups have become orphaned as | ||
259 | * a result of our exiting, and if they have any stopped jobs, | ||
260 | * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
261 | */ | ||
262 | static void | ||
263 | kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | ||
264 | { | ||
265 | struct pid *pgrp = task_pgrp(tsk); | ||
266 | struct task_struct *ignored_task = tsk; | ||
267 | |||
268 | if (!parent) | ||
269 | /* exit: our father is in a different pgrp than | ||
270 | * we are and we were the only connection outside. | ||
271 | */ | ||
272 | parent = tsk->real_parent; | ||
273 | else | ||
274 | /* reparent: our child is in a different pgrp than | ||
275 | * we are, and it was the only connection outside. | ||
276 | */ | ||
277 | ignored_task = NULL; | ||
278 | |||
279 | if (task_pgrp(parent) != pgrp && | ||
280 | task_session(parent) == task_session(tsk) && | ||
281 | will_become_orphaned_pgrp(pgrp, ignored_task) && | ||
282 | has_stopped_jobs(pgrp)) { | ||
283 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); | ||
284 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | ||
285 | } | ||
286 | } | ||
287 | |||
258 | /** | 288 | /** |
259 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd | 289 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd |
260 | * | 290 | * |
@@ -635,22 +665,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
635 | p->exit_signal != -1 && thread_group_empty(p)) | 665 | p->exit_signal != -1 && thread_group_empty(p)) |
636 | do_notify_parent(p, p->exit_signal); | 666 | do_notify_parent(p, p->exit_signal); |
637 | 667 | ||
638 | /* | 668 | kill_orphaned_pgrp(p, father); |
639 | * process group orphan check | ||
640 | * Case ii: Our child is in a different pgrp | ||
641 | * than we are, and it was the only connection | ||
642 | * outside, so the child pgrp is now orphaned. | ||
643 | */ | ||
644 | if ((task_pgrp(p) != task_pgrp(father)) && | ||
645 | (task_session(p) == task_session(father))) { | ||
646 | struct pid *pgrp = task_pgrp(p); | ||
647 | |||
648 | if (will_become_orphaned_pgrp(pgrp, NULL) && | ||
649 | has_stopped_jobs(pgrp)) { | ||
650 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); | ||
651 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | ||
652 | } | ||
653 | } | ||
654 | } | 669 | } |
655 | 670 | ||
656 | /* | 671 | /* |
@@ -735,11 +750,9 @@ static void forget_original_parent(struct task_struct *father) | |||
735 | * Send signals to all our closest relatives so that they know | 750 | * Send signals to all our closest relatives so that they know |
736 | * to properly mourn us.. | 751 | * to properly mourn us.. |
737 | */ | 752 | */ |
738 | static void exit_notify(struct task_struct *tsk) | 753 | static void exit_notify(struct task_struct *tsk, int group_dead) |
739 | { | 754 | { |
740 | int state; | 755 | int state; |
741 | struct task_struct *t; | ||
742 | struct pid *pgrp; | ||
743 | 756 | ||
744 | /* | 757 | /* |
745 | * This does two things: | 758 | * This does two things: |
@@ -753,25 +766,8 @@ static void exit_notify(struct task_struct *tsk) | |||
753 | exit_task_namespaces(tsk); | 766 | exit_task_namespaces(tsk); |
754 | 767 | ||
755 | write_lock_irq(&tasklist_lock); | 768 | write_lock_irq(&tasklist_lock); |
756 | /* | 769 | if (group_dead) |
757 | * Check to see if any process groups have become orphaned | 770 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
758 | * as a result of our exiting, and if they have any stopped | ||
759 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
760 | * | ||
761 | * Case i: Our father is in a different pgrp than we are | ||
762 | * and we were the only connection outside, so our pgrp | ||
763 | * is about to become orphaned. | ||
764 | */ | ||
765 | t = tsk->real_parent; | ||
766 | |||
767 | pgrp = task_pgrp(tsk); | ||
768 | if ((task_pgrp(t) != pgrp) && | ||
769 | (task_session(t) == task_session(tsk)) && | ||
770 | will_become_orphaned_pgrp(pgrp, tsk) && | ||
771 | has_stopped_jobs(pgrp)) { | ||
772 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); | ||
773 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | ||
774 | } | ||
775 | 771 | ||
776 | /* Let father know we died | 772 | /* Let father know we died |
777 | * | 773 | * |
@@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk) | |||
788 | * the same after a fork. | 784 | * the same after a fork. |
789 | */ | 785 | */ |
790 | if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && | 786 | if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && |
791 | ( tsk->parent_exec_id != t->self_exec_id || | 787 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || |
792 | tsk->self_exec_id != tsk->parent_exec_id) | 788 | tsk->self_exec_id != tsk->parent_exec_id) |
793 | && !capable(CAP_KILL)) | 789 | && !capable(CAP_KILL)) |
794 | tsk->exit_signal = SIGCHLD; | 790 | tsk->exit_signal = SIGCHLD; |
795 | 791 | ||
@@ -986,7 +982,7 @@ NORET_TYPE void do_exit(long code) | |||
986 | module_put(tsk->binfmt->module); | 982 | module_put(tsk->binfmt->module); |
987 | 983 | ||
988 | proc_exit_connector(tsk); | 984 | proc_exit_connector(tsk); |
989 | exit_notify(tsk); | 985 | exit_notify(tsk, group_dead); |
990 | #ifdef CONFIG_NUMA | 986 | #ifdef CONFIG_NUMA |
991 | mpol_free(tsk->mempolicy); | 987 | mpol_free(tsk->mempolicy); |
992 | tsk->mempolicy = NULL; | 988 | tsk->mempolicy = NULL; |
@@ -1382,7 +1378,7 @@ unlock_sig: | |||
1382 | if (!retval && infop) | 1378 | if (!retval && infop) |
1383 | retval = put_user(0, &infop->si_errno); | 1379 | retval = put_user(0, &infop->si_errno); |
1384 | if (!retval && infop) | 1380 | if (!retval && infop) |
1385 | retval = put_user(why, &infop->si_code); | 1381 | retval = put_user((short)why, &infop->si_code); |
1386 | if (!retval && infop) | 1382 | if (!retval && infop) |
1387 | retval = put_user(exit_code, &infop->si_status); | 1383 | retval = put_user(exit_code, &infop->si_status); |
1388 | if (!retval && infop) | 1384 | if (!retval && infop) |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7a86e6432338..fcfb580c3afc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr) | |||
498 | return 0; | 498 | return 0; |
499 | } | 499 | } |
500 | 500 | ||
501 | /* | ||
502 | * If we have a symbol_name argument, look it up and add the offset field | ||
503 | * to it. This way, we can specify a relative address to a symbol. | ||
504 | */ | ||
505 | static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | ||
506 | { | ||
507 | kprobe_opcode_t *addr = p->addr; | ||
508 | if (p->symbol_name) { | ||
509 | if (addr) | ||
510 | return NULL; | ||
511 | kprobe_lookup_name(p->symbol_name, addr); | ||
512 | } | ||
513 | |||
514 | if (!addr) | ||
515 | return NULL; | ||
516 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); | ||
517 | } | ||
518 | |||
501 | static int __kprobes __register_kprobe(struct kprobe *p, | 519 | static int __kprobes __register_kprobe(struct kprobe *p, |
502 | unsigned long called_from) | 520 | unsigned long called_from) |
503 | { | 521 | { |
504 | int ret = 0; | 522 | int ret = 0; |
505 | struct kprobe *old_p; | 523 | struct kprobe *old_p; |
506 | struct module *probed_mod; | 524 | struct module *probed_mod; |
525 | kprobe_opcode_t *addr; | ||
507 | 526 | ||
508 | /* | 527 | addr = kprobe_addr(p); |
509 | * If we have a symbol_name argument look it up, | 528 | if (!addr) |
510 | * and add it to the address. That way the addr | ||
511 | * field can either be global or relative to a symbol. | ||
512 | */ | ||
513 | if (p->symbol_name) { | ||
514 | if (p->addr) | ||
515 | return -EINVAL; | ||
516 | kprobe_lookup_name(p->symbol_name, p->addr); | ||
517 | } | ||
518 | |||
519 | if (!p->addr) | ||
520 | return -EINVAL; | 529 | return -EINVAL; |
521 | p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); | 530 | p->addr = addr; |
522 | 531 | ||
523 | if (!kernel_text_address((unsigned long) p->addr) || | 532 | if (!kernel_text_address((unsigned long) p->addr) || |
524 | in_kprobes_functions((unsigned long) p->addr)) | 533 | in_kprobes_functions((unsigned long) p->addr)) |
@@ -678,8 +687,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp) | |||
678 | unregister_kprobe(&jp->kp); | 687 | unregister_kprobe(&jp->kp); |
679 | } | 688 | } |
680 | 689 | ||
681 | #ifdef ARCH_SUPPORTS_KRETPROBES | 690 | #ifdef CONFIG_KRETPROBES |
682 | |||
683 | /* | 691 | /* |
684 | * This kprobe pre_handler is registered with every kretprobe. When probe | 692 | * This kprobe pre_handler is registered with every kretprobe. When probe |
685 | * hits it will set up the return probe. | 693 | * hits it will set up the return probe. |
@@ -722,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
722 | int ret = 0; | 730 | int ret = 0; |
723 | struct kretprobe_instance *inst; | 731 | struct kretprobe_instance *inst; |
724 | int i; | 732 | int i; |
725 | void *addr = rp->kp.addr; | 733 | void *addr; |
726 | 734 | ||
727 | if (kretprobe_blacklist_size) { | 735 | if (kretprobe_blacklist_size) { |
728 | if (addr == NULL) | 736 | addr = kprobe_addr(&rp->kp); |
729 | kprobe_lookup_name(rp->kp.symbol_name, addr); | 737 | if (!addr) |
730 | addr += rp->kp.offset; | 738 | return -EINVAL; |
731 | 739 | ||
732 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { | 740 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { |
733 | if (kretprobe_blacklist[i].addr == addr) | 741 | if (kretprobe_blacklist[i].addr == addr) |
@@ -769,8 +777,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
769 | return ret; | 777 | return ret; |
770 | } | 778 | } |
771 | 779 | ||
772 | #else /* ARCH_SUPPORTS_KRETPROBES */ | 780 | #else /* CONFIG_KRETPROBES */ |
773 | |||
774 | int __kprobes register_kretprobe(struct kretprobe *rp) | 781 | int __kprobes register_kretprobe(struct kretprobe *rp) |
775 | { | 782 | { |
776 | return -ENOSYS; | 783 | return -ENOSYS; |
@@ -781,8 +788,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
781 | { | 788 | { |
782 | return 0; | 789 | return 0; |
783 | } | 790 | } |
784 | 791 | #endif /* CONFIG_KRETPROBES */ | |
785 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | ||
786 | 792 | ||
787 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 793 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
788 | { | 794 | { |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3574379f4d62..81a4e4a3f087 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
779 | * parallel walking of the hash-list safe: | 779 | * parallel walking of the hash-list safe: |
780 | */ | 780 | */ |
781 | list_add_tail_rcu(&class->hash_entry, hash_head); | 781 | list_add_tail_rcu(&class->hash_entry, hash_head); |
782 | /* | ||
783 | * Add it to the global list of classes: | ||
784 | */ | ||
785 | list_add_tail_rcu(&class->lock_entry, &all_lock_classes); | ||
782 | 786 | ||
783 | if (verbose(class)) { | 787 | if (verbose(class)) { |
784 | graph_unlock(); | 788 | graph_unlock(); |
@@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2282 | return 0; | 2286 | return 0; |
2283 | break; | 2287 | break; |
2284 | case LOCK_USED: | 2288 | case LOCK_USED: |
2285 | /* | ||
2286 | * Add it to the global list of classes: | ||
2287 | */ | ||
2288 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
2289 | debug_atomic_dec(&nr_unused_locks); | 2289 | debug_atomic_dec(&nr_unused_locks); |
2290 | break; | 2290 | break; |
2291 | default: | 2291 | default: |
diff --git a/kernel/marker.c b/kernel/marker.c index 50effc01d9a2..48a4ea5afffd 100644 --- a/kernel/marker.c +++ b/kernel/marker.c | |||
@@ -698,14 +698,12 @@ int marker_probe_unregister(const char *name, | |||
698 | { | 698 | { |
699 | struct marker_entry *entry; | 699 | struct marker_entry *entry; |
700 | struct marker_probe_closure *old; | 700 | struct marker_probe_closure *old; |
701 | int ret = 0; | 701 | int ret = -ENOENT; |
702 | 702 | ||
703 | mutex_lock(&markers_mutex); | 703 | mutex_lock(&markers_mutex); |
704 | entry = get_marker(name); | 704 | entry = get_marker(name); |
705 | if (!entry) { | 705 | if (!entry) |
706 | ret = -ENOENT; | ||
707 | goto end; | 706 | goto end; |
708 | } | ||
709 | if (entry->rcu_pending) | 707 | if (entry->rcu_pending) |
710 | rcu_barrier(); | 708 | rcu_barrier(); |
711 | old = marker_entry_remove_probe(entry, probe, probe_private); | 709 | old = marker_entry_remove_probe(entry, probe, probe_private); |
@@ -713,12 +711,15 @@ int marker_probe_unregister(const char *name, | |||
713 | marker_update_probes(); /* may update entry */ | 711 | marker_update_probes(); /* may update entry */ |
714 | mutex_lock(&markers_mutex); | 712 | mutex_lock(&markers_mutex); |
715 | entry = get_marker(name); | 713 | entry = get_marker(name); |
714 | if (!entry) | ||
715 | goto end; | ||
716 | entry->oldptr = old; | 716 | entry->oldptr = old; |
717 | entry->rcu_pending = 1; | 717 | entry->rcu_pending = 1; |
718 | /* write rcu_pending before calling the RCU callback */ | 718 | /* write rcu_pending before calling the RCU callback */ |
719 | smp_wmb(); | 719 | smp_wmb(); |
720 | call_rcu(&entry->rcu, free_old_closure); | 720 | call_rcu(&entry->rcu, free_old_closure); |
721 | remove_marker(name); /* Ignore busy error message */ | 721 | remove_marker(name); /* Ignore busy error message */ |
722 | ret = 0; | ||
722 | end: | 723 | end: |
723 | mutex_unlock(&markers_mutex); | 724 | mutex_unlock(&markers_mutex); |
724 | return ret; | 725 | return ret; |
diff --git a/kernel/module.c b/kernel/module.c index 901cd6ac2f11..5d437bffd8dc 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1933,8 +1933,15 @@ static struct module *load_module(void __user *umod, | |||
1933 | /* Set up license info based on the info section */ | 1933 | /* Set up license info based on the info section */ |
1934 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1934 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1935 | 1935 | ||
1936 | /* | ||
1937 | * ndiswrapper is under GPL by itself, but loads proprietary modules. | ||
1938 | * Don't use add_taint_module(), as it would prevent ndiswrapper from | ||
1939 | * using GPL-only symbols it needs. | ||
1940 | */ | ||
1936 | if (strcmp(mod->name, "ndiswrapper") == 0) | 1941 | if (strcmp(mod->name, "ndiswrapper") == 0) |
1937 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1942 | add_taint(TAINT_PROPRIETARY_MODULE); |
1943 | |||
1944 | /* driverloader was caught wrongly pretending to be under GPL */ | ||
1938 | if (strcmp(mod->name, "driverloader") == 0) | 1945 | if (strcmp(mod->name, "driverloader") == 0) |
1939 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1946 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1940 | 1947 | ||
@@ -2171,10 +2178,20 @@ sys_init_module(void __user *umod, | |||
2171 | wake_up(&module_wq); | 2178 | wake_up(&module_wq); |
2172 | return ret; | 2179 | return ret; |
2173 | } | 2180 | } |
2181 | if (ret > 0) { | ||
2182 | printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " | ||
2183 | "it should follow 0/-E convention\n" | ||
2184 | KERN_WARNING "%s: loading module anyway...\n", | ||
2185 | __func__, mod->name, ret, | ||
2186 | __func__); | ||
2187 | dump_stack(); | ||
2188 | } | ||
2174 | 2189 | ||
2175 | /* Now it's a first class citizen! */ | 2190 | /* Now it's a first class citizen! Wake up anyone waiting for it. */ |
2176 | mutex_lock(&module_mutex); | ||
2177 | mod->state = MODULE_STATE_LIVE; | 2191 | mod->state = MODULE_STATE_LIVE; |
2192 | wake_up(&module_wq); | ||
2193 | |||
2194 | mutex_lock(&module_mutex); | ||
2178 | /* Drop initial reference. */ | 2195 | /* Drop initial reference. */ |
2179 | module_put(mod); | 2196 | module_put(mod); |
2180 | unwind_remove_table(mod->unwind_info, 1); | 2197 | unwind_remove_table(mod->unwind_info, 1); |
@@ -2183,7 +2200,6 @@ sys_init_module(void __user *umod, | |||
2183 | mod->init_size = 0; | 2200 | mod->init_size = 0; |
2184 | mod->init_text_size = 0; | 2201 | mod->init_text_size = 0; |
2185 | mutex_unlock(&module_mutex); | 2202 | mutex_unlock(&module_mutex); |
2186 | wake_up(&module_wq); | ||
2187 | 2203 | ||
2188 | return 0; | 2204 | return 0; |
2189 | } | 2205 | } |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 7c2118f9597f..f1d0b345c9ba 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -75,22 +75,15 @@ void refrigerator(void) | |||
75 | __set_current_state(save); | 75 | __set_current_state(save); |
76 | } | 76 | } |
77 | 77 | ||
78 | static void fake_signal_wake_up(struct task_struct *p, int resume) | 78 | static void fake_signal_wake_up(struct task_struct *p) |
79 | { | 79 | { |
80 | unsigned long flags; | 80 | unsigned long flags; |
81 | 81 | ||
82 | spin_lock_irqsave(&p->sighand->siglock, flags); | 82 | spin_lock_irqsave(&p->sighand->siglock, flags); |
83 | signal_wake_up(p, resume); | 83 | signal_wake_up(p, 0); |
84 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 84 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
85 | } | 85 | } |
86 | 86 | ||
87 | static void send_fake_signal(struct task_struct *p) | ||
88 | { | ||
89 | if (task_is_stopped(p)) | ||
90 | force_sig_specific(SIGSTOP, p); | ||
91 | fake_signal_wake_up(p, task_is_stopped(p)); | ||
92 | } | ||
93 | |||
94 | static int has_mm(struct task_struct *p) | 87 | static int has_mm(struct task_struct *p) |
95 | { | 88 | { |
96 | return (p->mm && !(p->flags & PF_BORROWED_MM)); | 89 | return (p->mm && !(p->flags & PF_BORROWED_MM)); |
@@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) | |||
121 | if (freezing(p)) { | 114 | if (freezing(p)) { |
122 | if (has_mm(p)) { | 115 | if (has_mm(p)) { |
123 | if (!signal_pending(p)) | 116 | if (!signal_pending(p)) |
124 | fake_signal_wake_up(p, 0); | 117 | fake_signal_wake_up(p); |
125 | } else { | 118 | } else { |
126 | if (with_mm_only) | 119 | if (with_mm_only) |
127 | ret = 0; | 120 | ret = 0; |
@@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) | |||
135 | } else { | 128 | } else { |
136 | if (has_mm(p)) { | 129 | if (has_mm(p)) { |
137 | set_freeze_flag(p); | 130 | set_freeze_flag(p); |
138 | send_fake_signal(p); | 131 | fake_signal_wake_up(p); |
139 | } else { | 132 | } else { |
140 | if (with_mm_only) { | 133 | if (with_mm_only) { |
141 | ret = 0; | 134 | ret = 0; |
@@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space) | |||
182 | if (frozen(p) || !freezeable(p)) | 175 | if (frozen(p) || !freezeable(p)) |
183 | continue; | 176 | continue; |
184 | 177 | ||
185 | if (task_is_traced(p) && frozen(p->parent)) { | ||
186 | cancel_freezing(p); | ||
187 | continue; | ||
188 | } | ||
189 | |||
190 | if (!freeze_task(p, freeze_user_space)) | 178 | if (!freeze_task(p, freeze_user_space)) |
191 | continue; | 179 | continue; |
192 | 180 | ||
193 | if (!freezer_should_skip(p)) | 181 | /* |
182 | * Now that we've done set_freeze_flag, don't | ||
183 | * perturb a task in TASK_STOPPED or TASK_TRACED. | ||
184 | * It is "frozen enough". If the task does wake | ||
185 | * up, it will immediately call try_to_freeze. | ||
186 | */ | ||
187 | if (!task_is_stopped_or_traced(p) && | ||
188 | !freezer_should_skip(p)) | ||
194 | todo++; | 189 | todo++; |
195 | } while_each_thread(g, p); | 190 | } while_each_thread(g, p); |
196 | read_unlock(&tasklist_lock); | 191 | read_unlock(&tasklist_lock); |
diff --git a/kernel/printk.c b/kernel/printk.c index bee36100f110..9adc2a473e6e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
666 | } | 666 | } |
667 | /* Emit the output into the temporary buffer */ | 667 | /* Emit the output into the temporary buffer */ |
668 | printed_len += vscnprintf(printk_buf + printed_len, | 668 | printed_len += vscnprintf(printk_buf + printed_len, |
669 | sizeof(printk_buf), fmt, args); | 669 | sizeof(printk_buf) - printed_len, fmt, args); |
670 | 670 | ||
671 | /* | 671 | /* |
672 | * Copy the output into log_buf. If the caller didn't provide | 672 | * Copy the output into log_buf. If the caller didn't provide |
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade89..e9517014b57c 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
@@ -23,6 +23,10 @@ | |||
23 | * to Suparna Bhattacharya for pushing me completely away | 23 | * to Suparna Bhattacharya for pushing me completely away |
24 | * from atomic instructions on the read side. | 24 | * from atomic instructions on the read side. |
25 | * | 25 | * |
26 | * - Added handling of Dynamic Ticks | ||
27 | * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> | ||
28 | * - Steven Rostedt <srostedt@redhat.com> | ||
29 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | 30 | * Papers: http://www.rdrop.com/users/paulmck/RCU |
27 | * | 31 | * |
28 | * Design Document: http://lwn.net/Articles/253651/ | 32 | * Design Document: http://lwn.net/Articles/253651/ |
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) | |||
409 | } | 413 | } |
410 | } | 414 | } |
411 | 415 | ||
416 | #ifdef CONFIG_NO_HZ | ||
417 | |||
418 | DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; | ||
419 | static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); | ||
420 | static DEFINE_PER_CPU(int, rcu_update_flag); | ||
421 | |||
422 | /** | ||
423 | * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. | ||
424 | * | ||
425 | * If the CPU was idle with dynamic ticks active, this updates the | ||
426 | * dynticks_progress_counter to let the RCU handling know that the | ||
427 | * CPU is active. | ||
428 | */ | ||
429 | void rcu_irq_enter(void) | ||
430 | { | ||
431 | int cpu = smp_processor_id(); | ||
432 | |||
433 | if (per_cpu(rcu_update_flag, cpu)) | ||
434 | per_cpu(rcu_update_flag, cpu)++; | ||
435 | |||
436 | /* | ||
437 | * Only update if we are coming from a stopped ticks mode | ||
438 | * (dynticks_progress_counter is even). | ||
439 | */ | ||
440 | if (!in_interrupt() && | ||
441 | (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { | ||
442 | /* | ||
443 | * The following might seem like we could have a race | ||
444 | * with NMI/SMIs. But this really isn't a problem. | ||
445 | * Here we do a read/modify/write, and the race happens | ||
446 | * when an NMI/SMI comes in after the read and before | ||
447 | * the write. But NMI/SMIs will increment this counter | ||
448 | * twice before returning, so the zero bit will not | ||
449 | * be corrupted by the NMI/SMI which is the most important | ||
450 | * part. | ||
451 | * | ||
452 | * The only thing is that we would bring back the counter | ||
453 | * to a postion that it was in during the NMI/SMI. | ||
454 | * But the zero bit would be set, so the rest of the | ||
455 | * counter would again be ignored. | ||
456 | * | ||
457 | * On return from the IRQ, the counter may have the zero | ||
458 | * bit be 0 and the counter the same as the return from | ||
459 | * the NMI/SMI. If the state machine was so unlucky to | ||
460 | * see that, it still doesn't matter, since all | ||
461 | * RCU read-side critical sections on this CPU would | ||
462 | * have already completed. | ||
463 | */ | ||
464 | per_cpu(dynticks_progress_counter, cpu)++; | ||
465 | /* | ||
466 | * The following memory barrier ensures that any | ||
467 | * rcu_read_lock() primitives in the irq handler | ||
468 | * are seen by other CPUs to follow the above | ||
469 | * increment to dynticks_progress_counter. This is | ||
470 | * required in order for other CPUs to correctly | ||
471 | * determine when it is safe to advance the RCU | ||
472 | * grace-period state machine. | ||
473 | */ | ||
474 | smp_mb(); /* see above block comment. */ | ||
475 | /* | ||
476 | * Since we can't determine the dynamic tick mode from | ||
477 | * the dynticks_progress_counter after this routine, | ||
478 | * we use a second flag to acknowledge that we came | ||
479 | * from an idle state with ticks stopped. | ||
480 | */ | ||
481 | per_cpu(rcu_update_flag, cpu)++; | ||
482 | /* | ||
483 | * If we take an NMI/SMI now, they will also increment | ||
484 | * the rcu_update_flag, and will not update the | ||
485 | * dynticks_progress_counter on exit. That is for | ||
486 | * this IRQ to do. | ||
487 | */ | ||
488 | } | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * rcu_irq_exit - Called from exiting Hard irq context. | ||
493 | * | ||
494 | * If the CPU was idle with dynamic ticks active, update the | ||
495 | * dynticks_progress_counter to put let the RCU handling be | ||
496 | * aware that the CPU is going back to idle with no ticks. | ||
497 | */ | ||
498 | void rcu_irq_exit(void) | ||
499 | { | ||
500 | int cpu = smp_processor_id(); | ||
501 | |||
502 | /* | ||
503 | * rcu_update_flag is set if we interrupted the CPU | ||
504 | * when it was idle with ticks stopped. | ||
505 | * Once this occurs, we keep track of interrupt nesting | ||
506 | * because a NMI/SMI could also come in, and we still | ||
507 | * only want the IRQ that started the increment of the | ||
508 | * dynticks_progress_counter to be the one that modifies | ||
509 | * it on exit. | ||
510 | */ | ||
511 | if (per_cpu(rcu_update_flag, cpu)) { | ||
512 | if (--per_cpu(rcu_update_flag, cpu)) | ||
513 | return; | ||
514 | |||
515 | /* This must match the interrupt nesting */ | ||
516 | WARN_ON(in_interrupt()); | ||
517 | |||
518 | /* | ||
519 | * If an NMI/SMI happens now we are still | ||
520 | * protected by the dynticks_progress_counter being odd. | ||
521 | */ | ||
522 | |||
523 | /* | ||
524 | * The following memory barrier ensures that any | ||
525 | * rcu_read_unlock() primitives in the irq handler | ||
526 | * are seen by other CPUs to preceed the following | ||
527 | * increment to dynticks_progress_counter. This | ||
528 | * is required in order for other CPUs to determine | ||
529 | * when it is safe to advance the RCU grace-period | ||
530 | * state machine. | ||
531 | */ | ||
532 | smp_mb(); /* see above block comment. */ | ||
533 | per_cpu(dynticks_progress_counter, cpu)++; | ||
534 | WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); | ||
535 | } | ||
536 | } | ||
537 | |||
538 | static void dyntick_save_progress_counter(int cpu) | ||
539 | { | ||
540 | per_cpu(rcu_dyntick_snapshot, cpu) = | ||
541 | per_cpu(dynticks_progress_counter, cpu); | ||
542 | } | ||
543 | |||
544 | static inline int | ||
545 | rcu_try_flip_waitack_needed(int cpu) | ||
546 | { | ||
547 | long curr; | ||
548 | long snap; | ||
549 | |||
550 | curr = per_cpu(dynticks_progress_counter, cpu); | ||
551 | snap = per_cpu(rcu_dyntick_snapshot, cpu); | ||
552 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
553 | |||
554 | /* | ||
555 | * If the CPU remained in dynticks mode for the entire time | ||
556 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
557 | * then it cannot be in the middle of an rcu_read_lock(), so | ||
558 | * the next rcu_read_lock() it executes must use the new value | ||
559 | * of the counter. So we can safely pretend that this CPU | ||
560 | * already acknowledged the counter. | ||
561 | */ | ||
562 | |||
563 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
564 | return 0; | ||
565 | |||
566 | /* | ||
567 | * If the CPU passed through or entered a dynticks idle phase with | ||
568 | * no active irq handlers, then, as above, we can safely pretend | ||
569 | * that this CPU already acknowledged the counter. | ||
570 | */ | ||
571 | |||
572 | if ((curr - snap) > 2 || (snap & 0x1) == 0) | ||
573 | return 0; | ||
574 | |||
575 | /* We need this CPU to explicitly acknowledge the counter flip. */ | ||
576 | |||
577 | return 1; | ||
578 | } | ||
579 | |||
580 | static inline int | ||
581 | rcu_try_flip_waitmb_needed(int cpu) | ||
582 | { | ||
583 | long curr; | ||
584 | long snap; | ||
585 | |||
586 | curr = per_cpu(dynticks_progress_counter, cpu); | ||
587 | snap = per_cpu(rcu_dyntick_snapshot, cpu); | ||
588 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
589 | |||
590 | /* | ||
591 | * If the CPU remained in dynticks mode for the entire time | ||
592 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
593 | * then it cannot have executed an RCU read-side critical section | ||
594 | * during that time, so there is no need for it to execute a | ||
595 | * memory barrier. | ||
596 | */ | ||
597 | |||
598 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
599 | return 0; | ||
600 | |||
601 | /* | ||
602 | * If the CPU either entered or exited an outermost interrupt, | ||
603 | * SMI, NMI, or whatever handler, then we know that it executed | ||
604 | * a memory barrier when doing so. So we don't need another one. | ||
605 | */ | ||
606 | if (curr != snap) | ||
607 | return 0; | ||
608 | |||
609 | /* We need the CPU to execute a memory barrier. */ | ||
610 | |||
611 | return 1; | ||
612 | } | ||
613 | |||
614 | #else /* !CONFIG_NO_HZ */ | ||
615 | |||
616 | # define dyntick_save_progress_counter(cpu) do { } while (0) | ||
617 | # define rcu_try_flip_waitack_needed(cpu) (1) | ||
618 | # define rcu_try_flip_waitmb_needed(cpu) (1) | ||
619 | |||
620 | #endif /* CONFIG_NO_HZ */ | ||
621 | |||
412 | /* | 622 | /* |
413 | * Get here when RCU is idle. Decide whether we need to | 623 | * Get here when RCU is idle. Decide whether we need to |
414 | * move out of idle state, and return non-zero if so. | 624 | * move out of idle state, and return non-zero if so. |
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void) | |||
447 | 657 | ||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | 658 | /* Now ask each CPU for acknowledgement of the flip. */ |
449 | 659 | ||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 660 | for_each_cpu_mask(cpu, rcu_cpu_online_map) { |
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | 661 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; |
662 | dyntick_save_progress_counter(cpu); | ||
663 | } | ||
452 | 664 | ||
453 | return 1; | 665 | return 1; |
454 | } | 666 | } |
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) | |||
464 | 676 | ||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | 677 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); |
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 678 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | 679 | if (rcu_try_flip_waitack_needed(cpu) && |
680 | per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | 681 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); |
469 | return 0; | 682 | return 0; |
470 | } | 683 | } |
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) | |||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | 722 | smp_mb(); /* ^^^^^^^^^^^^ */ |
510 | 723 | ||
511 | /* Call for a memory barrier from each CPU. */ | 724 | /* Call for a memory barrier from each CPU. */ |
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 725 | for_each_cpu_mask(cpu, rcu_cpu_online_map) { |
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | 726 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; |
727 | dyntick_save_progress_counter(cpu); | ||
728 | } | ||
514 | 729 | ||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | 730 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); |
516 | return 1; | 731 | return 1; |
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) | |||
528 | 743 | ||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | 744 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); |
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 745 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | 746 | if (rcu_try_flip_waitmb_needed(cpu) && |
747 | per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | 748 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); |
533 | return 0; | 749 | return 0; |
534 | } | 750 | } |
@@ -702,8 +918,9 @@ void rcu_offline_cpu(int cpu) | |||
702 | * fix. | 918 | * fix. |
703 | */ | 919 | */ |
704 | 920 | ||
921 | local_irq_save(flags); | ||
705 | rdp = RCU_DATA_ME(); | 922 | rdp = RCU_DATA_ME(); |
706 | spin_lock_irqsave(&rdp->lock, flags); | 923 | spin_lock(&rdp->lock); |
707 | *rdp->nexttail = list; | 924 | *rdp->nexttail = list; |
708 | if (list) | 925 | if (list) |
709 | rdp->nexttail = tail; | 926 | rdp->nexttail = tail; |
@@ -735,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
735 | { | 952 | { |
736 | unsigned long flags; | 953 | unsigned long flags; |
737 | struct rcu_head *next, *list; | 954 | struct rcu_head *next, *list; |
738 | struct rcu_data *rdp = RCU_DATA_ME(); | 955 | struct rcu_data *rdp; |
739 | 956 | ||
740 | spin_lock_irqsave(&rdp->lock, flags); | 957 | local_irq_save(flags); |
958 | rdp = RCU_DATA_ME(); | ||
959 | spin_lock(&rdp->lock); | ||
741 | list = rdp->donelist; | 960 | list = rdp->donelist; |
742 | if (list == NULL) { | 961 | if (list == NULL) { |
743 | spin_unlock_irqrestore(&rdp->lock, flags); | 962 | spin_unlock_irqrestore(&rdp->lock, flags); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 16cbec2d5d60..efbfc0fc232f 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member, | |||
113 | 113 | ||
114 | ret = -EINVAL; | 114 | ret = -EINVAL; |
115 | 115 | ||
116 | strstrip(buf); | ||
116 | if (write_strategy) { | 117 | if (write_strategy) { |
117 | if (write_strategy(buf, &tmp)) { | 118 | if (write_strategy(buf, &tmp)) { |
118 | goto out_free; | 119 | goto out_free; |
diff --git a/kernel/sched.c b/kernel/sched.c index b387a8de26a5..1cb53fb1fe3d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -174,41 +174,6 @@ struct task_group { | |||
174 | struct sched_entity **se; | 174 | struct sched_entity **se; |
175 | /* runqueue "owned" by this group on each cpu */ | 175 | /* runqueue "owned" by this group on each cpu */ |
176 | struct cfs_rq **cfs_rq; | 176 | struct cfs_rq **cfs_rq; |
177 | |||
178 | /* | ||
179 | * shares assigned to a task group governs how much of cpu bandwidth | ||
180 | * is allocated to the group. The more shares a group has, the more is | ||
181 | * the cpu bandwidth allocated to it. | ||
182 | * | ||
183 | * For ex, lets say that there are three task groups, A, B and C which | ||
184 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
185 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
186 | * should be: | ||
187 | * | ||
188 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
189 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
190 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
191 | * | ||
192 | * The weight assigned to a task group's schedulable entities on every | ||
193 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
194 | * group's shares. For ex: lets say that task group A has been | ||
195 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
196 | * | ||
197 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
198 | * | ||
199 | * Note: It's not necessary that each of a task's group schedulable | ||
200 | * entity have the same weight on all CPUs. If the group | ||
201 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
202 | * better distribution of weight could be: | ||
203 | * | ||
204 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
205 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
206 | * | ||
207 | * rebalance_shares() is responsible for distributing the shares of a | ||
208 | * task groups like this among the group's schedulable entities across | ||
209 | * cpus. | ||
210 | * | ||
211 | */ | ||
212 | unsigned long shares; | 177 | unsigned long shares; |
213 | #endif | 178 | #endif |
214 | 179 | ||
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
250 | static DEFINE_MUTEX(doms_cur_mutex); | 215 | static DEFINE_MUTEX(doms_cur_mutex); |
251 | 216 | ||
252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 217 | #ifdef CONFIG_FAIR_GROUP_SCHED |
253 | #ifdef CONFIG_SMP | ||
254 | /* kernel thread that runs rebalance_shares() periodically */ | ||
255 | static struct task_struct *lb_monitor_task; | ||
256 | static int load_balance_monitor(void *unused); | ||
257 | #endif | ||
258 | |||
259 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
260 | |||
261 | #ifdef CONFIG_USER_SCHED | 218 | #ifdef CONFIG_USER_SCHED |
262 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 219 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
263 | #else | 220 | #else |
264 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
265 | #endif | 222 | #endif |
266 | 223 | ||
267 | #define MIN_GROUP_SHARES 2 | ||
268 | |||
269 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
270 | #endif | 225 | #endif |
271 | 226 | ||
@@ -668,6 +623,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
668 | */ | 623 | */ |
669 | unsigned int sysctl_sched_rt_period = 1000000; | 624 | unsigned int sysctl_sched_rt_period = 1000000; |
670 | 625 | ||
626 | static __read_mostly int scheduler_running; | ||
627 | |||
671 | /* | 628 | /* |
672 | * part of the period that we allow rt tasks to run in us. | 629 | * part of the period that we allow rt tasks to run in us. |
673 | * default: 0.95s | 630 | * default: 0.95s |
@@ -689,14 +646,16 @@ unsigned long long cpu_clock(int cpu) | |||
689 | unsigned long flags; | 646 | unsigned long flags; |
690 | struct rq *rq; | 647 | struct rq *rq; |
691 | 648 | ||
692 | local_irq_save(flags); | ||
693 | rq = cpu_rq(cpu); | ||
694 | /* | 649 | /* |
695 | * Only call sched_clock() if the scheduler has already been | 650 | * Only call sched_clock() if the scheduler has already been |
696 | * initialized (some code might call cpu_clock() very early): | 651 | * initialized (some code might call cpu_clock() very early): |
697 | */ | 652 | */ |
698 | if (rq->idle) | 653 | if (unlikely(!scheduler_running)) |
699 | update_rq_clock(rq); | 654 | return 0; |
655 | |||
656 | local_irq_save(flags); | ||
657 | rq = cpu_rq(cpu); | ||
658 | update_rq_clock(rq); | ||
700 | now = rq->clock; | 659 | now = rq->clock; |
701 | local_irq_restore(flags); | 660 | local_irq_restore(flags); |
702 | 661 | ||
@@ -1241,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
1241 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1200 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1242 | #endif | 1201 | #endif |
1243 | 1202 | ||
1244 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1245 | { | ||
1246 | update_load_add(&rq->load, load); | ||
1247 | } | ||
1248 | |||
1249 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1250 | { | ||
1251 | update_load_sub(&rq->load, load); | ||
1252 | } | ||
1253 | |||
1254 | #ifdef CONFIG_SMP | 1203 | #ifdef CONFIG_SMP |
1255 | static unsigned long source_load(int cpu, int type); | 1204 | static unsigned long source_load(int cpu, int type); |
1256 | static unsigned long target_load(int cpu, int type); | 1205 | static unsigned long target_load(int cpu, int type); |
@@ -1268,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1268 | 1217 | ||
1269 | #define sched_class_highest (&rt_sched_class) | 1218 | #define sched_class_highest (&rt_sched_class) |
1270 | 1219 | ||
1271 | static void inc_nr_running(struct rq *rq) | 1220 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
1221 | { | ||
1222 | update_load_add(&rq->load, p->se.load.weight); | ||
1223 | } | ||
1224 | |||
1225 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1226 | { | ||
1227 | update_load_sub(&rq->load, p->se.load.weight); | ||
1228 | } | ||
1229 | |||
1230 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1272 | { | 1231 | { |
1273 | rq->nr_running++; | 1232 | rq->nr_running++; |
1233 | inc_load(rq, p); | ||
1274 | } | 1234 | } |
1275 | 1235 | ||
1276 | static void dec_nr_running(struct rq *rq) | 1236 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
1277 | { | 1237 | { |
1278 | rq->nr_running--; | 1238 | rq->nr_running--; |
1239 | dec_load(rq, p); | ||
1279 | } | 1240 | } |
1280 | 1241 | ||
1281 | static void set_load_weight(struct task_struct *p) | 1242 | static void set_load_weight(struct task_struct *p) |
@@ -1367,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1367 | rq->nr_uninterruptible--; | 1328 | rq->nr_uninterruptible--; |
1368 | 1329 | ||
1369 | enqueue_task(rq, p, wakeup); | 1330 | enqueue_task(rq, p, wakeup); |
1370 | inc_nr_running(rq); | 1331 | inc_nr_running(p, rq); |
1371 | } | 1332 | } |
1372 | 1333 | ||
1373 | /* | 1334 | /* |
@@ -1379,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1379 | rq->nr_uninterruptible++; | 1340 | rq->nr_uninterruptible++; |
1380 | 1341 | ||
1381 | dequeue_task(rq, p, sleep); | 1342 | dequeue_task(rq, p, sleep); |
1382 | dec_nr_running(rq); | 1343 | dec_nr_running(p, rq); |
1383 | } | 1344 | } |
1384 | 1345 | ||
1385 | /** | 1346 | /** |
@@ -2019,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2019 | * management (if any): | 1980 | * management (if any): |
2020 | */ | 1981 | */ |
2021 | p->sched_class->task_new(rq, p); | 1982 | p->sched_class->task_new(rq, p); |
2022 | inc_nr_running(rq); | 1983 | inc_nr_running(p, rq); |
2023 | } | 1984 | } |
2024 | check_preempt_curr(rq, p); | 1985 | check_preempt_curr(rq, p); |
2025 | #ifdef CONFIG_SMP | 1986 | #ifdef CONFIG_SMP |
@@ -3885,7 +3846,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) | |||
3885 | asmlinkage void __sched schedule(void) | 3846 | asmlinkage void __sched schedule(void) |
3886 | { | 3847 | { |
3887 | struct task_struct *prev, *next; | 3848 | struct task_struct *prev, *next; |
3888 | long *switch_count; | 3849 | unsigned long *switch_count; |
3889 | struct rq *rq; | 3850 | struct rq *rq; |
3890 | int cpu; | 3851 | int cpu; |
3891 | 3852 | ||
@@ -4358,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4358 | goto out_unlock; | 4319 | goto out_unlock; |
4359 | } | 4320 | } |
4360 | on_rq = p->se.on_rq; | 4321 | on_rq = p->se.on_rq; |
4361 | if (on_rq) | 4322 | if (on_rq) { |
4362 | dequeue_task(rq, p, 0); | 4323 | dequeue_task(rq, p, 0); |
4324 | dec_load(rq, p); | ||
4325 | } | ||
4363 | 4326 | ||
4364 | p->static_prio = NICE_TO_PRIO(nice); | 4327 | p->static_prio = NICE_TO_PRIO(nice); |
4365 | set_load_weight(p); | 4328 | set_load_weight(p); |
@@ -4369,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4369 | 4332 | ||
4370 | if (on_rq) { | 4333 | if (on_rq) { |
4371 | enqueue_task(rq, p, 0); | 4334 | enqueue_task(rq, p, 0); |
4335 | inc_load(rq, p); | ||
4372 | /* | 4336 | /* |
4373 | * If the task increased its priority or is running and | 4337 | * If the task increased its priority or is running and |
4374 | * lowered its priority, then reschedule its CPU: | 4338 | * lowered its priority, then reschedule its CPU: |
@@ -4458,7 +4422,7 @@ int task_nice(const struct task_struct *p) | |||
4458 | { | 4422 | { |
4459 | return TASK_NICE(p); | 4423 | return TASK_NICE(p); |
4460 | } | 4424 | } |
4461 | EXPORT_SYMBOL_GPL(task_nice); | 4425 | EXPORT_SYMBOL(task_nice); |
4462 | 4426 | ||
4463 | /** | 4427 | /** |
4464 | * idle_cpu - is a given cpu idle currently? | 4428 | * idle_cpu - is a given cpu idle currently? |
@@ -5136,7 +5100,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
5136 | time_slice = 0; | 5100 | time_slice = 0; |
5137 | if (p->policy == SCHED_RR) { | 5101 | if (p->policy == SCHED_RR) { |
5138 | time_slice = DEF_TIMESLICE; | 5102 | time_slice = DEF_TIMESLICE; |
5139 | } else { | 5103 | } else if (p->policy != SCHED_FIFO) { |
5140 | struct sched_entity *se = &p->se; | 5104 | struct sched_entity *se = &p->se; |
5141 | unsigned long flags; | 5105 | unsigned long flags; |
5142 | struct rq *rq; | 5106 | struct rq *rq; |
@@ -5917,7 +5881,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5917 | spin_unlock_irq(&rq->lock); | 5881 | spin_unlock_irq(&rq->lock); |
5918 | break; | 5882 | break; |
5919 | 5883 | ||
5920 | case CPU_DOWN_PREPARE: | 5884 | case CPU_DYING: |
5885 | case CPU_DYING_FROZEN: | ||
5921 | /* Update our root-domain */ | 5886 | /* Update our root-domain */ |
5922 | rq = cpu_rq(cpu); | 5887 | rq = cpu_rq(cpu); |
5923 | spin_lock_irqsave(&rq->lock, flags); | 5888 | spin_lock_irqsave(&rq->lock, flags); |
@@ -7083,21 +7048,6 @@ void __init sched_init_smp(void) | |||
7083 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7048 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
7084 | BUG(); | 7049 | BUG(); |
7085 | sched_init_granularity(); | 7050 | sched_init_granularity(); |
7086 | |||
7087 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7088 | if (nr_cpu_ids == 1) | ||
7089 | return; | ||
7090 | |||
7091 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7092 | "group_balance"); | ||
7093 | if (!IS_ERR(lb_monitor_task)) { | ||
7094 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7095 | wake_up_process(lb_monitor_task); | ||
7096 | } else { | ||
7097 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7098 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7099 | } | ||
7100 | #endif | ||
7101 | } | 7051 | } |
7102 | #else | 7052 | #else |
7103 | void __init sched_init_smp(void) | 7053 | void __init sched_init_smp(void) |
@@ -7284,6 +7234,8 @@ void __init sched_init(void) | |||
7284 | * During early bootup we pretend to be a normal task: | 7234 | * During early bootup we pretend to be a normal task: |
7285 | */ | 7235 | */ |
7286 | current->sched_class = &fair_sched_class; | 7236 | current->sched_class = &fair_sched_class; |
7237 | |||
7238 | scheduler_running = 1; | ||
7287 | } | 7239 | } |
7288 | 7240 | ||
7289 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 7241 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
@@ -7418,157 +7370,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
7418 | 7370 | ||
7419 | #ifdef CONFIG_GROUP_SCHED | 7371 | #ifdef CONFIG_GROUP_SCHED |
7420 | 7372 | ||
7421 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7422 | /* | ||
7423 | * distribute shares of all task groups among their schedulable entities, | ||
7424 | * to reflect load distribution across cpus. | ||
7425 | */ | ||
7426 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7427 | { | ||
7428 | struct cfs_rq *cfs_rq; | ||
7429 | struct rq *rq = cpu_rq(this_cpu); | ||
7430 | cpumask_t sdspan = sd->span; | ||
7431 | int balanced = 1; | ||
7432 | |||
7433 | /* Walk thr' all the task groups that we have */ | ||
7434 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7435 | int i; | ||
7436 | unsigned long total_load = 0, total_shares; | ||
7437 | struct task_group *tg = cfs_rq->tg; | ||
7438 | |||
7439 | /* Gather total task load of this group across cpus */ | ||
7440 | for_each_cpu_mask(i, sdspan) | ||
7441 | total_load += tg->cfs_rq[i]->load.weight; | ||
7442 | |||
7443 | /* Nothing to do if this group has no load */ | ||
7444 | if (!total_load) | ||
7445 | continue; | ||
7446 | |||
7447 | /* | ||
7448 | * tg->shares represents the number of cpu shares the task group | ||
7449 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7450 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7451 | */ | ||
7452 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7453 | |||
7454 | /* | ||
7455 | * redistribute total_shares across cpus as per the task load | ||
7456 | * distribution. | ||
7457 | */ | ||
7458 | for_each_cpu_mask(i, sdspan) { | ||
7459 | unsigned long local_load, local_shares; | ||
7460 | |||
7461 | local_load = tg->cfs_rq[i]->load.weight; | ||
7462 | local_shares = (local_load * total_shares) / total_load; | ||
7463 | if (!local_shares) | ||
7464 | local_shares = MIN_GROUP_SHARES; | ||
7465 | if (local_shares == tg->se[i]->load.weight) | ||
7466 | continue; | ||
7467 | |||
7468 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7469 | set_se_shares(tg->se[i], local_shares); | ||
7470 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7471 | balanced = 0; | ||
7472 | } | ||
7473 | } | ||
7474 | |||
7475 | return balanced; | ||
7476 | } | ||
7477 | |||
7478 | /* | ||
7479 | * How frequently should we rebalance_shares() across cpus? | ||
7480 | * | ||
7481 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7482 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7483 | * also implies increased scheduling overhead. | ||
7484 | * | ||
7485 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7486 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7487 | * | ||
7488 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7489 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7490 | * | ||
7491 | * These settings allows for the appropriate trade-off between accuracy of | ||
7492 | * fairness and the associated overhead. | ||
7493 | * | ||
7494 | */ | ||
7495 | |||
7496 | /* default: 8ms, units: milliseconds */ | ||
7497 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7498 | |||
7499 | /* default: 128ms, units: milliseconds */ | ||
7500 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7501 | |||
7502 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7503 | static int load_balance_monitor(void *unused) | ||
7504 | { | ||
7505 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7506 | struct sched_param schedparm; | ||
7507 | int ret; | ||
7508 | |||
7509 | /* | ||
7510 | * We don't want this thread's execution to be limited by the shares | ||
7511 | * assigned to default group (init_task_group). Hence make it run | ||
7512 | * as a SCHED_RR RT task at the lowest priority. | ||
7513 | */ | ||
7514 | schedparm.sched_priority = 1; | ||
7515 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7516 | if (ret) | ||
7517 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7518 | " monitor thread (error = %d) \n", ret); | ||
7519 | |||
7520 | while (!kthread_should_stop()) { | ||
7521 | int i, cpu, balanced = 1; | ||
7522 | |||
7523 | /* Prevent cpus going down or coming up */ | ||
7524 | get_online_cpus(); | ||
7525 | /* lockout changes to doms_cur[] array */ | ||
7526 | lock_doms_cur(); | ||
7527 | /* | ||
7528 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7529 | * chain on various cpus and to walk task group list | ||
7530 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7531 | */ | ||
7532 | rcu_read_lock(); | ||
7533 | |||
7534 | for (i = 0; i < ndoms_cur; i++) { | ||
7535 | cpumask_t cpumap = doms_cur[i]; | ||
7536 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7537 | |||
7538 | cpu = first_cpu(cpumap); | ||
7539 | |||
7540 | /* Find the highest domain at which to balance shares */ | ||
7541 | for_each_domain(cpu, sd) { | ||
7542 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7543 | continue; | ||
7544 | sd_prev = sd; | ||
7545 | } | ||
7546 | |||
7547 | sd = sd_prev; | ||
7548 | /* sd == NULL? No load balance reqd in this domain */ | ||
7549 | if (!sd) | ||
7550 | continue; | ||
7551 | |||
7552 | balanced &= rebalance_shares(sd, cpu); | ||
7553 | } | ||
7554 | |||
7555 | rcu_read_unlock(); | ||
7556 | |||
7557 | unlock_doms_cur(); | ||
7558 | put_online_cpus(); | ||
7559 | |||
7560 | if (!balanced) | ||
7561 | timeout = sysctl_sched_min_bal_int_shares; | ||
7562 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7563 | timeout *= 2; | ||
7564 | |||
7565 | msleep_interruptible(timeout); | ||
7566 | } | ||
7567 | |||
7568 | return 0; | ||
7569 | } | ||
7570 | #endif /* CONFIG_SMP */ | ||
7571 | |||
7572 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7373 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7573 | static void free_fair_sched_group(struct task_group *tg) | 7374 | static void free_fair_sched_group(struct task_group *tg) |
7574 | { | 7375 | { |
@@ -7825,6 +7626,11 @@ void sched_move_task(struct task_struct *tsk) | |||
7825 | 7626 | ||
7826 | set_task_rq(tsk, task_cpu(tsk)); | 7627 | set_task_rq(tsk, task_cpu(tsk)); |
7827 | 7628 | ||
7629 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7630 | if (tsk->sched_class->moved_group) | ||
7631 | tsk->sched_class->moved_group(tsk); | ||
7632 | #endif | ||
7633 | |||
7828 | if (on_rq) { | 7634 | if (on_rq) { |
7829 | if (unlikely(running)) | 7635 | if (unlikely(running)) |
7830 | tsk->sched_class->set_curr_task(rq); | 7636 | tsk->sched_class->set_curr_task(rq); |
@@ -7835,29 +7641,25 @@ void sched_move_task(struct task_struct *tsk) | |||
7835 | } | 7641 | } |
7836 | 7642 | ||
7837 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7643 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7838 | /* rq->lock to be locked by caller */ | ||
7839 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7644 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7840 | { | 7645 | { |
7841 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7646 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7842 | struct rq *rq = cfs_rq->rq; | 7647 | struct rq *rq = cfs_rq->rq; |
7843 | int on_rq; | 7648 | int on_rq; |
7844 | 7649 | ||
7845 | if (!shares) | 7650 | spin_lock_irq(&rq->lock); |
7846 | shares = MIN_GROUP_SHARES; | ||
7847 | 7651 | ||
7848 | on_rq = se->on_rq; | 7652 | on_rq = se->on_rq; |
7849 | if (on_rq) { | 7653 | if (on_rq) |
7850 | dequeue_entity(cfs_rq, se, 0); | 7654 | dequeue_entity(cfs_rq, se, 0); |
7851 | dec_cpu_load(rq, se->load.weight); | ||
7852 | } | ||
7853 | 7655 | ||
7854 | se->load.weight = shares; | 7656 | se->load.weight = shares; |
7855 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7657 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7856 | 7658 | ||
7857 | if (on_rq) { | 7659 | if (on_rq) |
7858 | enqueue_entity(cfs_rq, se, 0); | 7660 | enqueue_entity(cfs_rq, se, 0); |
7859 | inc_cpu_load(rq, se->load.weight); | 7661 | |
7860 | } | 7662 | spin_unlock_irq(&rq->lock); |
7861 | } | 7663 | } |
7862 | 7664 | ||
7863 | static DEFINE_MUTEX(shares_mutex); | 7665 | static DEFINE_MUTEX(shares_mutex); |
@@ -7867,18 +7669,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7867 | int i; | 7669 | int i; |
7868 | unsigned long flags; | 7670 | unsigned long flags; |
7869 | 7671 | ||
7672 | /* | ||
7673 | * A weight of 0 or 1 can cause arithmetics problems. | ||
7674 | * (The default weight is 1024 - so there's no practical | ||
7675 | * limitation from this.) | ||
7676 | */ | ||
7677 | if (shares < 2) | ||
7678 | shares = 2; | ||
7679 | |||
7870 | mutex_lock(&shares_mutex); | 7680 | mutex_lock(&shares_mutex); |
7871 | if (tg->shares == shares) | 7681 | if (tg->shares == shares) |
7872 | goto done; | 7682 | goto done; |
7873 | 7683 | ||
7874 | if (shares < MIN_GROUP_SHARES) | ||
7875 | shares = MIN_GROUP_SHARES; | ||
7876 | |||
7877 | /* | ||
7878 | * Prevent any load balance activity (rebalance_shares, | ||
7879 | * load_balance_fair) from referring to this group first, | ||
7880 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7881 | */ | ||
7882 | spin_lock_irqsave(&task_group_lock, flags); | 7684 | spin_lock_irqsave(&task_group_lock, flags); |
7883 | for_each_possible_cpu(i) | 7685 | for_each_possible_cpu(i) |
7884 | unregister_fair_sched_group(tg, i); | 7686 | unregister_fair_sched_group(tg, i); |
@@ -7892,11 +7694,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7892 | * w/o tripping rebalance_share or load_balance_fair. | 7694 | * w/o tripping rebalance_share or load_balance_fair. |
7893 | */ | 7695 | */ |
7894 | tg->shares = shares; | 7696 | tg->shares = shares; |
7895 | for_each_possible_cpu(i) { | 7697 | for_each_possible_cpu(i) |
7896 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7897 | set_se_shares(tg->se[i], shares); | 7698 | set_se_shares(tg->se[i], shares); |
7898 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7899 | } | ||
7900 | 7699 | ||
7901 | /* | 7700 | /* |
7902 | * Enable load balance activity on this group, by inserting it back on | 7701 | * Enable load balance activity on this group, by inserting it back on |
@@ -7928,9 +7727,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
7928 | if (runtime == RUNTIME_INF) | 7727 | if (runtime == RUNTIME_INF) |
7929 | return 1ULL << 16; | 7728 | return 1ULL << 16; |
7930 | 7729 | ||
7931 | runtime *= (1ULL << 16); | 7730 | return div64_64(runtime << 16, period); |
7932 | div64_64(runtime, period); | ||
7933 | return runtime; | ||
7934 | } | 7731 | } |
7935 | 7732 | ||
7936 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 7733 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
@@ -7954,25 +7751,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
7954 | return total + to_ratio(period, runtime) < global_ratio; | 7751 | return total + to_ratio(period, runtime) < global_ratio; |
7955 | } | 7752 | } |
7956 | 7753 | ||
7754 | /* Must be called with tasklist_lock held */ | ||
7755 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
7756 | { | ||
7757 | struct task_struct *g, *p; | ||
7758 | do_each_thread(g, p) { | ||
7759 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | ||
7760 | return 1; | ||
7761 | } while_each_thread(g, p); | ||
7762 | return 0; | ||
7763 | } | ||
7764 | |||
7957 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 7765 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) |
7958 | { | 7766 | { |
7959 | u64 rt_runtime, rt_period; | 7767 | u64 rt_runtime, rt_period; |
7960 | int err = 0; | 7768 | int err = 0; |
7961 | 7769 | ||
7962 | rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; | 7770 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
7963 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | 7771 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; |
7964 | if (rt_runtime_us == -1) | 7772 | if (rt_runtime_us == -1) |
7965 | rt_runtime = rt_period; | 7773 | rt_runtime = RUNTIME_INF; |
7966 | 7774 | ||
7967 | mutex_lock(&rt_constraints_mutex); | 7775 | mutex_lock(&rt_constraints_mutex); |
7776 | read_lock(&tasklist_lock); | ||
7777 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | ||
7778 | err = -EBUSY; | ||
7779 | goto unlock; | ||
7780 | } | ||
7968 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | 7781 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { |
7969 | err = -EINVAL; | 7782 | err = -EINVAL; |
7970 | goto unlock; | 7783 | goto unlock; |
7971 | } | 7784 | } |
7972 | if (rt_runtime_us == -1) | ||
7973 | rt_runtime = RUNTIME_INF; | ||
7974 | tg->rt_runtime = rt_runtime; | 7785 | tg->rt_runtime = rt_runtime; |
7975 | unlock: | 7786 | unlock: |
7787 | read_unlock(&tasklist_lock); | ||
7976 | mutex_unlock(&rt_constraints_mutex); | 7788 | mutex_unlock(&rt_constraints_mutex); |
7977 | 7789 | ||
7978 | return err; | 7790 | return err; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c091d6e159d..e2a530515619 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -202,17 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
202 | 202 | ||
203 | static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 203 | static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
204 | { | 204 | { |
205 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 205 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
206 | struct sched_entity *se = NULL; | ||
207 | struct rb_node *parent; | ||
208 | 206 | ||
209 | while (*link) { | 207 | if (!last) |
210 | parent = *link; | 208 | return NULL; |
211 | se = rb_entry(parent, struct sched_entity, run_node); | ||
212 | link = &parent->rb_right; | ||
213 | } | ||
214 | 209 | ||
215 | return se; | 210 | return rb_entry(last, struct sched_entity, run_node); |
216 | } | 211 | } |
217 | 212 | ||
218 | /************************************************************** | 213 | /************************************************************** |
@@ -732,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
732 | return se->parent; | 727 | return se->parent; |
733 | } | 728 | } |
734 | 729 | ||
735 | #define GROUP_IMBALANCE_PCT 20 | ||
736 | |||
737 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 730 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
738 | 731 | ||
739 | #define for_each_sched_entity(se) \ | 732 | #define for_each_sched_entity(se) \ |
@@ -824,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 817 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
825 | { | 818 | { |
826 | struct cfs_rq *cfs_rq; | 819 | struct cfs_rq *cfs_rq; |
827 | struct sched_entity *se = &p->se, | 820 | struct sched_entity *se = &p->se; |
828 | *topse = NULL; /* Highest schedulable entity */ | ||
829 | int incload = 1; | ||
830 | 821 | ||
831 | for_each_sched_entity(se) { | 822 | for_each_sched_entity(se) { |
832 | topse = se; | 823 | if (se->on_rq) |
833 | if (se->on_rq) { | ||
834 | incload = 0; | ||
835 | break; | 824 | break; |
836 | } | ||
837 | cfs_rq = cfs_rq_of(se); | 825 | cfs_rq = cfs_rq_of(se); |
838 | enqueue_entity(cfs_rq, se, wakeup); | 826 | enqueue_entity(cfs_rq, se, wakeup); |
839 | wakeup = 1; | 827 | wakeup = 1; |
840 | } | 828 | } |
841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
843 | * at the highest grouping level. | ||
844 | */ | ||
845 | if (incload) | ||
846 | inc_cpu_load(rq, topse->load.weight); | ||
847 | 829 | ||
848 | hrtick_start_fair(rq, rq->curr); | 830 | hrtick_start_fair(rq, rq->curr); |
849 | } | 831 | } |
@@ -856,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 838 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
857 | { | 839 | { |
858 | struct cfs_rq *cfs_rq; | 840 | struct cfs_rq *cfs_rq; |
859 | struct sched_entity *se = &p->se, | 841 | struct sched_entity *se = &p->se; |
860 | *topse = NULL; /* Highest schedulable entity */ | ||
861 | int decload = 1; | ||
862 | 842 | ||
863 | for_each_sched_entity(se) { | 843 | for_each_sched_entity(se) { |
864 | topse = se; | ||
865 | cfs_rq = cfs_rq_of(se); | 844 | cfs_rq = cfs_rq_of(se); |
866 | dequeue_entity(cfs_rq, se, sleep); | 845 | dequeue_entity(cfs_rq, se, sleep); |
867 | /* Don't dequeue parent if it has other entities besides us */ | 846 | /* Don't dequeue parent if it has other entities besides us */ |
868 | if (cfs_rq->load.weight) { | 847 | if (cfs_rq->load.weight) |
869 | if (parent_entity(se)) | ||
870 | decload = 0; | ||
871 | break; | 848 | break; |
872 | } | ||
873 | sleep = 1; | 849 | sleep = 1; |
874 | } | 850 | } |
875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
877 | * at the highest grouping level. | ||
878 | */ | ||
879 | if (decload) | ||
880 | dec_cpu_load(rq, topse->load.weight); | ||
881 | 851 | ||
882 | hrtick_start_fair(rq, rq->curr); | 852 | hrtick_start_fair(rq, rq->curr); |
883 | } | 853 | } |
@@ -1191,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1191 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1161 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
1192 | } | 1162 | } |
1193 | 1163 | ||
1164 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1165 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
1166 | { | ||
1167 | struct sched_entity *curr; | ||
1168 | struct task_struct *p; | ||
1169 | |||
1170 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1171 | return MAX_PRIO; | ||
1172 | |||
1173 | curr = cfs_rq->curr; | ||
1174 | if (!curr) | ||
1175 | curr = __pick_next_entity(cfs_rq); | ||
1176 | |||
1177 | p = task_of(curr); | ||
1178 | |||
1179 | return p->prio; | ||
1180 | } | ||
1181 | #endif | ||
1182 | |||
1194 | static unsigned long | 1183 | static unsigned long |
1195 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1184 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1196 | unsigned long max_load_move, | 1185 | unsigned long max_load_move, |
@@ -1200,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1200 | struct cfs_rq *busy_cfs_rq; | 1189 | struct cfs_rq *busy_cfs_rq; |
1201 | long rem_load_move = max_load_move; | 1190 | long rem_load_move = max_load_move; |
1202 | struct rq_iterator cfs_rq_iterator; | 1191 | struct rq_iterator cfs_rq_iterator; |
1203 | unsigned long load_moved; | ||
1204 | 1192 | ||
1205 | cfs_rq_iterator.start = load_balance_start_fair; | 1193 | cfs_rq_iterator.start = load_balance_start_fair; |
1206 | cfs_rq_iterator.next = load_balance_next_fair; | 1194 | cfs_rq_iterator.next = load_balance_next_fair; |
1207 | 1195 | ||
1208 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1196 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
1209 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1197 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1210 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; | 1198 | struct cfs_rq *this_cfs_rq; |
1211 | unsigned long maxload, task_load, group_weight; | 1199 | long imbalance; |
1212 | unsigned long thisload, per_task_load; | 1200 | unsigned long maxload; |
1213 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1214 | |||
1215 | task_load = busy_cfs_rq->load.weight; | ||
1216 | group_weight = se->load.weight; | ||
1217 | 1201 | ||
1218 | /* | 1202 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
1219 | * 'group_weight' is contributed by tasks of total weight | ||
1220 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
1221 | * we need to move a maximum task load of: | ||
1222 | * | ||
1223 | * maxload = (remload / group_weight) * task_load; | ||
1224 | */ | ||
1225 | maxload = (rem_load_move * task_load) / group_weight; | ||
1226 | 1203 | ||
1227 | if (!maxload || !task_load) | 1204 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; |
1205 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
1206 | if (imbalance <= 0) | ||
1228 | continue; | 1207 | continue; |
1229 | 1208 | ||
1230 | per_task_load = task_load / busy_cfs_rq->nr_running; | 1209 | /* Don't pull more than imbalance/2 */ |
1231 | /* | 1210 | imbalance /= 2; |
1232 | * balance_tasks will try to forcibly move atleast one task if | 1211 | maxload = min(rem_load_move, imbalance); |
1233 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1234 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1235 | */ | ||
1236 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1237 | continue; | ||
1238 | 1212 | ||
1239 | /* Disable priority-based load balance */ | 1213 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
1240 | *this_best_prio = 0; | ||
1241 | thisload = this_cfs_rq->load.weight; | ||
1242 | #else | 1214 | #else |
1243 | # define maxload rem_load_move | 1215 | # define maxload rem_load_move |
1244 | #endif | 1216 | #endif |
@@ -1247,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1247 | * load_balance_[start|next]_fair iterators | 1219 | * load_balance_[start|next]_fair iterators |
1248 | */ | 1220 | */ |
1249 | cfs_rq_iterator.arg = busy_cfs_rq; | 1221 | cfs_rq_iterator.arg = busy_cfs_rq; |
1250 | load_moved = balance_tasks(this_rq, this_cpu, busiest, | 1222 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, |
1251 | maxload, sd, idle, all_pinned, | 1223 | maxload, sd, idle, all_pinned, |
1252 | this_best_prio, | 1224 | this_best_prio, |
1253 | &cfs_rq_iterator); | 1225 | &cfs_rq_iterator); |
1254 | 1226 | ||
1255 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1256 | /* | ||
1257 | * load_moved holds the task load that was moved. The | ||
1258 | * effective (group) weight moved would be: | ||
1259 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1260 | */ | ||
1261 | load_moved = (group_weight * load_moved) / task_load; | ||
1262 | |||
1263 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1264 | group_weight -= load_moved; | ||
1265 | set_se_shares(se, group_weight); | ||
1266 | |||
1267 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1268 | if (!thisload) | ||
1269 | group_weight = load_moved; | ||
1270 | else | ||
1271 | group_weight = se->load.weight + load_moved; | ||
1272 | set_se_shares(se, group_weight); | ||
1273 | #endif | ||
1274 | |||
1275 | rem_load_move -= load_moved; | ||
1276 | |||
1277 | if (rem_load_move <= 0) | 1227 | if (rem_load_move <= 0) |
1278 | break; | 1228 | break; |
1279 | } | 1229 | } |
@@ -1403,6 +1353,16 @@ static void set_curr_task_fair(struct rq *rq) | |||
1403 | set_next_entity(cfs_rq_of(se), se); | 1353 | set_next_entity(cfs_rq_of(se), se); |
1404 | } | 1354 | } |
1405 | 1355 | ||
1356 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1357 | static void moved_group_fair(struct task_struct *p) | ||
1358 | { | ||
1359 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | ||
1360 | |||
1361 | update_curr(cfs_rq); | ||
1362 | place_entity(cfs_rq, &p->se, 1); | ||
1363 | } | ||
1364 | #endif | ||
1365 | |||
1406 | /* | 1366 | /* |
1407 | * All the scheduling class methods: | 1367 | * All the scheduling class methods: |
1408 | */ | 1368 | */ |
@@ -1431,6 +1391,10 @@ static const struct sched_class fair_sched_class = { | |||
1431 | 1391 | ||
1432 | .prio_changed = prio_changed_fair, | 1392 | .prio_changed = prio_changed_fair, |
1433 | .switched_to = switched_to_fair, | 1393 | .switched_to = switched_to_fair, |
1394 | |||
1395 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1396 | .moved_group = moved_group_fair, | ||
1397 | #endif | ||
1434 | }; | 1398 | }; |
1435 | 1399 | ||
1436 | #ifdef CONFIG_SCHED_DEBUG | 1400 | #ifdef CONFIG_SCHED_DEBUG |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f54792b175b2..0a6d2e516420 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
393 | */ | 393 | */ |
394 | for_each_sched_rt_entity(rt_se) | 394 | for_each_sched_rt_entity(rt_se) |
395 | enqueue_rt_entity(rt_se); | 395 | enqueue_rt_entity(rt_se); |
396 | |||
397 | inc_cpu_load(rq, p->se.load.weight); | ||
398 | } | 396 | } |
399 | 397 | ||
400 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 398 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
414 | if (rt_rq && rt_rq->rt_nr_running) | 412 | if (rt_rq && rt_rq->rt_nr_running) |
415 | enqueue_rt_entity(rt_se); | 413 | enqueue_rt_entity(rt_se); |
416 | } | 414 | } |
417 | |||
418 | dec_cpu_load(rq, p->se.load.weight); | ||
419 | } | 415 | } |
420 | 416 | ||
421 | /* | 417 | /* |
@@ -1111,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p, | |||
1111 | pull_rt_task(rq); | 1107 | pull_rt_task(rq); |
1112 | /* | 1108 | /* |
1113 | * If there's a higher priority task waiting to run | 1109 | * If there's a higher priority task waiting to run |
1114 | * then reschedule. | 1110 | * then reschedule. Note, the above pull_rt_task |
1111 | * can release the rq lock and p could migrate. | ||
1112 | * Only reschedule if p is still on the same runqueue. | ||
1115 | */ | 1113 | */ |
1116 | if (p->prio > rq->rt.highest_prio) | 1114 | if (p->prio > rq->rt.highest_prio && rq->curr == p) |
1117 | resched_task(p); | 1115 | resched_task(p); |
1118 | #else | 1116 | #else |
1119 | /* For UP simply resched on drop of prio */ | 1117 | /* For UP simply resched on drop of prio */ |
diff --git a/kernel/signal.c b/kernel/signal.c index 84917fe507f7..6af1210092c3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
1623 | /* Let the debugger run. */ | 1623 | /* Let the debugger run. */ |
1624 | __set_current_state(TASK_TRACED); | 1624 | __set_current_state(TASK_TRACED); |
1625 | spin_unlock_irq(¤t->sighand->siglock); | 1625 | spin_unlock_irq(¤t->sighand->siglock); |
1626 | try_to_freeze(); | ||
1627 | read_lock(&tasklist_lock); | 1626 | read_lock(&tasklist_lock); |
1628 | if (!unlikely(killed) && may_ptrace_stop()) { | 1627 | if (!unlikely(killed) && may_ptrace_stop()) { |
1629 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1628 | do_notify_parent_cldstop(current, CLD_TRAPPED); |
@@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
1641 | } | 1640 | } |
1642 | 1641 | ||
1643 | /* | 1642 | /* |
1643 | * While in TASK_TRACED, we were considered "frozen enough". | ||
1644 | * Now that we woke up, it's crucial if we're supposed to be | ||
1645 | * frozen that we freeze now before running anything substantial. | ||
1646 | */ | ||
1647 | try_to_freeze(); | ||
1648 | |||
1649 | /* | ||
1644 | * We are back. Now reacquire the siglock before touching | 1650 | * We are back. Now reacquire the siglock before touching |
1645 | * last_siginfo, so that we are sure to have synchronized with | 1651 | * last_siginfo, so that we are sure to have synchronized with |
1646 | * any signal-sending on another CPU that wants to examine it. | 1652 | * any signal-sending on another CPU that wants to examine it. |
@@ -1757,9 +1763,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
1757 | sigset_t *mask = ¤t->blocked; | 1763 | sigset_t *mask = ¤t->blocked; |
1758 | int signr = 0; | 1764 | int signr = 0; |
1759 | 1765 | ||
1766 | relock: | ||
1767 | /* | ||
1768 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | ||
1769 | * While in TASK_STOPPED, we were considered "frozen enough". | ||
1770 | * Now that we woke up, it's crucial if we're supposed to be | ||
1771 | * frozen that we freeze now before running anything substantial. | ||
1772 | */ | ||
1760 | try_to_freeze(); | 1773 | try_to_freeze(); |
1761 | 1774 | ||
1762 | relock: | ||
1763 | spin_lock_irq(¤t->sighand->siglock); | 1775 | spin_lock_irq(¤t->sighand->siglock); |
1764 | for (;;) { | 1776 | for (;;) { |
1765 | struct k_sigaction *ka; | 1777 | struct k_sigaction *ka; |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f471e..31e9f2a47928 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -313,6 +313,7 @@ void irq_exit(void) | |||
313 | /* Make sure that timer wheel updates are propagated */ | 313 | /* Make sure that timer wheel updates are propagated */ |
314 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) | 314 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) |
315 | tick_nohz_stop_sched_tick(); | 315 | tick_nohz_stop_sched_tick(); |
316 | rcu_irq_exit(); | ||
316 | #endif | 317 | #endif |
317 | preempt_enable_no_resched(); | 318 | preempt_enable_no_resched(); |
318 | } | 319 | } |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7c2da88db4ed..01b6522fd92b 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu) | |||
216 | /* initialize timestamp */ | 216 | /* initialize timestamp */ |
217 | touch_softlockup_watchdog(); | 217 | touch_softlockup_watchdog(); |
218 | 218 | ||
219 | set_current_state(TASK_INTERRUPTIBLE); | ||
219 | /* | 220 | /* |
220 | * Run briefly once per second to reset the softlockup timestamp. | 221 | * Run briefly once per second to reset the softlockup timestamp. |
221 | * If this gets delayed for more than 60 seconds then the | 222 | * If this gets delayed for more than 60 seconds then the |
222 | * debug-printout triggers in softlockup_tick(). | 223 | * debug-printout triggers in softlockup_tick(). |
223 | */ | 224 | */ |
224 | while (!kthread_should_stop()) { | 225 | while (!kthread_should_stop()) { |
225 | set_current_state(TASK_INTERRUPTIBLE); | ||
226 | touch_softlockup_watchdog(); | 226 | touch_softlockup_watchdog(); |
227 | schedule(); | 227 | schedule(); |
228 | 228 | ||
229 | if (kthread_should_stop()) | 229 | if (kthread_should_stop()) |
230 | break; | 230 | break; |
231 | 231 | ||
232 | if (this_cpu != check_cpu) | 232 | if (this_cpu == check_cpu) { |
233 | continue; | 233 | if (sysctl_hung_task_timeout_secs) |
234 | 234 | check_hung_uninterruptible_tasks(this_cpu); | |
235 | if (sysctl_hung_task_timeout_secs) | 235 | } |
236 | check_hung_uninterruptible_tasks(this_cpu); | ||
237 | 236 | ||
237 | set_current_state(TASK_INTERRUPTIBLE); | ||
238 | } | 238 | } |
239 | __set_current_state(TASK_RUNNING); | ||
239 | 240 | ||
240 | return 0; | 241 | return 0; |
241 | } | 242 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8b7e95411795..b2a2d6889bab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = { | |||
311 | .mode = 0644, | 311 | .mode = 0644, |
312 | .proc_handler = &proc_dointvec, | 312 | .proc_handler = &proc_dointvec, |
313 | }, | 313 | }, |
314 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
315 | { | ||
316 | .ctl_name = CTL_UNNUMBERED, | ||
317 | .procname = "sched_min_bal_int_shares", | ||
318 | .data = &sysctl_sched_min_bal_int_shares, | ||
319 | .maxlen = sizeof(unsigned int), | ||
320 | .mode = 0644, | ||
321 | .proc_handler = &proc_dointvec, | ||
322 | }, | ||
323 | { | ||
324 | .ctl_name = CTL_UNNUMBERED, | ||
325 | .procname = "sched_max_bal_int_shares", | ||
326 | .data = &sysctl_sched_max_bal_int_shares, | ||
327 | .maxlen = sizeof(unsigned int), | ||
328 | .mode = 0644, | ||
329 | .proc_handler = &proc_dointvec, | ||
330 | }, | ||
331 | #endif | ||
332 | #endif | 314 | #endif |
333 | { | 315 | { |
334 | .ctl_name = CTL_UNNUMBERED, | 316 | .ctl_name = CTL_UNNUMBERED, |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c88b5910e7ab..5fd9b9469770 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | |||
42 | long time_freq; /* frequency offset (scaled ppm)*/ | 42 | long time_freq; /* frequency offset (scaled ppm)*/ |
43 | static long time_reftime; /* time at last adjustment (s) */ | 43 | static long time_reftime; /* time at last adjustment (s) */ |
44 | long time_adjust; | 44 | long time_adjust; |
45 | static long ntp_tick_adj; | ||
45 | 46 | ||
46 | static void ntp_update_frequency(void) | 47 | static void ntp_update_frequency(void) |
47 | { | 48 | { |
48 | u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) | 49 | u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) |
49 | << TICK_LENGTH_SHIFT; | 50 | << TICK_LENGTH_SHIFT; |
50 | second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; | 51 | second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; |
51 | second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); | 52 | second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); |
52 | 53 | ||
53 | tick_length_base = second_length; | 54 | tick_length_base = second_length; |
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc) | |||
342 | freq_adj = shift_right(freq_adj, time_constant * 2 + | 343 | freq_adj = shift_right(freq_adj, time_constant * 2 + |
343 | (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); | 344 | (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); |
344 | if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { | 345 | if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { |
346 | u64 utemp64; | ||
345 | temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); | 347 | temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); |
346 | if (time_offset < 0) { | 348 | if (time_offset < 0) { |
347 | temp64 = -temp64; | 349 | utemp64 = -temp64; |
348 | do_div(temp64, mtemp); | 350 | do_div(utemp64, mtemp); |
349 | freq_adj -= temp64; | 351 | freq_adj -= utemp64; |
350 | } else { | 352 | } else { |
351 | do_div(temp64, mtemp); | 353 | utemp64 = temp64; |
352 | freq_adj += temp64; | 354 | do_div(utemp64, mtemp); |
355 | freq_adj += utemp64; | ||
353 | } | 356 | } |
354 | } | 357 | } |
355 | freq_adj += time_freq; | 358 | freq_adj += time_freq; |
@@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | |||
400 | notify_cmos_timer(); | 403 | notify_cmos_timer(); |
401 | return(result); | 404 | return(result); |
402 | } | 405 | } |
406 | |||
407 | static int __init ntp_tick_adj_setup(char *str) | ||
408 | { | ||
409 | ntp_tick_adj = simple_strtol(str, NULL, 0); | ||
410 | return 1; | ||
411 | } | ||
412 | |||
413 | __setup("ntp_tick_adj=", ntp_tick_adj_setup); | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbdb4..686da821d376 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void) | |||
282 | ts->idle_tick = ts->sched_timer.expires; | 282 | ts->idle_tick = ts->sched_timer.expires; |
283 | ts->tick_stopped = 1; | 283 | ts->tick_stopped = 1; |
284 | ts->idle_jiffies = last_jiffies; | 284 | ts->idle_jiffies = last_jiffies; |
285 | rcu_enter_nohz(); | ||
285 | } | 286 | } |
286 | 287 | ||
287 | /* | 288 | /* |
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void) | |||
375 | return; | 376 | return; |
376 | } | 377 | } |
377 | 378 | ||
379 | rcu_exit_nohz(); | ||
380 | |||
378 | /* Update jiffies first */ | 381 | /* Update jiffies first */ |
379 | select_nohz_load_balancer(0); | 382 | select_nohz_load_balancer(0); |
380 | now = ktime_get(); | 383 | now = ktime_get(); |
@@ -637,7 +640,7 @@ void tick_cancel_sched_timer(int cpu) | |||
637 | 640 | ||
638 | if (ts->sched_timer.base) | 641 | if (ts->sched_timer.base) |
639 | hrtimer_cancel(&ts->sched_timer); | 642 | hrtimer_cancel(&ts->sched_timer); |
640 | ts->tick_stopped = 0; | 643 | |
641 | ts->nohz_mode = NOHZ_MODE_INACTIVE; | 644 | ts->nohz_mode = NOHZ_MODE_INACTIVE; |
642 | } | 645 | } |
643 | #endif /* HIGH_RES_TIMERS */ | 646 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1af9fb050fe2..671af612b768 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -187,8 +187,7 @@ static void change_clocksource(void) | |||
187 | 187 | ||
188 | clock->error = 0; | 188 | clock->error = 0; |
189 | clock->xtime_nsec = 0; | 189 | clock->xtime_nsec = 0; |
190 | clocksource_calculate_interval(clock, | 190 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
191 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
192 | 191 | ||
193 | tick_clock_notify(); | 192 | tick_clock_notify(); |
194 | 193 | ||
@@ -245,8 +244,7 @@ void __init timekeeping_init(void) | |||
245 | ntp_clear(); | 244 | ntp_clear(); |
246 | 245 | ||
247 | clock = clocksource_get_next(); | 246 | clock = clocksource_get_next(); |
248 | clocksource_calculate_interval(clock, | 247 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
249 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
250 | clock->cycle_last = clocksource_read(clock); | 248 | clock->cycle_last = clocksource_read(clock); |
251 | 249 | ||
252 | xtime.tv_sec = sec; | 250 | xtime.tv_sec = sec; |