aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <nickpiggin@yahoo.com.au>2005-06-25 17:57:07 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:40 -0400
commit8102679447da7fcbcb5226ee0207c3a034bc6d5f (patch)
treebb1717150a94a02a44c3bafc9bf8969ef6045f89
parente0f364f4069f76a3613a797c388832822d179076 (diff)
[PATCH] sched: improve load balancing pinned tasks
John Hawkes explained the problem best: A large number of processes that are pinned to a single CPU results in every other CPU's load_balance() seeing this overloaded CPU as "busiest", yet move_tasks() never finds a task to pull-migrate. This condition occurs during module unload, but can also occur as a denial-of-service using sys_sched_setaffinity(). Several hundred CPUs performing this fruitless load_balance() will livelock on the busiest CPU's runqueue lock. A smaller number of CPUs will livelock if the pinned task count gets high. Expanding slightly on John's patch, this one attempts to work out whether the balancing failure has been due to too many tasks pinned on the runqueue. This allows it to be basically invisible to the regular blancing paths (ie. when there are no pinned tasks). We can use this extra knowledge to shut down the balancing faster, and ensure the migration threads don't start running which is another problem observed in the wild. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--kernel/sched.c62
1 files changed, 39 insertions, 23 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 86be13ee5006..2794c79b9197 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1632,7 +1632,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1632 */ 1632 */
1633static inline 1633static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1635 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1636{ 1636{
1637 /* 1637 /*
1638 * We do not migrate tasks that are: 1638 * We do not migrate tasks that are:
@@ -1640,10 +1640,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1641 * 3) are cache-hot on their current CPU.
1642 */ 1642 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1643 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1644 return 0;
1645 *all_pinned = 0;
1646
1647 if (task_running(rq, p))
1648 return 0;
1647 1649
1648 /* 1650 /*
1649 * Aggressive migration if: 1651 * Aggressive migration if:
@@ -1656,7 +1658,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1656 return 1; 1658 return 1;
1657 1659
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1660 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1661 return 0;
1660 return 1; 1662 return 1;
1661} 1663}
1662 1664
@@ -1669,16 +1671,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1669 */ 1671 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1672static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1673 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1674 enum idle_type idle, int *all_pinned)
1673{ 1675{
1674 prio_array_t *array, *dst_array; 1676 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1677 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1678 int idx, pulled = 0, pinned = 0;
1677 task_t *tmp; 1679 task_t *tmp;
1678 1680
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1681 if (max_nr_move == 0)
1680 goto out; 1682 goto out;
1681 1683
1684 pinned = 1;
1685
1682 /* 1686 /*
1683 * We first consider expired tasks. Those will likely not be 1687 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1688 * executed in the near future, and they are most likely to
@@ -1717,7 +1721,7 @@ skip_queue:
1717 1721
1718 curr = curr->prev; 1722 curr = curr->prev;
1719 1723
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1724 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1721 if (curr != head) 1725 if (curr != head)
1722 goto skip_queue; 1726 goto skip_queue;
1723 idx++; 1727 idx++;
@@ -1746,6 +1750,9 @@ out:
1746 * inside pull_task(). 1750 * inside pull_task().
1747 */ 1751 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1752 schedstat_add(sd, lb_gained[idle], pulled);
1753
1754 if (all_pinned)
1755 *all_pinned = pinned;
1749 return pulled; 1756 return pulled;
1750} 1757}
1751 1758
@@ -1917,7 +1924,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1917 struct sched_group *group; 1924 struct sched_group *group;
1918 runqueue_t *busiest; 1925 runqueue_t *busiest;
1919 unsigned long imbalance; 1926 unsigned long imbalance;
1920 int nr_moved; 1927 int nr_moved, all_pinned;
1928 int active_balance = 0;
1921 1929
1922 spin_lock(&this_rq->lock); 1930 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 1931 schedstat_inc(sd, lb_cnt[idle]);
@@ -1956,9 +1964,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1956 */ 1964 */
1957 double_lock_balance(this_rq, busiest); 1965 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 1966 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 1967 imbalance, sd, idle,
1968 &all_pinned);
1960 spin_unlock(&busiest->lock); 1969 spin_unlock(&busiest->lock);
1970
1971 /* All tasks on this runqueue were pinned by CPU affinity */
1972 if (unlikely(all_pinned))
1973 goto out_balanced;
1961 } 1974 }
1975
1962 spin_unlock(&this_rq->lock); 1976 spin_unlock(&this_rq->lock);
1963 1977
1964 if (!nr_moved) { 1978 if (!nr_moved) {
@@ -1966,16 +1980,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1966 sd->nr_balance_failed++; 1980 sd->nr_balance_failed++;
1967 1981
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 1982 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970 1983
1971 spin_lock(&busiest->lock); 1984 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 1985 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 1986 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 1987 busiest->push_cpu = this_cpu;
1975 wake = 1; 1988 active_balance = 1;
1976 } 1989 }
1977 spin_unlock(&busiest->lock); 1990 spin_unlock(&busiest->lock);
1978 if (wake) 1991 if (active_balance)
1979 wake_up_process(busiest->migration_thread); 1992 wake_up_process(busiest->migration_thread);
1980 1993
1981 /* 1994 /*
@@ -1984,18 +1997,21 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1984 */ 1997 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 1998 sd->nr_balance_failed = sd->cache_nice_tries;
1986 } 1999 }
1987 2000 } else
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0; 2001 sd->nr_balance_failed = 0;
1996 2002
2003 if (likely(!active_balance)) {
1997 /* We were unbalanced, so reset the balancing interval */ 2004 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 2005 sd->balance_interval = sd->min_interval;
2006 } else {
2007 /*
2008 * If we've begun active balancing, start to back off. This
2009 * case may not be covered by the all_pinned logic if there
2010 * is only 1 task on the busy runqueue (because we don't call
2011 * move_tasks).
2012 */
2013 if (sd->balance_interval < sd->max_interval)
2014 sd->balance_interval *= 2;
1999 } 2015 }
2000 2016
2001 return nr_moved; 2017 return nr_moved;
@@ -2047,7 +2063,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2047 2063
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2064 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2065 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2066 imbalance, sd, NEWLY_IDLE, NULL);
2051 if (!nr_moved) 2067 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2068 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2053 2069
@@ -2126,7 +2142,7 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2126 /* move a task from busiest_rq to target_rq */ 2142 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq); 2143 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq, 2144 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) { 2145 1, sd, SCHED_IDLE, NULL)) {
2130 schedstat_inc(sd, alb_pushed); 2146 schedstat_inc(sd, alb_pushed);
2131 } else { 2147 } else {
2132 schedstat_inc(sd, alb_failed); 2148 schedstat_inc(sd, alb_failed);