8 files changed, 107 insertions, 88 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 72248d1b9e3f..ab81fdd4572b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2231,19 +2231,25 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
 * short of memory, might require taking the callback_mutex mutex.
 *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * The first call here from mm/page_alloc:get_page_from_freelist()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
- * hardwall cpusets - no allocation on a node outside the cpuset is
+ * no allocation on a node outside the cpuset is allowed (unless in
- * allowed (unless in interrupt, of course).
+ * interrupt, of course).
- *
+ *
- * The second loop doesn't even call here for GFP_ATOMIC requests
+ * The second pass through get_page_from_freelist() doesn't even call
- * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
- * and the checks below have the combined affect in the second loop of
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
- * the __alloc_pages() routine that:
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
 **/
 int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2255,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
        if (in_interrupt())
                return 1;
        node = z->zone_pgdat->node_id;
+        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
diff --git a/kernel/exit.c b/kernel/exit.c
index e95b93282210..e06d0c10a24e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -881,14 +881,6 @@ fastcall NORET_TYPE void do_exit(long code)
        tsk->flags |= PF_EXITING;
-        /*
-         * Make sure we don't try to process any timer firings
-         * while we are already exiting.
-         */
-        tsk->it_virt_expires = cputime_zero;
-        tsk->it_prof_expires = cputime_zero;
-        tsk->it_sched_expires = 0;
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                                current->comm, current->pid,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b7f0388bd71c..01fa2ae98a85 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -456,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start);
 /**
 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -484,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 /**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -504,6 +506,7 @@ int hrtimer_cancel(struct hrtimer *timer)
                cpu_relax();
        }
 }
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
 /**
 * hrtimer_get_remaining - get remaining time for the timer
@@ -522,6 +525,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
        return rem;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 #ifdef CONFIG_NO_IDLE_HZ
 /**
@@ -580,6 +584,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        timer->base = &bases[clock_id];
        timer->node.rb_parent = HRTIMER_INACTIVE;
 }
+EXPORT_SYMBOL_GPL(hrtimer_init);
 /**
 * hrtimer_get_res - get the timer resolution for a clock
@@ -599,6 +604,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
        return 0;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
 /*
 * Expire the per base hrtimer-queue:
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 520f6c59948d..d38d9ec3276c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        struct cpu_timer_list *next;
        unsigned long i;
-        if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
-                return;
        head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
                p->cpu_timers : p->signal->cpu_timers);
        head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
                }
                t = tsk;
                do {
+                        if (unlikely(t->flags & PF_EXITING))
+                                continue;
                        ticks = cputime_add(cputime_add(t->utime, t->stime),
                                            prof_left);
                        if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
                                              t->it_sched_expires > sched)) {
                                t->it_sched_expires = sched;
                        }
+                } while ((t = next_thread(t)) != tsk);
-                        do {
-                                t = next_thread(t);
-                        } while (unlikely(t->flags & PF_EXITING));
-                } while (t != tsk);
        }
 }
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 #undef  UNEXPIRED
-        BUG_ON(tsk->exit_state);
        /*
         * Double-check with locks held.
         */
        read_lock(&tasklist_lock);
-        spin_lock(&tsk->sighand->siglock);
+        if (likely(tsk->signal != NULL)) {
+                spin_lock(&tsk->sighand->siglock);
-        /*
+                /*
-         * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+                 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-         * all the timers that are firing, and put them on the firing list.
+                 * all the timers that are firing, and put them on the firing list.
-         */
+                 */
-        check_thread_timers(tsk, &firing);
+                check_thread_timers(tsk, &firing);
-        check_process_timers(tsk, &firing);
+                check_process_timers(tsk, &firing);
-        /*
+                /*
-         * We must release these locks before taking any timer's lock.
+                 * We must release these locks before taking any timer's lock.
-         * There is a potential race with timer deletion here, as the
+                 * There is a potential race with timer deletion here, as the
-         * siglock now protects our private firing list.  We have set
+                 * siglock now protects our private firing list.  We have set
-         * the firing flag in each timer, so that a deletion attempt
+                 * the firing flag in each timer, so that a deletion attempt
-         * that gets the timer lock before we do will give it up and
+                 * that gets the timer lock before we do will give it up and
-         * spin until we've taken care of that timer below.
+                 * spin until we've taken care of that timer below.
-         */
+                 */
-        spin_unlock(&tsk->sighand->siglock);
+                spin_unlock(&tsk->sighand->siglock);
+        }
        read_unlock(&tasklist_lock);
        /*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a6d9ef46009e..0a907f0dc56b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
                        goto Thaw;
        }
+        suspend_console();
        if ((error = device_suspend(PMSG_SUSPEND))) {
                printk(KERN_ERR "Some devices failed to suspend\n");
                goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
 static void suspend_finish(suspend_state_t state)
 {
        device_resume();
+        resume_console();
        thaw_processes();
        enable_nonboot_cpus();
        if (pm_ops && pm_ops->finish)
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..19a955619294 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(oops_in_progress);
 * driver system.
 */
 static DECLARE_MUTEX(console_sem);
+static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
 /*
 * This is used for debugging the mess that is the VT code by
@@ -76,7 +77,7 @@ struct console *console_drivers;
 * path in the console code where we end up in places I want
 * locked without the console sempahore held
 */
-static int console_locked;
+static int console_locked, console_suspended;
 /*
 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -698,6 +699,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
 }
 /**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+        acquire_console_sem();
+        console_suspended = 1;
+}
+void resume_console(void)
+{
+        console_suspended = 0;
+        release_console_sem();
+}
+/**
 * acquire_console_sem - lock the console system for exclusive use.
 *
 * Acquires a semaphore which guarantees that the caller has
@@ -708,6 +726,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
 void acquire_console_sem(void)
 {
        BUG_ON(in_interrupt());
+        if (console_suspended) {
+                down(&secondary_console_sem);
+                return;
+        }
        down(&console_sem);
        console_locked = 1;
        console_may_schedule = 1;
@@ -750,6 +772,10 @@ void release_console_sem(void)
        unsigned long _con_start, _log_end;
        unsigned long wake_klogd = 0;
+        if (console_suspended) {
+                up(&secondary_console_sem);
+                return;
+        }
        for ( ; ; ) {
                spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4c64f85698ae..c13f1bd2df7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -665,55 +665,13 @@ static int effective_prio(task_t *p)
 }
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired, and switch periodically
- * regardless, to ensure that highly interactive tasks do not starve
- * the less fortunate for unreasonably long periods.
- */
-static inline int expired_starving(runqueue_t *rq)
-{
-        int limit;
-        /*
-         * Arrays were recently switched, all is well
-         */
-        if (!rq->expired_timestamp)
-                return 0;
-        limit = STARVATION_LIMIT * rq->nr_running;
-        /*
-         * It's time to switch arrays
-         */
-        if (jiffies - rq->expired_timestamp >= limit)
-                return 1;
-        /*
-         * There's a better selection in the expired array
-         */
-        if (rq->curr->static_prio > rq->best_expired_prio)
-                return 1;
-        /*
-         * All is well
-         */
-        return 0;
-}
-/*
 * __activate_task - move a task to the runqueue.
 */
 static void __activate_task(task_t *p, runqueue_t *rq)
 {
        prio_array_t *target = rq->active;
-        if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
+        if (batch_task(p))
                target = rq->expired;
        enqueue_task(p, target);
        rq->nr_running++;
@@ -2532,6 +2490,22 @@ unsigned long long current_sched_time(const task_t *tsk)
 }
 /*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+        ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+                (jiffies - (rq)->expired_timestamp >= \
+                        STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+                        ((rq)->curr->static_prio > (rq)->best_expired_prio))
+/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -2666,7 +2640,7 @@ void scheduler_tick(void)
                if (!rq->expired_timestamp)
                        rq->expired_timestamp = jiffies;
-                if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+                if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                        enqueue_task(p, rq->expired);
                        if (p->static_prio < rq->best_expired_prio)
                                rq->best_expired_prio = p->static_prio;
diff --git a/kernel/timer.c b/kernel/timer.c
index 67eaf0f54096..9e49deed468c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -541,6 +541,22 @@ found:
        }
        spin_unlock(&base->lock);
+        /*
+         * It can happen that other CPUs service timer IRQs and increment
+         * jiffies, but we have not yet got a local timer tick to process
+         * the timer wheels.  In that case, the expiry time can be before
+         * jiffies, but since the high-resolution timer here is relative to
+         * jiffies, the default expression when high-resolution timers are
+         * not active,
+         *
+         *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+         *
+         * would falsely evaluate to true.  If that is the case, just
+         * return jiffies so that we can immediately fire the local timer
+         */
+        if (time_before(expires, jiffies))
+                return jiffies;
        if (time_before(hr_expires, expires))
                return hr_expires;

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 72248d1b9e3f..ab81fdd4572b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -2231,19 +2231,25 @@ static const struct cpuset nearest_exclusive_ancestor(const struct cpuset cs)
2231	* So only GFP_KERNEL allocations, if all nodes in the cpuset are	2231	* So only GFP_KERNEL allocations, if all nodes in the cpuset are
2232	* short of memory, might require taking the callback_mutex mutex.	2232	* short of memory, might require taking the callback_mutex mutex.
2233	*	2233	*
2234	* The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()	2234	* The first call here from mm/page_alloc:get_page_from_freelist()
2235	* calls here with __GFP_HARDWALL always set in gfp_mask, enforcing	2235	* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
2236	* hardwall cpusets - no allocation on a node outside the cpuset is	2236	* no allocation on a node outside the cpuset is allowed (unless in
2237	* allowed (unless in interrupt, of course).	2237	* interrupt, of course).
2238	*	2238	*
2239	* The second loop doesn't even call here for GFP_ATOMIC requests	2239	* The second pass through get_page_from_freelist() doesn't even call
2240	* (if the __alloc_pages() local variable 'wait' is set). That check	2240	* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
2241	* and the checks below have the combined affect in the second loop of	2241	* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
2242	* the __alloc_pages() routine that:	2242	* in alloc_flags. That logic and the checks below have the combined
		2243	* affect that:
2243	* in_interrupt - any node ok (current task context irrelevant)	2244	* in_interrupt - any node ok (current task context irrelevant)
2244	* GFP_ATOMIC - any node ok	2245	* GFP_ATOMIC - any node ok
2245	* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok	2246	* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
2246	* GFP_USER - only nodes in current tasks mems allowed ok.	2247	* GFP_USER - only nodes in current tasks mems allowed ok.
		2248	*
		2249	* Rule:
		2250	* Don't call cpuset_zone_allowed() if you can't sleep, unless you
		2251	* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
		2252	* the code that might scan up ancestor cpusets and sleep.
2247	**/	2253	**/
2248		2254
2249	int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)	2255	int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2255,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2255	if (in_interrupt())	2261	if (in_interrupt())
2256	return 1;	2262	return 1;
2257	node = z->zone_pgdat->node_id;	2263	node = z->zone_pgdat->node_id;
		2264	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2258	if (node_isset(node, current->mems_allowed))	2265	if (node_isset(node, current->mems_allowed))
2259	return 1;	2266	return 1;
2260	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */	2267	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */


diff --git a/kernel/exit.c b/kernel/exit.c index e95b93282210..e06d0c10a24e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c
@@ -881,14 +881,6 @@ fastcall NORET_TYPE void do_exit(long code)
881		881
882	tsk->flags \|= PF_EXITING;	882	tsk->flags \|= PF_EXITING;
883		883
884	/*
885	* Make sure we don't try to process any timer firings
886	* while we are already exiting.
887	*/
888	tsk->it_virt_expires = cputime_zero;
889	tsk->it_prof_expires = cputime_zero;
890	tsk->it_sched_expires = 0;
891
892	if (unlikely(in_atomic()))	884	if (unlikely(in_atomic()))
893	printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",	885	printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
894	current->comm, current->pid,	886	current->comm, current->pid,


diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b7f0388bd71c..01fa2ae98a85 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c
@@ -456,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
456		456
457	return ret;	457	return ret;
458	}	458	}
		459	EXPORT_SYMBOL_GPL(hrtimer_start);
459		460
460	/**	461	/**
461	* hrtimer_try_to_cancel - try to deactivate a timer	462	* hrtimer_try_to_cancel - try to deactivate a timer
@@ -484,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
484	return ret;	485	return ret;
485		486
486	}	487	}
		488	EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
487		489
488	/**	490	/**
489	* hrtimer_cancel - cancel a timer and wait for the handler to finish.	491	* hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -504,6 +506,7 @@ int hrtimer_cancel(struct hrtimer *timer)
504	cpu_relax();	506	cpu_relax();
505	}	507	}
506	}	508	}
		509	EXPORT_SYMBOL_GPL(hrtimer_cancel);
507		510
508	/**	511	/**
509	* hrtimer_get_remaining - get remaining time for the timer	512	* hrtimer_get_remaining - get remaining time for the timer
@@ -522,6 +525,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
522		525
523	return rem;	526	return rem;
524	}	527	}
		528	EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
525		529
526	#ifdef CONFIG_NO_IDLE_HZ	530	#ifdef CONFIG_NO_IDLE_HZ
527	/**	531	/**
@@ -580,6 +584,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
580	timer->base = &bases[clock_id];	584	timer->base = &bases[clock_id];
581	timer->node.rb_parent = HRTIMER_INACTIVE;	585	timer->node.rb_parent = HRTIMER_INACTIVE;
582	}	586	}
		587	EXPORT_SYMBOL_GPL(hrtimer_init);
583		588
584	/**	589	/**
585	* hrtimer_get_res - get the timer resolution for a clock	590	* hrtimer_get_res - get the timer resolution for a clock
@@ -599,6 +604,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
599		604
600	return 0;	605	return 0;
601	}	606	}
		607	EXPORT_SYMBOL_GPL(hrtimer_get_res);
602		608
603	/*	609	/*
604	* Expire the per base hrtimer-queue:	610	* Expire the per base hrtimer-queue:


diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 520f6c59948d..d38d9ec3276c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
555	struct cpu_timer_list *next;	555	struct cpu_timer_list *next;
556	unsigned long i;	556	unsigned long i;
557		557
558	if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
559	return;
560
561	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?	558	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
562	p->cpu_timers : p->signal->cpu_timers);	559	p->cpu_timers : p->signal->cpu_timers);
563	head += CPUCLOCK_WHICH(timer->it_clock);	560	head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
1173	}	1170	}
1174	t = tsk;	1171	t = tsk;
1175	do {	1172	do {
		1173	if (unlikely(t->flags & PF_EXITING))
		1174	continue;
		1175
1176	ticks = cputime_add(cputime_add(t->utime, t->stime),	1176	ticks = cputime_add(cputime_add(t->utime, t->stime),
1177	prof_left);	1177	prof_left);
1178	if (!cputime_eq(prof_expires, cputime_zero) &&	1178	if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
1193	t->it_sched_expires > sched)) {	1193	t->it_sched_expires > sched)) {
1194	t->it_sched_expires = sched;	1194	t->it_sched_expires = sched;
1195	}	1195	}
1196		1196	} while ((t = next_thread(t)) != tsk);
1197	do {
1198	t = next_thread(t);
1199	} while (unlikely(t->flags & PF_EXITING));
1200	} while (t != tsk);
1201	}	1197	}
1202	}	1198	}
1203		1199
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1289		1285
1290	#undef UNEXPIRED	1286	#undef UNEXPIRED
1291		1287
1292	BUG_ON(tsk->exit_state);
1293
1294	/*	1288	/*
1295	* Double-check with locks held.	1289	* Double-check with locks held.
1296	*/	1290	*/
1297	read_lock(&tasklist_lock);	1291	read_lock(&tasklist_lock);
1298	spin_lock(&tsk->sighand->siglock);	1292	if (likely(tsk->signal != NULL)) {
		1293	spin_lock(&tsk->sighand->siglock);
1299		1294
1300	/*	1295	/*
1301	* Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]	1296	* Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
1302	* all the timers that are firing, and put them on the firing list.	1297	* all the timers that are firing, and put them on the firing list.
1303	*/	1298	*/
1304	check_thread_timers(tsk, &firing);	1299	check_thread_timers(tsk, &firing);
1305	check_process_timers(tsk, &firing);	1300	check_process_timers(tsk, &firing);
1306		1301
1307	/*	1302	/*
1308	* We must release these locks before taking any timer's lock.	1303	* We must release these locks before taking any timer's lock.
1309	* There is a potential race with timer deletion here, as the	1304	* There is a potential race with timer deletion here, as the
1310	* siglock now protects our private firing list. We have set	1305	* siglock now protects our private firing list. We have set
1311	* the firing flag in each timer, so that a deletion attempt	1306	* the firing flag in each timer, so that a deletion attempt
1312	* that gets the timer lock before we do will give it up and	1307	* that gets the timer lock before we do will give it up and
1313	* spin until we've taken care of that timer below.	1308	* spin until we've taken care of that timer below.
1314	*/	1309	*/
1315	spin_unlock(&tsk->sighand->siglock);	1310	spin_unlock(&tsk->sighand->siglock);
		1311	}
1316	read_unlock(&tasklist_lock);	1312	read_unlock(&tasklist_lock);
1317		1313
1318	/*	1314	/*


diff --git a/kernel/power/main.c b/kernel/power/main.c index a6d9ef46009e..0a907f0dc56b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
86	goto Thaw;	86	goto Thaw;
87	}	87	}
88		88
		89	suspend_console();
89	if ((error = device_suspend(PMSG_SUSPEND))) {	90	if ((error = device_suspend(PMSG_SUSPEND))) {
90	printk(KERN_ERR "Some devices failed to suspend\n");	91	printk(KERN_ERR "Some devices failed to suspend\n");
91	goto Finish;	92	goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
133	static void suspend_finish(suspend_state_t state)	134	static void suspend_finish(suspend_state_t state)
134	{	135	{
135	device_resume();	136	device_resume();
		137	resume_console();
136	thaw_processes();	138	thaw_processes();
137	enable_nonboot_cpus();	139	enable_nonboot_cpus();
138	if (pm_ops && pm_ops->finish)	140	if (pm_ops && pm_ops->finish)


diff --git a/kernel/printk.c b/kernel/printk.c index c056f3324432..19a955619294 100644 --- a/kernel/printk.c +++ b/kernel/printk.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(oops_in_progress);
67	* driver system.	67	* driver system.
68	*/	68	*/
69	static DECLARE_MUTEX(console_sem);	69	static DECLARE_MUTEX(console_sem);
		70	static DECLARE_MUTEX(secondary_console_sem);
70	struct console *console_drivers;	71	struct console *console_drivers;
71	/*	72	/*
72	* This is used for debugging the mess that is the VT code by	73	* This is used for debugging the mess that is the VT code by
@@ -76,7 +77,7 @@ struct console *console_drivers;
76	* path in the console code where we end up in places I want	77	* path in the console code where we end up in places I want
77	* locked without the console sempahore held	78	* locked without the console sempahore held
78	*/	79	*/
79	static int console_locked;	80	static int console_locked, console_suspended;
80		81
81	/*	82	/*
82	* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars	83	* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -698,6 +699,23 @@ int __init add_preferred_console(char name, int idx, char options)
698	}	699	}
699		700
700	/**	701	/**
		702	* suspend_console - suspend the console subsystem
		703	*
		704	* This disables printk() while we go into suspend states
		705	*/
		706	void suspend_console(void)
		707	{
		708	acquire_console_sem();
		709	console_suspended = 1;
		710	}
		711
		712	void resume_console(void)
		713	{
		714	console_suspended = 0;
		715	release_console_sem();
		716	}
		717
		718	/**
701	* acquire_console_sem - lock the console system for exclusive use.	719	* acquire_console_sem - lock the console system for exclusive use.
702	*	720	*
703	* Acquires a semaphore which guarantees that the caller has	721	* Acquires a semaphore which guarantees that the caller has
@@ -708,6 +726,10 @@ int __init add_preferred_console(char name, int idx, char options)
708	void acquire_console_sem(void)	726	void acquire_console_sem(void)
709	{	727	{
710	BUG_ON(in_interrupt());	728	BUG_ON(in_interrupt());
		729	if (console_suspended) {
		730	down(&secondary_console_sem);
		731	return;
		732	}
711	down(&console_sem);	733	down(&console_sem);
712	console_locked = 1;	734	console_locked = 1;
713	console_may_schedule = 1;	735	console_may_schedule = 1;
@@ -750,6 +772,10 @@ void release_console_sem(void)
750	unsigned long _con_start, _log_end;	772	unsigned long _con_start, _log_end;
751	unsigned long wake_klogd = 0;	773	unsigned long wake_klogd = 0;
752		774
		775	if (console_suspended) {
		776	up(&secondary_console_sem);
		777	return;
		778	}
753	for ( ; ; ) {	779	for ( ; ; ) {
754	spin_lock_irqsave(&logbuf_lock, flags);	780	spin_lock_irqsave(&logbuf_lock, flags);
755	wake_klogd \|= log_start - log_end;	781	wake_klogd \|= log_start - log_end;


diff --git a/kernel/sched.c b/kernel/sched.c index 4c64f85698ae..c13f1bd2df7d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -665,55 +665,13 @@ static int effective_prio(task_t *p)
665	}	665	}
666		666
667	/*	667	/*
668	* We place interactive tasks back into the active array, if possible.
669	*
670	* To guarantee that this does not starve expired tasks we ignore the
671	* interactivity of a task if the first expired task had to wait more
672	* than a 'reasonable' amount of time. This deadline timeout is
673	* load-dependent, as the frequency of array switched decreases with
674	* increasing number of running tasks. We also ignore the interactivity
675	* if a better static_prio task has expired, and switch periodically
676	* regardless, to ensure that highly interactive tasks do not starve
677	* the less fortunate for unreasonably long periods.
678	*/
679	static inline int expired_starving(runqueue_t *rq)
680	{
681	int limit;
682
683	/*
684	* Arrays were recently switched, all is well
685	*/
686	if (!rq->expired_timestamp)
687	return 0;
688
689	limit = STARVATION_LIMIT * rq->nr_running;
690
691	/*
692	* It's time to switch arrays
693	*/
694	if (jiffies - rq->expired_timestamp >= limit)
695	return 1;
696
697	/*
698	* There's a better selection in the expired array
699	*/
700	if (rq->curr->static_prio > rq->best_expired_prio)
701	return 1;
702
703	/*
704	* All is well
705	*/
706	return 0;
707	}
708
709	/*
710	* __activate_task - move a task to the runqueue.	668	* __activate_task - move a task to the runqueue.
711	*/	669	*/
712	static void __activate_task(task_t p, runqueue_t rq)	670	static void __activate_task(task_t p, runqueue_t rq)
713	{	671	{
714	prio_array_t *target = rq->active;	672	prio_array_t *target = rq->active;
715		673
716	if (unlikely(batch_task(p) \|\| (expired_starving(rq) && !rt_task(p))))	674	if (batch_task(p))
717	target = rq->expired;	675	target = rq->expired;
718	enqueue_task(p, target);	676	enqueue_task(p, target);
719	rq->nr_running++;	677	rq->nr_running++;
@@ -2532,6 +2490,22 @@ unsigned long long current_sched_time(const task_t *tsk)
2532	}	2490	}
2533		2491
2534	/*	2492	/*
		2493	* We place interactive tasks back into the active array, if possible.
		2494	*
		2495	* To guarantee that this does not starve expired tasks we ignore the
		2496	* interactivity of a task if the first expired task had to wait more
		2497	* than a 'reasonable' amount of time. This deadline timeout is
		2498	* load-dependent, as the frequency of array switched decreases with
		2499	* increasing number of running tasks. We also ignore the interactivity
		2500	* if a better static_prio task has expired:
		2501	*/
		2502	#define EXPIRED_STARVING(rq) \
		2503	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
		2504	(jiffies - (rq)->expired_timestamp >= \
		2505	STARVATION_LIMIT * ((rq)->nr_running) + 1))) \|\| \
		2506	((rq)->curr->static_prio > (rq)->best_expired_prio))
		2507
		2508	/*
2535	* Account user cpu time to a process.	2509	* Account user cpu time to a process.
2536	* @p: the process that the cpu time gets accounted to	2510	* @p: the process that the cpu time gets accounted to
2537	* @hardirq_offset: the offset to subtract from hardirq_count()	2511	* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -2666,7 +2640,7 @@ void scheduler_tick(void)
2666		2640
2667	if (!rq->expired_timestamp)	2641	if (!rq->expired_timestamp)
2668	rq->expired_timestamp = jiffies;	2642	rq->expired_timestamp = jiffies;
2669	if (!TASK_INTERACTIVE(p) \|\| expired_starving(rq)) {	2643	if (!TASK_INTERACTIVE(p) \|\| EXPIRED_STARVING(rq)) {
2670	enqueue_task(p, rq->expired);	2644	enqueue_task(p, rq->expired);
2671	if (p->static_prio < rq->best_expired_prio)	2645	if (p->static_prio < rq->best_expired_prio)
2672	rq->best_expired_prio = p->static_prio;	2646	rq->best_expired_prio = p->static_prio;


diff --git a/kernel/timer.c b/kernel/timer.c index 67eaf0f54096..9e49deed468c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c
@@ -541,6 +541,22 @@ found:
541	}	541	}
542	spin_unlock(&base->lock);	542	spin_unlock(&base->lock);
543		543
		544	/*
		545	* It can happen that other CPUs service timer IRQs and increment
		546	* jiffies, but we have not yet got a local timer tick to process
		547	* the timer wheels. In that case, the expiry time can be before
		548	* jiffies, but since the high-resolution timer here is relative to
		549	* jiffies, the default expression when high-resolution timers are
		550	* not active,
		551	*
		552	* time_before(MAX_JIFFY_OFFSET + jiffies, expires)
		553	*
		554	* would falsely evaluate to true. If that is the case, just
		555	* return jiffies so that we can immediately fire the local timer
		556	*/
		557	if (time_before(expires, jiffies))
		558	return jiffies;
		559
544	if (time_before(hr_expires, expires))	560	if (time_before(hr_expires, expires))
545	return hr_expires;	561	return hr_expires;
546		562