6 files changed, 82 insertions, 15 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba42b0a76961..12815d3f1a05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1977,6 +1977,39 @@ void cpuset_fork(struct task_struct *child)
 * We don't need to task_lock() this reference to tsk->cpuset,
 * because tsk is already marked PF_EXITING, so attach_task() won't
 * mess with it, or task is a failed fork, never visible to attach_task.
+ *
+ * Hack:
+ *
+ *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
+ *
+ *    Don't leave a task unable to allocate memory, as that is an
+ *    accident waiting to happen should someone add a callout in
+ *    do_exit() after the cpuset_exit() call that might allocate.
+ *    If a task tries to allocate memory with an invalid cpuset,
+ *    it will oops in cpuset_update_task_memory_state().
+ *
+ *    We call cpuset_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to
+ *    the root cpuset (top_cpuset) for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cpuset, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cpuset function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cpuset reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cpuset without bumping its
+ *    count is unsafe.   The cpuset could go away, or someone could
+ *    attach us to a different cpuset, decrementing the count on
+ *    the first cpuset that we never incremented.  But in this case,
+ *    top_cpuset isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ *    Another way to do this would be to set the cpuset pointer
+ *    to NULL here, and check in cpuset_update_task_memory_state()
+ *    for a NULL pointer.  This hack avoids that NULL check, for no
+ *    cost (other than this way too long comment ;).
 **/
 void cpuset_exit(struct task_struct *tsk)
@@ -1984,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk)
        struct cpuset *cs;
        cs = tsk->cpuset;
-        tsk->cpuset = NULL;
+        tsk->cpuset = &top_cpuset;      /* Hack - see comment above */
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 41f66365f0d8..8d5a5986d621 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -91,10 +91,8 @@ static int save_highmem_zone(struct zone *zone)
                 * corrected eventually when the cases giving rise to this
                 * are better understood.
                 */
-                if (PageReserved(page)) {
+                if (PageReserved(page))
-                        printk("highmem reserved page?!\n");
                        continue;
-                }
                BUG_ON(PageNosave(page));
                if (PageNosaveFree(page))
                        continue;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e87..2d9d08f72f76 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -153,13 +153,11 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
        int i;
-        if (!swsusp_resume_device)
-                return -ENODEV;
        spin_lock(&swap_lock);
        for (i = 0; i < MAX_SWAPFILES; i++) {
                if (!(swap_info[i].flags & SWP_WRITEOK))
                        continue;
-                if (is_resume_device(swap_info + i)) {
+                if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
                        spin_unlock(&swap_lock);
                        root_swap = i;
                        return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 66d957227de9..12d291bf3379 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5058,7 +5058,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 #define MAX_DOMAIN_DISTANCE 32
 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+                        CONFIG_DEFAULT_MIGRATION_COST
+#else
+                        -1LL
+#endif
+};
 /*
 * Allow override of migration cost - in units of microseconds.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71dd6f62efec..7654d55c47f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,8 +126,6 @@ extern int sysctl_hz_timer;
 extern int acct_parm[];
 #endif
-int randomize_va_space = 1;
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
                       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
diff --git a/kernel/timer.c b/kernel/timer.c
index b9dad3994676..fe3a9a9f8328 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -717,12 +717,16 @@ static void second_overflow(void)
 #endif
 }
-/* in the NTP reference this is called "hardclock()" */
+/*
-static void update_wall_time_one_tick(void)
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
 {
-        long time_adjust_step, delta_nsec;
+        long time_adjust_step;
-        if ((time_adjust_step = time_adjust) != 0 ) {
+        time_adjust_step = time_adjust;
+        if (time_adjust_step) {
                /*
                 * We are doing an adjtime thing.  Prepare time_adjust_step to
                 * be within bounds.  Note that a positive time_adjust means we
@@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void)
                 */
                time_adjust_step = min(time_adjust_step, (long)tickadj);
                time_adjust_step = max(time_adjust_step, (long)-tickadj);
+        }
+        return time_adjust_step;
+}
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+        long time_adjust_step, delta_nsec;
+        time_adjust_step = adjtime_adjustment();
+        if (time_adjust_step)
                /* Reduce by this step the amount of time left  */
                time_adjust -= time_adjust_step;
-        }
        delta_nsec = tick_nsec + time_adjust_step * 1000;
        /*
         * Advance the phase, once it gets to one microsecond, then
@@ -759,6 +772,22 @@ static void update_wall_time_one_tick(void)
 }
 /*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
+ * bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+        long delta_nsec;
+        delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+        return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+}
+/*
 * Using a loop looks inefficient, but "ticks" is
 * usually just one (we shouldn't be losing ticks,
 * we're doing this this way mainly for interrupt