7 files changed, 86 insertions, 43 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2a75e44e1a41..fe2f71f92ae0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1554,7 +1554,7 @@ struct ctr_struct {
 * when reading out p->cpuset, as we don't really care if it changes
 * on the next cycle, and we are not going to try to dereference it.
 */
-static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
+static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
 {
        int n = 0;
        struct task_struct *g, *p;
@@ -2150,6 +2150,33 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 }
 /**
+ * cpuset_lock - lock out any changes to cpuset structures
+ *
+ * The out of memory (oom) code needs to lock down cpusets
+ * from being changed while it scans the tasklist looking for a
+ * task in an overlapping cpuset.  Expose callback_sem via this
+ * cpuset_lock() routine, so the oom code can lock it, before
+ * locking the task list.  The tasklist_lock is a spinlock, so
+ * must be taken inside callback_sem.
+ */
+void cpuset_lock(void)
+{
+        down(&callback_sem);
+}
+/**
+ * cpuset_unlock - release lock on cpuset changes
+ *
+ * Undo the lock taken in a previous cpuset_lock() call.
+ */
+void cpuset_unlock(void)
+{
+        up(&callback_sem);
+}
+/**
 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
 * @p: pointer to task_struct of some other task.
 *
@@ -2158,7 +2185,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 * determine if task @p's memory usage might impact the memory
 * available to the current task.
 *
- * Acquires callback_sem - not suitable for calling from a fast path.
+ * Call while holding callback_sem.
 **/
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2166,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
        const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
        int overlap = 0;                /* do cpusets overlap? */
-        down(&callback_sem);
        task_lock(current);
        if (current->flags & PF_EXITING) {
                task_unlock(current);
@@ -2186,8 +2211,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
        overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-        up(&callback_sem);
        return overlap;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index f8e609ff1893..93cee3671332 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -193,7 +193,7 @@ int is_orphaned_pgrp(int pgrp)
        return retval;
 }
-static inline int has_stopped_jobs(int pgrp)
+static int has_stopped_jobs(int pgrp)
 {
        int retval = 0;
        struct task_struct *p;
@@ -230,7 +230,7 @@ static inline int has_stopped_jobs(int pgrp)
 *
 * NOTE that reparent_to_init() gives the caller full capabilities.
 */
-static inline void reparent_to_init(void)
+static void reparent_to_init(void)
 {
        write_lock_irq(&tasklist_lock);
@@ -244,7 +244,9 @@ static inline void reparent_to_init(void)
        /* Set the exit signal to SIGCHLD so we signal init on exit */
        current->exit_signal = SIGCHLD;
-        if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+        if ((current->policy == SCHED_NORMAL ||
+                        current->policy == SCHED_BATCH)
+                                && (task_nice(current) < 0))
                set_user_nice(current, 0);
        /* cpus_allowed? */
        /* rt_priority? */
@@ -367,7 +369,7 @@ void daemonize(const char *name, ...)
 EXPORT_SYMBOL(daemonize);
-static inline void close_files(struct files_struct * files)
+static void close_files(struct files_struct * files)
 {
        int i, j;
        struct fdtable *fdt;
@@ -541,7 +543,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
        p->real_parent = reaper;
 }
-static inline void reparent_thread(task_t *p, task_t *father, int traced)
+static void reparent_thread(task_t *p, task_t *father, int traced)
 {
        /* We don't want people slaying init.  */
        if (p->exit_signal != -1)
@@ -605,7 +607,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 * group, and if no such member exists, give it to
 * the global child reaper process (ie "init")
 */
-static inline void forget_original_parent(struct task_struct * father,
+static void forget_original_parent(struct task_struct * father,
                                          struct list_head *to_release)
 {
        struct task_struct *p, *reaper = father;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9e66e614862a..197208b3aa2a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -192,7 +192,7 @@ static inline int common_clock_set(const clockid_t which_clock,
        return do_sys_settimeofday(tp, NULL);
 }
-static inline int common_timer_create(struct k_itimer *new_timer)
+static int common_timer_create(struct k_itimer *new_timer)
 {
        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock);
        new_timer->it.real.timer.data = new_timer;
@@ -361,7 +361,7 @@ static int posix_timer_fn(void *data)
        return ret;
 }
-static inline struct task_struct * good_sigevent(sigevent_t * event)
+static struct task_struct * good_sigevent(sigevent_t * event)
 {
        struct task_struct *rtn = current->group_leader;
@@ -687,7 +687,7 @@ sys_timer_getoverrun(timer_t timer_id)
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
-static inline int
+static int
 common_timer_set(struct k_itimer *timr, int flags,
                 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
@@ -829,7 +829,7 @@ retry_delete:
 /*
 * return timer owned by the process, used by exit_itimers
 */
-static inline void itimer_delete(struct k_itimer *timer)
+static void itimer_delete(struct k_itimer *timer)
 {
        unsigned long flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index c9dec2aa1976..788ecce1e0e4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -521,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t)
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
-static inline void sched_info_arrive(task_t *t)
+static void sched_info_arrive(task_t *t)
 {
        unsigned long now = jiffies, diff = 0;
        struct runqueue *rq = task_rq(t);
@@ -748,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        unsigned long long __sleep_time = now - p->timestamp;
        unsigned long sleep_time;
-        if (__sleep_time > NS_MAX_SLEEP_AVG)
+        if (unlikely(p->policy == SCHED_BATCH))
-                sleep_time = NS_MAX_SLEEP_AVG;
+                sleep_time = 0;
-        else
+        else {
-                sleep_time = (unsigned long)__sleep_time;
+                if (__sleep_time > NS_MAX_SLEEP_AVG)
+                        sleep_time = NS_MAX_SLEEP_AVG;
+                else
+                        sleep_time = (unsigned long)__sleep_time;
+        }
        if (likely(sleep_time > 0)) {
                /*
@@ -1003,7 +1007,7 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
-static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long running = rq->nr_running;
@@ -1866,7 +1870,7 @@ void sched_exec(void)
 * pull_task - move a task from a remote runqueue to the local runqueue.
 * Both runqueues must be locked.
 */
-static inline
+static
 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
@@ -1888,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
-static inline
+static
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
                     struct sched_domain *sd, enum idle_type idle,
                     int *all_pinned)
@@ -2374,7 +2378,7 @@ out_balanced:
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+static void idle_balance(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *sd;
@@ -2758,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
                resched_task(rq->idle);
 }
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
        cpumask_t sibling_map;
@@ -2812,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
        return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
        cpumask_t sibling_map;
@@ -3560,7 +3564,7 @@ void set_user_nice(task_t *p, long nice)
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
         * it wont have any effect on scheduling until the task is
-         * not SCHED_NORMAL:
+         * not SCHED_NORMAL/SCHED_BATCH:
         */
        if (rt_task(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
@@ -3706,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
        BUG_ON(p->array);
        p->policy = policy;
        p->rt_priority = prio;
-        if (policy != SCHED_NORMAL)
+        if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
                p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-        else
+        } else {
                p->prio = p->static_prio;
+                /*
+                 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+                 */
+                if (policy == SCHED_BATCH)
+                        p->sleep_avg = 0;
+        }
 }
 /**
@@ -3733,29 +3743,35 @@ recheck:
        if (policy < 0)
                policy = oldpolicy = p->policy;
        else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                                policy != SCHED_NORMAL)
+                        policy != SCHED_NORMAL && policy != SCHED_BATCH)
-                        return -EINVAL;
+                return -EINVAL;
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
-         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
+         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+         * SCHED_BATCH is 0.
         */
        if (param->sched_priority < 0 ||
            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
+        if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
+                                        != (param->sched_priority == 0))
                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (!capable(CAP_SYS_NICE)) {
-                /* can't change policy */
+                /*
-                if (policy != p->policy &&
+                 * can't change policy, except between SCHED_NORMAL
-                        !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+                 * and SCHED_BATCH:
+                 */
+                if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
+                        (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+                                !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
                        return -EPERM;
                /* can't increase priority */
-                if (policy != SCHED_NORMAL &&
+                if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
                    param->sched_priority > p->rt_priority &&
                    param->sched_priority >
                                p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
@@ -4233,6 +4249,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
                ret = MAX_USER_RT_PRIO-1;
                break;
        case SCHED_NORMAL:
+        case SCHED_BATCH:
                ret = 0;
                break;
        }
@@ -4256,6 +4273,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
                ret = 1;
                break;
        case SCHED_NORMAL:
+        case SCHED_BATCH:
                ret = 0;
        }
        return ret;
@@ -5990,7 +6008,7 @@ next_sg:
 * Detach sched domains from a group of cpus specified in cpu_map
 * These cpus will now be attached to the NULL domain
 */
-static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
        int i;
diff --git a/kernel/signal.c b/kernel/signal.c
index 1da2e74beb97..5dafbd36d62e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -476,7 +476,7 @@ unblock_all_signals(void)
        spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
-static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
        struct sigqueue *q, *first = NULL;
        int still_pending = 0;
@@ -1881,7 +1881,7 @@ do_signal_stop(int signr)
 * We return zero if we still hold the siglock and should look
 * for another signal without checking group_stop_count again.
 */
-static inline int handle_group_stop(void)
+static int handle_group_stop(void)
 {
        int stop_count;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62d4d9566876..f5d69b6e29f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -648,7 +648,7 @@ static ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#if defined(CONFIG_S390)
+#if defined(CONFIG_S390) && defined(CONFIG_SMP)
        {
                .ctl_name       = KERN_SPIN_RETRY,
                .procname       = "spin_retry",
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82c4fa70595c..b052e2c4c710 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -147,7 +147,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
        return ret;
 }
-static inline void run_workqueue(struct cpu_workqueue_struct *cwq)
+static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
        unsigned long flags;