Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: Christoph Lameter <clameter@sgi.com> 2008-02-14 15:05:41 -0500
committer: Christoph Lameter <clameter@sgi.com> 2008-02-14 15:05:41 -0500
commit: c5974932c1e8514d3478573bb52beebeb2c786dd (patch)
tree: a204156fbb0036fb76e89ceffa15a30e90bc3f75 /kernel
parent: 9e40ade04c45a46f6b3d647e0bdac1a32bfaa3a9 (diff)
parent: e760e716d47b48caf98da348368fd41b4a9b9e7e (diff)
10 files changed, 1003 insertions, 377 deletions
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbce..c4c2cd8b61f5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
+/* Set to 1 to enable marker debug output */
+const int marker_debug;
 /*
 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
 */
 static DEFINE_MUTEX(markers_mutex);
 /*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-/*
 * Marker hash table, containing the active markers.
 * Protected by module_mutex.
 */
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker.  It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
 struct marker_entry {
        struct hlist_node hlist;
        char *format;
-        marker_probe_func *probe;
+        void (*call)(const struct marker *mdata,        /* Probe wrapper */
-        void *private;
+                void *call_private, const char *fmt, ...);
+        struct marker_probe_closure single;
+        struct marker_probe_closure *multi;
        int refcount;   /* Number of times armed. 0 if disarmed. */
+        struct rcu_head rcu;
+        void *oldptr;
+        char rcu_pending:1;
+        char ptype:1;
        char name[0];   /* Contains name'\0'format'\0' */
 };
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 /**
 * __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_private: probe private data
+ * @call_private: call site private data
 * @fmt: format string
 * @...: variable argument list
 *
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 * though the function pointer change and the marker enabling are two distinct
 * operations that modifies the execution flow of preemptible code.
 */
-void __mark_empty_function(const struct marker *mdata, void *private,
+void __mark_empty_function(void *probe_private, void *call_private,
-        const char *fmt, ...)
+        const char *fmt, va_list *args)
 {
 }
 EXPORT_SYMBOL_GPL(__mark_empty_function);
 /*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private,
+        const char *fmt, ...)
+{
+        va_list args;
+        char ptype;
+        /*
+         * disabling preemption to make sure the teardown of the callbacks can
+         * be done correctly when they are in modules and they insure RCU read
+         * coherency.
+         */
+        preempt_disable();
+        ptype = ACCESS_ONCE(mdata->ptype);
+        if (likely(!ptype)) {
+                marker_probe_func *func;
+                /* Must read the ptype before ptr. They are not data dependant,
+                 * so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                func = ACCESS_ONCE(mdata->single.func);
+                /* Must read the ptr before private data. They are not data
+                 * dependant, so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                va_start(args, fmt);
+                func(mdata->single.probe_private, call_private, fmt, &args);
+                va_end(args);
+        } else {
+                struct marker_probe_closure *multi;
+                int i;
+                /*
+                 * multi points to an array, therefore accessing the array
+                 * depends on reading multi. However, even in this case,
+                 * we must insure that the pointer is read _before_ the array
+                 * data. Same as rcu_dereference, but we need a full smp_rmb()
+                 * in the fast path, so put the explicit barrier here.
+                 */
+                smp_read_barrier_depends();
+                multi = ACCESS_ONCE(mdata->multi);
+                for (i = 0; multi[i].func; i++) {
+                        va_start(args, fmt);
+                        multi[i].func(multi[i].probe_private, call_private, fmt,
+                                &args);
+                        va_end(args);
+                }
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata,
+        void *call_private, const char *fmt, ...)
+{
+        va_list args;   /* not initialized */
+        char ptype;
+        preempt_disable();
+        ptype = ACCESS_ONCE(mdata->ptype);
+        if (likely(!ptype)) {
+                marker_probe_func *func;
+                /* Must read the ptype before ptr. They are not data dependant,
+                 * so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                func = ACCESS_ONCE(mdata->single.func);
+                /* Must read the ptr before private data. They are not data
+                 * dependant, so we put an explicit smp_rmb() here. */
+                smp_rmb();
+                func(mdata->single.probe_private, call_private, fmt, &args);
+        } else {
+                struct marker_probe_closure *multi;
+                int i;
+                /*
+                 * multi points to an array, therefore accessing the array
+                 * depends on reading multi. However, even in this case,
+                 * we must insure that the pointer is read _before_ the array
+                 * data. Same as rcu_dereference, but we need a full smp_rmb()
+                 * in the fast path, so put the explicit barrier here.
+                 */
+                smp_read_barrier_depends();
+                multi = ACCESS_ONCE(mdata->multi);
+                for (i = 0; multi[i].func; i++)
+                        multi[i].func(multi[i].probe_private, call_private, fmt,
+                                &args);
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+static void free_old_closure(struct rcu_head *head)
+{
+        struct marker_entry *entry = container_of(head,
+                struct marker_entry, rcu);
+        kfree(entry->oldptr);
+        /* Make sure we free the data before setting the pending flag to 0 */
+        smp_wmb();
+        entry->rcu_pending = 0;
+}
+static void debug_print_probes(struct marker_entry *entry)
+{
+        int i;
+        if (!marker_debug)
+                return;
+        if (!entry->ptype) {
+                printk(KERN_DEBUG "Single probe : %p %p\n",
+                        entry->single.func,
+                        entry->single.probe_private);
+        } else {
+                for (i = 0; entry->multi[i].func; i++)
+                        printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+                                entry->multi[i].func,
+                                entry->multi[i].probe_private);
+        }
+}
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+                marker_probe_func *probe, void *probe_private)
+{
+        int nr_probes = 0;
+        struct marker_probe_closure *old, *new;
+        WARN_ON(!probe);
+        debug_print_probes(entry);
+        old = entry->multi;
+        if (!entry->ptype) {
+                if (entry->single.func == probe &&
+                                entry->single.probe_private == probe_private)
+                        return ERR_PTR(-EBUSY);
+                if (entry->single.func == __mark_empty_function) {
+                        /* 0 -> 1 probes */
+                        entry->single.func = probe;
+                        entry->single.probe_private = probe_private;
+                        entry->refcount = 1;
+                        entry->ptype = 0;
+                        debug_print_probes(entry);
+                        return NULL;
+                } else {
+                        /* 1 -> 2 probes */
+                        nr_probes = 1;
+                        old = NULL;
+                }
+        } else {
+                /* (N -> N+1), (N != 0, 1) probes */
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+                        if (old[nr_probes].func == probe
+                                        && old[nr_probes].probe_private
+                                                == probe_private)
+                                return ERR_PTR(-EBUSY);
+        }
+        /* + 2 : one for new probe, one for NULL func */
+        new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+                        GFP_KERNEL);
+        if (new == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (!old)
+                new[0] = entry->single;
+        else
+                memcpy(new, old,
+                        nr_probes * sizeof(struct marker_probe_closure));
+        new[nr_probes].func = probe;
+        new[nr_probes].probe_private = probe_private;
+        entry->refcount = nr_probes + 1;
+        entry->multi = new;
+        entry->ptype = 1;
+        debug_print_probes(entry);
+        return old;
+}
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+                marker_probe_func *probe, void *probe_private)
+{
+        int nr_probes = 0, nr_del = 0, i;
+        struct marker_probe_closure *old, *new;
+        old = entry->multi;
+        debug_print_probes(entry);
+        if (!entry->ptype) {
+                /* 0 -> N is an error */
+                WARN_ON(entry->single.func == __mark_empty_function);
+                /* 1 -> 0 probes */
+                WARN_ON(probe && entry->single.func != probe);
+                WARN_ON(entry->single.probe_private != probe_private);
+                entry->single.func = __mark_empty_function;
+                entry->refcount = 0;
+                entry->ptype = 0;
+                debug_print_probes(entry);
+                return NULL;
+        } else {
+                /* (N -> M), (N > 1, M >= 0) probes */
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+                        if ((!probe || old[nr_probes].func == probe)
+                                        && old[nr_probes].probe_private
+                                                == probe_private)
+                                nr_del++;
+                }
+        }
+        if (nr_probes - nr_del == 0) {
+                /* N -> 0, (N > 1) */
+                entry->single.func = __mark_empty_function;
+                entry->refcount = 0;
+                entry->ptype = 0;
+        } else if (nr_probes - nr_del == 1) {
+                /* N -> 1, (N > 1) */
+                for (i = 0; old[i].func; i++)
+                        if ((probe && old[i].func != probe) ||
+                                        old[i].probe_private != probe_private)
+                                entry->single = old[i];
+                entry->refcount = 1;
+                entry->ptype = 0;
+        } else {
+                int j = 0;
+                /* N -> M, (N > 1, M > 1) */
+                /* + 1 for NULL */
+                new = kzalloc((nr_probes - nr_del + 1)
+                        * sizeof(struct marker_probe_closure), GFP_KERNEL);
+                if (new == NULL)
+                        return ERR_PTR(-ENOMEM);
+                for (i = 0; old[i].func; i++)
+                        if ((probe && old[i].func != probe) ||
+                                        old[i].probe_private != probe_private)
+                                new[j++] = old[i];
+                entry->refcount = nr_probes - nr_del;
+                entry->ptype = 1;
+                entry->multi = new;
+        }
+        debug_print_probes(entry);
+        return old;
+}
+/*
 * Get marker if the marker is present in the marker hash table.
 * Must be called with markers_mutex held.
 * Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
 * Add the marker to the marker hash table. Must be called with markers_mutex
 * held.
 */
-static int add_marker(const char *name, const char *format,
+static struct marker_entry *add_marker(const char *name, const char *format)
-        marker_probe_func *probe, void *private)
 {
        struct hlist_head *head;
        struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
        hlist_for_each_entry(e, node, head, hlist) {
                if (!strcmp(name, e->name)) {
                        printk(KERN_NOTICE
-                                "Marker %s busy, probe %p already installed\n",
+                                "Marker %s busy\n", name);
-                                name, e->probe);
+                        return ERR_PTR(-EBUSY); /* Already there */
-                        return -EBUSY;  /* Already there */
                }
        }
        /*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
        e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
                        GFP_KERNEL);
        if (!e)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        memcpy(&e->name[0], name, name_len);
        if (format) {
                e->format = &e->name[name_len];
                memcpy(e->format, format, format_len);
+                if (strcmp(e->format, MARK_NOARGS) == 0)
+                        e->call = marker_probe_cb_noarg;
+                else
+                        e->call = marker_probe_cb;
                trace_mark(core_marker_format, "name %s format %s",
                                e->name, e->format);
-        } else
+        } else {
                e->format = NULL;
-        e->probe = probe;
+                e->call = marker_probe_cb;
-        e->private = private;
+        }
+        e->single.func = __mark_empty_function;
+        e->single.probe_private = NULL;
+        e->multi = NULL;
+        e->ptype = 0;
        e->refcount = 0;
+        e->rcu_pending = 0;
        hlist_add_head(&e->hlist, head);
-        return 0;
+        return e;
 }
 /*
 * Remove the marker from the marker hash table. Must be called with mutex_lock
 * held.
 */
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
 {
        struct hlist_head *head;
        struct hlist_node *node;
        struct marker_entry *e;
        int found = 0;
        size_t len = strlen(name) + 1;
-        void *private = NULL;
        u32 hash = jhash(name, len-1, 0);
        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
                        break;
                }
        }
-        if (found) {
+        if (!found)
-                private = e->private;
+                return -ENOENT;
-                hlist_del(&e->hlist);
+        if (e->single.func != __mark_empty_function)
-                kfree(e);
+                return -EBUSY;
-        }
+        hlist_del(&e->hlist);
-        return private;
+        /* Make sure the call_rcu has been executed */
+        if (e->rcu_pending)
+                rcu_barrier();
+        kfree(e);
+        return 0;
 }
 /*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        size_t name_len = strlen((*entry)->name) + 1;
        size_t format_len = strlen(format) + 1;
        e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
                        GFP_KERNEL);
        if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        memcpy(&e->name[0], (*entry)->name, name_len);
        e->format = &e->name[name_len];
        memcpy(e->format, format, format_len);
-        e->probe = (*entry)->probe;
+        if (strcmp(e->format, MARK_NOARGS) == 0)
-        e->private = (*entry)->private;
+                e->call = marker_probe_cb_noarg;
+        else
+                e->call = marker_probe_cb;
+        e->single = (*entry)->single;
+        e->multi = (*entry)->multi;
+        e->ptype = (*entry)->ptype;
        e->refcount = (*entry)->refcount;
+        e->rcu_pending = 0;
        hlist_add_before(&e->hlist, &(*entry)->hlist);
        hlist_del(&(*entry)->hlist);
+        /* Make sure the call_rcu has been executed */
+        if ((*entry)->rcu_pending)
+                rcu_barrier();
        kfree(*entry);
        *entry = e;
        trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 /*
 * Sets the probe callback corresponding to one marker.
 */
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+                int active)
 {
        int ret;
        WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
                if (ret)
                        return ret;
        }
-        elem->call = (*entry)->probe;
-        elem->private = (*entry)->private;
+        /*
-        elem->state = 1;
+         * probe_cb setup (statically known) is done here. It is
+         * asynchronous with the rest of execution, therefore we only
+         * pass from a "safe" callback (with argument) to an "unsafe"
+         * callback (does not set arguments).
+         */
+        elem->call = (*entry)->call;
+        /*
+         * Sanity check :
+         * We only update the single probe private data when the ptr is
+         * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+         */
+        WARN_ON(elem->single.func != __mark_empty_function
+                && elem->single.probe_private
+                != (*entry)->single.probe_private &&
+                !elem->ptype);
+        elem->single.probe_private = (*entry)->single.probe_private;
+        /*
+         * Make sure the private data is valid when we update the
+         * single probe ptr.
+         */
+        smp_wmb();
+        elem->single.func = (*entry)->single.func;
+        /*
+         * We also make sure that the new probe callbacks array is consistent
+         * before setting a pointer to it.
+         */
+        rcu_assign_pointer(elem->multi, (*entry)->multi);
+        /*
+         * Update the function or multi probe array pointer before setting the
+         * ptype.
+         */
+        smp_wmb();
+        elem->ptype = (*entry)->ptype;
+        elem->state = active;
        return 0;
 }
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
 */
 static void disable_marker(struct marker *elem)
 {
+        /* leave "call" as is. It is known statically. */
        elem->state = 0;
-        elem->call = __mark_empty_function;
+        elem->single.func = __mark_empty_function;
+        /* Update the function before setting the ptype */
+        smp_wmb();
+        elem->ptype = 0;        /* single probe */
        /*
         * Leave the private data and id there, because removal is racy and
         * should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
 * marker_update_probe_range - Update a probe range
 * @begin: beginning of the range
 * @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
 *
 * Updates the probe callback corresponding to a range of markers.
 */
 void marker_update_probe_range(struct marker *begin,
-        struct marker *end, struct module *probe_module,
+        struct marker *end)
-        int *refcount)
 {
        struct marker *iter;
        struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
        mutex_lock(&markers_mutex);
        for (iter = begin; iter < end; iter++) {
                mark_entry = get_marker(iter->name);
-                if (mark_entry && mark_entry->refcount) {
+                if (mark_entry) {
-                        set_marker(&mark_entry, iter);
+                        set_marker(&mark_entry, iter,
+                                        !!mark_entry->refcount);
                        /*
                         * ignore error, continue
                         */
-                        if (probe_module)
-                                if (probe_module ==
-                        __module_text_address((unsigned long)mark_entry->probe))
-                                        (*refcount)++;
                } else {
                        disable_marker(iter);
                }
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
 * Issues a synchronize_sched() when no reference to the module passed
 * as parameter is found in the probes so the probe module can be
 * safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions.  All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
 */
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
 {
-        int refcount = 0;
        /* Core kernel markers */
-        marker_update_probe_range(__start___markers,
+        marker_update_probe_range(__start___markers, __stop___markers);
-                        __stop___markers, probe_module, &refcount);
        /* Markers in modules. */
-        module_update_markers(probe_module, &refcount);
+        module_update_markers();
-        if (probe_module && refcount == 0) {
-                synchronize_sched();
-                deferred_sync = 0;
-        }
 }
 /**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
 * @name: marker name
 * @format: format string
 * @probe: probe handler
- * @private: probe private data
+ * @probe_private: probe private data
 *
 * private data must be a valid allocated memory address, or NULL.
 * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
 */
 int marker_probe_register(const char *name, const char *format,
-                        marker_probe_func *probe, void *private)
+                        marker_probe_func *probe, void *probe_private)
 {
        struct marker_entry *entry;
        int ret = 0;
+        struct marker_probe_closure *old;
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
-        if (entry && entry->refcount) {
+        if (!entry) {
-                ret = -EBUSY;
+                entry = add_marker(name, format);
-                goto end;
+                if (IS_ERR(entry)) {
-        }
+                        ret = PTR_ERR(entry);
-        if (deferred_sync) {
+                        goto end;
-                synchronize_sched();
+                }
-                deferred_sync = 0;
        }
-        ret = add_marker(name, format, probe, private);
+        /*
-        if (ret)
+         * If we detect that a call_rcu is pending for this marker,
+         * make sure it's executed now.
+         */
+        if (entry->rcu_pending)
+                rcu_barrier();
+        old = marker_entry_add_probe(entry, probe, probe_private);
+        if (IS_ERR(old)) {
+                ret = PTR_ERR(old);
                goto end;
+        }
        mutex_unlock(&markers_mutex);
-        marker_update_probes(NULL);
+        marker_update_probes();         /* may update entry */
-        return ret;
+        mutex_lock(&markers_mutex);
+        entry = get_marker(name);
+        WARN_ON(!entry);
+        entry->oldptr = old;
+        entry->rcu_pending = 1;
+        /* write rcu_pending before calling the RCU callback */
+        smp_wmb();
+        call_rcu(&entry->rcu, free_old_closure);
 end:
        mutex_unlock(&markers_mutex);
        return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
 /**
 * marker_probe_unregister -  Disconnect a probe from a marker
 * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
 *
 * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
 */
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+        marker_probe_func *probe, void *probe_private)
 {
-        struct module *probe_module;
        struct marker_entry *entry;
-        void *private;
+        struct marker_probe_closure *old;
+        int ret = 0;
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
        if (!entry) {
-                private = ERR_PTR(-ENOENT);
+                ret = -ENOENT;
                goto end;
        }
-        entry->refcount = 0;
+        if (entry->rcu_pending)
-        /* In what module is the probe handler ? */
+                rcu_barrier();
-        probe_module = __module_text_address((unsigned long)entry->probe);
+        old = marker_entry_remove_probe(entry, probe, probe_private);
-        private = remove_marker(name);
-        deferred_sync = 1;
        mutex_unlock(&markers_mutex);
-        marker_update_probes(probe_module);
+        marker_update_probes();         /* may update entry */
-        return private;
+        mutex_lock(&markers_mutex);
+        entry = get_marker(name);
+        entry->oldptr = old;
+        entry->rcu_pending = 1;
+        /* write rcu_pending before calling the RCU callback */
+        smp_wmb();
+        call_rcu(&entry->rcu, free_old_closure);
+        remove_marker(name);    /* Ignore busy error message */
 end:
        mutex_unlock(&markers_mutex);
-        return private;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(marker_probe_unregister);
-/**
+static struct marker_entry *
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
 {
-        struct module *probe_module;
-        struct hlist_head *head;
-        struct hlist_node *node;
        struct marker_entry *entry;
-        int found = 0;
        unsigned int i;
+        struct hlist_head *head;
+        struct hlist_node *node;
-        mutex_lock(&markers_mutex);
        for (i = 0; i < MARKER_TABLE_SIZE; i++) {
                head = &marker_table[i];
                hlist_for_each_entry(entry, node, head, hlist) {
-                        if (entry->private == private) {
+                        if (!entry->ptype) {
-                                found = 1;
+                                if (entry->single.func == probe
-                                goto iter_end;
+                                                && entry->single.probe_private
+                                                == probe_private)
+                                        return entry;
+                        } else {
+                                struct marker_probe_closure *closure;
+                                closure = entry->multi;
+                                for (i = 0; closure[i].func; i++) {
+                                        if (closure[i].func == probe &&
+                                                        closure[i].probe_private
+                                                        == probe_private)
+                                                return entry;
+                                }
                        }
                }
        }
-iter_end:
+        return NULL;
-        if (!found) {
-                private = ERR_PTR(-ENOENT);
-                goto end;
-        }
-        entry->refcount = 0;
-        /* In what module is the probe handler ? */
-        probe_module = __module_text_address((unsigned long)entry->probe);
-        private = remove_marker(entry->name);
-        deferred_sync = 1;
-        mutex_unlock(&markers_mutex);
-        marker_update_probes(probe_module);
-        return private;
-end:
-        mutex_unlock(&markers_mutex);
-        return private;
 }
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 /**
- * marker_arm - Arm a marker
+ * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @name: marker name
+ * @probe: probe function
+ * @probe_private: probe private data
 *
- * Activate a marker. It keeps a reference count of the number of
+ * Unregister a probe by providing the registered private data.
- * arming/disarming done.
+ * Only removes the first marker found in hash table.
- * Returns 0 if ok, error value on error.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
 */
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+                void *probe_private)
 {
        struct marker_entry *entry;
        int ret = 0;
+        struct marker_probe_closure *old;
        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
+        entry = get_marker_from_private_data(probe, probe_private);
        if (!entry) {
                ret = -ENOENT;
                goto end;
        }
-        /*
+        if (entry->rcu_pending)
-         * Only need to update probes when refcount passes from 0 to 1.
+                rcu_barrier();
-         */
+        old = marker_entry_remove_probe(entry, NULL, probe_private);
-        if (entry->refcount++)
-                goto end;
-end:
        mutex_unlock(&markers_mutex);
-        marker_update_probes(NULL);
+        marker_update_probes();         /* may update entry */
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
-        struct marker_entry *entry;
-        int ret = 0;
        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
+        entry = get_marker_from_private_data(probe, probe_private);
-        if (!entry) {
+        WARN_ON(!entry);
-                ret = -ENOENT;
+        entry->oldptr = old;
-                goto end;
+        entry->rcu_pending = 1;
-        }
+        /* write rcu_pending before calling the RCU callback */
-        /*
+        smp_wmb();
-         * Only permit decrement refcount if higher than 0.
+        call_rcu(&entry->rcu, free_old_closure);
-         * Do probe update only on 1 -> 0 transition.
+        remove_marker(entry->name);     /* Ignore busy error message */
-         */
-        if (entry->refcount) {
-                if (--entry->refcount)
-                        goto end;
-        } else {
-                ret = -EPERM;
-                goto end;
-        }
 end:
        mutex_unlock(&markers_mutex);
-        marker_update_probes(NULL);
        return ret;
 }
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 /**
 * marker_get_private_data - Get a marker's probe private data
 * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
 *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
 * Returns the private data pointer, or an ERR_PTR.
 * The private data pointer should _only_ be dereferenced if the caller is the
 * owner of the data, or its content could vanish. This is mostly used to
 * confirm that a caller is the owner of a registered probe.
 */
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+                int num)
 {
        struct hlist_head *head;
        struct hlist_node *node;
        struct marker_entry *e;
        size_t name_len = strlen(name) + 1;
        u32 hash = jhash(name, name_len-1, 0);
-        int found = 0;
+        int i;
        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
        hlist_for_each_entry(e, node, head, hlist) {
                if (!strcmp(name, e->name)) {
-                        found = 1;
+                        if (!e->ptype) {
-                        return e->private;
+                                if (num == 0 && e->single.func == probe)
+                                        return e->single.probe_private;
+                                else
+                                        break;
+                        } else {
+                                struct marker_probe_closure *closure;
+                                int match = 0;
+                                closure = e->multi;
+                                for (i = 0; closure[i].func; i++) {
+                                        if (closure[i].func != probe)
+                                                continue;
+                                        if (match++ == num)
+                                                return closure[i].probe_private;
+                                }
+                        }
                }
        }
        return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index 4202da97a1da..92595bad3812 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2038,7 +2038,7 @@ static struct module *load_module(void __user *umod,
 #ifdef CONFIG_MARKERS
        if (!mod->taints)
                marker_update_probe_range(mod->markers,
-                        mod->markers + mod->num_markers, NULL, NULL);
+                        mod->markers + mod->num_markers);
 #endif
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
@@ -2564,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
 #endif
 #ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
 {
        struct module *mod;
@@ -2572,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
        list_for_each_entry(mod, &modules, list)
                if (!mod->taints)
                        marker_update_probe_range(mod->markers,
-                                mod->markers + mod->num_markers,
+                                mod->markers + mod->num_markers);
-                                probe_module, refcount);
        mutex_unlock(&module_mutex);
 }
 #endif
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a00..c09605f8d16c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
-/* Because of FASTCALL declaration of complete, we use this wrapper */
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
 static void wakeme_after_rcu(struct rcu_head  *head)
 {
        struct rcu_synchronize *rcu;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d2..6522ae5b14a2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        set_current_state(state);
        /* Setup the timer, when timeout != NULL */
-        if (unlikely(timeout))
+        if (unlikely(timeout)) {
                hrtimer_start(&timeout->timer, timeout->timer.expires,
                              HRTIMER_MODE_ABS);
+                if (!hrtimer_active(&timeout->timer))
+                        timeout->task = NULL;
+        }
        for (;;) {
                /* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3eedd5260907..f28f19e65b59 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
        struct list_head queue[MAX_RT_PRIO];
 };
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 #include <linux/cgroup.h>
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
        struct cgroup_subsys_state css;
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
-        struct sched_rt_entity **rt_se;
-        struct rt_rq **rt_rq;
-        unsigned int rt_ratio;
        /*
         * shares assigned to a task group governs how much of cpu bandwidth
         * is allocated to the group. The more shares a group has, the more is
@@ -213,33 +210,46 @@ struct task_group {
         *
         */
        unsigned long shares;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct sched_rt_entity **rt_se;
+        struct rt_rq **rt_rq;
+        u64 rt_runtime;
+#endif
        struct rcu_head rcu;
        struct list_head list;
 };
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
-/* task_group_mutex serializes add/remove of task groups and also changes to
+/* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
 */
-static DEFINE_MUTEX(task_group_mutex);
+static DEFINE_SPINLOCK(task_group_lock);
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
+#ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
 /* kernel thread that runs rebalance_shares() periodically */
 static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
 static void set_se_shares(struct sched_entity *se, unsigned long shares);
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
+#else
+# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
+#endif
+#define MIN_GROUP_SHARES        2
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
 /* Default task group.
 *      Every task in system belong to this group at bootup.
 */
 struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        .se     = init_sched_entity_p,
        .cfs_rq = init_cfs_rq_p,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        .rt_se  = init_sched_rt_entity_p,
        .rt_rq  = init_rt_rq_p,
-};
-#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else
-# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 #endif
+};
-#define MIN_GROUP_SHARES        2
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
        struct task_group *tg;
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
        tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
 #else
@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
        p->se.parent = task_group(p)->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
        p->rt.parent = task_group(p)->rt_se[cpu];
-}
+#endif
-static inline void lock_task_group_list(void)
-{
-        mutex_lock(&task_group_mutex);
-}
-static inline void unlock_task_group_list(void)
-{
-        mutex_unlock(&task_group_mutex);
 }
 static inline void lock_doms_cur(void)
@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_task_group_list(void) { }
-static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
 static inline void unlock_doms_cur(void) { }
-#endif  /* CONFIG_FAIR_GROUP_SCHED */
+#endif  /* CONFIG_GROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -363,7 +370,7 @@ struct cfs_rq {
 struct rt_rq {
        struct rt_prio_array active;
        unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        int highest_prio; /* highest queued rt task prio */
 #endif
 #ifdef CONFIG_SMP
@@ -373,7 +380,9 @@ struct rt_rq {
        int rt_throttled;
        u64 rt_time;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+        unsigned long rt_nr_boosted;
        struct rq *rq;
        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
@@ -447,6 +456,8 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        struct list_head leaf_rt_rq_list;
 #endif
@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
-#define SCHED_RT_FRAC_SHIFT     16
+/*
-#define SCHED_RT_FRAC           (1UL << SCHED_RT_FRAC_SHIFT)
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
 /*
- * ratio of time -rt tasks may consume.
+ * single value that denotes runtime == period, ie unlimited time.
- * default: 95%
 */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF     ((u64)~0ULL)
 /*
 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -4571,6 +4584,15 @@ recheck:
                        return -EPERM;
        }
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Do not allow realtime tasks into groups that have no runtime
+         * assigned.
+         */
+        if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+                return -EPERM;
+#endif
        retval = security_task_setscheduler(p, policy, param);
        if (retval)
                return retval;
@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+        rt_rq->rt_nr_boosted = 0;
        rt_rq->rq = rq;
 #endif
 }
@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
        se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
        se->parent = NULL;
 }
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
                struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
                int cpu, int add)
@@ -7175,7 +7200,7 @@ void __init sched_init(void)
        init_defrootdomain();
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
 #endif
@@ -7196,7 +7221,10 @@ void __init sched_init(void)
                                &per_cpu(init_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1);
-                init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+                init_task_group.rt_runtime =
+                        sysctl_sched_rt_runtime * NSEC_PER_USEC;
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                init_tg_rt_entry(rq, &init_task_group,
                                &per_cpu(init_rt_rq, i),
@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
        unsigned long flags;
        struct rq *rq;
-        read_lock_irq(&tasklist_lock);
+        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
                /*
                 * Only normalize user tasks:
@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
                        continue;
                }
-                spin_lock_irqsave(&p->pi_lock, flags);
+                spin_lock(&p->pi_lock);
                rq = __task_rq_lock(p);
                normalize_task(rq, p);
                __task_rq_unlock(rq);
-                spin_unlock_irqrestore(&p->pi_lock, flags);
+                spin_unlock(&p->pi_lock);
        } while_each_thread(g, p);
-        read_unlock_irq(&tasklist_lock);
+        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
-#ifdef CONFIG_SMP
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
 /*
 * distribute shares of all task groups among their schedulable entities,
 * to reflect load distribution across cpus.
@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
 }
 #endif  /* CONFIG_SMP */
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
 {
        int i;
@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
                        kfree(tg->cfs_rq[i]);
                if (tg->se)
                        kfree(tg->se[i]);
-                if (tg->rt_rq)
-                        kfree(tg->rt_rq[i]);
-                if (tg->rt_se)
-                        kfree(tg->rt_se[i]);
        }
        kfree(tg->cfs_rq);
        kfree(tg->se);
-        kfree(tg->rt_rq);
-        kfree(tg->rt_se);
-        kfree(tg);
 }
-/* allocate runqueue etc for a new task group */
+static int alloc_fair_sched_group(struct task_group *tg)
-struct task_group *sched_create_group(void)
 {
-        struct task_group *tg;
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
-        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
-        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
-        if (!tg)
-                return ERR_PTR(-ENOMEM);
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
        if (!tg->cfs_rq)
                goto err;
        tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
        if (!tg->se)
                goto err;
-        tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
-        if (!tg->rt_rq)
-                goto err;
-        tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
-        if (!tg->rt_se)
-                goto err;
        tg->shares = NICE_0_LOAD;
-        tg->rt_ratio = 0; /* XXX */
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
                if (!se)
                        goto err;
+                init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+        }
+        return 1;
+ err:
+        return 0;
+}
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+        list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+                        &cpu_rq(cpu)->leaf_cfs_rq_list);
+}
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+        return 1;
+}
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                if (tg->rt_rq)
+                        kfree(tg->rt_rq[i]);
+                if (tg->rt_se)
+                        kfree(tg->rt_se[i]);
+        }
+        kfree(tg->rt_rq);
+        kfree(tg->rt_se);
+}
+static int alloc_rt_sched_group(struct task_group *tg)
+{
+        struct rt_rq *rt_rq;
+        struct sched_rt_entity *rt_se;
+        struct rq *rq;
+        int i;
+        tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+        if (!tg->rt_rq)
+                goto err;
+        tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+        if (!tg->rt_se)
+                goto err;
+        tg->rt_runtime = 0;
+        for_each_possible_cpu(i) {
+                rq = cpu_rq(i);
                rt_rq = kmalloc_node(sizeof(struct rt_rq),
                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
                if (!rt_rq)
@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
                if (!rt_se)
                        goto err;
-                init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
                init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
        }
-        lock_task_group_list();
+        return 1;
+ err:
+        return 0;
+}
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+        list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+                        &cpu_rq(cpu)->leaf_rt_rq_list);
+}
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+        return 1;
+}
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+static void free_sched_group(struct task_group *tg)
+{
+        free_fair_sched_group(tg);
+        free_rt_sched_group(tg);
+        kfree(tg);
+}
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+        struct task_group *tg;
+        unsigned long flags;
+        int i;
+        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+        if (!tg)
+                return ERR_PTR(-ENOMEM);
+        if (!alloc_fair_sched_group(tg))
+                goto err;
+        if (!alloc_rt_sched_group(tg))
+                goto err;
+        spin_lock_irqsave(&task_group_lock, flags);
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
+                register_fair_sched_group(tg, i);
-                cfs_rq = tg->cfs_rq[i];
+                register_rt_sched_group(tg, i);
-                list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-                rt_rq = tg->rt_rq[i];
-                list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        }
        list_add_rcu(&tg->list, &task_groups);
-        unlock_task_group_list();
+        spin_unlock_irqrestore(&task_group_lock, flags);
        return tg;
@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
-        struct cfs_rq *cfs_rq = NULL;
+        unsigned long flags;
-        struct rt_rq *rt_rq = NULL;
        int i;
-        lock_task_group_list();
+        spin_lock_irqsave(&task_group_lock, flags);
        for_each_possible_cpu(i) {
-                cfs_rq = tg->cfs_rq[i];
+                unregister_fair_sched_group(tg, i);
-                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                unregister_rt_sched_group(tg, i);
-                rt_rq = tg->rt_rq[i];
-                list_del_rcu(&rt_rq->leaf_rt_rq_list);
        }
        list_del_rcu(&tg->list);
-        unlock_task_group_list();
+        spin_unlock_irqrestore(&task_group_lock, flags);
-        BUG_ON(!cfs_rq);
        /* wait for possible concurrent references to cfs_rqs complete */
        call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
        task_rq_unlock(rq, &flags);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        }
 }
+static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
        int i;
-        struct cfs_rq *cfs_rq;
+        unsigned long flags;
-        struct rq *rq;
-        lock_task_group_list();
+        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
                goto done;
@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * load_balance_fair) from referring to this group first,
         * by taking it off the rq->leaf_cfs_rq_list on each cpu.
         */
-        for_each_possible_cpu(i) {
+        spin_lock_irqsave(&task_group_lock, flags);
-                cfs_rq = tg->cfs_rq[i];
+        for_each_possible_cpu(i)
-                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                unregister_fair_sched_group(tg, i);
-        }
+        spin_unlock_irqrestore(&task_group_lock, flags);
        /* wait for any ongoing reference to this group to finish */
        synchronize_sched();
@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * Enable load balance activity on this group, by inserting it back on
         * each cpu's rq->leaf_cfs_rq_list.
         */
-        for_each_possible_cpu(i) {
+        spin_lock_irqsave(&task_group_lock, flags);
-                rq = cpu_rq(i);
+        for_each_possible_cpu(i)
-                cfs_rq = tg->cfs_rq[i];
+                register_fair_sched_group(tg, i);
-                list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+        spin_unlock_irqrestore(&task_group_lock, flags);
-        }
 done:
-        unlock_task_group_list();
+        mutex_unlock(&shares_mutex);
        return 0;
 }
@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
 {
        return tg->shares;
 }
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 /*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
 */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+        if (runtime == RUNTIME_INF)
+                return 1ULL << 16;
+        runtime *= (1ULL << 16);
+        div64_64(runtime, period);
+        return runtime;
+}
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
        struct task_group *tgi;
        unsigned long total = 0;
+        unsigned long global_ratio =
+                to_ratio(sysctl_sched_rt_period,
+                         sysctl_sched_rt_runtime < 0 ?
+                                RUNTIME_INF : sysctl_sched_rt_runtime);
        rcu_read_lock();
-        list_for_each_entry_rcu(tgi, &task_groups, list)
+        list_for_each_entry_rcu(tgi, &task_groups, list) {
-                total += tgi->rt_ratio;
+                if (tgi == tg)
-        rcu_read_unlock();
+                        continue;
-        if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
+                total += to_ratio(period, tgi->rt_runtime);
-                return -EINVAL;
+        }
+        rcu_read_unlock();
-        tg->rt_ratio = rt_ratio;
+        return total + to_ratio(period, runtime) < global_ratio;
-        return 0;
 }
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
-        return tg->rt_ratio;
+        u64 rt_runtime, rt_period;
+        int err = 0;
+        rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+        if (rt_runtime_us == -1)
+                rt_runtime = rt_period;
+        mutex_lock(&rt_constraints_mutex);
+        if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+                err = -EINVAL;
+                goto unlock;
+        }
+        if (rt_runtime_us == -1)
+                rt_runtime = RUNTIME_INF;
+        tg->rt_runtime = rt_runtime;
+ unlock:
+        mutex_unlock(&rt_constraints_mutex);
+        return err;
 }
-#endif  /* CONFIG_FAIR_GROUP_SCHED */
+long sched_group_rt_runtime(struct task_group *tg)
+{
+        u64 rt_runtime_us;
+        if (tg->rt_runtime == RUNTIME_INF)
+                return -1;
+        rt_runtime_us = tg->rt_runtime;
+        do_div(rt_runtime_us, NSEC_PER_USEC);
+        return rt_runtime_us;
+}
+#endif
+#endif  /* CONFIG_GROUP_SCHED */
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +8037,15 @@ static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                      struct task_struct *tsk)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+        /* Don't accept realtime tasks when there is no way for them to run */
+        if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+                return -EINVAL;
+#else
        /* We don't support RT-tasks being in separate groups */
        if (tsk->sched_class != &fair_sched_class)
                return -EINVAL;
+#endif
        return 0;
 }
@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        sched_move_task(tsk);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
        return (u64) tg->shares;
 }
+#endif
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+#ifdef CONFIG_RT_GROUP_SCHED
-                u64 rt_ratio_val)
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+                                struct file *file,
+                                const char __user *userbuf,
+                                size_t nbytes, loff_t *unused_ppos)
 {
-        return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+        char buffer[64];
+        int retval = 0;
+        s64 val;
+        char *end;
+        if (!nbytes)
+                return -EINVAL;
+        if (nbytes >= sizeof(buffer))
+                return -E2BIG;
+        if (copy_from_user(buffer, userbuf, nbytes))
+                return -EFAULT;
+        buffer[nbytes] = 0;     /* nul-terminate */
+        /* strip newline if necessary */
+        if (nbytes && (buffer[nbytes-1] == '\n'))
+                buffer[nbytes-1] = 0;
+        val = simple_strtoll(buffer, &end, 0);
+        if (*end)
+                return -EINVAL;
+        /* Pass to subsystem */
+        retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+        if (!retval)
+                retval = nbytes;
+        return retval;
 }
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+                                   struct file *file,
+                                   char __user *buf, size_t nbytes,
+                                   loff_t *ppos)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        char tmp[64];
+        long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+        int len = sprintf(tmp, "%ld\n", val);
-        return (u64) tg->rt_ratio;
+        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+#endif
 static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .name = "shares",
                .read_uint = cpu_shares_read_uint,
                .write_uint = cpu_shares_write_uint,
        },
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
        {
-                .name = "rt_ratio",
+                .name = "rt_runtime_us",
-                .read_uint = cpu_rt_ratio_read_uint,
+                .read = cpu_rt_runtime_read,
-                .write_uint = cpu_rt_ratio_write_uint,
+                .write = cpu_rt_runtime_write,
        },
+#endif
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .early_init     = 1,
 };
-#endif  /* CONFIG_FAIR_CGROUP_SCHED */
+#endif  /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_CGROUP_CPUACCT
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef2..f54792b175b2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
        return !list_empty(&rt_se->run_list);
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
        if (!rt_rq->tg)
-                return SCHED_RT_FRAC;
+                return RUNTIME_INF;
-        return rt_rq->tg->rt_ratio;
+        return rt_rq->tg->rt_runtime;
 }
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
        struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
        }
 }
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
        struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
                dequeue_rt_entity(rt_se);
 }
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+        struct rt_rq *rt_rq = group_rt_rq(rt_se);
+        struct task_struct *p;
+        if (rt_rq)
+                return !!rt_rq->rt_nr_boosted;
+        p = rt_task_of(rt_se);
+        return p->prio != p->normal_prio;
+}
 #else
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-        return sysctl_sched_rt_ratio;
+        if (sysctl_sched_rt_runtime == -1)
+                return RUNTIME_INF;
+        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return NULL;
 }
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 }
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled;
+}
 #endif
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
        if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
        return rt_task_of(rt_se)->prio;
 }
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
-        unsigned int rt_ratio = sched_rt_ratio(rt_rq);
+        u64 runtime = sched_rt_runtime(rt_rq);
-        u64 period, ratio;
-        if (rt_ratio == SCHED_RT_FRAC)
+        if (runtime == RUNTIME_INF)
                return 0;
        if (rt_rq->rt_throttled)
-                return 1;
+                return rt_rq_throttled(rt_rq);
-        period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-        ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-        if (rt_rq->rt_time > ratio) {
+        if (rt_rq->rt_time > runtime) {
                struct rq *rq = rq_of_rt_rq(rt_rq);
                rq->rt_throttled = 1;
                rt_rq->rt_throttled = 1;
-                sched_rt_ratio_dequeue(rt_rq);
+                if (rt_rq_throttled(rt_rq)) {
-                return 1;
+                        sched_rt_rq_dequeue(rt_rq);
+                        return 1;
+                }
        }
        return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
        u64 period;
        while (rq->clock > rq->rt_period_expire) {
-                period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+                period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
                rq->rt_period_expire += period;
                for_each_leaf_rt_rq(rt_rq, rq) {
-                        unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+                        u64 runtime = sched_rt_runtime(rt_rq);
-                        u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-                        rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+                        rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
-                        if (rt_rq->rt_throttled) {
+                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                                rt_rq->rt_throttled = 0;
-                                sched_rt_ratio_enqueue(rt_rq);
+                                sched_rt_rq_enqueue(rt_rq);
                        }
                }
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
        cpuacct_charge(curr, delta_exec);
        rt_rq->rt_time += delta_exec;
-        /*
+        if (sched_rt_runtime_exceeded(rt_rq))
-         * might make it a tad more accurate:
-         *
-         * update_sched_rt_period(rq);
-         */
-        if (sched_rt_ratio_exceeded(rt_rq))
                resched_task(curr);
 }
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        if (rt_se_prio(rt_se) < rt_rq->highest_prio)
                rt_rq->highest_prio = rt_se_prio(rt_se);
 #endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        if (rt_se_boosted(rt_se))
+                rt_rq->rt_nr_boosted++;
+#endif
 }
 static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        if (rt_rq->rt_nr_running) {
                struct rt_prio_array *array;
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_RT_GROUP_SCHED
+        if (rt_se_boosted(rt_se))
+                rt_rq->rt_nr_boosted--;
+        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
        struct rt_prio_array *array = &rt_rq->active;
        struct rt_rq *group_rq = group_rt_rq(rt_se);
-        if (group_rq && group_rq->rt_throttled)
+        if (group_rq && rt_rq_throttled(group_rq))
                return;
        list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
        if (unlikely(!rt_rq->rt_nr_running))
                return NULL;
-        if (sched_rt_ratio_exceeded(rt_rq))
+        if (rt_rq_throttled(rt_rq))
                return NULL;
        do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c1f08defac2..84917fe507f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
        }
 }
-int fastcall __fatal_signal_pending(struct task_struct *tsk)
+int __fatal_signal_pending(struct task_struct *tsk)
 {
        return sigismember(&tsk->pending.signal, SIGKILL);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d41ef6b4cf72..8b7e95411795 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "sched_rt_period_ms",
-                .data           = &sysctl_sched_rt_period,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "sched_rt_ratio",
-                .data           = &sysctl_sched_rt_ratio,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -348,6 +332,22 @@ static struct ctl_table kern_table[] = {
 #endif
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_rt_period_us",
+                .data           = &sysctl_sched_rt_period,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_rt_runtime_us",
+                .data           = &sysctl_sched_rt_runtime,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_compat_yield",
                .data           = &sysctl_sched_compat_yield,
                .maxlen         = sizeof(unsigned int),
@@ -978,8 +978,8 @@ static struct ctl_table vm_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_overcommit_hugepages",
-                .data           = &nr_overcommit_huge_pages,
+                .data           = &sysctl_overcommit_huge_pages,
-                .maxlen         = sizeof(nr_overcommit_huge_pages),
+                .maxlen         = sizeof(sysctl_overcommit_huge_pages),
                .mode           = 0644,
                .proc_handler   = &hugetlb_overcommit_handler,
        },
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 62b1287932ed..41468035473c 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -339,7 +339,7 @@ sub output($@)
        print "\n";
        foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
-                      'USEC_TO_HZ','HZ_TO_USEC') {
+                      'HZ_TO_USEC','USEC_TO_HZ') {
                foreach $bit (32, 64) {
                        foreach $suf ('MUL', 'ADJ', 'SHR') {
                                printf "#define %-23s %s\n",
diff --git a/kernel/user.c b/kernel/user.c
index 7d7900c5a1fd..7132022a040c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -57,7 +57,7 @@ struct user_struct root_user = {
        .uid_keyring    = &root_user_keyring,
        .session_keyring = &root_session_keyring,
 #endif
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
        .tg             = &init_task_group,
 #endif
 };
@@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
        return NULL;
 }
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 static void sched_destroy_user(struct user_struct *up)
 {
@@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
        sched_move_task(p);
 }
-#else   /* CONFIG_FAIR_USER_SCHED */
+#else   /* CONFIG_USER_SCHED */
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
 static void sched_switch_user(struct task_struct *p) { }
-#endif  /* CONFIG_FAIR_USER_SCHED */
+#endif  /* CONFIG_USER_SCHED */
-#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void)
 }
 /* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static ssize_t cpu_shares_show(struct kobject *kobj,
                               struct kobj_attribute *attr,
                               char *buf)
@@ -163,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
 static struct kobj_attribute cpu_share_attr =
        __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   char *buf)
+{
+        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+        return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    const char *buf, size_t size)
+{
+        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+        unsigned long rt_runtime;
+        int rc;
+        sscanf(buf, "%lu", &rt_runtime);
+        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+        return (rc ? rc : size);
+}
+static struct kobj_attribute cpu_rt_runtime_attr =
+        __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+#endif
 /* default attributes per uid directory */
 static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
        &cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        &cpu_rt_runtime_attr.attr,
+#endif
        NULL
 };
@@ -269,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
        schedule_work(&up->work);
 }
-#else   /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
+#else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -373,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                        /* This case is not possible when CONFIG_FAIR_USER_SCHED
+                        /* This case is not possible when CONFIG_USER_SCHED
                         * is defined, since we serialize alloc_uid() using
                         * uids_mutex. Hence no need to call
                         * sched_destroy_user() or remove_user_sysfs_dir().
author	Christoph Lameter <clameter@sgi.com>	2008-02-14 15:05:41 -0500
committer	Christoph Lameter <clameter@sgi.com>	2008-02-14 15:05:41 -0500
commit	c5974932c1e8514d3478573bb52beebeb2c786dd (patch)
tree	a204156fbb0036fb76e89ceffa15a30e90bc3f75 /kernel
parent	9e40ade04c45a46f6b3d647e0bdac1a32bfaa3a9 (diff)
parent	e760e716d47b48caf98da348368fd41b4a9b9e7e (diff)