aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2008-02-14 15:05:41 -0500
committerChristoph Lameter <clameter@sgi.com>2008-02-14 15:05:41 -0500
commitc5974932c1e8514d3478573bb52beebeb2c786dd (patch)
treea204156fbb0036fb76e89ceffa15a30e90bc3f75 /kernel
parent9e40ade04c45a46f6b3d647e0bdac1a32bfaa3a9 (diff)
parente760e716d47b48caf98da348368fd41b4a9b9e7e (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel')
-rw-r--r--kernel/marker.c677
-rw-r--r--kernel/module.c7
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/rtmutex.c5
-rw-r--r--kernel/sched.c494
-rw-r--r--kernel/sched_rt.c102
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sysctl.c36
-rw-r--r--kernel/timeconst.pl2
-rw-r--r--kernel/user.c50
10 files changed, 1003 insertions, 377 deletions
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbce..c4c2cd8b61f5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
27extern struct marker __start___markers[]; 27extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 28extern struct marker __stop___markers[];
29 29
30/* Set to 1 to enable marker debug output */
31const int marker_debug;
32
30/* 33/*
31 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 34 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
32 * and module markers, the hash table and deferred_sync. 35 * and module markers and the hash table.
33 */ 36 */
34static DEFINE_MUTEX(markers_mutex); 37static DEFINE_MUTEX(markers_mutex);
35 38
36/* 39/*
37 * Marker deferred synchronization.
38 * Upon marker probe_unregister, we delay call to synchronize_sched() to
39 * accelerate mass unregistration (only when there is no more reference to a
40 * given module do we call synchronize_sched()). However, we need to make sure
41 * every critical region has ended before we re-arm a marker that has been
42 * unregistered and then registered back with a different probe data.
43 */
44static int deferred_sync;
45
46/*
47 * Marker hash table, containing the active markers. 40 * Marker hash table, containing the active markers.
48 * Protected by module_mutex. 41 * Protected by module_mutex.
49 */ 42 */
50#define MARKER_HASH_BITS 6 43#define MARKER_HASH_BITS 6
51#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 44#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
52 45
46/*
47 * Note about RCU :
48 * It is used to make sure every handler has finished using its private data
49 * between two consecutive operation (add or remove) on a given marker. It is
50 * also used to delay the free of multiple probes array until a quiescent state
51 * is reached.
52 * marker entries modifications are protected by the markers_mutex.
53 */
53struct marker_entry { 54struct marker_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 char *format; 56 char *format;
56 marker_probe_func *probe; 57 void (*call)(const struct marker *mdata, /* Probe wrapper */
57 void *private; 58 void *call_private, const char *fmt, ...);
59 struct marker_probe_closure single;
60 struct marker_probe_closure *multi;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 61 int refcount; /* Number of times armed. 0 if disarmed. */
62 struct rcu_head rcu;
63 void *oldptr;
64 char rcu_pending:1;
65 char ptype:1;
59 char name[0]; /* Contains name'\0'format'\0' */ 66 char name[0]; /* Contains name'\0'format'\0' */
60}; 67};
61 68
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
63 70
64/** 71/**
65 * __mark_empty_function - Empty probe callback 72 * __mark_empty_function - Empty probe callback
66 * @mdata: pointer of type const struct marker 73 * @probe_private: probe private data
74 * @call_private: call site private data
67 * @fmt: format string 75 * @fmt: format string
68 * @...: variable argument list 76 * @...: variable argument list
69 * 77 *
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
72 * though the function pointer change and the marker enabling are two distinct 80 * though the function pointer change and the marker enabling are two distinct
73 * operations that modifies the execution flow of preemptible code. 81 * operations that modifies the execution flow of preemptible code.
74 */ 82 */
75void __mark_empty_function(const struct marker *mdata, void *private, 83void __mark_empty_function(void *probe_private, void *call_private,
76 const char *fmt, ...) 84 const char *fmt, va_list *args)
77{ 85{
78} 86}
79EXPORT_SYMBOL_GPL(__mark_empty_function); 87EXPORT_SYMBOL_GPL(__mark_empty_function);
80 88
81/* 89/*
90 * marker_probe_cb Callback that prepares the variable argument list for probes.
91 * @mdata: pointer of type struct marker
92 * @call_private: caller site private data
93 * @fmt: format string
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private,
101 const char *fmt, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * disabling preemption to make sure the teardown of the callbacks can
108 * be done correctly when they are in modules and they insure RCU read
109 * coherency.
110 */
111 preempt_disable();
112 ptype = ACCESS_ONCE(mdata->ptype);
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = ACCESS_ONCE(mdata->single.func);
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, fmt);
123 func(mdata->single.probe_private, call_private, fmt, &args);
124 va_end(args);
125 } else {
126 struct marker_probe_closure *multi;
127 int i;
128 /*
129 * multi points to an array, therefore accessing the array
130 * depends on reading multi. However, even in this case,
131 * we must insure that the pointer is read _before_ the array
132 * data. Same as rcu_dereference, but we need a full smp_rmb()
133 * in the fast path, so put the explicit barrier here.
134 */
135 smp_read_barrier_depends();
136 multi = ACCESS_ONCE(mdata->multi);
137 for (i = 0; multi[i].func; i++) {
138 va_start(args, fmt);
139 multi[i].func(multi[i].probe_private, call_private, fmt,
140 &args);
141 va_end(args);
142 }
143 }
144 preempt_enable();
145}
146EXPORT_SYMBOL_GPL(marker_probe_cb);
147
148/*
149 * marker_probe_cb Callback that does not prepare the variable argument list.
150 * @mdata: pointer of type struct marker
151 * @call_private: caller site private data
152 * @fmt: format string
153 * @...: Variable argument list.
154 *
155 * Should be connected to markers "MARK_NOARGS".
156 */
157void marker_probe_cb_noarg(const struct marker *mdata,
158 void *call_private, const char *fmt, ...)
159{
160 va_list args; /* not initialized */
161 char ptype;
162
163 preempt_disable();
164 ptype = ACCESS_ONCE(mdata->ptype);
165 if (likely(!ptype)) {
166 marker_probe_func *func;
167 /* Must read the ptype before ptr. They are not data dependant,
168 * so we put an explicit smp_rmb() here. */
169 smp_rmb();
170 func = ACCESS_ONCE(mdata->single.func);
171 /* Must read the ptr before private data. They are not data
172 * dependant, so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func(mdata->single.probe_private, call_private, fmt, &args);
175 } else {
176 struct marker_probe_closure *multi;
177 int i;
178 /*
179 * multi points to an array, therefore accessing the array
180 * depends on reading multi. However, even in this case,
181 * we must insure that the pointer is read _before_ the array
182 * data. Same as rcu_dereference, but we need a full smp_rmb()
183 * in the fast path, so put the explicit barrier here.
184 */
185 smp_read_barrier_depends();
186 multi = ACCESS_ONCE(mdata->multi);
187 for (i = 0; multi[i].func; i++)
188 multi[i].func(multi[i].probe_private, call_private, fmt,
189 &args);
190 }
191 preempt_enable();
192}
193EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
194
195static void free_old_closure(struct rcu_head *head)
196{
197 struct marker_entry *entry = container_of(head,
198 struct marker_entry, rcu);
199 kfree(entry->oldptr);
200 /* Make sure we free the data before setting the pending flag to 0 */
201 smp_wmb();
202 entry->rcu_pending = 0;
203}
204
205static void debug_print_probes(struct marker_entry *entry)
206{
207 int i;
208
209 if (!marker_debug)
210 return;
211
212 if (!entry->ptype) {
213 printk(KERN_DEBUG "Single probe : %p %p\n",
214 entry->single.func,
215 entry->single.probe_private);
216 } else {
217 for (i = 0; entry->multi[i].func; i++)
218 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
219 entry->multi[i].func,
220 entry->multi[i].probe_private);
221 }
222}
223
224static struct marker_probe_closure *
225marker_entry_add_probe(struct marker_entry *entry,
226 marker_probe_func *probe, void *probe_private)
227{
228 int nr_probes = 0;
229 struct marker_probe_closure *old, *new;
230
231 WARN_ON(!probe);
232
233 debug_print_probes(entry);
234 old = entry->multi;
235 if (!entry->ptype) {
236 if (entry->single.func == probe &&
237 entry->single.probe_private == probe_private)
238 return ERR_PTR(-EBUSY);
239 if (entry->single.func == __mark_empty_function) {
240 /* 0 -> 1 probes */
241 entry->single.func = probe;
242 entry->single.probe_private = probe_private;
243 entry->refcount = 1;
244 entry->ptype = 0;
245 debug_print_probes(entry);
246 return NULL;
247 } else {
248 /* 1 -> 2 probes */
249 nr_probes = 1;
250 old = NULL;
251 }
252 } else {
253 /* (N -> N+1), (N != 0, 1) probes */
254 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
255 if (old[nr_probes].func == probe
256 && old[nr_probes].probe_private
257 == probe_private)
258 return ERR_PTR(-EBUSY);
259 }
260 /* + 2 : one for new probe, one for NULL func */
261 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
262 GFP_KERNEL);
263 if (new == NULL)
264 return ERR_PTR(-ENOMEM);
265 if (!old)
266 new[0] = entry->single;
267 else
268 memcpy(new, old,
269 nr_probes * sizeof(struct marker_probe_closure));
270 new[nr_probes].func = probe;
271 new[nr_probes].probe_private = probe_private;
272 entry->refcount = nr_probes + 1;
273 entry->multi = new;
274 entry->ptype = 1;
275 debug_print_probes(entry);
276 return old;
277}
278
279static struct marker_probe_closure *
280marker_entry_remove_probe(struct marker_entry *entry,
281 marker_probe_func *probe, void *probe_private)
282{
283 int nr_probes = 0, nr_del = 0, i;
284 struct marker_probe_closure *old, *new;
285
286 old = entry->multi;
287
288 debug_print_probes(entry);
289 if (!entry->ptype) {
290 /* 0 -> N is an error */
291 WARN_ON(entry->single.func == __mark_empty_function);
292 /* 1 -> 0 probes */
293 WARN_ON(probe && entry->single.func != probe);
294 WARN_ON(entry->single.probe_private != probe_private);
295 entry->single.func = __mark_empty_function;
296 entry->refcount = 0;
297 entry->ptype = 0;
298 debug_print_probes(entry);
299 return NULL;
300 } else {
301 /* (N -> M), (N > 1, M >= 0) probes */
302 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
303 if ((!probe || old[nr_probes].func == probe)
304 && old[nr_probes].probe_private
305 == probe_private)
306 nr_del++;
307 }
308 }
309
310 if (nr_probes - nr_del == 0) {
311 /* N -> 0, (N > 1) */
312 entry->single.func = __mark_empty_function;
313 entry->refcount = 0;
314 entry->ptype = 0;
315 } else if (nr_probes - nr_del == 1) {
316 /* N -> 1, (N > 1) */
317 for (i = 0; old[i].func; i++)
318 if ((probe && old[i].func != probe) ||
319 old[i].probe_private != probe_private)
320 entry->single = old[i];
321 entry->refcount = 1;
322 entry->ptype = 0;
323 } else {
324 int j = 0;
325 /* N -> M, (N > 1, M > 1) */
326 /* + 1 for NULL */
327 new = kzalloc((nr_probes - nr_del + 1)
328 * sizeof(struct marker_probe_closure), GFP_KERNEL);
329 if (new == NULL)
330 return ERR_PTR(-ENOMEM);
331 for (i = 0; old[i].func; i++)
332 if ((probe && old[i].func != probe) ||
333 old[i].probe_private != probe_private)
334 new[j++] = old[i];
335 entry->refcount = nr_probes - nr_del;
336 entry->ptype = 1;
337 entry->multi = new;
338 }
339 debug_print_probes(entry);
340 return old;
341}
342
343/*
82 * Get marker if the marker is present in the marker hash table. 344 * Get marker if the marker is present in the marker hash table.
83 * Must be called with markers_mutex held. 345 * Must be called with markers_mutex held.
84 * Returns NULL if not present. 346 * Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
102 * Add the marker to the marker hash table. Must be called with markers_mutex 364 * Add the marker to the marker hash table. Must be called with markers_mutex
103 * held. 365 * held.
104 */ 366 */
105static int add_marker(const char *name, const char *format, 367static struct marker_entry *add_marker(const char *name, const char *format)
106 marker_probe_func *probe, void *private)
107{ 368{
108 struct hlist_head *head; 369 struct hlist_head *head;
109 struct hlist_node *node; 370 struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
118 hlist_for_each_entry(e, node, head, hlist) { 379 hlist_for_each_entry(e, node, head, hlist) {
119 if (!strcmp(name, e->name)) { 380 if (!strcmp(name, e->name)) {
120 printk(KERN_NOTICE 381 printk(KERN_NOTICE
121 "Marker %s busy, probe %p already installed\n", 382 "Marker %s busy\n", name);
122 name, e->probe); 383 return ERR_PTR(-EBUSY); /* Already there */
123 return -EBUSY; /* Already there */
124 } 384 }
125 } 385 }
126 /* 386 /*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
130 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, 390 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
131 GFP_KERNEL); 391 GFP_KERNEL);
132 if (!e) 392 if (!e)
133 return -ENOMEM; 393 return ERR_PTR(-ENOMEM);
134 memcpy(&e->name[0], name, name_len); 394 memcpy(&e->name[0], name, name_len);
135 if (format) { 395 if (format) {
136 e->format = &e->name[name_len]; 396 e->format = &e->name[name_len];
137 memcpy(e->format, format, format_len); 397 memcpy(e->format, format, format_len);
398 if (strcmp(e->format, MARK_NOARGS) == 0)
399 e->call = marker_probe_cb_noarg;
400 else
401 e->call = marker_probe_cb;
138 trace_mark(core_marker_format, "name %s format %s", 402 trace_mark(core_marker_format, "name %s format %s",
139 e->name, e->format); 403 e->name, e->format);
140 } else 404 } else {
141 e->format = NULL; 405 e->format = NULL;
142 e->probe = probe; 406 e->call = marker_probe_cb;
143 e->private = private; 407 }
408 e->single.func = __mark_empty_function;
409 e->single.probe_private = NULL;
410 e->multi = NULL;
411 e->ptype = 0;
144 e->refcount = 0; 412 e->refcount = 0;
413 e->rcu_pending = 0;
145 hlist_add_head(&e->hlist, head); 414 hlist_add_head(&e->hlist, head);
146 return 0; 415 return e;
147} 416}
148 417
149/* 418/*
150 * Remove the marker from the marker hash table. Must be called with mutex_lock 419 * Remove the marker from the marker hash table. Must be called with mutex_lock
151 * held. 420 * held.
152 */ 421 */
153static void *remove_marker(const char *name) 422static int remove_marker(const char *name)
154{ 423{
155 struct hlist_head *head; 424 struct hlist_head *head;
156 struct hlist_node *node; 425 struct hlist_node *node;
157 struct marker_entry *e; 426 struct marker_entry *e;
158 int found = 0; 427 int found = 0;
159 size_t len = strlen(name) + 1; 428 size_t len = strlen(name) + 1;
160 void *private = NULL;
161 u32 hash = jhash(name, len-1, 0); 429 u32 hash = jhash(name, len-1, 0);
162 430
163 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; 431 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
167 break; 435 break;
168 } 436 }
169 } 437 }
170 if (found) { 438 if (!found)
171 private = e->private; 439 return -ENOENT;
172 hlist_del(&e->hlist); 440 if (e->single.func != __mark_empty_function)
173 kfree(e); 441 return -EBUSY;
174 } 442 hlist_del(&e->hlist);
175 return private; 443 /* Make sure the call_rcu has been executed */
444 if (e->rcu_pending)
445 rcu_barrier();
446 kfree(e);
447 return 0;
176} 448}
177 449
178/* 450/*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
184 size_t name_len = strlen((*entry)->name) + 1; 456 size_t name_len = strlen((*entry)->name) + 1;
185 size_t format_len = strlen(format) + 1; 457 size_t format_len = strlen(format) + 1;
186 458
459
187 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, 460 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
188 GFP_KERNEL); 461 GFP_KERNEL);
189 if (!e) 462 if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
191 memcpy(&e->name[0], (*entry)->name, name_len); 464 memcpy(&e->name[0], (*entry)->name, name_len);
192 e->format = &e->name[name_len]; 465 e->format = &e->name[name_len];
193 memcpy(e->format, format, format_len); 466 memcpy(e->format, format, format_len);
194 e->probe = (*entry)->probe; 467 if (strcmp(e->format, MARK_NOARGS) == 0)
195 e->private = (*entry)->private; 468 e->call = marker_probe_cb_noarg;
469 else
470 e->call = marker_probe_cb;
471 e->single = (*entry)->single;
472 e->multi = (*entry)->multi;
473 e->ptype = (*entry)->ptype;
196 e->refcount = (*entry)->refcount; 474 e->refcount = (*entry)->refcount;
475 e->rcu_pending = 0;
197 hlist_add_before(&e->hlist, &(*entry)->hlist); 476 hlist_add_before(&e->hlist, &(*entry)->hlist);
198 hlist_del(&(*entry)->hlist); 477 hlist_del(&(*entry)->hlist);
478 /* Make sure the call_rcu has been executed */
479 if ((*entry)->rcu_pending)
480 rcu_barrier();
199 kfree(*entry); 481 kfree(*entry);
200 *entry = e; 482 *entry = e;
201 trace_mark(core_marker_format, "name %s format %s", 483 trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
206/* 488/*
207 * Sets the probe callback corresponding to one marker. 489 * Sets the probe callback corresponding to one marker.
208 */ 490 */
209static int set_marker(struct marker_entry **entry, struct marker *elem) 491static int set_marker(struct marker_entry **entry, struct marker *elem,
492 int active)
210{ 493{
211 int ret; 494 int ret;
212 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 495 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
226 if (ret) 509 if (ret)
227 return ret; 510 return ret;
228 } 511 }
229 elem->call = (*entry)->probe; 512
230 elem->private = (*entry)->private; 513 /*
231 elem->state = 1; 514 * probe_cb setup (statically known) is done here. It is
515 * asynchronous with the rest of execution, therefore we only
516 * pass from a "safe" callback (with argument) to an "unsafe"
517 * callback (does not set arguments).
518 */
519 elem->call = (*entry)->call;
520 /*
521 * Sanity check :
522 * We only update the single probe private data when the ptr is
523 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
524 */
525 WARN_ON(elem->single.func != __mark_empty_function
526 && elem->single.probe_private
527 != (*entry)->single.probe_private &&
528 !elem->ptype);
529 elem->single.probe_private = (*entry)->single.probe_private;
530 /*
531 * Make sure the private data is valid when we update the
532 * single probe ptr.
533 */
534 smp_wmb();
535 elem->single.func = (*entry)->single.func;
536 /*
537 * We also make sure that the new probe callbacks array is consistent
538 * before setting a pointer to it.
539 */
540 rcu_assign_pointer(elem->multi, (*entry)->multi);
541 /*
542 * Update the function or multi probe array pointer before setting the
543 * ptype.
544 */
545 smp_wmb();
546 elem->ptype = (*entry)->ptype;
547 elem->state = active;
548
232 return 0; 549 return 0;
233} 550}
234 551
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
240 */ 557 */
241static void disable_marker(struct marker *elem) 558static void disable_marker(struct marker *elem)
242{ 559{
560 /* leave "call" as is. It is known statically. */
243 elem->state = 0; 561 elem->state = 0;
244 elem->call = __mark_empty_function; 562 elem->single.func = __mark_empty_function;
563 /* Update the function before setting the ptype */
564 smp_wmb();
565 elem->ptype = 0; /* single probe */
245 /* 566 /*
246 * Leave the private data and id there, because removal is racy and 567 * Leave the private data and id there, because removal is racy and
247 * should be done only after a synchronize_sched(). These are never used 568 * should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
253 * marker_update_probe_range - Update a probe range 574 * marker_update_probe_range - Update a probe range
254 * @begin: beginning of the range 575 * @begin: beginning of the range
255 * @end: end of the range 576 * @end: end of the range
256 * @probe_module: module address of the probe being updated
257 * @refcount: number of references left to the given probe_module (out)
258 * 577 *
259 * Updates the probe callback corresponding to a range of markers. 578 * Updates the probe callback corresponding to a range of markers.
260 */ 579 */
261void marker_update_probe_range(struct marker *begin, 580void marker_update_probe_range(struct marker *begin,
262 struct marker *end, struct module *probe_module, 581 struct marker *end)
263 int *refcount)
264{ 582{
265 struct marker *iter; 583 struct marker *iter;
266 struct marker_entry *mark_entry; 584 struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
268 mutex_lock(&markers_mutex); 586 mutex_lock(&markers_mutex);
269 for (iter = begin; iter < end; iter++) { 587 for (iter = begin; iter < end; iter++) {
270 mark_entry = get_marker(iter->name); 588 mark_entry = get_marker(iter->name);
271 if (mark_entry && mark_entry->refcount) { 589 if (mark_entry) {
272 set_marker(&mark_entry, iter); 590 set_marker(&mark_entry, iter,
591 !!mark_entry->refcount);
273 /* 592 /*
274 * ignore error, continue 593 * ignore error, continue
275 */ 594 */
276 if (probe_module)
277 if (probe_module ==
278 __module_text_address((unsigned long)mark_entry->probe))
279 (*refcount)++;
280 } else { 595 } else {
281 disable_marker(iter); 596 disable_marker(iter);
282 } 597 }
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
289 * Issues a synchronize_sched() when no reference to the module passed 604 * Issues a synchronize_sched() when no reference to the module passed
290 * as parameter is found in the probes so the probe module can be 605 * as parameter is found in the probes so the probe module can be
291 * safely unloaded from now on. 606 * safely unloaded from now on.
607 *
608 * Internal callback only changed before the first probe is connected to it.
609 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
610 * transitions. All other transitions will leave the old private data valid.
611 * This makes the non-atomicity of the callback/private data updates valid.
612 *
613 * "special case" updates :
614 * 0 -> 1 callback
615 * 1 -> 0 callback
616 * 1 -> 2 callbacks
617 * 2 -> 1 callbacks
618 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
619 * Site effect : marker_set_format may delete the marker entry (creating a
620 * replacement).
292 */ 621 */
293static void marker_update_probes(struct module *probe_module) 622static void marker_update_probes(void)
294{ 623{
295 int refcount = 0;
296
297 /* Core kernel markers */ 624 /* Core kernel markers */
298 marker_update_probe_range(__start___markers, 625 marker_update_probe_range(__start___markers, __stop___markers);
299 __stop___markers, probe_module, &refcount);
300 /* Markers in modules. */ 626 /* Markers in modules. */
301 module_update_markers(probe_module, &refcount); 627 module_update_markers();
302 if (probe_module && refcount == 0) {
303 synchronize_sched();
304 deferred_sync = 0;
305 }
306} 628}
307 629
308/** 630/**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
310 * @name: marker name 632 * @name: marker name
311 * @format: format string 633 * @format: format string
312 * @probe: probe handler 634 * @probe: probe handler
313 * @private: probe private data 635 * @probe_private: probe private data
314 * 636 *
315 * private data must be a valid allocated memory address, or NULL. 637 * private data must be a valid allocated memory address, or NULL.
316 * Returns 0 if ok, error value on error. 638 * Returns 0 if ok, error value on error.
639 * The probe address must at least be aligned on the architecture pointer size.
317 */ 640 */
318int marker_probe_register(const char *name, const char *format, 641int marker_probe_register(const char *name, const char *format,
319 marker_probe_func *probe, void *private) 642 marker_probe_func *probe, void *probe_private)
320{ 643{
321 struct marker_entry *entry; 644 struct marker_entry *entry;
322 int ret = 0; 645 int ret = 0;
646 struct marker_probe_closure *old;
323 647
324 mutex_lock(&markers_mutex); 648 mutex_lock(&markers_mutex);
325 entry = get_marker(name); 649 entry = get_marker(name);
326 if (entry && entry->refcount) { 650 if (!entry) {
327 ret = -EBUSY; 651 entry = add_marker(name, format);
328 goto end; 652 if (IS_ERR(entry)) {
329 } 653 ret = PTR_ERR(entry);
330 if (deferred_sync) { 654 goto end;
331 synchronize_sched(); 655 }
332 deferred_sync = 0;
333 } 656 }
334 ret = add_marker(name, format, probe, private); 657 /*
335 if (ret) 658 * If we detect that a call_rcu is pending for this marker,
659 * make sure it's executed now.
660 */
661 if (entry->rcu_pending)
662 rcu_barrier();
663 old = marker_entry_add_probe(entry, probe, probe_private);
664 if (IS_ERR(old)) {
665 ret = PTR_ERR(old);
336 goto end; 666 goto end;
667 }
337 mutex_unlock(&markers_mutex); 668 mutex_unlock(&markers_mutex);
338 marker_update_probes(NULL); 669 marker_update_probes(); /* may update entry */
339 return ret; 670 mutex_lock(&markers_mutex);
671 entry = get_marker(name);
672 WARN_ON(!entry);
673 entry->oldptr = old;
674 entry->rcu_pending = 1;
675 /* write rcu_pending before calling the RCU callback */
676 smp_wmb();
677 call_rcu(&entry->rcu, free_old_closure);
340end: 678end:
341 mutex_unlock(&markers_mutex); 679 mutex_unlock(&markers_mutex);
342 return ret; 680 return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
346/** 684/**
347 * marker_probe_unregister - Disconnect a probe from a marker 685 * marker_probe_unregister - Disconnect a probe from a marker
348 * @name: marker name 686 * @name: marker name
687 * @probe: probe function pointer
688 * @probe_private: probe private data
349 * 689 *
350 * Returns the private data given to marker_probe_register, or an ERR_PTR(). 690 * Returns the private data given to marker_probe_register, or an ERR_PTR().
691 * We do not need to call a synchronize_sched to make sure the probes have
692 * finished running before doing a module unload, because the module unload
693 * itself uses stop_machine(), which insures that every preempt disabled section
694 * have finished.
351 */ 695 */
352void *marker_probe_unregister(const char *name) 696int marker_probe_unregister(const char *name,
697 marker_probe_func *probe, void *probe_private)
353{ 698{
354 struct module *probe_module;
355 struct marker_entry *entry; 699 struct marker_entry *entry;
356 void *private; 700 struct marker_probe_closure *old;
701 int ret = 0;
357 702
358 mutex_lock(&markers_mutex); 703 mutex_lock(&markers_mutex);
359 entry = get_marker(name); 704 entry = get_marker(name);
360 if (!entry) { 705 if (!entry) {
361 private = ERR_PTR(-ENOENT); 706 ret = -ENOENT;
362 goto end; 707 goto end;
363 } 708 }
364 entry->refcount = 0; 709 if (entry->rcu_pending)
365 /* In what module is the probe handler ? */ 710 rcu_barrier();
366 probe_module = __module_text_address((unsigned long)entry->probe); 711 old = marker_entry_remove_probe(entry, probe, probe_private);
367 private = remove_marker(name);
368 deferred_sync = 1;
369 mutex_unlock(&markers_mutex); 712 mutex_unlock(&markers_mutex);
370 marker_update_probes(probe_module); 713 marker_update_probes(); /* may update entry */
371 return private; 714 mutex_lock(&markers_mutex);
715 entry = get_marker(name);
716 entry->oldptr = old;
717 entry->rcu_pending = 1;
718 /* write rcu_pending before calling the RCU callback */
719 smp_wmb();
720 call_rcu(&entry->rcu, free_old_closure);
721 remove_marker(name); /* Ignore busy error message */
372end: 722end:
373 mutex_unlock(&markers_mutex); 723 mutex_unlock(&markers_mutex);
374 return private; 724 return ret;
375} 725}
376EXPORT_SYMBOL_GPL(marker_probe_unregister); 726EXPORT_SYMBOL_GPL(marker_probe_unregister);
377 727
378/** 728static struct marker_entry *
379 * marker_probe_unregister_private_data - Disconnect a probe from a marker 729get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
380 * @private: probe private data
381 *
382 * Unregister a marker by providing the registered private data.
383 * Returns the private data given to marker_probe_register, or an ERR_PTR().
384 */
385void *marker_probe_unregister_private_data(void *private)
386{ 730{
387 struct module *probe_module;
388 struct hlist_head *head;
389 struct hlist_node *node;
390 struct marker_entry *entry; 731 struct marker_entry *entry;
391 int found = 0;
392 unsigned int i; 732 unsigned int i;
733 struct hlist_head *head;
734 struct hlist_node *node;
393 735
394 mutex_lock(&markers_mutex);
395 for (i = 0; i < MARKER_TABLE_SIZE; i++) { 736 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
396 head = &marker_table[i]; 737 head = &marker_table[i];
397 hlist_for_each_entry(entry, node, head, hlist) { 738 hlist_for_each_entry(entry, node, head, hlist) {
398 if (entry->private == private) { 739 if (!entry->ptype) {
399 found = 1; 740 if (entry->single.func == probe
400 goto iter_end; 741 && entry->single.probe_private
742 == probe_private)
743 return entry;
744 } else {
745 struct marker_probe_closure *closure;
746 closure = entry->multi;
747 for (i = 0; closure[i].func; i++) {
748 if (closure[i].func == probe &&
749 closure[i].probe_private
750 == probe_private)
751 return entry;
752 }
401 } 753 }
402 } 754 }
403 } 755 }
404iter_end: 756 return NULL;
405 if (!found) {
406 private = ERR_PTR(-ENOENT);
407 goto end;
408 }
409 entry->refcount = 0;
410 /* In what module is the probe handler ? */
411 probe_module = __module_text_address((unsigned long)entry->probe);
412 private = remove_marker(entry->name);
413 deferred_sync = 1;
414 mutex_unlock(&markers_mutex);
415 marker_update_probes(probe_module);
416 return private;
417end:
418 mutex_unlock(&markers_mutex);
419 return private;
420} 757}
421EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
422 758
423/** 759/**
424 * marker_arm - Arm a marker 760 * marker_probe_unregister_private_data - Disconnect a probe from a marker
425 * @name: marker name 761 * @probe: probe function
762 * @probe_private: probe private data
426 * 763 *
427 * Activate a marker. It keeps a reference count of the number of 764 * Unregister a probe by providing the registered private data.
428 * arming/disarming done. 765 * Only removes the first marker found in hash table.
429 * Returns 0 if ok, error value on error. 766 * Return 0 on success or error value.
767 * We do not need to call a synchronize_sched to make sure the probes have
768 * finished running before doing a module unload, because the module unload
769 * itself uses stop_machine(), which insures that every preempt disabled section
770 * have finished.
430 */ 771 */
431int marker_arm(const char *name) 772int marker_probe_unregister_private_data(marker_probe_func *probe,
773 void *probe_private)
432{ 774{
433 struct marker_entry *entry; 775 struct marker_entry *entry;
434 int ret = 0; 776 int ret = 0;
777 struct marker_probe_closure *old;
435 778
436 mutex_lock(&markers_mutex); 779 mutex_lock(&markers_mutex);
437 entry = get_marker(name); 780 entry = get_marker_from_private_data(probe, probe_private);
438 if (!entry) { 781 if (!entry) {
439 ret = -ENOENT; 782 ret = -ENOENT;
440 goto end; 783 goto end;
441 } 784 }
442 /* 785 if (entry->rcu_pending)
443 * Only need to update probes when refcount passes from 0 to 1. 786 rcu_barrier();
444 */ 787 old = marker_entry_remove_probe(entry, NULL, probe_private);
445 if (entry->refcount++)
446 goto end;
447end:
448 mutex_unlock(&markers_mutex); 788 mutex_unlock(&markers_mutex);
449 marker_update_probes(NULL); 789 marker_update_probes(); /* may update entry */
450 return ret;
451}
452EXPORT_SYMBOL_GPL(marker_arm);
453
454/**
455 * marker_disarm - Disarm a marker
456 * @name: marker name
457 *
458 * Disarm a marker. It keeps a reference count of the number of arming/disarming
459 * done.
460 * Returns 0 if ok, error value on error.
461 */
462int marker_disarm(const char *name)
463{
464 struct marker_entry *entry;
465 int ret = 0;
466
467 mutex_lock(&markers_mutex); 790 mutex_lock(&markers_mutex);
468 entry = get_marker(name); 791 entry = get_marker_from_private_data(probe, probe_private);
469 if (!entry) { 792 WARN_ON(!entry);
470 ret = -ENOENT; 793 entry->oldptr = old;
471 goto end; 794 entry->rcu_pending = 1;
472 } 795 /* write rcu_pending before calling the RCU callback */
473 /* 796 smp_wmb();
474 * Only permit decrement refcount if higher than 0. 797 call_rcu(&entry->rcu, free_old_closure);
475 * Do probe update only on 1 -> 0 transition. 798 remove_marker(entry->name); /* Ignore busy error message */
476 */
477 if (entry->refcount) {
478 if (--entry->refcount)
479 goto end;
480 } else {
481 ret = -EPERM;
482 goto end;
483 }
484end: 799end:
485 mutex_unlock(&markers_mutex); 800 mutex_unlock(&markers_mutex);
486 marker_update_probes(NULL);
487 return ret; 801 return ret;
488} 802}
489EXPORT_SYMBOL_GPL(marker_disarm); 803EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
490 804
491/** 805/**
492 * marker_get_private_data - Get a marker's probe private data 806 * marker_get_private_data - Get a marker's probe private data
493 * @name: marker name 807 * @name: marker name
808 * @probe: probe to match
809 * @num: get the nth matching probe's private data
494 * 810 *
811 * Returns the nth private data pointer (starting from 0) matching, or an
812 * ERR_PTR.
495 * Returns the private data pointer, or an ERR_PTR. 813 * Returns the private data pointer, or an ERR_PTR.
496 * The private data pointer should _only_ be dereferenced if the caller is the 814 * The private data pointer should _only_ be dereferenced if the caller is the
497 * owner of the data, or its content could vanish. This is mostly used to 815 * owner of the data, or its content could vanish. This is mostly used to
498 * confirm that a caller is the owner of a registered probe. 816 * confirm that a caller is the owner of a registered probe.
499 */ 817 */
500void *marker_get_private_data(const char *name) 818void *marker_get_private_data(const char *name, marker_probe_func *probe,
819 int num)
501{ 820{
502 struct hlist_head *head; 821 struct hlist_head *head;
503 struct hlist_node *node; 822 struct hlist_node *node;
504 struct marker_entry *e; 823 struct marker_entry *e;
505 size_t name_len = strlen(name) + 1; 824 size_t name_len = strlen(name) + 1;
506 u32 hash = jhash(name, name_len-1, 0); 825 u32 hash = jhash(name, name_len-1, 0);
507 int found = 0; 826 int i;
508 827
509 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; 828 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
510 hlist_for_each_entry(e, node, head, hlist) { 829 hlist_for_each_entry(e, node, head, hlist) {
511 if (!strcmp(name, e->name)) { 830 if (!strcmp(name, e->name)) {
512 found = 1; 831 if (!e->ptype) {
513 return e->private; 832 if (num == 0 && e->single.func == probe)
833 return e->single.probe_private;
834 else
835 break;
836 } else {
837 struct marker_probe_closure *closure;
838 int match = 0;
839 closure = e->multi;
840 for (i = 0; closure[i].func; i++) {
841 if (closure[i].func != probe)
842 continue;
843 if (match++ == num)
844 return closure[i].probe_private;
845 }
846 }
514 } 847 }
515 } 848 }
516 return ERR_PTR(-ENOENT); 849 return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index 4202da97a1da..92595bad3812 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2038,7 +2038,7 @@ static struct module *load_module(void __user *umod,
2038#ifdef CONFIG_MARKERS 2038#ifdef CONFIG_MARKERS
2039 if (!mod->taints) 2039 if (!mod->taints)
2040 marker_update_probe_range(mod->markers, 2040 marker_update_probe_range(mod->markers,
2041 mod->markers + mod->num_markers, NULL, NULL); 2041 mod->markers + mod->num_markers);
2042#endif 2042#endif
2043 err = module_finalize(hdr, sechdrs, mod); 2043 err = module_finalize(hdr, sechdrs, mod);
2044 if (err < 0) 2044 if (err < 0)
@@ -2564,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
2564#endif 2564#endif
2565 2565
2566#ifdef CONFIG_MARKERS 2566#ifdef CONFIG_MARKERS
2567void module_update_markers(struct module *probe_module, int *refcount) 2567void module_update_markers(void)
2568{ 2568{
2569 struct module *mod; 2569 struct module *mod;
2570 2570
@@ -2572,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
2572 list_for_each_entry(mod, &modules, list) 2572 list_for_each_entry(mod, &modules, list)
2573 if (!mod->taints) 2573 if (!mod->taints)
2574 marker_update_probe_range(mod->markers, 2574 marker_update_probe_range(mod->markers,
2575 mod->markers + mod->num_markers, 2575 mod->markers + mod->num_markers);
2576 probe_module, refcount);
2577 mutex_unlock(&module_mutex); 2576 mutex_unlock(&module_mutex);
2578} 2577}
2579#endif 2578#endif
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a00..c09605f8d16c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
56static DEFINE_MUTEX(rcu_barrier_mutex); 56static DEFINE_MUTEX(rcu_barrier_mutex);
57static struct completion rcu_barrier_completion; 57static struct completion rcu_barrier_completion;
58 58
59/* Because of FASTCALL declaration of complete, we use this wrapper */ 59/*
60 * Awaken the corresponding synchronize_rcu() instance now that a
61 * grace period has elapsed.
62 */
60static void wakeme_after_rcu(struct rcu_head *head) 63static void wakeme_after_rcu(struct rcu_head *head)
61{ 64{
62 struct rcu_synchronize *rcu; 65 struct rcu_synchronize *rcu;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d2..6522ae5b14a2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
630 set_current_state(state); 630 set_current_state(state);
631 631
632 /* Setup the timer, when timeout != NULL */ 632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) 633 if (unlikely(timeout)) {
634 hrtimer_start(&timeout->timer, timeout->timer.expires, 634 hrtimer_start(&timeout->timer, timeout->timer.expires,
635 HRTIMER_MODE_ABS); 635 HRTIMER_MODE_ABS);
636 if (!hrtimer_active(&timeout->timer))
637 timeout->task = NULL;
638 }
636 639
637 for (;;) { 640 for (;;) {
638 /* Try to acquire the lock: */ 641 /* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3eedd5260907..f28f19e65b59 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
155 struct list_head queue[MAX_RT_PRIO]; 155 struct list_head queue[MAX_RT_PRIO];
156}; 156};
157 157
158#ifdef CONFIG_FAIR_GROUP_SCHED 158#ifdef CONFIG_GROUP_SCHED
159 159
160#include <linux/cgroup.h> 160#include <linux/cgroup.h>
161 161
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
165 165
166/* task group related information */ 166/* task group related information */
167struct task_group { 167struct task_group {
168#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_CGROUP_SCHED
169 struct cgroup_subsys_state css; 169 struct cgroup_subsys_state css;
170#endif 170#endif
171
172#ifdef CONFIG_FAIR_GROUP_SCHED
171 /* schedulable entities of this group on each cpu */ 173 /* schedulable entities of this group on each cpu */
172 struct sched_entity **se; 174 struct sched_entity **se;
173 /* runqueue "owned" by this group on each cpu */ 175 /* runqueue "owned" by this group on each cpu */
174 struct cfs_rq **cfs_rq; 176 struct cfs_rq **cfs_rq;
175 177
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /* 178 /*
182 * shares assigned to a task group governs how much of cpu bandwidth 179 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is 180 * is allocated to the group. The more shares a group has, the more is
@@ -213,33 +210,46 @@ struct task_group {
213 * 210 *
214 */ 211 */
215 unsigned long shares; 212 unsigned long shares;
213#endif
214
215#ifdef CONFIG_RT_GROUP_SCHED
216 struct sched_rt_entity **rt_se;
217 struct rt_rq **rt_rq;
218
219 u64 rt_runtime;
220#endif
216 221
217 struct rcu_head rcu; 222 struct rcu_head rcu;
218 struct list_head list; 223 struct list_head list;
219}; 224};
220 225
226#ifdef CONFIG_FAIR_GROUP_SCHED
221/* Default task group's sched entity on each cpu */ 227/* Default task group's sched entity on each cpu */
222static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 228static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
223/* Default task group's cfs_rq on each cpu */ 229/* Default task group's cfs_rq on each cpu */
224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 230static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
225 231
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
229static struct sched_entity *init_sched_entity_p[NR_CPUS]; 232static struct sched_entity *init_sched_entity_p[NR_CPUS];
230static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 233static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
234#endif
235
236#ifdef CONFIG_RT_GROUP_SCHED
237static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
238static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
231 239
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; 240static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS]; 241static struct rt_rq *init_rt_rq_p[NR_CPUS];
242#endif
234 243
235/* task_group_mutex serializes add/remove of task groups and also changes to 244/* task_group_lock serializes add/remove of task groups and also changes to
236 * a task group's cpu shares. 245 * a task group's cpu shares.
237 */ 246 */
238static DEFINE_MUTEX(task_group_mutex); 247static DEFINE_SPINLOCK(task_group_lock);
239 248
240/* doms_cur_mutex serializes access to doms_cur[] array */ 249/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex); 250static DEFINE_MUTEX(doms_cur_mutex);
242 251
252#ifdef CONFIG_FAIR_GROUP_SCHED
243#ifdef CONFIG_SMP 253#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */ 254/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task; 255static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
248 258
249static void set_se_shares(struct sched_entity *se, unsigned long shares); 259static void set_se_shares(struct sched_entity *se, unsigned long shares);
250 260
261#ifdef CONFIG_USER_SCHED
262# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
263#else
264# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
265#endif
266
267#define MIN_GROUP_SHARES 2
268
269static int init_task_group_load = INIT_TASK_GROUP_LOAD;
270#endif
271
251/* Default task group. 272/* Default task group.
252 * Every task in system belong to this group at bootup. 273 * Every task in system belong to this group at bootup.
253 */ 274 */
254struct task_group init_task_group = { 275struct task_group init_task_group = {
276#ifdef CONFIG_FAIR_GROUP_SCHED
255 .se = init_sched_entity_p, 277 .se = init_sched_entity_p,
256 .cfs_rq = init_cfs_rq_p, 278 .cfs_rq = init_cfs_rq_p,
279#endif
257 280
281#ifdef CONFIG_RT_GROUP_SCHED
258 .rt_se = init_sched_rt_entity_p, 282 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p, 283 .rt_rq = init_rt_rq_p,
260};
261
262#ifdef CONFIG_FAIR_USER_SCHED
263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
264#else
265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
266#endif 284#endif
267 285};
268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
271 286
272/* return group to which a task belongs */ 287/* return group to which a task belongs */
273static inline struct task_group *task_group(struct task_struct *p) 288static inline struct task_group *task_group(struct task_struct *p)
274{ 289{
275 struct task_group *tg; 290 struct task_group *tg;
276 291
277#ifdef CONFIG_FAIR_USER_SCHED 292#ifdef CONFIG_USER_SCHED
278 tg = p->user->tg; 293 tg = p->user->tg;
279#elif defined(CONFIG_FAIR_CGROUP_SCHED) 294#elif defined(CONFIG_CGROUP_SCHED)
280 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 295 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
281 struct task_group, css); 296 struct task_group, css);
282#else 297#else
@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 303/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
289static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 304static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
290{ 305{
306#ifdef CONFIG_FAIR_GROUP_SCHED
291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 307 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
292 p->se.parent = task_group(p)->se[cpu]; 308 p->se.parent = task_group(p)->se[cpu];
309#endif
293 310
311#ifdef CONFIG_RT_GROUP_SCHED
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 312 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu]; 313 p->rt.parent = task_group(p)->rt_se[cpu];
296} 314#endif
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306} 315}
307 316
308static inline void lock_doms_cur(void) 317static inline void lock_doms_cur(void)
@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
318#else 327#else
319 328
320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 329static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { } 330static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { } 331static inline void unlock_doms_cur(void) { }
325 332
326#endif /* CONFIG_FAIR_GROUP_SCHED */ 333#endif /* CONFIG_GROUP_SCHED */
327 334
328/* CFS-related fields in a runqueue */ 335/* CFS-related fields in a runqueue */
329struct cfs_rq { 336struct cfs_rq {
@@ -363,7 +370,7 @@ struct cfs_rq {
363struct rt_rq { 370struct rt_rq {
364 struct rt_prio_array active; 371 struct rt_prio_array active;
365 unsigned long rt_nr_running; 372 unsigned long rt_nr_running;
366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 373#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */ 374 int highest_prio; /* highest queued rt task prio */
368#endif 375#endif
369#ifdef CONFIG_SMP 376#ifdef CONFIG_SMP
@@ -373,7 +380,9 @@ struct rt_rq {
373 int rt_throttled; 380 int rt_throttled;
374 u64 rt_time; 381 u64 rt_time;
375 382
376#ifdef CONFIG_FAIR_GROUP_SCHED 383#ifdef CONFIG_RT_GROUP_SCHED
384 unsigned long rt_nr_boosted;
385
377 struct rq *rq; 386 struct rq *rq;
378 struct list_head leaf_rt_rq_list; 387 struct list_head leaf_rt_rq_list;
379 struct task_group *tg; 388 struct task_group *tg;
@@ -447,6 +456,8 @@ struct rq {
447#ifdef CONFIG_FAIR_GROUP_SCHED 456#ifdef CONFIG_FAIR_GROUP_SCHED
448 /* list of leaf cfs_rq on this cpu: */ 457 /* list of leaf cfs_rq on this cpu: */
449 struct list_head leaf_cfs_rq_list; 458 struct list_head leaf_cfs_rq_list;
459#endif
460#ifdef CONFIG_RT_GROUP_SCHED
450 struct list_head leaf_rt_rq_list; 461 struct list_head leaf_rt_rq_list;
451#endif 462#endif
452 463
@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
652const_debug unsigned int sysctl_sched_nr_migrate = 32; 663const_debug unsigned int sysctl_sched_nr_migrate = 32;
653 664
654/* 665/*
655 * period over which we measure -rt task cpu usage in ms. 666 * period over which we measure -rt task cpu usage in us.
656 * default: 1s 667 * default: 1s
657 */ 668 */
658const_debug unsigned int sysctl_sched_rt_period = 1000; 669unsigned int sysctl_sched_rt_period = 1000000;
659 670
660#define SCHED_RT_FRAC_SHIFT 16 671/*
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) 672 * part of the period that we allow rt tasks to run in us.
673 * default: 0.95s
674 */
675int sysctl_sched_rt_runtime = 950000;
662 676
663/* 677/*
664 * ratio of time -rt tasks may consume. 678 * single value that denotes runtime == period, ie unlimited time.
665 * default: 95%
666 */ 679 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259; 680#define RUNTIME_INF ((u64)~0ULL)
668 681
669/* 682/*
670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 683 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -4571,6 +4584,15 @@ recheck:
4571 return -EPERM; 4584 return -EPERM;
4572 } 4585 }
4573 4586
4587#ifdef CONFIG_RT_GROUP_SCHED
4588 /*
4589 * Do not allow realtime tasks into groups that have no runtime
4590 * assigned.
4591 */
4592 if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
4593 return -EPERM;
4594#endif
4595
4574 retval = security_task_setscheduler(p, policy, param); 4596 retval = security_task_setscheduler(p, policy, param);
4575 if (retval) 4597 if (retval)
4576 return retval; 4598 return retval;
@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7112 /* delimiter for bitsearch: */ 7134 /* delimiter for bitsearch: */
7113 __set_bit(MAX_RT_PRIO, array->bitmap); 7135 __set_bit(MAX_RT_PRIO, array->bitmap);
7114 7136
7115#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 7137#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7116 rt_rq->highest_prio = MAX_RT_PRIO; 7138 rt_rq->highest_prio = MAX_RT_PRIO;
7117#endif 7139#endif
7118#ifdef CONFIG_SMP 7140#ifdef CONFIG_SMP
@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7123 rt_rq->rt_time = 0; 7145 rt_rq->rt_time = 0;
7124 rt_rq->rt_throttled = 0; 7146 rt_rq->rt_throttled = 0;
7125 7147
7126#ifdef CONFIG_FAIR_GROUP_SCHED 7148#ifdef CONFIG_RT_GROUP_SCHED
7149 rt_rq->rt_nr_boosted = 0;
7127 rt_rq->rq = rq; 7150 rt_rq->rq = rq;
7128#endif 7151#endif
7129} 7152}
@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7146 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 7169 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7147 se->parent = NULL; 7170 se->parent = NULL;
7148} 7171}
7172#endif
7149 7173
7174#ifdef CONFIG_RT_GROUP_SCHED
7150static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, 7175static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7151 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 7176 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7152 int cpu, int add) 7177 int cpu, int add)
@@ -7175,7 +7200,7 @@ void __init sched_init(void)
7175 init_defrootdomain(); 7200 init_defrootdomain();
7176#endif 7201#endif
7177 7202
7178#ifdef CONFIG_FAIR_GROUP_SCHED 7203#ifdef CONFIG_GROUP_SCHED
7179 list_add(&init_task_group.list, &task_groups); 7204 list_add(&init_task_group.list, &task_groups);
7180#endif 7205#endif
7181 7206
@@ -7196,7 +7221,10 @@ void __init sched_init(void)
7196 &per_cpu(init_cfs_rq, i), 7221 &per_cpu(init_cfs_rq, i),
7197 &per_cpu(init_sched_entity, i), i, 1); 7222 &per_cpu(init_sched_entity, i), i, 1);
7198 7223
7199 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ 7224#endif
7225#ifdef CONFIG_RT_GROUP_SCHED
7226 init_task_group.rt_runtime =
7227 sysctl_sched_rt_runtime * NSEC_PER_USEC;
7200 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7228 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7201 init_tg_rt_entry(rq, &init_task_group, 7229 init_tg_rt_entry(rq, &init_task_group,
7202 &per_cpu(init_rt_rq, i), 7230 &per_cpu(init_rt_rq, i),
@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
7303 unsigned long flags; 7331 unsigned long flags;
7304 struct rq *rq; 7332 struct rq *rq;
7305 7333
7306 read_lock_irq(&tasklist_lock); 7334 read_lock_irqsave(&tasklist_lock, flags);
7307 do_each_thread(g, p) { 7335 do_each_thread(g, p) {
7308 /* 7336 /*
7309 * Only normalize user tasks: 7337 * Only normalize user tasks:
@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
7329 continue; 7357 continue;
7330 } 7358 }
7331 7359
7332 spin_lock_irqsave(&p->pi_lock, flags); 7360 spin_lock(&p->pi_lock);
7333 rq = __task_rq_lock(p); 7361 rq = __task_rq_lock(p);
7334 7362
7335 normalize_task(rq, p); 7363 normalize_task(rq, p);
7336 7364
7337 __task_rq_unlock(rq); 7365 __task_rq_unlock(rq);
7338 spin_unlock_irqrestore(&p->pi_lock, flags); 7366 spin_unlock(&p->pi_lock);
7339 } while_each_thread(g, p); 7367 } while_each_thread(g, p);
7340 7368
7341 read_unlock_irq(&tasklist_lock); 7369 read_unlock_irqrestore(&tasklist_lock, flags);
7342} 7370}
7343 7371
7344#endif /* CONFIG_MAGIC_SYSRQ */ 7372#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
7387 7415
7388#endif 7416#endif
7389 7417
7390#ifdef CONFIG_FAIR_GROUP_SCHED 7418#ifdef CONFIG_GROUP_SCHED
7391 7419
7392#ifdef CONFIG_SMP 7420#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7393/* 7421/*
7394 * distribute shares of all task groups among their schedulable entities, 7422 * distribute shares of all task groups among their schedulable entities,
7395 * to reflect load distribution across cpus. 7423 * to reflect load distribution across cpus.
@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
7540} 7568}
7541#endif /* CONFIG_SMP */ 7569#endif /* CONFIG_SMP */
7542 7570
7543static void free_sched_group(struct task_group *tg) 7571#ifdef CONFIG_FAIR_GROUP_SCHED
7572static void free_fair_sched_group(struct task_group *tg)
7544{ 7573{
7545 int i; 7574 int i;
7546 7575
@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
7549 kfree(tg->cfs_rq[i]); 7578 kfree(tg->cfs_rq[i]);
7550 if (tg->se) 7579 if (tg->se)
7551 kfree(tg->se[i]); 7580 kfree(tg->se[i]);
7552 if (tg->rt_rq)
7553 kfree(tg->rt_rq[i]);
7554 if (tg->rt_se)
7555 kfree(tg->rt_se[i]);
7556 } 7581 }
7557 7582
7558 kfree(tg->cfs_rq); 7583 kfree(tg->cfs_rq);
7559 kfree(tg->se); 7584 kfree(tg->se);
7560 kfree(tg->rt_rq);
7561 kfree(tg->rt_se);
7562 kfree(tg);
7563} 7585}
7564 7586
7565/* allocate runqueue etc for a new task group */ 7587static int alloc_fair_sched_group(struct task_group *tg)
7566struct task_group *sched_create_group(void)
7567{ 7588{
7568 struct task_group *tg;
7569 struct cfs_rq *cfs_rq; 7589 struct cfs_rq *cfs_rq;
7570 struct sched_entity *se; 7590 struct sched_entity *se;
7571 struct rt_rq *rt_rq;
7572 struct sched_rt_entity *rt_se;
7573 struct rq *rq; 7591 struct rq *rq;
7574 int i; 7592 int i;
7575 7593
7576 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7577 if (!tg)
7578 return ERR_PTR(-ENOMEM);
7579
7580 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); 7594 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
7581 if (!tg->cfs_rq) 7595 if (!tg->cfs_rq)
7582 goto err; 7596 goto err;
7583 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7597 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
7584 if (!tg->se) 7598 if (!tg->se)
7585 goto err; 7599 goto err;
7586 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7587 if (!tg->rt_rq)
7588 goto err;
7589 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7590 if (!tg->rt_se)
7591 goto err;
7592 7600
7593 tg->shares = NICE_0_LOAD; 7601 tg->shares = NICE_0_LOAD;
7594 tg->rt_ratio = 0; /* XXX */
7595 7602
7596 for_each_possible_cpu(i) { 7603 for_each_possible_cpu(i) {
7597 rq = cpu_rq(i); 7604 rq = cpu_rq(i);
@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
7606 if (!se) 7613 if (!se)
7607 goto err; 7614 goto err;
7608 7615
7616 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7617 }
7618
7619 return 1;
7620
7621 err:
7622 return 0;
7623}
7624
7625static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7626{
7627 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
7628 &cpu_rq(cpu)->leaf_cfs_rq_list);
7629}
7630
7631static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7632{
7633 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
7634}
7635#else
7636static inline void free_fair_sched_group(struct task_group *tg)
7637{
7638}
7639
7640static inline int alloc_fair_sched_group(struct task_group *tg)
7641{
7642 return 1;
7643}
7644
7645static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7646{
7647}
7648
7649static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7650{
7651}
7652#endif
7653
7654#ifdef CONFIG_RT_GROUP_SCHED
7655static void free_rt_sched_group(struct task_group *tg)
7656{
7657 int i;
7658
7659 for_each_possible_cpu(i) {
7660 if (tg->rt_rq)
7661 kfree(tg->rt_rq[i]);
7662 if (tg->rt_se)
7663 kfree(tg->rt_se[i]);
7664 }
7665
7666 kfree(tg->rt_rq);
7667 kfree(tg->rt_se);
7668}
7669
7670static int alloc_rt_sched_group(struct task_group *tg)
7671{
7672 struct rt_rq *rt_rq;
7673 struct sched_rt_entity *rt_se;
7674 struct rq *rq;
7675 int i;
7676
7677 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7678 if (!tg->rt_rq)
7679 goto err;
7680 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7681 if (!tg->rt_se)
7682 goto err;
7683
7684 tg->rt_runtime = 0;
7685
7686 for_each_possible_cpu(i) {
7687 rq = cpu_rq(i);
7688
7609 rt_rq = kmalloc_node(sizeof(struct rt_rq), 7689 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7610 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 7690 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7611 if (!rt_rq) 7691 if (!rt_rq)
@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
7616 if (!rt_se) 7696 if (!rt_se)
7617 goto err; 7697 goto err;
7618 7698
7619 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7620 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); 7699 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7621 } 7700 }
7622 7701
7623 lock_task_group_list(); 7702 return 1;
7703
7704 err:
7705 return 0;
7706}
7707
7708static inline void register_rt_sched_group(struct task_group *tg, int cpu)
7709{
7710 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
7711 &cpu_rq(cpu)->leaf_rt_rq_list);
7712}
7713
7714static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7715{
7716 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
7717}
7718#else
7719static inline void free_rt_sched_group(struct task_group *tg)
7720{
7721}
7722
7723static inline int alloc_rt_sched_group(struct task_group *tg)
7724{
7725 return 1;
7726}
7727
7728static inline void register_rt_sched_group(struct task_group *tg, int cpu)
7729{
7730}
7731
7732static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7733{
7734}
7735#endif
7736
7737static void free_sched_group(struct task_group *tg)
7738{
7739 free_fair_sched_group(tg);
7740 free_rt_sched_group(tg);
7741 kfree(tg);
7742}
7743
7744/* allocate runqueue etc for a new task group */
7745struct task_group *sched_create_group(void)
7746{
7747 struct task_group *tg;
7748 unsigned long flags;
7749 int i;
7750
7751 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7752 if (!tg)
7753 return ERR_PTR(-ENOMEM);
7754
7755 if (!alloc_fair_sched_group(tg))
7756 goto err;
7757
7758 if (!alloc_rt_sched_group(tg))
7759 goto err;
7760
7761 spin_lock_irqsave(&task_group_lock, flags);
7624 for_each_possible_cpu(i) { 7762 for_each_possible_cpu(i) {
7625 rq = cpu_rq(i); 7763 register_fair_sched_group(tg, i);
7626 cfs_rq = tg->cfs_rq[i]; 7764 register_rt_sched_group(tg, i);
7627 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7628 rt_rq = tg->rt_rq[i];
7629 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7630 } 7765 }
7631 list_add_rcu(&tg->list, &task_groups); 7766 list_add_rcu(&tg->list, &task_groups);
7632 unlock_task_group_list(); 7767 spin_unlock_irqrestore(&task_group_lock, flags);
7633 7768
7634 return tg; 7769 return tg;
7635 7770
@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7648/* Destroy runqueue etc associated with a task group */ 7783/* Destroy runqueue etc associated with a task group */
7649void sched_destroy_group(struct task_group *tg) 7784void sched_destroy_group(struct task_group *tg)
7650{ 7785{
7651 struct cfs_rq *cfs_rq = NULL; 7786 unsigned long flags;
7652 struct rt_rq *rt_rq = NULL;
7653 int i; 7787 int i;
7654 7788
7655 lock_task_group_list(); 7789 spin_lock_irqsave(&task_group_lock, flags);
7656 for_each_possible_cpu(i) { 7790 for_each_possible_cpu(i) {
7657 cfs_rq = tg->cfs_rq[i]; 7791 unregister_fair_sched_group(tg, i);
7658 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7792 unregister_rt_sched_group(tg, i);
7659 rt_rq = tg->rt_rq[i];
7660 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7661 } 7793 }
7662 list_del_rcu(&tg->list); 7794 list_del_rcu(&tg->list);
7663 unlock_task_group_list(); 7795 spin_unlock_irqrestore(&task_group_lock, flags);
7664
7665 BUG_ON(!cfs_rq);
7666 7796
7667 /* wait for possible concurrent references to cfs_rqs complete */ 7797 /* wait for possible concurrent references to cfs_rqs complete */
7668 call_rcu(&tg->rcu, free_sched_group_rcu); 7798 call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
7703 task_rq_unlock(rq, &flags); 7833 task_rq_unlock(rq, &flags);
7704} 7834}
7705 7835
7836#ifdef CONFIG_FAIR_GROUP_SCHED
7706/* rq->lock to be locked by caller */ 7837/* rq->lock to be locked by caller */
7707static void set_se_shares(struct sched_entity *se, unsigned long shares) 7838static void set_se_shares(struct sched_entity *se, unsigned long shares)
7708{ 7839{
@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
7728 } 7859 }
7729} 7860}
7730 7861
7862static DEFINE_MUTEX(shares_mutex);
7863
7731int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7864int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7732{ 7865{
7733 int i; 7866 int i;
7734 struct cfs_rq *cfs_rq; 7867 unsigned long flags;
7735 struct rq *rq;
7736 7868
7737 lock_task_group_list(); 7869 mutex_lock(&shares_mutex);
7738 if (tg->shares == shares) 7870 if (tg->shares == shares)
7739 goto done; 7871 goto done;
7740 7872
@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7746 * load_balance_fair) from referring to this group first, 7878 * load_balance_fair) from referring to this group first,
7747 * by taking it off the rq->leaf_cfs_rq_list on each cpu. 7879 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7748 */ 7880 */
7749 for_each_possible_cpu(i) { 7881 spin_lock_irqsave(&task_group_lock, flags);
7750 cfs_rq = tg->cfs_rq[i]; 7882 for_each_possible_cpu(i)
7751 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7883 unregister_fair_sched_group(tg, i);
7752 } 7884 spin_unlock_irqrestore(&task_group_lock, flags);
7753 7885
7754 /* wait for any ongoing reference to this group to finish */ 7886 /* wait for any ongoing reference to this group to finish */
7755 synchronize_sched(); 7887 synchronize_sched();
@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7769 * Enable load balance activity on this group, by inserting it back on 7901 * Enable load balance activity on this group, by inserting it back on
7770 * each cpu's rq->leaf_cfs_rq_list. 7902 * each cpu's rq->leaf_cfs_rq_list.
7771 */ 7903 */
7772 for_each_possible_cpu(i) { 7904 spin_lock_irqsave(&task_group_lock, flags);
7773 rq = cpu_rq(i); 7905 for_each_possible_cpu(i)
7774 cfs_rq = tg->cfs_rq[i]; 7906 register_fair_sched_group(tg, i);
7775 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7907 spin_unlock_irqrestore(&task_group_lock, flags);
7776 }
7777done: 7908done:
7778 unlock_task_group_list(); 7909 mutex_unlock(&shares_mutex);
7779 return 0; 7910 return 0;
7780} 7911}
7781 7912
@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
7783{ 7914{
7784 return tg->shares; 7915 return tg->shares;
7785} 7916}
7917#endif
7786 7918
7919#ifdef CONFIG_RT_GROUP_SCHED
7787/* 7920/*
7788 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio 7921 * Ensure that the real time constraints are schedulable.
7789 */ 7922 */
7790int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) 7923static DEFINE_MUTEX(rt_constraints_mutex);
7924
7925static unsigned long to_ratio(u64 period, u64 runtime)
7926{
7927 if (runtime == RUNTIME_INF)
7928 return 1ULL << 16;
7929
7930 runtime *= (1ULL << 16);
7931 div64_64(runtime, period);
7932 return runtime;
7933}
7934
7935static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7791{ 7936{
7792 struct task_group *tgi; 7937 struct task_group *tgi;
7793 unsigned long total = 0; 7938 unsigned long total = 0;
7939 unsigned long global_ratio =
7940 to_ratio(sysctl_sched_rt_period,
7941 sysctl_sched_rt_runtime < 0 ?
7942 RUNTIME_INF : sysctl_sched_rt_runtime);
7794 7943
7795 rcu_read_lock(); 7944 rcu_read_lock();
7796 list_for_each_entry_rcu(tgi, &task_groups, list) 7945 list_for_each_entry_rcu(tgi, &task_groups, list) {
7797 total += tgi->rt_ratio; 7946 if (tgi == tg)
7798 rcu_read_unlock(); 7947 continue;
7799 7948
7800 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) 7949 total += to_ratio(period, tgi->rt_runtime);
7801 return -EINVAL; 7950 }
7951 rcu_read_unlock();
7802 7952
7803 tg->rt_ratio = rt_ratio; 7953 return total + to_ratio(period, runtime) < global_ratio;
7804 return 0;
7805} 7954}
7806 7955
7807unsigned long sched_group_rt_ratio(struct task_group *tg) 7956int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7808{ 7957{
7809 return tg->rt_ratio; 7958 u64 rt_runtime, rt_period;
7959 int err = 0;
7960
7961 rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
7962 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7963 if (rt_runtime_us == -1)
7964 rt_runtime = rt_period;
7965
7966 mutex_lock(&rt_constraints_mutex);
7967 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
7968 err = -EINVAL;
7969 goto unlock;
7970 }
7971 if (rt_runtime_us == -1)
7972 rt_runtime = RUNTIME_INF;
7973 tg->rt_runtime = rt_runtime;
7974 unlock:
7975 mutex_unlock(&rt_constraints_mutex);
7976
7977 return err;
7810} 7978}
7811 7979
7812#endif /* CONFIG_FAIR_GROUP_SCHED */ 7980long sched_group_rt_runtime(struct task_group *tg)
7981{
7982 u64 rt_runtime_us;
7983
7984 if (tg->rt_runtime == RUNTIME_INF)
7985 return -1;
7986
7987 rt_runtime_us = tg->rt_runtime;
7988 do_div(rt_runtime_us, NSEC_PER_USEC);
7989 return rt_runtime_us;
7990}
7991#endif
7992#endif /* CONFIG_GROUP_SCHED */
7813 7993
7814#ifdef CONFIG_FAIR_CGROUP_SCHED 7994#ifdef CONFIG_CGROUP_SCHED
7815 7995
7816/* return corresponding task_group object of a cgroup */ 7996/* return corresponding task_group object of a cgroup */
7817static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 7997static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +8037,15 @@ static int
7857cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8037cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7858 struct task_struct *tsk) 8038 struct task_struct *tsk)
7859{ 8039{
8040#ifdef CONFIG_RT_GROUP_SCHED
8041 /* Don't accept realtime tasks when there is no way for them to run */
8042 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
8043 return -EINVAL;
8044#else
7860 /* We don't support RT-tasks being in separate groups */ 8045 /* We don't support RT-tasks being in separate groups */
7861 if (tsk->sched_class != &fair_sched_class) 8046 if (tsk->sched_class != &fair_sched_class)
7862 return -EINVAL; 8047 return -EINVAL;
8048#endif
7863 8049
7864 return 0; 8050 return 0;
7865} 8051}
@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7871 sched_move_task(tsk); 8057 sched_move_task(tsk);
7872} 8058}
7873 8059
8060#ifdef CONFIG_FAIR_GROUP_SCHED
7874static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8061static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7875 u64 shareval) 8062 u64 shareval)
7876{ 8063{
@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7883 8070
7884 return (u64) tg->shares; 8071 return (u64) tg->shares;
7885} 8072}
8073#endif
7886 8074
7887static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8075#ifdef CONFIG_RT_GROUP_SCHED
7888 u64 rt_ratio_val) 8076static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8077 struct file *file,
8078 const char __user *userbuf,
8079 size_t nbytes, loff_t *unused_ppos)
7889{ 8080{
7890 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); 8081 char buffer[64];
8082 int retval = 0;
8083 s64 val;
8084 char *end;
8085
8086 if (!nbytes)
8087 return -EINVAL;
8088 if (nbytes >= sizeof(buffer))
8089 return -E2BIG;
8090 if (copy_from_user(buffer, userbuf, nbytes))
8091 return -EFAULT;
8092
8093 buffer[nbytes] = 0; /* nul-terminate */
8094
8095 /* strip newline if necessary */
8096 if (nbytes && (buffer[nbytes-1] == '\n'))
8097 buffer[nbytes-1] = 0;
8098 val = simple_strtoll(buffer, &end, 0);
8099 if (*end)
8100 return -EINVAL;
8101
8102 /* Pass to subsystem */
8103 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8104 if (!retval)
8105 retval = nbytes;
8106 return retval;
7891} 8107}
7892 8108
7893static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) 8109static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
8110 struct file *file,
8111 char __user *buf, size_t nbytes,
8112 loff_t *ppos)
7894{ 8113{
7895 struct task_group *tg = cgroup_tg(cgrp); 8114 char tmp[64];
8115 long val = sched_group_rt_runtime(cgroup_tg(cgrp));
8116 int len = sprintf(tmp, "%ld\n", val);
7896 8117
7897 return (u64) tg->rt_ratio; 8118 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
7898} 8119}
8120#endif
7899 8121
7900static struct cftype cpu_files[] = { 8122static struct cftype cpu_files[] = {
8123#ifdef CONFIG_FAIR_GROUP_SCHED
7901 { 8124 {
7902 .name = "shares", 8125 .name = "shares",
7903 .read_uint = cpu_shares_read_uint, 8126 .read_uint = cpu_shares_read_uint,
7904 .write_uint = cpu_shares_write_uint, 8127 .write_uint = cpu_shares_write_uint,
7905 }, 8128 },
8129#endif
8130#ifdef CONFIG_RT_GROUP_SCHED
7906 { 8131 {
7907 .name = "rt_ratio", 8132 .name = "rt_runtime_us",
7908 .read_uint = cpu_rt_ratio_read_uint, 8133 .read = cpu_rt_runtime_read,
7909 .write_uint = cpu_rt_ratio_write_uint, 8134 .write = cpu_rt_runtime_write,
7910 }, 8135 },
8136#endif
7911}; 8137};
7912 8138
7913static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 8139static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7926 .early_init = 1, 8152 .early_init = 1,
7927}; 8153};
7928 8154
7929#endif /* CONFIG_FAIR_CGROUP_SCHED */ 8155#endif /* CONFIG_CGROUP_SCHED */
7930 8156
7931#ifdef CONFIG_CGROUP_CPUACCT 8157#ifdef CONFIG_CGROUP_CPUACCT
7932 8158
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef2..f54792b175b2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
55 return !list_empty(&rt_se->run_list); 55 return !list_empty(&rt_se->run_list);
56} 56}
57 57
58#ifdef CONFIG_FAIR_GROUP_SCHED 58#ifdef CONFIG_RT_GROUP_SCHED
59 59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) 60static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
61{ 61{
62 if (!rt_rq->tg) 62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC; 63 return RUNTIME_INF;
64 64
65 return rt_rq->tg->rt_ratio; 65 return rt_rq->tg->rt_runtime;
66} 66}
67 67
68#define for_each_leaf_rt_rq(rt_rq, rq) \ 68#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91 91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) 92static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
93{ 93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se; 94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95 95
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
102 } 102 }
103} 103}
104 104
105static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) 105static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
106{ 106{
107 struct sched_rt_entity *rt_se = rt_rq->rt_se; 107 struct sched_rt_entity *rt_se = rt_rq->rt_se;
108 108
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
110 dequeue_rt_entity(rt_se); 110 dequeue_rt_entity(rt_se);
111} 111}
112 112
113static inline int rt_rq_throttled(struct rt_rq *rt_rq)
114{
115 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
116}
117
118static int rt_se_boosted(struct sched_rt_entity *rt_se)
119{
120 struct rt_rq *rt_rq = group_rt_rq(rt_se);
121 struct task_struct *p;
122
123 if (rt_rq)
124 return !!rt_rq->rt_nr_boosted;
125
126 p = rt_task_of(rt_se);
127 return p->prio != p->normal_prio;
128}
129
113#else 130#else
114 131
115static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) 132static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
116{ 133{
117 return sysctl_sched_rt_ratio; 134 if (sysctl_sched_rt_runtime == -1)
135 return RUNTIME_INF;
136
137 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
118} 138}
119 139
120#define for_each_leaf_rt_rq(rt_rq, rq) \ 140#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
141 return NULL; 161 return NULL;
142} 162}
143 163
144static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) 164static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
145{ 165{
146} 166}
147 167
148static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) 168static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
149{ 169{
150} 170}
151 171
172static inline int rt_rq_throttled(struct rt_rq *rt_rq)
173{
174 return rt_rq->rt_throttled;
175}
152#endif 176#endif
153 177
154static inline int rt_se_prio(struct sched_rt_entity *rt_se) 178static inline int rt_se_prio(struct sched_rt_entity *rt_se)
155{ 179{
156#ifdef CONFIG_FAIR_GROUP_SCHED 180#ifdef CONFIG_RT_GROUP_SCHED
157 struct rt_rq *rt_rq = group_rt_rq(rt_se); 181 struct rt_rq *rt_rq = group_rt_rq(rt_se);
158 182
159 if (rt_rq) 183 if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
163 return rt_task_of(rt_se)->prio; 187 return rt_task_of(rt_se)->prio;
164} 188}
165 189
166static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) 190static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
167{ 191{
168 unsigned int rt_ratio = sched_rt_ratio(rt_rq); 192 u64 runtime = sched_rt_runtime(rt_rq);
169 u64 period, ratio;
170 193
171 if (rt_ratio == SCHED_RT_FRAC) 194 if (runtime == RUNTIME_INF)
172 return 0; 195 return 0;
173 196
174 if (rt_rq->rt_throttled) 197 if (rt_rq->rt_throttled)
175 return 1; 198 return rt_rq_throttled(rt_rq);
176
177 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
178 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
179 199
180 if (rt_rq->rt_time > ratio) { 200 if (rt_rq->rt_time > runtime) {
181 struct rq *rq = rq_of_rt_rq(rt_rq); 201 struct rq *rq = rq_of_rt_rq(rt_rq);
182 202
183 rq->rt_throttled = 1; 203 rq->rt_throttled = 1;
184 rt_rq->rt_throttled = 1; 204 rt_rq->rt_throttled = 1;
185 205
186 sched_rt_ratio_dequeue(rt_rq); 206 if (rt_rq_throttled(rt_rq)) {
187 return 1; 207 sched_rt_rq_dequeue(rt_rq);
208 return 1;
209 }
188 } 210 }
189 211
190 return 0; 212 return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
196 u64 period; 218 u64 period;
197 219
198 while (rq->clock > rq->rt_period_expire) { 220 while (rq->clock > rq->rt_period_expire) {
199 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; 221 period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
200 rq->rt_period_expire += period; 222 rq->rt_period_expire += period;
201 223
202 for_each_leaf_rt_rq(rt_rq, rq) { 224 for_each_leaf_rt_rq(rt_rq, rq) {
203 unsigned long rt_ratio = sched_rt_ratio(rt_rq); 225 u64 runtime = sched_rt_runtime(rt_rq);
204 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
205 226
206 rt_rq->rt_time -= min(rt_rq->rt_time, ratio); 227 rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
207 if (rt_rq->rt_throttled) { 228 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
208 rt_rq->rt_throttled = 0; 229 rt_rq->rt_throttled = 0;
209 sched_rt_ratio_enqueue(rt_rq); 230 sched_rt_rq_enqueue(rt_rq);
210 } 231 }
211 } 232 }
212 233
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
239 cpuacct_charge(curr, delta_exec); 260 cpuacct_charge(curr, delta_exec);
240 261
241 rt_rq->rt_time += delta_exec; 262 rt_rq->rt_time += delta_exec;
242 /* 263 if (sched_rt_runtime_exceeded(rt_rq))
243 * might make it a tad more accurate:
244 *
245 * update_sched_rt_period(rq);
246 */
247 if (sched_rt_ratio_exceeded(rt_rq))
248 resched_task(curr); 264 resched_task(curr);
249} 265}
250 266
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
253{ 269{
254 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 270 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
255 rt_rq->rt_nr_running++; 271 rt_rq->rt_nr_running++;
256#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 272#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
257 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 273 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
258 rt_rq->highest_prio = rt_se_prio(rt_se); 274 rt_rq->highest_prio = rt_se_prio(rt_se);
259#endif 275#endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
265 281
266 update_rt_migration(rq_of_rt_rq(rt_rq)); 282 update_rt_migration(rq_of_rt_rq(rt_rq));
267#endif 283#endif
284#ifdef CONFIG_RT_GROUP_SCHED
285 if (rt_se_boosted(rt_se))
286 rt_rq->rt_nr_boosted++;
287#endif
268} 288}
269 289
270static inline 290static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
273 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 293 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
274 WARN_ON(!rt_rq->rt_nr_running); 294 WARN_ON(!rt_rq->rt_nr_running);
275 rt_rq->rt_nr_running--; 295 rt_rq->rt_nr_running--;
276#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 296#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
277 if (rt_rq->rt_nr_running) { 297 if (rt_rq->rt_nr_running) {
278 struct rt_prio_array *array; 298 struct rt_prio_array *array;
279 299
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
295 315
296 update_rt_migration(rq_of_rt_rq(rt_rq)); 316 update_rt_migration(rq_of_rt_rq(rt_rq));
297#endif /* CONFIG_SMP */ 317#endif /* CONFIG_SMP */
318#ifdef CONFIG_RT_GROUP_SCHED
319 if (rt_se_boosted(rt_se))
320 rt_rq->rt_nr_boosted--;
321
322 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
323#endif
298} 324}
299 325
300static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 326static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
303 struct rt_prio_array *array = &rt_rq->active; 329 struct rt_prio_array *array = &rt_rq->active;
304 struct rt_rq *group_rq = group_rt_rq(rt_se); 330 struct rt_rq *group_rq = group_rt_rq(rt_se);
305 331
306 if (group_rq && group_rq->rt_throttled) 332 if (group_rq && rt_rq_throttled(group_rq))
307 return; 333 return;
308 334
309 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 335 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
496 if (unlikely(!rt_rq->rt_nr_running)) 522 if (unlikely(!rt_rq->rt_nr_running))
497 return NULL; 523 return NULL;
498 524
499 if (sched_rt_ratio_exceeded(rt_rq)) 525 if (rt_rq_throttled(rt_rq))
500 return NULL; 526 return NULL;
501 527
502 do { 528 do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c1f08defac2..84917fe507f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
972 } 972 }
973} 973}
974 974
975int fastcall __fatal_signal_pending(struct task_struct *tsk) 975int __fatal_signal_pending(struct task_struct *tsk)
976{ 976{
977 return sigismember(&tsk->pending.signal, SIGKILL); 977 return sigismember(&tsk->pending.signal, SIGKILL);
978} 978}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d41ef6b4cf72..8b7e95411795 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = {
311 .mode = 0644, 311 .mode = 0644,
312 .proc_handler = &proc_dointvec, 312 .proc_handler = &proc_dointvec,
313 }, 313 },
314 {
315 .ctl_name = CTL_UNNUMBERED,
316 .procname = "sched_rt_period_ms",
317 .data = &sysctl_sched_rt_period,
318 .maxlen = sizeof(unsigned int),
319 .mode = 0644,
320 .proc_handler = &proc_dointvec,
321 },
322 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_rt_ratio",
325 .data = &sysctl_sched_rt_ratio,
326 .maxlen = sizeof(unsigned int),
327 .mode = 0644,
328 .proc_handler = &proc_dointvec,
329 },
330#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 314#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
331 { 315 {
332 .ctl_name = CTL_UNNUMBERED, 316 .ctl_name = CTL_UNNUMBERED,
@@ -348,6 +332,22 @@ static struct ctl_table kern_table[] = {
348#endif 332#endif
349 { 333 {
350 .ctl_name = CTL_UNNUMBERED, 334 .ctl_name = CTL_UNNUMBERED,
335 .procname = "sched_rt_period_us",
336 .data = &sysctl_sched_rt_period,
337 .maxlen = sizeof(unsigned int),
338 .mode = 0644,
339 .proc_handler = &proc_dointvec,
340 },
341 {
342 .ctl_name = CTL_UNNUMBERED,
343 .procname = "sched_rt_runtime_us",
344 .data = &sysctl_sched_rt_runtime,
345 .maxlen = sizeof(int),
346 .mode = 0644,
347 .proc_handler = &proc_dointvec,
348 },
349 {
350 .ctl_name = CTL_UNNUMBERED,
351 .procname = "sched_compat_yield", 351 .procname = "sched_compat_yield",
352 .data = &sysctl_sched_compat_yield, 352 .data = &sysctl_sched_compat_yield,
353 .maxlen = sizeof(unsigned int), 353 .maxlen = sizeof(unsigned int),
@@ -978,8 +978,8 @@ static struct ctl_table vm_table[] = {
978 { 978 {
979 .ctl_name = CTL_UNNUMBERED, 979 .ctl_name = CTL_UNNUMBERED,
980 .procname = "nr_overcommit_hugepages", 980 .procname = "nr_overcommit_hugepages",
981 .data = &nr_overcommit_huge_pages, 981 .data = &sysctl_overcommit_huge_pages,
982 .maxlen = sizeof(nr_overcommit_huge_pages), 982 .maxlen = sizeof(sysctl_overcommit_huge_pages),
983 .mode = 0644, 983 .mode = 0644,
984 .proc_handler = &hugetlb_overcommit_handler, 984 .proc_handler = &hugetlb_overcommit_handler,
985 }, 985 },
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 62b1287932ed..41468035473c 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -339,7 +339,7 @@ sub output($@)
339 print "\n"; 339 print "\n";
340 340
341 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', 341 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
342 'USEC_TO_HZ','HZ_TO_USEC') { 342 'HZ_TO_USEC','USEC_TO_HZ') {
343 foreach $bit (32, 64) { 343 foreach $bit (32, 64) {
344 foreach $suf ('MUL', 'ADJ', 'SHR') { 344 foreach $suf ('MUL', 'ADJ', 'SHR') {
345 printf "#define %-23s %s\n", 345 printf "#define %-23s %s\n",
diff --git a/kernel/user.c b/kernel/user.c
index 7d7900c5a1fd..7132022a040c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -57,7 +57,7 @@ struct user_struct root_user = {
57 .uid_keyring = &root_user_keyring, 57 .uid_keyring = &root_user_keyring,
58 .session_keyring = &root_session_keyring, 58 .session_keyring = &root_session_keyring,
59#endif 59#endif
60#ifdef CONFIG_FAIR_USER_SCHED 60#ifdef CONFIG_USER_SCHED
61 .tg = &init_task_group, 61 .tg = &init_task_group,
62#endif 62#endif
63}; 63};
@@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
90 return NULL; 90 return NULL;
91} 91}
92 92
93#ifdef CONFIG_FAIR_USER_SCHED 93#ifdef CONFIG_USER_SCHED
94 94
95static void sched_destroy_user(struct user_struct *up) 95static void sched_destroy_user(struct user_struct *up)
96{ 96{
@@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
113 sched_move_task(p); 113 sched_move_task(p);
114} 114}
115 115
116#else /* CONFIG_FAIR_USER_SCHED */ 116#else /* CONFIG_USER_SCHED */
117 117
118static void sched_destroy_user(struct user_struct *up) { } 118static void sched_destroy_user(struct user_struct *up) { }
119static int sched_create_user(struct user_struct *up) { return 0; } 119static int sched_create_user(struct user_struct *up) { return 0; }
120static void sched_switch_user(struct task_struct *p) { } 120static void sched_switch_user(struct task_struct *p) { }
121 121
122#endif /* CONFIG_FAIR_USER_SCHED */ 122#endif /* CONFIG_USER_SCHED */
123 123
124#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) 124#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
125 125
126static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 126static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
127static DEFINE_MUTEX(uids_mutex); 127static DEFINE_MUTEX(uids_mutex);
@@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void)
137} 137}
138 138
139/* uid directory attributes */ 139/* uid directory attributes */
140#ifdef CONFIG_FAIR_GROUP_SCHED
140static ssize_t cpu_shares_show(struct kobject *kobj, 141static ssize_t cpu_shares_show(struct kobject *kobj,
141 struct kobj_attribute *attr, 142 struct kobj_attribute *attr,
142 char *buf) 143 char *buf)
@@ -163,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
163 164
164static struct kobj_attribute cpu_share_attr = 165static struct kobj_attribute cpu_share_attr =
165 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); 166 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
167#endif
168
169#ifdef CONFIG_RT_GROUP_SCHED
170static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
171 struct kobj_attribute *attr,
172 char *buf)
173{
174 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
175
176 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
177}
178
179static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 struct kobj_attribute *attr,
181 const char *buf, size_t size)
182{
183 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
184 unsigned long rt_runtime;
185 int rc;
186
187 sscanf(buf, "%lu", &rt_runtime);
188
189 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
190
191 return (rc ? rc : size);
192}
193
194static struct kobj_attribute cpu_rt_runtime_attr =
195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
196#endif
166 197
167/* default attributes per uid directory */ 198/* default attributes per uid directory */
168static struct attribute *uids_attributes[] = { 199static struct attribute *uids_attributes[] = {
200#ifdef CONFIG_FAIR_GROUP_SCHED
169 &cpu_share_attr.attr, 201 &cpu_share_attr.attr,
202#endif
203#ifdef CONFIG_RT_GROUP_SCHED
204 &cpu_rt_runtime_attr.attr,
205#endif
170 NULL 206 NULL
171}; 207};
172 208
@@ -269,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
269 schedule_work(&up->work); 305 schedule_work(&up->work);
270} 306}
271 307
272#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ 308#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
273 309
274int uids_sysfs_init(void) { return 0; } 310int uids_sysfs_init(void) { return 0; }
275static inline int uids_user_create(struct user_struct *up) { return 0; } 311static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -373,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
373 spin_lock_irq(&uidhash_lock); 409 spin_lock_irq(&uidhash_lock);
374 up = uid_hash_find(uid, hashent); 410 up = uid_hash_find(uid, hashent);
375 if (up) { 411 if (up) {
376 /* This case is not possible when CONFIG_FAIR_USER_SCHED 412 /* This case is not possible when CONFIG_USER_SCHED
377 * is defined, since we serialize alloc_uid() using 413 * is defined, since we serialize alloc_uid() using
378 * uids_mutex. Hence no need to call 414 * uids_mutex. Hence no need to call
379 * sched_destroy_user() or remove_user_sysfs_dir(). 415 * sched_destroy_user() or remove_user_sysfs_dir().