aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-03-01 03:28:53 -0500
committerIngo Molnar <mingo@elte.hu>2010-03-01 03:28:58 -0500
commite2f4699ac15fe36de1288505bc6e6e5a8603ab1b (patch)
tree8078d3ff21eaa0a0ed6e446ac94f3681e831cad1 /kernel
parent1883c79a57a5fe25309007590cccb1b2782c41b2 (diff)
parent30ff056c42c665b9ea535d8515890857ae382540 (diff)
Merge branch 'linus' into core/rcu
Merge reason: Backmerge latest upstream to queue up dependent fix in the scheduler. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/padata.c690
-rw-r--r--kernel/perf_event.c627
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c23
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/resource.c57
-rw-r--r--kernel/sched.c2197
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c105
-rw-r--r--kernel/trace/trace.c144
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events.c81
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_export.c87
-rw-r--r--kernel/trace/trace_functions_graph.c78
-rw-r--r--kernel/trace/trace_kprobe.c304
-rw-r--r--kernel/trace/trace_syscalls.c189
-rw-r--r--kernel/user.c305
35 files changed, 3685 insertions, 3340 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..6aebdeb2aa34 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
103obj-$(CONFIG_PADATA) += padata.o
103 104
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 105ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 106# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..ccec774c716d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h> 46#include <linux/memory.h>
47#include <linux/ftrace.h>
47 48
48#include <asm-generic/sections.h> 49#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
@@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 94 {"native_get_debugreg",},
94 {"irq_entries_start",}, 95 {"irq_entries_start",},
95 {"common_interrupt",}, 96 {"common_interrupt",},
97 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 98 {NULL} /* Terminator */
97}; 99};
98 100
@@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages);
124static int kprobe_garbage_slots; 126static int kprobe_garbage_slots;
125static int collect_garbage_slots(void); 127static int collect_garbage_slots(void);
126 128
127static int __kprobes check_safety(void)
128{
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150
151/** 129/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 130 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 131 * We allocate an executable page if there's no room on existing ones.
@@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
235{ 213{
236 struct kprobe_insn_page *kip, *next; 214 struct kprobe_insn_page *kip, *next;
237 215
238 /* Ensure no-one is preepmted on the garbages */ 216 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 217 synchronize_sched();
240 return -EAGAIN;
241 218
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
243 int i; 220 int i;
@@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 705
729 preempt_disable(); 706 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 707 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 708 in_kprobes_functions((unsigned long) p->addr) ||
709 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 710 preempt_enable();
733 return -EINVAL; 711 return -EINVAL;
734 } 712 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..6f9bcb8313d6
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,690 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/rcupdate.h>
29
30#define MAX_SEQ_NR INT_MAX - NR_CPUS
31#define MAX_OBJ_NUM 10000 * NR_CPUS
32
33static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
34{
35 int cpu, target_cpu;
36
37 target_cpu = cpumask_first(pd->cpumask);
38 for (cpu = 0; cpu < cpu_index; cpu++)
39 target_cpu = cpumask_next(target_cpu, pd->cpumask);
40
41 return target_cpu;
42}
43
44static int padata_cpu_hash(struct padata_priv *padata)
45{
46 int cpu_index;
47 struct parallel_data *pd;
48
49 pd = padata->pd;
50
51 /*
52 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use.
54 */
55 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
56
57 return padata_index_to_cpu(pd, cpu_index);
58}
59
60static void padata_parallel_worker(struct work_struct *work)
61{
62 struct padata_queue *queue;
63 struct parallel_data *pd;
64 struct padata_instance *pinst;
65 LIST_HEAD(local_list);
66
67 local_bh_disable();
68 queue = container_of(work, struct padata_queue, pwork);
69 pd = queue->pd;
70 pinst = pd->pinst;
71
72 spin_lock(&queue->parallel.lock);
73 list_replace_init(&queue->parallel.list, &local_list);
74 spin_unlock(&queue->parallel.lock);
75
76 while (!list_empty(&local_list)) {
77 struct padata_priv *padata;
78
79 padata = list_entry(local_list.next,
80 struct padata_priv, list);
81
82 list_del_init(&padata->list);
83
84 padata->parallel(padata);
85 }
86
87 local_bh_enable();
88}
89
90/*
91 * padata_do_parallel - padata parallelization function
92 *
93 * @pinst: padata instance
94 * @padata: object to be parallelized
95 * @cb_cpu: cpu the serialization callback function will run on,
96 * must be in the cpumask of padata.
97 *
98 * The parallelization callback function will run with BHs off.
99 * Note: Every object which is parallelized by padata_do_parallel
100 * must be seen by padata_do_serial.
101 */
102int padata_do_parallel(struct padata_instance *pinst,
103 struct padata_priv *padata, int cb_cpu)
104{
105 int target_cpu, err;
106 struct padata_queue *queue;
107 struct parallel_data *pd;
108
109 rcu_read_lock_bh();
110
111 pd = rcu_dereference(pinst->pd);
112
113 err = 0;
114 if (!(pinst->flags & PADATA_INIT))
115 goto out;
116
117 err = -EBUSY;
118 if ((pinst->flags & PADATA_RESET))
119 goto out;
120
121 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
122 goto out;
123
124 err = -EINVAL;
125 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
126 goto out;
127
128 err = -EINPROGRESS;
129 atomic_inc(&pd->refcnt);
130 padata->pd = pd;
131 padata->cb_cpu = cb_cpu;
132
133 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
134 atomic_set(&pd->seq_nr, -1);
135
136 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
137
138 target_cpu = padata_cpu_hash(padata);
139 queue = per_cpu_ptr(pd->queue, target_cpu);
140
141 spin_lock(&queue->parallel.lock);
142 list_add_tail(&padata->list, &queue->parallel.list);
143 spin_unlock(&queue->parallel.lock);
144
145 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
146
147out:
148 rcu_read_unlock_bh();
149
150 return err;
151}
152EXPORT_SYMBOL(padata_do_parallel);
153
154static struct padata_priv *padata_get_next(struct parallel_data *pd)
155{
156 int cpu, num_cpus, empty, calc_seq_nr;
157 int seq_nr, next_nr, overrun, next_overrun;
158 struct padata_queue *queue, *next_queue;
159 struct padata_priv *padata;
160 struct padata_list *reorder;
161
162 empty = 0;
163 next_nr = -1;
164 next_overrun = 0;
165 next_queue = NULL;
166
167 num_cpus = cpumask_weight(pd->cpumask);
168
169 for_each_cpu(cpu, pd->cpumask) {
170 queue = per_cpu_ptr(pd->queue, cpu);
171 reorder = &queue->reorder;
172
173 /*
174 * Calculate the seq_nr of the object that should be
175 * next in this queue.
176 */
177 overrun = 0;
178 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
179 + queue->cpu_index;
180
181 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
182 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
183 overrun = 1;
184 }
185
186 if (!list_empty(&reorder->list)) {
187 padata = list_entry(reorder->list.next,
188 struct padata_priv, list);
189
190 seq_nr = padata->seq_nr;
191 BUG_ON(calc_seq_nr != seq_nr);
192 } else {
193 seq_nr = calc_seq_nr;
194 empty++;
195 }
196
197 if (next_nr < 0 || seq_nr < next_nr
198 || (next_overrun && !overrun)) {
199 next_nr = seq_nr;
200 next_overrun = overrun;
201 next_queue = queue;
202 }
203 }
204
205 padata = NULL;
206
207 if (empty == num_cpus)
208 goto out;
209
210 reorder = &next_queue->reorder;
211
212 if (!list_empty(&reorder->list)) {
213 padata = list_entry(reorder->list.next,
214 struct padata_priv, list);
215
216 if (unlikely(next_overrun)) {
217 for_each_cpu(cpu, pd->cpumask) {
218 queue = per_cpu_ptr(pd->queue, cpu);
219 atomic_set(&queue->num_obj, 0);
220 }
221 }
222
223 spin_lock(&reorder->lock);
224 list_del_init(&padata->list);
225 atomic_dec(&pd->reorder_objects);
226 spin_unlock(&reorder->lock);
227
228 atomic_inc(&next_queue->num_obj);
229
230 goto out;
231 }
232
233 if (next_nr % num_cpus == next_queue->cpu_index) {
234 padata = ERR_PTR(-ENODATA);
235 goto out;
236 }
237
238 padata = ERR_PTR(-EINPROGRESS);
239out:
240 return padata;
241}
242
243static void padata_reorder(struct parallel_data *pd)
244{
245 struct padata_priv *padata;
246 struct padata_queue *queue;
247 struct padata_instance *pinst = pd->pinst;
248
249try_again:
250 if (!spin_trylock_bh(&pd->lock))
251 goto out;
252
253 while (1) {
254 padata = padata_get_next(pd);
255
256 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
257 break;
258
259 if (PTR_ERR(padata) == -ENODATA) {
260 spin_unlock_bh(&pd->lock);
261 goto out;
262 }
263
264 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
265
266 spin_lock(&queue->serial.lock);
267 list_add_tail(&padata->list, &queue->serial.list);
268 spin_unlock(&queue->serial.lock);
269
270 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
271 }
272
273 spin_unlock_bh(&pd->lock);
274
275 if (atomic_read(&pd->reorder_objects))
276 goto try_again;
277
278out:
279 return;
280}
281
282static void padata_serial_worker(struct work_struct *work)
283{
284 struct padata_queue *queue;
285 struct parallel_data *pd;
286 LIST_HEAD(local_list);
287
288 local_bh_disable();
289 queue = container_of(work, struct padata_queue, swork);
290 pd = queue->pd;
291
292 spin_lock(&queue->serial.lock);
293 list_replace_init(&queue->serial.list, &local_list);
294 spin_unlock(&queue->serial.lock);
295
296 while (!list_empty(&local_list)) {
297 struct padata_priv *padata;
298
299 padata = list_entry(local_list.next,
300 struct padata_priv, list);
301
302 list_del_init(&padata->list);
303
304 padata->serial(padata);
305 atomic_dec(&pd->refcnt);
306 }
307 local_bh_enable();
308}
309
310/*
311 * padata_do_serial - padata serialization function
312 *
313 * @padata: object to be serialized.
314 *
315 * padata_do_serial must be called for every parallelized object.
316 * The serialization callback function will run with BHs off.
317 */
318void padata_do_serial(struct padata_priv *padata)
319{
320 int cpu;
321 struct padata_queue *queue;
322 struct parallel_data *pd;
323
324 pd = padata->pd;
325
326 cpu = get_cpu();
327 queue = per_cpu_ptr(pd->queue, cpu);
328
329 spin_lock(&queue->reorder.lock);
330 atomic_inc(&pd->reorder_objects);
331 list_add_tail(&padata->list, &queue->reorder.list);
332 spin_unlock(&queue->reorder.lock);
333
334 put_cpu();
335
336 padata_reorder(pd);
337}
338EXPORT_SYMBOL(padata_do_serial);
339
340static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
341 const struct cpumask *cpumask)
342{
343 int cpu, cpu_index, num_cpus;
344 struct padata_queue *queue;
345 struct parallel_data *pd;
346
347 cpu_index = 0;
348
349 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
350 if (!pd)
351 goto err;
352
353 pd->queue = alloc_percpu(struct padata_queue);
354 if (!pd->queue)
355 goto err_free_pd;
356
357 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
358 goto err_free_queue;
359
360 for_each_possible_cpu(cpu) {
361 queue = per_cpu_ptr(pd->queue, cpu);
362
363 queue->pd = pd;
364
365 if (cpumask_test_cpu(cpu, cpumask)
366 && cpumask_test_cpu(cpu, cpu_active_mask)) {
367 queue->cpu_index = cpu_index;
368 cpu_index++;
369 } else
370 queue->cpu_index = -1;
371
372 INIT_LIST_HEAD(&queue->reorder.list);
373 INIT_LIST_HEAD(&queue->parallel.list);
374 INIT_LIST_HEAD(&queue->serial.list);
375 spin_lock_init(&queue->reorder.lock);
376 spin_lock_init(&queue->parallel.lock);
377 spin_lock_init(&queue->serial.lock);
378
379 INIT_WORK(&queue->pwork, padata_parallel_worker);
380 INIT_WORK(&queue->swork, padata_serial_worker);
381 atomic_set(&queue->num_obj, 0);
382 }
383
384 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
385
386 num_cpus = cpumask_weight(pd->cpumask);
387 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
388
389 atomic_set(&pd->seq_nr, -1);
390 atomic_set(&pd->reorder_objects, 0);
391 atomic_set(&pd->refcnt, 0);
392 pd->pinst = pinst;
393 spin_lock_init(&pd->lock);
394
395 return pd;
396
397err_free_queue:
398 free_percpu(pd->queue);
399err_free_pd:
400 kfree(pd);
401err:
402 return NULL;
403}
404
405static void padata_free_pd(struct parallel_data *pd)
406{
407 free_cpumask_var(pd->cpumask);
408 free_percpu(pd->queue);
409 kfree(pd);
410}
411
412static void padata_replace(struct padata_instance *pinst,
413 struct parallel_data *pd_new)
414{
415 struct parallel_data *pd_old = pinst->pd;
416
417 pinst->flags |= PADATA_RESET;
418
419 rcu_assign_pointer(pinst->pd, pd_new);
420
421 synchronize_rcu();
422
423 while (atomic_read(&pd_old->refcnt) != 0)
424 yield();
425
426 flush_workqueue(pinst->wq);
427
428 padata_free_pd(pd_old);
429
430 pinst->flags &= ~PADATA_RESET;
431}
432
433/*
434 * padata_set_cpumask - set the cpumask that padata should use
435 *
436 * @pinst: padata instance
437 * @cpumask: the cpumask to use
438 */
439int padata_set_cpumask(struct padata_instance *pinst,
440 cpumask_var_t cpumask)
441{
442 struct parallel_data *pd;
443 int err = 0;
444
445 might_sleep();
446
447 mutex_lock(&pinst->lock);
448
449 pd = padata_alloc_pd(pinst, cpumask);
450 if (!pd) {
451 err = -ENOMEM;
452 goto out;
453 }
454
455 cpumask_copy(pinst->cpumask, cpumask);
456
457 padata_replace(pinst, pd);
458
459out:
460 mutex_unlock(&pinst->lock);
461
462 return err;
463}
464EXPORT_SYMBOL(padata_set_cpumask);
465
466static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
467{
468 struct parallel_data *pd;
469
470 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
471 pd = padata_alloc_pd(pinst, pinst->cpumask);
472 if (!pd)
473 return -ENOMEM;
474
475 padata_replace(pinst, pd);
476 }
477
478 return 0;
479}
480
481/*
482 * padata_add_cpu - add a cpu to the padata cpumask
483 *
484 * @pinst: padata instance
485 * @cpu: cpu to add
486 */
487int padata_add_cpu(struct padata_instance *pinst, int cpu)
488{
489 int err;
490
491 might_sleep();
492
493 mutex_lock(&pinst->lock);
494
495 cpumask_set_cpu(cpu, pinst->cpumask);
496 err = __padata_add_cpu(pinst, cpu);
497
498 mutex_unlock(&pinst->lock);
499
500 return err;
501}
502EXPORT_SYMBOL(padata_add_cpu);
503
504static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
505{
506 struct parallel_data *pd;
507
508 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
509 pd = padata_alloc_pd(pinst, pinst->cpumask);
510 if (!pd)
511 return -ENOMEM;
512
513 padata_replace(pinst, pd);
514 }
515
516 return 0;
517}
518
519/*
520 * padata_remove_cpu - remove a cpu from the padata cpumask
521 *
522 * @pinst: padata instance
523 * @cpu: cpu to remove
524 */
525int padata_remove_cpu(struct padata_instance *pinst, int cpu)
526{
527 int err;
528
529 might_sleep();
530
531 mutex_lock(&pinst->lock);
532
533 cpumask_clear_cpu(cpu, pinst->cpumask);
534 err = __padata_remove_cpu(pinst, cpu);
535
536 mutex_unlock(&pinst->lock);
537
538 return err;
539}
540EXPORT_SYMBOL(padata_remove_cpu);
541
542/*
543 * padata_start - start the parallel processing
544 *
545 * @pinst: padata instance to start
546 */
547void padata_start(struct padata_instance *pinst)
548{
549 might_sleep();
550
551 mutex_lock(&pinst->lock);
552 pinst->flags |= PADATA_INIT;
553 mutex_unlock(&pinst->lock);
554}
555EXPORT_SYMBOL(padata_start);
556
557/*
558 * padata_stop - stop the parallel processing
559 *
560 * @pinst: padata instance to stop
561 */
562void padata_stop(struct padata_instance *pinst)
563{
564 might_sleep();
565
566 mutex_lock(&pinst->lock);
567 pinst->flags &= ~PADATA_INIT;
568 mutex_unlock(&pinst->lock);
569}
570EXPORT_SYMBOL(padata_stop);
571
572static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
573 unsigned long action, void *hcpu)
574{
575 int err;
576 struct padata_instance *pinst;
577 int cpu = (unsigned long)hcpu;
578
579 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
580
581 switch (action) {
582 case CPU_ONLINE:
583 case CPU_ONLINE_FROZEN:
584 if (!cpumask_test_cpu(cpu, pinst->cpumask))
585 break;
586 mutex_lock(&pinst->lock);
587 err = __padata_add_cpu(pinst, cpu);
588 mutex_unlock(&pinst->lock);
589 if (err)
590 return NOTIFY_BAD;
591 break;
592
593 case CPU_DOWN_PREPARE:
594 case CPU_DOWN_PREPARE_FROZEN:
595 if (!cpumask_test_cpu(cpu, pinst->cpumask))
596 break;
597 mutex_lock(&pinst->lock);
598 err = __padata_remove_cpu(pinst, cpu);
599 mutex_unlock(&pinst->lock);
600 if (err)
601 return NOTIFY_BAD;
602 break;
603
604 case CPU_UP_CANCELED:
605 case CPU_UP_CANCELED_FROZEN:
606 if (!cpumask_test_cpu(cpu, pinst->cpumask))
607 break;
608 mutex_lock(&pinst->lock);
609 __padata_remove_cpu(pinst, cpu);
610 mutex_unlock(&pinst->lock);
611
612 case CPU_DOWN_FAILED:
613 case CPU_DOWN_FAILED_FROZEN:
614 if (!cpumask_test_cpu(cpu, pinst->cpumask))
615 break;
616 mutex_lock(&pinst->lock);
617 __padata_add_cpu(pinst, cpu);
618 mutex_unlock(&pinst->lock);
619 }
620
621 return NOTIFY_OK;
622}
623
624/*
625 * padata_alloc - allocate and initialize a padata instance
626 *
627 * @cpumask: cpumask that padata uses for parallelization
628 * @wq: workqueue to use for the allocated padata instance
629 */
630struct padata_instance *padata_alloc(const struct cpumask *cpumask,
631 struct workqueue_struct *wq)
632{
633 int err;
634 struct padata_instance *pinst;
635 struct parallel_data *pd;
636
637 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
638 if (!pinst)
639 goto err;
640
641 pd = padata_alloc_pd(pinst, cpumask);
642 if (!pd)
643 goto err_free_inst;
644
645 rcu_assign_pointer(pinst->pd, pd);
646
647 pinst->wq = wq;
648
649 cpumask_copy(pinst->cpumask, cpumask);
650
651 pinst->flags = 0;
652
653 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
654 pinst->cpu_notifier.priority = 0;
655 err = register_hotcpu_notifier(&pinst->cpu_notifier);
656 if (err)
657 goto err_free_pd;
658
659 mutex_init(&pinst->lock);
660
661 return pinst;
662
663err_free_pd:
664 padata_free_pd(pd);
665err_free_inst:
666 kfree(pinst);
667err:
668 return NULL;
669}
670EXPORT_SYMBOL(padata_alloc);
671
672/*
673 * padata_free - free a padata instance
674 *
675 * @ padata_inst: padata instance to free
676 */
677void padata_free(struct padata_instance *pinst)
678{
679 padata_stop(pinst);
680
681 synchronize_rcu();
682
683 while (atomic_read(&pinst->pd->refcnt) != 0)
684 yield();
685
686 unregister_hotcpu_notifier(&pinst->cpu_notifier);
687 padata_free_pd(pinst->pd);
688 kfree(pinst);
689}
690EXPORT_SYMBOL(padata_free);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..a661e7991865 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 98
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 102
102int __weak 103int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 104hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 105 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 106 struct perf_event_context *ctx)
106{ 107{
107 return 0; 108 return 0;
108} 109}
@@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 249
249static inline u64 perf_clock(void) 250static inline u64 perf_clock(void)
250{ 251{
251 return cpu_clock(smp_processor_id()); 252 return cpu_clock(raw_smp_processor_id());
252} 253}
253 254
254/* 255/*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 290 event->total_time_running = run_end - event->tstamp_running;
290} 291}
291 292
293static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{
296 if (event->attr.pinned)
297 return &ctx->pinned_groups;
298 else
299 return &ctx->flexible_groups;
300}
301
292/* 302/*
293 * Add a event from the lists for its context. 303 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 304 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 313 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 314 * leader's sibling list:
305 */ 315 */
306 if (group_leader == event) 316 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 317 struct list_head *list;
308 else { 318
319 if (is_software_event(event))
320 event->group_flags |= PERF_GROUP_SOFTWARE;
321
322 list = ctx_group_list(event, ctx);
323 list_add_tail(&event->group_entry, list);
324 } else {
325 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
326 !is_software_event(event))
327 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
328
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 329 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 330 group_leader->nr_siblings++;
311 } 331 }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 375 * to the context list directly:
356 */ 376 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 377 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
378 struct list_head *list;
358 379
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 380 list = ctx_group_list(event, ctx);
381 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 382 sibling->group_leader = sibling;
383
384 /* Inherit group flags from the previous leader */
385 sibling->group_flags = event->group_flags;
361 } 386 }
362} 387}
363 388
@@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
608static int 633static int
609event_sched_in(struct perf_event *event, 634event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 635 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 636 struct perf_event_context *ctx)
612 int cpu)
613{ 637{
614 if (event->state <= PERF_EVENT_STATE_OFF) 638 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 639 return 0;
616 640
617 event->state = PERF_EVENT_STATE_ACTIVE; 641 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 642 event->oncpu = smp_processor_id();
619 /* 643 /*
620 * The new state must be visible before we turn it on in the hardware: 644 * The new state must be visible before we turn it on in the hardware:
621 */ 645 */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
642static int 666static int
643group_sched_in(struct perf_event *group_event, 667group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 669 struct perf_event_context *ctx)
646 int cpu)
647{ 670{
648 struct perf_event *event, *partial_group; 671 struct perf_event *event, *partial_group;
649 int ret; 672 int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 674 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 675 return 0;
653 676
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 678 if (ret)
656 return ret < 0 ? ret : 0; 679 return ret < 0 ? ret : 0;
657 680
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 681 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 682 return -EAGAIN;
660 683
661 /* 684 /*
662 * Schedule in siblings as one group (if any): 685 * Schedule in siblings as one group (if any):
663 */ 686 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 687 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 688 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 689 partial_group = event;
667 goto group_error; 690 goto group_error;
668 } 691 }
@@ -686,24 +709,6 @@ group_error:
686} 709}
687 710
688/* 711/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 712 * Work out whether we can put this event group on the CPU now.
708 */ 713 */
709static int group_can_go_on(struct perf_event *event, 714static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 718 /*
714 * Groups consisting entirely of software events can always go on. 719 * Groups consisting entirely of software events can always go on.
715 */ 720 */
716 if (is_software_only_group(event)) 721 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 722 return 1;
718 /* 723 /*
719 * If an exclusive group is already on, no other hardware 724 * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 759 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 760 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 761 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 762 int err;
759 763
760 /* 764 /*
@@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 805 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 806 err = -EEXIST;
803 else 807 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 808 err = event_sched_in(event, cpuctx, ctx);
805 809
806 if (err) { 810 if (err) {
807 /* 811 /*
@@ -943,11 +947,9 @@ static void __perf_event_enable(void *info)
943 } else { 947 } else {
944 perf_disable(); 948 perf_disable();
945 if (event == leader) 949 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 950 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 951 else
949 err = event_sched_in(event, cpuctx, ctx, 952 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 953 perf_enable();
952 } 954 }
953 955
@@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1045 return 0;
1044} 1046}
1045 1047
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1048enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1049 EVENT_FLEXIBLE = 0x1,
1050 EVENT_PINNED = 0x2,
1051 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052};
1053
1054static void ctx_sched_out(struct perf_event_context *ctx,
1055 struct perf_cpu_context *cpuctx,
1056 enum event_type_t event_type)
1048{ 1057{
1049 struct perf_event *event; 1058 struct perf_event *event;
1050 1059
@@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1064 update_context_time(ctx);
1056 1065
1057 perf_disable(); 1066 perf_disable();
1058 if (ctx->nr_active) { 1067 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1068 goto out_enable;
1069
1070 if (event_type & EVENT_PINNED)
1071 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1072 group_sched_out(event, cpuctx, ctx);
1061 } 1073
1074 if (event_type & EVENT_FLEXIBLE)
1075 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076 group_sched_out(event, cpuctx, ctx);
1077
1078 out_enable:
1062 perf_enable(); 1079 perf_enable();
1063 out: 1080 out:
1064 raw_spin_unlock(&ctx->lock); 1081 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1187 * not restart the event.
1171 */ 1188 */
1172void perf_event_task_sched_out(struct task_struct *task, 1189void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1190 struct task_struct *next)
1174{ 1191{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1192 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1193 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1194 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1195 struct perf_event_context *parent;
@@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1237 rcu_read_unlock();
1221 1238
1222 if (do_switch) { 1239 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1240 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1241 cpuctx->task_ctx = NULL;
1225 } 1242 }
1226} 1243}
1227 1244
1228/* 1245static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1246 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1247{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1249
@@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1253 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1254 return;
1240 1255
1241 __perf_event_sched_out(ctx, cpuctx); 1256 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1257 cpuctx->task_ctx = NULL;
1243} 1258}
1244 1259
1245/* 1260/*
1246 * Called with IRQs disabled 1261 * Called with IRQs disabled
1247 */ 1262 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1263static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1264{
1265 task_ctx_sched_out(ctx, EVENT_ALL);
1266}
1267
1268/*
1269 * Called with IRQs disabled
1270 */
1271static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272 enum event_type_t event_type)
1249{ 1273{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1274 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1275}
1252 1276
1253static void 1277static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1278ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1279 struct perf_cpu_context *cpuctx)
1256{ 1280{
1257 struct perf_event *event; 1281 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264 1282
1265 ctx->timestamp = perf_clock(); 1283 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1266 1284 if (event->state <= PERF_EVENT_STATE_OFF)
1267 perf_disable();
1268
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1285 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1286 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1287 continue;
1279 1288
1280 if (group_can_go_on(event, cpuctx, 1)) 1289 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1290 group_sched_in(event, cpuctx, ctx);
1282 1291
1283 /* 1292 /*
1284 * If this pinned group hasn't been scheduled, 1293 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1298 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1299 }
1291 } 1300 }
1301}
1292 1302
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1303static void
1294 /* 1304ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1305 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1306{
1297 */ 1307 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1308 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1309
1310 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311 /* Ignore events in OFF or ERROR state */
1312 if (event->state <= PERF_EVENT_STATE_OFF)
1313 continue;
1302 /* 1314 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1315 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1316 * of events:
1305 */ 1317 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1318 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1319 continue;
1308 1320
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1321 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1322 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1323 can_add_hw = 0;
1312 } 1324 }
1325}
1326
1327static void
1328ctx_sched_in(struct perf_event_context *ctx,
1329 struct perf_cpu_context *cpuctx,
1330 enum event_type_t event_type)
1331{
1332 raw_spin_lock(&ctx->lock);
1333 ctx->is_active = 1;
1334 if (likely(!ctx->nr_events))
1335 goto out;
1336
1337 ctx->timestamp = perf_clock();
1338
1339 perf_disable();
1340
1341 /*
1342 * First go through the list and put on any pinned groups
1343 * in order to give them the best chance of going on.
1344 */
1345 if (event_type & EVENT_PINNED)
1346 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348 /* Then walk through the lower prio flexible groups */
1349 if (event_type & EVENT_FLEXIBLE)
1350 ctx_flexible_sched_in(ctx, cpuctx);
1351
1313 perf_enable(); 1352 perf_enable();
1314 out: 1353 out:
1315 raw_spin_unlock(&ctx->lock); 1354 raw_spin_unlock(&ctx->lock);
1316} 1355}
1317 1356
1357static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358 enum event_type_t event_type)
1359{
1360 struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362 ctx_sched_in(ctx, cpuctx, event_type);
1363}
1364
1365static void task_ctx_sched_in(struct task_struct *task,
1366 enum event_type_t event_type)
1367{
1368 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369 struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371 if (likely(!ctx))
1372 return;
1373 if (cpuctx->task_ctx == ctx)
1374 return;
1375 ctx_sched_in(ctx, cpuctx, event_type);
1376 cpuctx->task_ctx = ctx;
1377}
1318/* 1378/*
1319 * Called from scheduler to add the events of the current task 1379 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1380 * with interrupts disabled.
@@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1386 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1387 * keep the event running.
1328 */ 1388 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1389void perf_event_task_sched_in(struct task_struct *task)
1330{ 1390{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1391 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1392 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1393
1334 if (likely(!ctx)) 1394 if (likely(!ctx))
1335 return; 1395 return;
1396
1336 if (cpuctx->task_ctx == ctx) 1397 if (cpuctx->task_ctx == ctx)
1337 return; 1398 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1399
1400 /*
1401 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned,
1403 * cpu flexible, task flexible.
1404 */
1405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1339 cpuctx->task_ctx = ctx; 1411 cpuctx->task_ctx = ctx;
1340} 1412}
1341 1413
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1414#define MAX_INTERRUPTS (~0ULL)
1415
1416static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1419{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1420 u64 frequency = event->attr.sample_freq;
1421 u64 sec = NSEC_PER_SEC;
1422 u64 divisor, dividend;
1423
1424 int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426 count_fls = fls64(count);
1427 nsec_fls = fls64(nsec);
1428 frequency_fls = fls64(frequency);
1429 sec_fls = 30;
1345 1430
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1431 /*
1432 * We got @count in @nsec, with a target of sample_freq HZ
1433 * the target period becomes:
1434 *
1435 * @count * 10^9
1436 * period = -------------------
1437 * @nsec * sample_freq
1438 *
1439 */
1440
1441 /*
1442 * Reduce accuracy by one bit such that @a and @b converge
1443 * to a similar magnitude.
1444 */
1445#define REDUCE_FLS(a, b) \
1446do { \
1447 if (a##_fls > b##_fls) { \
1448 a >>= 1; \
1449 a##_fls--; \
1450 } else { \
1451 b >>= 1; \
1452 b##_fls--; \
1453 } \
1454} while (0)
1455
1456 /*
1457 * Reduce accuracy until either term fits in a u64, then proceed with
1458 * the other, so that finally we can do a u64/u64 division.
1459 */
1460 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461 REDUCE_FLS(nsec, frequency);
1462 REDUCE_FLS(sec, count);
1463 }
1464
1465 if (count_fls + sec_fls > 64) {
1466 divisor = nsec * frequency;
1467
1468 while (count_fls + sec_fls > 64) {
1469 REDUCE_FLS(count, sec);
1470 divisor >>= 1;
1471 }
1472
1473 dividend = count * sec;
1474 } else {
1475 dividend = count * sec;
1476
1477 while (nsec_fls + frequency_fls > 64) {
1478 REDUCE_FLS(nsec, frequency);
1479 dividend >>= 1;
1480 }
1481
1482 divisor = nsec * frequency;
1483 }
1484
1485 return div64_u64(dividend, divisor);
1347} 1486}
1348 1487
1349#define MAX_INTERRUPTS (~0ULL) 1488static void perf_event_stop(struct perf_event *event)
1489{
1490 if (!event->pmu->stop)
1491 return event->pmu->disable(event);
1350 1492
1351static void perf_log_throttle(struct perf_event *event, int enable); 1493 return event->pmu->stop(event);
1494}
1495
1496static int perf_event_start(struct perf_event *event)
1497{
1498 if (!event->pmu->start)
1499 return event->pmu->enable(event);
1352 1500
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1501 return event->pmu->start(event);
1502}
1503
1504static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1505{
1355 struct hw_perf_event *hwc = &event->hw; 1506 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1507 u64 period, sample_period;
1357 s64 delta; 1508 s64 delta;
1358 1509
1359 events *= hwc->sample_period; 1510 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1511
1362 delta = (s64)(period - hwc->sample_period); 1512 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1513 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1518,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1518 sample_period = 1;
1369 1519
1370 hwc->sample_period = sample_period; 1520 hwc->sample_period = sample_period;
1521
1522 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523 perf_disable();
1524 perf_event_stop(event);
1525 atomic64_set(&hwc->period_left, 0);
1526 perf_event_start(event);
1527 perf_enable();
1528 }
1371} 1529}
1372 1530
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1531static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1532{
1375 struct perf_event *event; 1533 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1534 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1535 u64 interrupts, now;
1536 s64 delta;
1378 1537
1379 raw_spin_lock(&ctx->lock); 1538 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1395 if (interrupts == MAX_INTERRUPTS) { 1554 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1555 perf_log_throttle(event, 1);
1397 event->pmu->unthrottle(event); 1556 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1399 } 1557 }
1400 1558
1401 if (!event->attr.freq || !event->attr.sample_freq) 1559 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1560 continue;
1403 1561
1404 /* 1562 event->pmu->read(event);
1405 * if the specified freq < HZ then we need to skip ticks 1563 now = atomic64_read(&event->count);
1406 */ 1564 delta = now - hwc->freq_count_stamp;
1407 if (event->attr.sample_freq < HZ) { 1565 hwc->freq_count_stamp = now;
1408 freq = event->attr.sample_freq;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1566
1424 /* 1567 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1568 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1569 }
1437 raw_spin_unlock(&ctx->lock); 1570 raw_spin_unlock(&ctx->lock);
1438} 1571}
@@ -1442,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1575 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1576static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1577{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events) 1578 if (!ctx->nr_events)
1448 return; 1579 return;
1449 1580
1450 raw_spin_lock(&ctx->lock); 1581 raw_spin_lock(&ctx->lock);
1451 /* 1582
1452 * Rotate the first entry last (works just fine for group events too): 1583 /* Rotate the first entry last of non-pinned groups */
1453 */ 1584 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1585
1461 raw_spin_unlock(&ctx->lock); 1586 raw_spin_unlock(&ctx->lock);
1462} 1587}
1463 1588
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1589void perf_event_task_tick(struct task_struct *curr)
1465{ 1590{
1466 struct perf_cpu_context *cpuctx; 1591 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1592 struct perf_event_context *ctx;
@@ -1469,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1469 if (!atomic_read(&nr_events)) 1594 if (!atomic_read(&nr_events))
1470 return; 1595 return;
1471 1596
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1597 cpuctx = &__get_cpu_var(perf_cpu_context);
1473 ctx = curr->perf_event_ctxp; 1598 ctx = curr->perf_event_ctxp;
1474 1599
1600 perf_disable();
1601
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1602 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1603 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1604 perf_ctx_adjust_freq(ctx);
1478 1605
1479 perf_event_cpu_sched_out(cpuctx); 1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1607 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1609
1483 rotate_ctx(&cpuctx->ctx); 1610 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1611 if (ctx)
1485 rotate_ctx(ctx); 1612 rotate_ctx(ctx);
1486 1613
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1615 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable();
1619}
1620
1621static int event_enable_on_exec(struct perf_event *event,
1622 struct perf_event_context *ctx)
1623{
1624 if (!event->attr.enable_on_exec)
1625 return 0;
1626
1627 event->attr.enable_on_exec = 0;
1628 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629 return 0;
1630
1631 __perf_event_mark_enabled(event, ctx);
1632
1633 return 1;
1490} 1634}
1491 1635
1492/* 1636/*
@@ -1499,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1643 struct perf_event *event;
1500 unsigned long flags; 1644 unsigned long flags;
1501 int enabled = 0; 1645 int enabled = 0;
1646 int ret;
1502 1647
1503 local_irq_save(flags); 1648 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1649 ctx = task->perf_event_ctxp;
@@ -1509,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1654
1510 raw_spin_lock(&ctx->lock); 1655 raw_spin_lock(&ctx->lock);
1511 1656
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1657 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1658 ret = event_enable_on_exec(event, ctx);
1514 continue; 1659 if (ret)
1515 event->attr.enable_on_exec = 0; 1660 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1661 }
1517 continue; 1662
1518 __perf_event_mark_enabled(event, ctx); 1663 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1664 ret = event_enable_on_exec(event, ctx);
1665 if (ret)
1666 enabled = 1;
1520 } 1667 }
1521 1668
1522 /* 1669 /*
@@ -1527,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1674
1528 raw_spin_unlock(&ctx->lock); 1675 raw_spin_unlock(&ctx->lock);
1529 1676
1530 perf_event_task_sched_in(task, smp_processor_id()); 1677 perf_event_task_sched_in(task);
1531 out: 1678 out:
1532 local_irq_restore(flags); 1679 local_irq_restore(flags);
1533} 1680}
@@ -1590,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1737{
1591 raw_spin_lock_init(&ctx->lock); 1738 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1739 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1740 INIT_LIST_HEAD(&ctx->pinned_groups);
1741 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1742 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1743 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1744 ctx->task = task;
@@ -3608,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3608 /* .tid */ 3756 /* .tid */
3609 .start = vma->vm_start, 3757 .start = vma->vm_start,
3610 .len = vma->vm_end - vma->vm_start, 3758 .len = vma->vm_end - vma->vm_start,
3611 .pgoff = vma->vm_pgoff, 3759 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3612 }, 3760 },
3613 }; 3761 };
3614 3762
@@ -3688,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 3836
3689 if (event->attr.freq) { 3837 if (event->attr.freq) {
3690 u64 now = perf_clock(); 3838 u64 now = perf_clock();
3691 s64 delta = now - hwc->freq_stamp; 3839 s64 delta = now - hwc->freq_time_stamp;
3692 3840
3693 hwc->freq_stamp = now; 3841 hwc->freq_time_stamp = now;
3694 3842
3695 if (delta > 0 && delta < TICK_NSEC) 3843 if (delta > 0 && delta < 2*TICK_NSEC)
3696 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3844 perf_adjust_period(event, delta, hwc->last_period);
3697 } 3845 }
3698 3846
3699 /* 3847 /*
@@ -4184,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
4184 .read = task_clock_perf_event_read, 4332 .read = task_clock_perf_event_read,
4185}; 4333};
4186 4334
4187#ifdef CONFIG_EVENT_PROFILE 4335#ifdef CONFIG_EVENT_TRACING
4188 4336
4189void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4190 int entry_size) 4338 int entry_size)
@@ -4289,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
4289{ 4437{
4290} 4438}
4291 4439
4292#endif /* CONFIG_EVENT_PROFILE */ 4440#endif /* CONFIG_EVENT_TRACING */
4293 4441
4294#ifdef CONFIG_HAVE_HW_BREAKPOINT 4442#ifdef CONFIG_HAVE_HW_BREAKPOINT
4295static void bp_perf_event_destroy(struct perf_event *event) 4443static void bp_perf_event_destroy(struct perf_event *event)
@@ -4870,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
4870 else 5018 else
4871 child_event->state = PERF_EVENT_STATE_OFF; 5019 child_event->state = PERF_EVENT_STATE_OFF;
4872 5020
4873 if (parent_event->attr.freq) 5021 if (parent_event->attr.freq) {
4874 child_event->hw.sample_period = parent_event->hw.sample_period; 5022 u64 sample_period = parent_event->hw.sample_period;
5023 struct hw_perf_event *hwc = &child_event->hw;
5024
5025 hwc->sample_period = sample_period;
5026 hwc->last_period = sample_period;
5027
5028 atomic64_set(&hwc->period_left, sample_period);
5029 }
4875 5030
4876 child_event->overflow_handler = parent_event->overflow_handler; 5031 child_event->overflow_handler = parent_event->overflow_handler;
4877 5032
@@ -5039,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
5039 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5040 5195
5041again: 5196again:
5042 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5198 group_entry)
5199 __perf_event_exit_task(child_event, child_ctx, child);
5200
5201 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5043 group_entry) 5202 group_entry)
5044 __perf_event_exit_task(child_event, child_ctx, child); 5203 __perf_event_exit_task(child_event, child_ctx, child);
5045 5204
@@ -5048,7 +5207,8 @@ again:
5048 * its siblings to the list, but we obtained 'tmp' before that which 5207 * its siblings to the list, but we obtained 'tmp' before that which
5049 * will still point to the list head terminating the iteration. 5208 * will still point to the list head terminating the iteration.
5050 */ 5209 */
5051 if (!list_empty(&child_ctx->group_list)) 5210 if (!list_empty(&child_ctx->pinned_groups) ||
5211 !list_empty(&child_ctx->flexible_groups))
5052 goto again; 5212 goto again;
5053 5213
5054 mutex_unlock(&child_ctx->mutex); 5214 mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5216,24 @@ again:
5056 put_ctx(child_ctx); 5216 put_ctx(child_ctx);
5057} 5217}
5058 5218
5219static void perf_free_event(struct perf_event *event,
5220 struct perf_event_context *ctx)
5221{
5222 struct perf_event *parent = event->parent;
5223
5224 if (WARN_ON_ONCE(!parent))
5225 return;
5226
5227 mutex_lock(&parent->child_mutex);
5228 list_del_init(&event->child_list);
5229 mutex_unlock(&parent->child_mutex);
5230
5231 fput(parent->filp);
5232
5233 list_del_event(event, ctx);
5234 free_event(event);
5235}
5236
5059/* 5237/*
5060 * free an unexposed, unused context as created by inheritance by 5238 * free an unexposed, unused context as created by inheritance by
5061 * init_task below, used by fork() in case of fail. 5239 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task)
5070 5248
5071 mutex_lock(&ctx->mutex); 5249 mutex_lock(&ctx->mutex);
5072again: 5250again:
5073 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5251 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5074 struct perf_event *parent = event->parent; 5252 perf_free_event(event, ctx);
5075 5253
5076 if (WARN_ON_ONCE(!parent)) 5254 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5077 continue; 5255 group_entry)
5256 perf_free_event(event, ctx);
5078 5257
5079 mutex_lock(&parent->child_mutex); 5258 if (!list_empty(&ctx->pinned_groups) ||
5080 list_del_init(&event->child_list); 5259 !list_empty(&ctx->flexible_groups))
5081 mutex_unlock(&parent->child_mutex); 5260 goto again;
5082 5261
5083 fput(parent->filp); 5262 mutex_unlock(&ctx->mutex);
5084 5263
5085 list_del_event(event, ctx); 5264 put_ctx(ctx);
5086 free_event(event); 5265}
5266
5267static int
5268inherit_task_group(struct perf_event *event, struct task_struct *parent,
5269 struct perf_event_context *parent_ctx,
5270 struct task_struct *child,
5271 int *inherited_all)
5272{
5273 int ret;
5274 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5275
5276 if (!event->attr.inherit) {
5277 *inherited_all = 0;
5278 return 0;
5087 } 5279 }
5088 5280
5089 if (!list_empty(&ctx->group_list)) 5281 if (!child_ctx) {
5090 goto again; 5282 /*
5283 * This is executed from the parent task context, so
5284 * inherit events that have been marked for cloning.
5285 * First allocate and initialize a context for the
5286 * child.
5287 */
5091 5288
5092 mutex_unlock(&ctx->mutex); 5289 child_ctx = kzalloc(sizeof(struct perf_event_context),
5290 GFP_KERNEL);
5291 if (!child_ctx)
5292 return -ENOMEM;
5093 5293
5094 put_ctx(ctx); 5294 __perf_event_init_context(child_ctx, child);
5295 child->perf_event_ctxp = child_ctx;
5296 get_task_struct(child);
5297 }
5298
5299 ret = inherit_group(event, parent, parent_ctx,
5300 child, child_ctx);
5301
5302 if (ret)
5303 *inherited_all = 0;
5304
5305 return ret;
5095} 5306}
5096 5307
5308
5097/* 5309/*
5098 * Initialize the perf_event context in task_struct 5310 * Initialize the perf_event context in task_struct
5099 */ 5311 */
5100int perf_event_init_task(struct task_struct *child) 5312int perf_event_init_task(struct task_struct *child)
5101{ 5313{
5102 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5314 struct perf_event_context *child_ctx, *parent_ctx;
5103 struct perf_event_context *cloned_ctx; 5315 struct perf_event_context *cloned_ctx;
5104 struct perf_event *event; 5316 struct perf_event *event;
5105 struct task_struct *parent = current; 5317 struct task_struct *parent = current;
@@ -5137,41 +5349,22 @@ int perf_event_init_task(struct task_struct *child)
5137 * We dont have to disable NMIs - we are only looking at 5349 * We dont have to disable NMIs - we are only looking at
5138 * the list, not manipulating it: 5350 * the list, not manipulating it:
5139 */ 5351 */
5140 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5352 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5141 5353 ret = inherit_task_group(event, parent, parent_ctx, child,
5142 if (!event->attr.inherit) { 5354 &inherited_all);
5143 inherited_all = 0; 5355 if (ret)
5144 continue; 5356 break;
5145 } 5357 }
5146
5147 if (!child->perf_event_ctxp) {
5148 /*
5149 * This is executed from the parent task context, so
5150 * inherit events that have been marked for cloning.
5151 * First allocate and initialize a context for the
5152 * child.
5153 */
5154
5155 child_ctx = kzalloc(sizeof(struct perf_event_context),
5156 GFP_KERNEL);
5157 if (!child_ctx) {
5158 ret = -ENOMEM;
5159 break;
5160 }
5161
5162 __perf_event_init_context(child_ctx, child);
5163 child->perf_event_ctxp = child_ctx;
5164 get_task_struct(child);
5165 }
5166 5358
5167 ret = inherit_group(event, parent, parent_ctx, 5359 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5168 child, child_ctx); 5360 ret = inherit_task_group(event, parent, parent_ctx, child,
5169 if (ret) { 5361 &inherited_all);
5170 inherited_all = 0; 5362 if (ret)
5171 break; 5363 break;
5172 }
5173 } 5364 }
5174 5365
5366 child_ctx = child->perf_event_ctxp;
5367
5175 if (child_ctx && inherited_all) { 5368 if (child_ctx && inherited_all) {
5176 /* 5369 /*
5177 * Mark the child context as a clone of the parent 5370 * Mark the child context as a clone of the parent
@@ -5220,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
5220 struct perf_event_context *ctx = &cpuctx->ctx; 5413 struct perf_event_context *ctx = &cpuctx->ctx;
5221 struct perf_event *event, *tmp; 5414 struct perf_event *event, *tmp;
5222 5415
5223 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5416 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5417 __perf_event_remove_from_context(event);
5418 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5224 __perf_event_remove_from_context(event); 5419 __perf_event_remove_from_context(event);
5225} 5420}
5226static void perf_event_exit_cpu(int cpu) 5421static void perf_event_exit_cpu(int cpu)
@@ -5258,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5258 perf_event_exit_cpu(cpu); 5453 perf_event_exit_cpu(cpu);
5259 break; 5454 break;
5260 5455
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5261 default: 5460 default:
5262 break; 5461 break;
5263 } 5462 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
1181 1181
1182 memory_bm_position_reset(&copy_bm); 1182 memory_bm_position_reset(&copy_bm);
1183 1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1184 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1186 struct page *page = pfn_to_page(pfn);
1187 1187
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
1500{ 1500{
1501 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
1502 1502
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1503 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1504
1505 drain_local_pages(NULL); 1505 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1506 nr_pages = count_data_pages();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
657 struct swsusp_info *header; 657 struct swsusp_info *header;
658 658
659 *flags_p = swsusp_header->flags; 659 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664 660
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 661 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 662 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..4e9d87fd7bc5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
@@ -297,14 +327,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 327
298#endif 328#endif
299 329
330static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
331{
332 return 1;
333}
334/*
335 * This generic page_is_ram() returns true if specified address is
336 * registered as "System RAM" in iomem_resource list.
337 */
338int __weak page_is_ram(unsigned long pfn)
339{
340 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
341}
342
300/* 343/*
301 * Find empty slot in the resource tree given range and alignment. 344 * Find empty slot in the resource tree given range and alignment.
302 */ 345 */
303static int find_resource(struct resource *root, struct resource *new, 346static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 347 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 348 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 349 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 350 const struct resource *,
351 resource_size_t,
352 resource_size_t),
308 void *alignf_data) 353 void *alignf_data)
309{ 354{
310 struct resource *this = root->child; 355 struct resource *this = root->child;
@@ -330,7 +375,7 @@ static int find_resource(struct resource *root, struct resource *new,
330 tmp.end = max; 375 tmp.end = max;
331 tmp.start = ALIGN(tmp.start, align); 376 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 377 if (alignf)
333 alignf(alignf_data, &tmp, size, align); 378 tmp.start = alignf(alignf_data, &tmp, size, align);
334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 379 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = tmp.start; 380 new->start = tmp.start;
336 new->end = tmp.start + size - 1; 381 new->end = tmp.start + size - 1;
@@ -358,8 +403,10 @@ static int find_resource(struct resource *root, struct resource *new,
358int allocate_resource(struct resource *root, struct resource *new, 403int allocate_resource(struct resource *root, struct resource *new,
359 resource_size_t size, resource_size_t min, 404 resource_size_t size, resource_size_t min,
360 resource_size_t max, resource_size_t align, 405 resource_size_t max, resource_size_t align,
361 void (*alignf)(void *, struct resource *, 406 resource_size_t (*alignf)(void *,
362 resource_size_t, resource_size_t), 407 const struct resource *,
408 resource_size_t,
409 resource_size_t),
363 void *alignf_data) 410 void *alignf_data)
364{ 411{
365 int err; 412 int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3218f5213717..6a212c97f523 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -946,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
946#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
947 904
948/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
949 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
950 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
951 */ 921 */
952static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
953 __acquires(rq->lock) 923 __acquires(rq->lock)
954{ 924{
925 struct rq *rq;
926
955 for (;;) { 927 for (;;) {
956 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
957 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
958 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
959 return rq; 933 return rq;
960 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
961 } 935 }
@@ -972,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
972 struct rq *rq; 946 struct rq *rq;
973 947
974 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
975 local_irq_save(*flags); 951 local_irq_save(*flags);
976 rq = task_rq(p); 952 rq = task_rq(p);
977 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
978 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
979 return rq; 955 return rq;
980 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
981 } 957 }
@@ -1395,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1395 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1396}; 1372};
1397 1373
1398static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1399
1400/*
1401 * runqueue iterator, to support SMP load-balancing between different
1402 * scheduling classes, without having to expose their internal data
1403 * structures to the load-balancing proper:
1404 */
1405struct rq_iterator {
1406 void *arg;
1407 struct task_struct *(*start)(void *);
1408 struct task_struct *(*next)(void *);
1409};
1410
1411#ifdef CONFIG_SMP
1412static unsigned long
1413balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1414 unsigned long max_load_move, struct sched_domain *sd,
1415 enum cpu_idle_type idle, int *all_pinned,
1416 int *this_best_prio, struct rq_iterator *iterator);
1417
1418static int
1419iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1420 struct sched_domain *sd, enum cpu_idle_type idle,
1421 struct rq_iterator *iterator);
1422#endif
1423
1424/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1425enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1426 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1706,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1706 } 1656 }
1707} 1657}
1708 1658
1709static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1710{
1711 if (root_task_group_empty())
1712 return;
1713
1714 raw_spin_unlock(&rq->lock);
1715 update_shares(sd);
1716 raw_spin_lock(&rq->lock);
1717}
1718
1719static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1720{ 1660{
1721 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1730,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1730{ 1670{
1731} 1671}
1732 1672
1733static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1734{
1735}
1736
1737#endif 1673#endif
1738 1674
1739#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1810,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1810 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1811 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1812} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1813#endif 1794#endif
1814 1795
1815#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1839,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1839#endif 1820#endif
1840} 1821}
1841 1822
1842#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1843#include "sched_idletask.c"
1844#include "sched_fair.c"
1845#include "sched_rt.c"
1846#ifdef CONFIG_SCHED_DEBUG
1847# include "sched_debug.c"
1848#endif
1849 1824
1850#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1851#define for_each_class(class) \ 1826#define for_each_class(class) \
1852 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1853 1828
1829#include "sched_stats.h"
1830
1854static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1855{ 1832{
1856 rq->nr_running++; 1833 rq->nr_running++;
@@ -1888,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1888 *avg += diff >> 3; 1865 *avg += diff >> 3;
1889} 1866}
1890 1867
1891static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1892{ 1870{
1893 if (wakeup) 1871 if (wakeup)
1894 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1895 1873
1896 sched_info_queued(p); 1874 sched_info_queued(p);
1897 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1898 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1899} 1877}
1900 1878
@@ -1917,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1917} 1895}
1918 1896
1919/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1920 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1921 */ 1930 */
1922static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1962,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1962 return p->prio; 1971 return p->prio;
1963} 1972}
1964 1973
1965/*
1966 * activate_task - move a task to the runqueue.
1967 */
1968static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1969{
1970 if (task_contributes_to_load(p))
1971 rq->nr_uninterruptible--;
1972
1973 enqueue_task(rq, p, wakeup);
1974 inc_nr_running(rq);
1975}
1976
1977/*
1978 * deactivate_task - remove a task from the runqueue.
1979 */
1980static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1981{
1982 if (task_contributes_to_load(p))
1983 rq->nr_uninterruptible++;
1984
1985 dequeue_task(rq, p, sleep);
1986 dec_nr_running(rq);
1987}
1988
1989/** 1974/**
1990 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1991 * @p: the task in question. 1976 * @p: the task in question.
@@ -2413,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2413 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2414 2399
2415 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2416 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2417 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2418 2409
2419 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2420 update_rq_clock(rq); 2412 update_rq_clock(rq);
2421 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2422 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2423 cpu = task_cpu(p);
2424 2422
2425#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2426 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2668,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2668 set_task_cpu(p, cpu); 2666 set_task_cpu(p, cpu);
2669#endif 2667#endif
2670 2668
2671 rq = task_rq_lock(p, &flags); 2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2672 BUG_ON(p->state != TASK_WAKING); 2676 BUG_ON(p->state != TASK_WAKING);
2673 p->state = TASK_RUNNING; 2677 p->state = TASK_RUNNING;
2674 update_rq_clock(rq); 2678 update_rq_clock(rq);
@@ -2799,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2799 */ 2803 */
2800 prev_state = prev->state; 2804 prev_state = prev->state;
2801 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2802 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2803 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2804 2814
2805 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -3104,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3104#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3105 3115
3106/* 3116/*
3107 * double_rq_lock - safely lock two runqueues
3108 *
3109 * Note this does not disable interrupts like task_rq_lock,
3110 * you need to do so manually before calling.
3111 */
3112static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3113 __acquires(rq1->lock)
3114 __acquires(rq2->lock)
3115{
3116 BUG_ON(!irqs_disabled());
3117 if (rq1 == rq2) {
3118 raw_spin_lock(&rq1->lock);
3119 __acquire(rq2->lock); /* Fake it out ;) */
3120 } else {
3121 if (rq1 < rq2) {
3122 raw_spin_lock(&rq1->lock);
3123 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3124 } else {
3125 raw_spin_lock(&rq2->lock);
3126 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3127 }
3128 }
3129 update_rq_clock(rq1);
3130 update_rq_clock(rq2);
3131}
3132
3133/*
3134 * double_rq_unlock - safely unlock two runqueues
3135 *
3136 * Note this does not restore interrupts like task_rq_unlock,
3137 * you need to do so manually after calling.
3138 */
3139static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3140 __releases(rq1->lock)
3141 __releases(rq2->lock)
3142{
3143 raw_spin_unlock(&rq1->lock);
3144 if (rq1 != rq2)
3145 raw_spin_unlock(&rq2->lock);
3146 else
3147 __release(rq2->lock);
3148}
3149
3150/*
3151 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3152 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3153 */ 3119 */
@@ -3195,1771 +3161,6 @@ again:
3195 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3196} 3162}
3197 3163
3198/*
3199 * pull_task - move a task from a remote runqueue to the local runqueue.
3200 * Both runqueues must be locked.
3201 */
3202static void pull_task(struct rq *src_rq, struct task_struct *p,
3203 struct rq *this_rq, int this_cpu)
3204{
3205 deactivate_task(src_rq, p, 0);
3206 set_task_cpu(p, this_cpu);
3207 activate_task(this_rq, p, 0);
3208 check_preempt_curr(this_rq, p, 0);
3209}
3210
3211/*
3212 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3213 */
3214static
3215int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3216 struct sched_domain *sd, enum cpu_idle_type idle,
3217 int *all_pinned)
3218{
3219 int tsk_cache_hot = 0;
3220 /*
3221 * We do not migrate tasks that are:
3222 * 1) running (obviously), or
3223 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3224 * 3) are cache-hot on their current CPU.
3225 */
3226 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3227 schedstat_inc(p, se.nr_failed_migrations_affine);
3228 return 0;
3229 }
3230 *all_pinned = 0;
3231
3232 if (task_running(rq, p)) {
3233 schedstat_inc(p, se.nr_failed_migrations_running);
3234 return 0;
3235 }
3236
3237 /*
3238 * Aggressive migration if:
3239 * 1) task is cache cold, or
3240 * 2) too many balance attempts have failed.
3241 */
3242
3243 tsk_cache_hot = task_hot(p, rq->clock, sd);
3244 if (!tsk_cache_hot ||
3245 sd->nr_balance_failed > sd->cache_nice_tries) {
3246#ifdef CONFIG_SCHEDSTATS
3247 if (tsk_cache_hot) {
3248 schedstat_inc(sd, lb_hot_gained[idle]);
3249 schedstat_inc(p, se.nr_forced_migrations);
3250 }
3251#endif
3252 return 1;
3253 }
3254
3255 if (tsk_cache_hot) {
3256 schedstat_inc(p, se.nr_failed_migrations_hot);
3257 return 0;
3258 }
3259 return 1;
3260}
3261
3262static unsigned long
3263balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3264 unsigned long max_load_move, struct sched_domain *sd,
3265 enum cpu_idle_type idle, int *all_pinned,
3266 int *this_best_prio, struct rq_iterator *iterator)
3267{
3268 int loops = 0, pulled = 0, pinned = 0;
3269 struct task_struct *p;
3270 long rem_load_move = max_load_move;
3271
3272 if (max_load_move == 0)
3273 goto out;
3274
3275 pinned = 1;
3276
3277 /*
3278 * Start the load-balancing iterator:
3279 */
3280 p = iterator->start(iterator->arg);
3281next:
3282 if (!p || loops++ > sysctl_sched_nr_migrate)
3283 goto out;
3284
3285 if ((p->se.load.weight >> 1) > rem_load_move ||
3286 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3287 p = iterator->next(iterator->arg);
3288 goto next;
3289 }
3290
3291 pull_task(busiest, p, this_rq, this_cpu);
3292 pulled++;
3293 rem_load_move -= p->se.load.weight;
3294
3295#ifdef CONFIG_PREEMPT
3296 /*
3297 * NEWIDLE balancing is a source of latency, so preemptible kernels
3298 * will stop after the first task is pulled to minimize the critical
3299 * section.
3300 */
3301 if (idle == CPU_NEWLY_IDLE)
3302 goto out;
3303#endif
3304
3305 /*
3306 * We only want to steal up to the prescribed amount of weighted load.
3307 */
3308 if (rem_load_move > 0) {
3309 if (p->prio < *this_best_prio)
3310 *this_best_prio = p->prio;
3311 p = iterator->next(iterator->arg);
3312 goto next;
3313 }
3314out:
3315 /*
3316 * Right now, this is one of only two places pull_task() is called,
3317 * so we can safely collect pull_task() stats here rather than
3318 * inside pull_task().
3319 */
3320 schedstat_add(sd, lb_gained[idle], pulled);
3321
3322 if (all_pinned)
3323 *all_pinned = pinned;
3324
3325 return max_load_move - rem_load_move;
3326}
3327
3328/*
3329 * move_tasks tries to move up to max_load_move weighted load from busiest to
3330 * this_rq, as part of a balancing operation within domain "sd".
3331 * Returns 1 if successful and 0 otherwise.
3332 *
3333 * Called with both runqueues locked.
3334 */
3335static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3336 unsigned long max_load_move,
3337 struct sched_domain *sd, enum cpu_idle_type idle,
3338 int *all_pinned)
3339{
3340 const struct sched_class *class = sched_class_highest;
3341 unsigned long total_load_moved = 0;
3342 int this_best_prio = this_rq->curr->prio;
3343
3344 do {
3345 total_load_moved +=
3346 class->load_balance(this_rq, this_cpu, busiest,
3347 max_load_move - total_load_moved,
3348 sd, idle, all_pinned, &this_best_prio);
3349 class = class->next;
3350
3351#ifdef CONFIG_PREEMPT
3352 /*
3353 * NEWIDLE balancing is a source of latency, so preemptible
3354 * kernels will stop after the first task is pulled to minimize
3355 * the critical section.
3356 */
3357 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3358 break;
3359#endif
3360 } while (class && max_load_move > total_load_moved);
3361
3362 return total_load_moved > 0;
3363}
3364
3365static int
3366iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3367 struct sched_domain *sd, enum cpu_idle_type idle,
3368 struct rq_iterator *iterator)
3369{
3370 struct task_struct *p = iterator->start(iterator->arg);
3371 int pinned = 0;
3372
3373 while (p) {
3374 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3375 pull_task(busiest, p, this_rq, this_cpu);
3376 /*
3377 * Right now, this is only the second place pull_task()
3378 * is called, so we can safely collect pull_task()
3379 * stats here rather than inside pull_task().
3380 */
3381 schedstat_inc(sd, lb_gained[idle]);
3382
3383 return 1;
3384 }
3385 p = iterator->next(iterator->arg);
3386 }
3387
3388 return 0;
3389}
3390
3391/*
3392 * move_one_task tries to move exactly one task from busiest to this_rq, as
3393 * part of active balancing operations within "domain".
3394 * Returns 1 if successful and 0 otherwise.
3395 *
3396 * Called with both runqueues locked.
3397 */
3398static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3399 struct sched_domain *sd, enum cpu_idle_type idle)
3400{
3401 const struct sched_class *class;
3402
3403 for_each_class(class) {
3404 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3405 return 1;
3406 }
3407
3408 return 0;
3409}
3410/********** Helpers for find_busiest_group ************************/
3411/*
3412 * sd_lb_stats - Structure to store the statistics of a sched_domain
3413 * during load balancing.
3414 */
3415struct sd_lb_stats {
3416 struct sched_group *busiest; /* Busiest group in this sd */
3417 struct sched_group *this; /* Local group in this sd */
3418 unsigned long total_load; /* Total load of all groups in sd */
3419 unsigned long total_pwr; /* Total power of all groups in sd */
3420 unsigned long avg_load; /* Average load across all groups in sd */
3421
3422 /** Statistics of this group */
3423 unsigned long this_load;
3424 unsigned long this_load_per_task;
3425 unsigned long this_nr_running;
3426
3427 /* Statistics of the busiest group */
3428 unsigned long max_load;
3429 unsigned long busiest_load_per_task;
3430 unsigned long busiest_nr_running;
3431
3432 int group_imb; /* Is there imbalance in this sd */
3433#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3434 int power_savings_balance; /* Is powersave balance needed for this sd */
3435 struct sched_group *group_min; /* Least loaded group in sd */
3436 struct sched_group *group_leader; /* Group which relieves group_min */
3437 unsigned long min_load_per_task; /* load_per_task in group_min */
3438 unsigned long leader_nr_running; /* Nr running of group_leader */
3439 unsigned long min_nr_running; /* Nr running of group_min */
3440#endif
3441};
3442
3443/*
3444 * sg_lb_stats - stats of a sched_group required for load_balancing
3445 */
3446struct sg_lb_stats {
3447 unsigned long avg_load; /*Avg load across the CPUs of the group */
3448 unsigned long group_load; /* Total load over the CPUs of the group */
3449 unsigned long sum_nr_running; /* Nr tasks running in the group */
3450 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3451 unsigned long group_capacity;
3452 int group_imb; /* Is there an imbalance in the group ? */
3453};
3454
3455/**
3456 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3457 * @group: The group whose first cpu is to be returned.
3458 */
3459static inline unsigned int group_first_cpu(struct sched_group *group)
3460{
3461 return cpumask_first(sched_group_cpus(group));
3462}
3463
3464/**
3465 * get_sd_load_idx - Obtain the load index for a given sched domain.
3466 * @sd: The sched_domain whose load_idx is to be obtained.
3467 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3468 */
3469static inline int get_sd_load_idx(struct sched_domain *sd,
3470 enum cpu_idle_type idle)
3471{
3472 int load_idx;
3473
3474 switch (idle) {
3475 case CPU_NOT_IDLE:
3476 load_idx = sd->busy_idx;
3477 break;
3478
3479 case CPU_NEWLY_IDLE:
3480 load_idx = sd->newidle_idx;
3481 break;
3482 default:
3483 load_idx = sd->idle_idx;
3484 break;
3485 }
3486
3487 return load_idx;
3488}
3489
3490
3491#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3492/**
3493 * init_sd_power_savings_stats - Initialize power savings statistics for
3494 * the given sched_domain, during load balancing.
3495 *
3496 * @sd: Sched domain whose power-savings statistics are to be initialized.
3497 * @sds: Variable containing the statistics for sd.
3498 * @idle: Idle status of the CPU at which we're performing load-balancing.
3499 */
3500static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3501 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3502{
3503 /*
3504 * Busy processors will not participate in power savings
3505 * balance.
3506 */
3507 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3508 sds->power_savings_balance = 0;
3509 else {
3510 sds->power_savings_balance = 1;
3511 sds->min_nr_running = ULONG_MAX;
3512 sds->leader_nr_running = 0;
3513 }
3514}
3515
3516/**
3517 * update_sd_power_savings_stats - Update the power saving stats for a
3518 * sched_domain while performing load balancing.
3519 *
3520 * @group: sched_group belonging to the sched_domain under consideration.
3521 * @sds: Variable containing the statistics of the sched_domain
3522 * @local_group: Does group contain the CPU for which we're performing
3523 * load balancing ?
3524 * @sgs: Variable containing the statistics of the group.
3525 */
3526static inline void update_sd_power_savings_stats(struct sched_group *group,
3527 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3528{
3529
3530 if (!sds->power_savings_balance)
3531 return;
3532
3533 /*
3534 * If the local group is idle or completely loaded
3535 * no need to do power savings balance at this domain
3536 */
3537 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3538 !sds->this_nr_running))
3539 sds->power_savings_balance = 0;
3540
3541 /*
3542 * If a group is already running at full capacity or idle,
3543 * don't include that group in power savings calculations
3544 */
3545 if (!sds->power_savings_balance ||
3546 sgs->sum_nr_running >= sgs->group_capacity ||
3547 !sgs->sum_nr_running)
3548 return;
3549
3550 /*
3551 * Calculate the group which has the least non-idle load.
3552 * This is the group from where we need to pick up the load
3553 * for saving power
3554 */
3555 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3556 (sgs->sum_nr_running == sds->min_nr_running &&
3557 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3558 sds->group_min = group;
3559 sds->min_nr_running = sgs->sum_nr_running;
3560 sds->min_load_per_task = sgs->sum_weighted_load /
3561 sgs->sum_nr_running;
3562 }
3563
3564 /*
3565 * Calculate the group which is almost near its
3566 * capacity but still has some space to pick up some load
3567 * from other group and save more power
3568 */
3569 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3570 return;
3571
3572 if (sgs->sum_nr_running > sds->leader_nr_running ||
3573 (sgs->sum_nr_running == sds->leader_nr_running &&
3574 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3575 sds->group_leader = group;
3576 sds->leader_nr_running = sgs->sum_nr_running;
3577 }
3578}
3579
3580/**
3581 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3582 * @sds: Variable containing the statistics of the sched_domain
3583 * under consideration.
3584 * @this_cpu: Cpu at which we're currently performing load-balancing.
3585 * @imbalance: Variable to store the imbalance.
3586 *
3587 * Description:
3588 * Check if we have potential to perform some power-savings balance.
3589 * If yes, set the busiest group to be the least loaded group in the
3590 * sched_domain, so that it's CPUs can be put to idle.
3591 *
3592 * Returns 1 if there is potential to perform power-savings balance.
3593 * Else returns 0.
3594 */
3595static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3596 int this_cpu, unsigned long *imbalance)
3597{
3598 if (!sds->power_savings_balance)
3599 return 0;
3600
3601 if (sds->this != sds->group_leader ||
3602 sds->group_leader == sds->group_min)
3603 return 0;
3604
3605 *imbalance = sds->min_load_per_task;
3606 sds->busiest = sds->group_min;
3607
3608 return 1;
3609
3610}
3611#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3612static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3613 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3614{
3615 return;
3616}
3617
3618static inline void update_sd_power_savings_stats(struct sched_group *group,
3619 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3620{
3621 return;
3622}
3623
3624static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3625 int this_cpu, unsigned long *imbalance)
3626{
3627 return 0;
3628}
3629#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3630
3631
3632unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3633{
3634 return SCHED_LOAD_SCALE;
3635}
3636
3637unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3638{
3639 return default_scale_freq_power(sd, cpu);
3640}
3641
3642unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3643{
3644 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3645 unsigned long smt_gain = sd->smt_gain;
3646
3647 smt_gain /= weight;
3648
3649 return smt_gain;
3650}
3651
3652unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3653{
3654 return default_scale_smt_power(sd, cpu);
3655}
3656
3657unsigned long scale_rt_power(int cpu)
3658{
3659 struct rq *rq = cpu_rq(cpu);
3660 u64 total, available;
3661
3662 sched_avg_update(rq);
3663
3664 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3665 available = total - rq->rt_avg;
3666
3667 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3668 total = SCHED_LOAD_SCALE;
3669
3670 total >>= SCHED_LOAD_SHIFT;
3671
3672 return div_u64(available, total);
3673}
3674
3675static void update_cpu_power(struct sched_domain *sd, int cpu)
3676{
3677 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3678 unsigned long power = SCHED_LOAD_SCALE;
3679 struct sched_group *sdg = sd->groups;
3680
3681 if (sched_feat(ARCH_POWER))
3682 power *= arch_scale_freq_power(sd, cpu);
3683 else
3684 power *= default_scale_freq_power(sd, cpu);
3685
3686 power >>= SCHED_LOAD_SHIFT;
3687
3688 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3689 if (sched_feat(ARCH_POWER))
3690 power *= arch_scale_smt_power(sd, cpu);
3691 else
3692 power *= default_scale_smt_power(sd, cpu);
3693
3694 power >>= SCHED_LOAD_SHIFT;
3695 }
3696
3697 power *= scale_rt_power(cpu);
3698 power >>= SCHED_LOAD_SHIFT;
3699
3700 if (!power)
3701 power = 1;
3702
3703 sdg->cpu_power = power;
3704}
3705
3706static void update_group_power(struct sched_domain *sd, int cpu)
3707{
3708 struct sched_domain *child = sd->child;
3709 struct sched_group *group, *sdg = sd->groups;
3710 unsigned long power;
3711
3712 if (!child) {
3713 update_cpu_power(sd, cpu);
3714 return;
3715 }
3716
3717 power = 0;
3718
3719 group = child->groups;
3720 do {
3721 power += group->cpu_power;
3722 group = group->next;
3723 } while (group != child->groups);
3724
3725 sdg->cpu_power = power;
3726}
3727
3728/**
3729 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3730 * @sd: The sched_domain whose statistics are to be updated.
3731 * @group: sched_group whose statistics are to be updated.
3732 * @this_cpu: Cpu for which load balance is currently performed.
3733 * @idle: Idle status of this_cpu
3734 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3735 * @sd_idle: Idle status of the sched_domain containing group.
3736 * @local_group: Does group contain this_cpu.
3737 * @cpus: Set of cpus considered for load balancing.
3738 * @balance: Should we balance.
3739 * @sgs: variable to hold the statistics for this group.
3740 */
3741static inline void update_sg_lb_stats(struct sched_domain *sd,
3742 struct sched_group *group, int this_cpu,
3743 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3744 int local_group, const struct cpumask *cpus,
3745 int *balance, struct sg_lb_stats *sgs)
3746{
3747 unsigned long load, max_cpu_load, min_cpu_load;
3748 int i;
3749 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3750 unsigned long sum_avg_load_per_task;
3751 unsigned long avg_load_per_task;
3752
3753 if (local_group) {
3754 balance_cpu = group_first_cpu(group);
3755 if (balance_cpu == this_cpu)
3756 update_group_power(sd, this_cpu);
3757 }
3758
3759 /* Tally up the load of all CPUs in the group */
3760 sum_avg_load_per_task = avg_load_per_task = 0;
3761 max_cpu_load = 0;
3762 min_cpu_load = ~0UL;
3763
3764 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3765 struct rq *rq = cpu_rq(i);
3766
3767 if (*sd_idle && rq->nr_running)
3768 *sd_idle = 0;
3769
3770 /* Bias balancing toward cpus of our domain */
3771 if (local_group) {
3772 if (idle_cpu(i) && !first_idle_cpu) {
3773 first_idle_cpu = 1;
3774 balance_cpu = i;
3775 }
3776
3777 load = target_load(i, load_idx);
3778 } else {
3779 load = source_load(i, load_idx);
3780 if (load > max_cpu_load)
3781 max_cpu_load = load;
3782 if (min_cpu_load > load)
3783 min_cpu_load = load;
3784 }
3785
3786 sgs->group_load += load;
3787 sgs->sum_nr_running += rq->nr_running;
3788 sgs->sum_weighted_load += weighted_cpuload(i);
3789
3790 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3791 }
3792
3793 /*
3794 * First idle cpu or the first cpu(busiest) in this sched group
3795 * is eligible for doing load balancing at this and above
3796 * domains. In the newly idle case, we will allow all the cpu's
3797 * to do the newly idle load balance.
3798 */
3799 if (idle != CPU_NEWLY_IDLE && local_group &&
3800 balance_cpu != this_cpu && balance) {
3801 *balance = 0;
3802 return;
3803 }
3804
3805 /* Adjust by relative CPU power of the group */
3806 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3807
3808
3809 /*
3810 * Consider the group unbalanced when the imbalance is larger
3811 * than the average weight of two tasks.
3812 *
3813 * APZ: with cgroup the avg task weight can vary wildly and
3814 * might not be a suitable number - should we keep a
3815 * normalized nr_running number somewhere that negates
3816 * the hierarchy?
3817 */
3818 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3819 group->cpu_power;
3820
3821 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3822 sgs->group_imb = 1;
3823
3824 sgs->group_capacity =
3825 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3826}
3827
3828/**
3829 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3830 * @sd: sched_domain whose statistics are to be updated.
3831 * @this_cpu: Cpu for which load balance is currently performed.
3832 * @idle: Idle status of this_cpu
3833 * @sd_idle: Idle status of the sched_domain containing group.
3834 * @cpus: Set of cpus considered for load balancing.
3835 * @balance: Should we balance.
3836 * @sds: variable to hold the statistics for this sched_domain.
3837 */
3838static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3839 enum cpu_idle_type idle, int *sd_idle,
3840 const struct cpumask *cpus, int *balance,
3841 struct sd_lb_stats *sds)
3842{
3843 struct sched_domain *child = sd->child;
3844 struct sched_group *group = sd->groups;
3845 struct sg_lb_stats sgs;
3846 int load_idx, prefer_sibling = 0;
3847
3848 if (child && child->flags & SD_PREFER_SIBLING)
3849 prefer_sibling = 1;
3850
3851 init_sd_power_savings_stats(sd, sds, idle);
3852 load_idx = get_sd_load_idx(sd, idle);
3853
3854 do {
3855 int local_group;
3856
3857 local_group = cpumask_test_cpu(this_cpu,
3858 sched_group_cpus(group));
3859 memset(&sgs, 0, sizeof(sgs));
3860 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3861 local_group, cpus, balance, &sgs);
3862
3863 if (local_group && balance && !(*balance))
3864 return;
3865
3866 sds->total_load += sgs.group_load;
3867 sds->total_pwr += group->cpu_power;
3868
3869 /*
3870 * In case the child domain prefers tasks go to siblings
3871 * first, lower the group capacity to one so that we'll try
3872 * and move all the excess tasks away.
3873 */
3874 if (prefer_sibling)
3875 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3876
3877 if (local_group) {
3878 sds->this_load = sgs.avg_load;
3879 sds->this = group;
3880 sds->this_nr_running = sgs.sum_nr_running;
3881 sds->this_load_per_task = sgs.sum_weighted_load;
3882 } else if (sgs.avg_load > sds->max_load &&
3883 (sgs.sum_nr_running > sgs.group_capacity ||
3884 sgs.group_imb)) {
3885 sds->max_load = sgs.avg_load;
3886 sds->busiest = group;
3887 sds->busiest_nr_running = sgs.sum_nr_running;
3888 sds->busiest_load_per_task = sgs.sum_weighted_load;
3889 sds->group_imb = sgs.group_imb;
3890 }
3891
3892 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3893 group = group->next;
3894 } while (group != sd->groups);
3895}
3896
3897/**
3898 * fix_small_imbalance - Calculate the minor imbalance that exists
3899 * amongst the groups of a sched_domain, during
3900 * load balancing.
3901 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3902 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3903 * @imbalance: Variable to store the imbalance.
3904 */
3905static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3906 int this_cpu, unsigned long *imbalance)
3907{
3908 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3909 unsigned int imbn = 2;
3910
3911 if (sds->this_nr_running) {
3912 sds->this_load_per_task /= sds->this_nr_running;
3913 if (sds->busiest_load_per_task >
3914 sds->this_load_per_task)
3915 imbn = 1;
3916 } else
3917 sds->this_load_per_task =
3918 cpu_avg_load_per_task(this_cpu);
3919
3920 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3921 sds->busiest_load_per_task * imbn) {
3922 *imbalance = sds->busiest_load_per_task;
3923 return;
3924 }
3925
3926 /*
3927 * OK, we don't have enough imbalance to justify moving tasks,
3928 * however we may be able to increase total CPU power used by
3929 * moving them.
3930 */
3931
3932 pwr_now += sds->busiest->cpu_power *
3933 min(sds->busiest_load_per_task, sds->max_load);
3934 pwr_now += sds->this->cpu_power *
3935 min(sds->this_load_per_task, sds->this_load);
3936 pwr_now /= SCHED_LOAD_SCALE;
3937
3938 /* Amount of load we'd subtract */
3939 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3940 sds->busiest->cpu_power;
3941 if (sds->max_load > tmp)
3942 pwr_move += sds->busiest->cpu_power *
3943 min(sds->busiest_load_per_task, sds->max_load - tmp);
3944
3945 /* Amount of load we'd add */
3946 if (sds->max_load * sds->busiest->cpu_power <
3947 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3948 tmp = (sds->max_load * sds->busiest->cpu_power) /
3949 sds->this->cpu_power;
3950 else
3951 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3952 sds->this->cpu_power;
3953 pwr_move += sds->this->cpu_power *
3954 min(sds->this_load_per_task, sds->this_load + tmp);
3955 pwr_move /= SCHED_LOAD_SCALE;
3956
3957 /* Move if we gain throughput */
3958 if (pwr_move > pwr_now)
3959 *imbalance = sds->busiest_load_per_task;
3960}
3961
3962/**
3963 * calculate_imbalance - Calculate the amount of imbalance present within the
3964 * groups of a given sched_domain during load balance.
3965 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3966 * @this_cpu: Cpu for which currently load balance is being performed.
3967 * @imbalance: The variable to store the imbalance.
3968 */
3969static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3970 unsigned long *imbalance)
3971{
3972 unsigned long max_pull;
3973 /*
3974 * In the presence of smp nice balancing, certain scenarios can have
3975 * max load less than avg load(as we skip the groups at or below
3976 * its cpu_power, while calculating max_load..)
3977 */
3978 if (sds->max_load < sds->avg_load) {
3979 *imbalance = 0;
3980 return fix_small_imbalance(sds, this_cpu, imbalance);
3981 }
3982
3983 /* Don't want to pull so many tasks that a group would go idle */
3984 max_pull = min(sds->max_load - sds->avg_load,
3985 sds->max_load - sds->busiest_load_per_task);
3986
3987 /* How much load to actually move to equalise the imbalance */
3988 *imbalance = min(max_pull * sds->busiest->cpu_power,
3989 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3990 / SCHED_LOAD_SCALE;
3991
3992 /*
3993 * if *imbalance is less than the average load per runnable task
3994 * there is no gaurantee that any tasks will be moved so we'll have
3995 * a think about bumping its value to force at least one task to be
3996 * moved
3997 */
3998 if (*imbalance < sds->busiest_load_per_task)
3999 return fix_small_imbalance(sds, this_cpu, imbalance);
4000
4001}
4002/******* find_busiest_group() helpers end here *********************/
4003
4004/**
4005 * find_busiest_group - Returns the busiest group within the sched_domain
4006 * if there is an imbalance. If there isn't an imbalance, and
4007 * the user has opted for power-savings, it returns a group whose
4008 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4009 * such a group exists.
4010 *
4011 * Also calculates the amount of weighted load which should be moved
4012 * to restore balance.
4013 *
4014 * @sd: The sched_domain whose busiest group is to be returned.
4015 * @this_cpu: The cpu for which load balancing is currently being performed.
4016 * @imbalance: Variable which stores amount of weighted load which should
4017 * be moved to restore balance/put a group to idle.
4018 * @idle: The idle status of this_cpu.
4019 * @sd_idle: The idleness of sd
4020 * @cpus: The set of CPUs under consideration for load-balancing.
4021 * @balance: Pointer to a variable indicating if this_cpu
4022 * is the appropriate cpu to perform load balancing at this_level.
4023 *
4024 * Returns: - the busiest group if imbalance exists.
4025 * - If no imbalance and user has opted for power-savings balance,
4026 * return the least loaded group whose CPUs can be
4027 * put to idle by rebalancing its tasks onto our group.
4028 */
4029static struct sched_group *
4030find_busiest_group(struct sched_domain *sd, int this_cpu,
4031 unsigned long *imbalance, enum cpu_idle_type idle,
4032 int *sd_idle, const struct cpumask *cpus, int *balance)
4033{
4034 struct sd_lb_stats sds;
4035
4036 memset(&sds, 0, sizeof(sds));
4037
4038 /*
4039 * Compute the various statistics relavent for load balancing at
4040 * this level.
4041 */
4042 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4043 balance, &sds);
4044
4045 /* Cases where imbalance does not exist from POV of this_cpu */
4046 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4047 * at this level.
4048 * 2) There is no busy sibling group to pull from.
4049 * 3) This group is the busiest group.
4050 * 4) This group is more busy than the avg busieness at this
4051 * sched_domain.
4052 * 5) The imbalance is within the specified limit.
4053 * 6) Any rebalance would lead to ping-pong
4054 */
4055 if (balance && !(*balance))
4056 goto ret;
4057
4058 if (!sds.busiest || sds.busiest_nr_running == 0)
4059 goto out_balanced;
4060
4061 if (sds.this_load >= sds.max_load)
4062 goto out_balanced;
4063
4064 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4065
4066 if (sds.this_load >= sds.avg_load)
4067 goto out_balanced;
4068
4069 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4070 goto out_balanced;
4071
4072 sds.busiest_load_per_task /= sds.busiest_nr_running;
4073 if (sds.group_imb)
4074 sds.busiest_load_per_task =
4075 min(sds.busiest_load_per_task, sds.avg_load);
4076
4077 /*
4078 * We're trying to get all the cpus to the average_load, so we don't
4079 * want to push ourselves above the average load, nor do we wish to
4080 * reduce the max loaded cpu below the average load, as either of these
4081 * actions would just result in more rebalancing later, and ping-pong
4082 * tasks around. Thus we look for the minimum possible imbalance.
4083 * Negative imbalances (*we* are more loaded than anyone else) will
4084 * be counted as no imbalance for these purposes -- we can't fix that
4085 * by pulling tasks to us. Be careful of negative numbers as they'll
4086 * appear as very large values with unsigned longs.
4087 */
4088 if (sds.max_load <= sds.busiest_load_per_task)
4089 goto out_balanced;
4090
4091 /* Looks like there is an imbalance. Compute it */
4092 calculate_imbalance(&sds, this_cpu, imbalance);
4093 return sds.busiest;
4094
4095out_balanced:
4096 /*
4097 * There is no obvious imbalance. But check if we can do some balancing
4098 * to save power.
4099 */
4100 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4101 return sds.busiest;
4102ret:
4103 *imbalance = 0;
4104 return NULL;
4105}
4106
4107/*
4108 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4109 */
4110static struct rq *
4111find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4112 unsigned long imbalance, const struct cpumask *cpus)
4113{
4114 struct rq *busiest = NULL, *rq;
4115 unsigned long max_load = 0;
4116 int i;
4117
4118 for_each_cpu(i, sched_group_cpus(group)) {
4119 unsigned long power = power_of(i);
4120 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4121 unsigned long wl;
4122
4123 if (!cpumask_test_cpu(i, cpus))
4124 continue;
4125
4126 rq = cpu_rq(i);
4127 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4128 wl /= power;
4129
4130 if (capacity && rq->nr_running == 1 && wl > imbalance)
4131 continue;
4132
4133 if (wl > max_load) {
4134 max_load = wl;
4135 busiest = rq;
4136 }
4137 }
4138
4139 return busiest;
4140}
4141
4142/*
4143 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4144 * so long as it is large enough.
4145 */
4146#define MAX_PINNED_INTERVAL 512
4147
4148/* Working cpumask for load_balance and load_balance_newidle. */
4149static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4150
4151/*
4152 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4153 * tasks if there is an imbalance.
4154 */
4155static int load_balance(int this_cpu, struct rq *this_rq,
4156 struct sched_domain *sd, enum cpu_idle_type idle,
4157 int *balance)
4158{
4159 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4160 struct sched_group *group;
4161 unsigned long imbalance;
4162 struct rq *busiest;
4163 unsigned long flags;
4164 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4165
4166 cpumask_copy(cpus, cpu_active_mask);
4167
4168 /*
4169 * When power savings policy is enabled for the parent domain, idle
4170 * sibling can pick up load irrespective of busy siblings. In this case,
4171 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4172 * portraying it as CPU_NOT_IDLE.
4173 */
4174 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4175 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4176 sd_idle = 1;
4177
4178 schedstat_inc(sd, lb_count[idle]);
4179
4180redo:
4181 update_shares(sd);
4182 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4183 cpus, balance);
4184
4185 if (*balance == 0)
4186 goto out_balanced;
4187
4188 if (!group) {
4189 schedstat_inc(sd, lb_nobusyg[idle]);
4190 goto out_balanced;
4191 }
4192
4193 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4194 if (!busiest) {
4195 schedstat_inc(sd, lb_nobusyq[idle]);
4196 goto out_balanced;
4197 }
4198
4199 BUG_ON(busiest == this_rq);
4200
4201 schedstat_add(sd, lb_imbalance[idle], imbalance);
4202
4203 ld_moved = 0;
4204 if (busiest->nr_running > 1) {
4205 /*
4206 * Attempt to move tasks. If find_busiest_group has found
4207 * an imbalance but busiest->nr_running <= 1, the group is
4208 * still unbalanced. ld_moved simply stays zero, so it is
4209 * correctly treated as an imbalance.
4210 */
4211 local_irq_save(flags);
4212 double_rq_lock(this_rq, busiest);
4213 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4214 imbalance, sd, idle, &all_pinned);
4215 double_rq_unlock(this_rq, busiest);
4216 local_irq_restore(flags);
4217
4218 /*
4219 * some other cpu did the load balance for us.
4220 */
4221 if (ld_moved && this_cpu != smp_processor_id())
4222 resched_cpu(this_cpu);
4223
4224 /* All tasks on this runqueue were pinned by CPU affinity */
4225 if (unlikely(all_pinned)) {
4226 cpumask_clear_cpu(cpu_of(busiest), cpus);
4227 if (!cpumask_empty(cpus))
4228 goto redo;
4229 goto out_balanced;
4230 }
4231 }
4232
4233 if (!ld_moved) {
4234 schedstat_inc(sd, lb_failed[idle]);
4235 sd->nr_balance_failed++;
4236
4237 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4238
4239 raw_spin_lock_irqsave(&busiest->lock, flags);
4240
4241 /* don't kick the migration_thread, if the curr
4242 * task on busiest cpu can't be moved to this_cpu
4243 */
4244 if (!cpumask_test_cpu(this_cpu,
4245 &busiest->curr->cpus_allowed)) {
4246 raw_spin_unlock_irqrestore(&busiest->lock,
4247 flags);
4248 all_pinned = 1;
4249 goto out_one_pinned;
4250 }
4251
4252 if (!busiest->active_balance) {
4253 busiest->active_balance = 1;
4254 busiest->push_cpu = this_cpu;
4255 active_balance = 1;
4256 }
4257 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4258 if (active_balance)
4259 wake_up_process(busiest->migration_thread);
4260
4261 /*
4262 * We've kicked active balancing, reset the failure
4263 * counter.
4264 */
4265 sd->nr_balance_failed = sd->cache_nice_tries+1;
4266 }
4267 } else
4268 sd->nr_balance_failed = 0;
4269
4270 if (likely(!active_balance)) {
4271 /* We were unbalanced, so reset the balancing interval */
4272 sd->balance_interval = sd->min_interval;
4273 } else {
4274 /*
4275 * If we've begun active balancing, start to back off. This
4276 * case may not be covered by the all_pinned logic if there
4277 * is only 1 task on the busy runqueue (because we don't call
4278 * move_tasks).
4279 */
4280 if (sd->balance_interval < sd->max_interval)
4281 sd->balance_interval *= 2;
4282 }
4283
4284 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4285 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4286 ld_moved = -1;
4287
4288 goto out;
4289
4290out_balanced:
4291 schedstat_inc(sd, lb_balanced[idle]);
4292
4293 sd->nr_balance_failed = 0;
4294
4295out_one_pinned:
4296 /* tune up the balancing interval */
4297 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4298 (sd->balance_interval < sd->max_interval))
4299 sd->balance_interval *= 2;
4300
4301 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4302 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4303 ld_moved = -1;
4304 else
4305 ld_moved = 0;
4306out:
4307 if (ld_moved)
4308 update_shares(sd);
4309 return ld_moved;
4310}
4311
4312/*
4313 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4314 * tasks if there is an imbalance.
4315 *
4316 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4317 * this_rq is locked.
4318 */
4319static int
4320load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4321{
4322 struct sched_group *group;
4323 struct rq *busiest = NULL;
4324 unsigned long imbalance;
4325 int ld_moved = 0;
4326 int sd_idle = 0;
4327 int all_pinned = 0;
4328 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4329
4330 cpumask_copy(cpus, cpu_active_mask);
4331
4332 /*
4333 * When power savings policy is enabled for the parent domain, idle
4334 * sibling can pick up load irrespective of busy siblings. In this case,
4335 * let the state of idle sibling percolate up as IDLE, instead of
4336 * portraying it as CPU_NOT_IDLE.
4337 */
4338 if (sd->flags & SD_SHARE_CPUPOWER &&
4339 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4340 sd_idle = 1;
4341
4342 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4343redo:
4344 update_shares_locked(this_rq, sd);
4345 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4346 &sd_idle, cpus, NULL);
4347 if (!group) {
4348 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4349 goto out_balanced;
4350 }
4351
4352 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4353 if (!busiest) {
4354 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4355 goto out_balanced;
4356 }
4357
4358 BUG_ON(busiest == this_rq);
4359
4360 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4361
4362 ld_moved = 0;
4363 if (busiest->nr_running > 1) {
4364 /* Attempt to move tasks */
4365 double_lock_balance(this_rq, busiest);
4366 /* this_rq->clock is already updated */
4367 update_rq_clock(busiest);
4368 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4369 imbalance, sd, CPU_NEWLY_IDLE,
4370 &all_pinned);
4371 double_unlock_balance(this_rq, busiest);
4372
4373 if (unlikely(all_pinned)) {
4374 cpumask_clear_cpu(cpu_of(busiest), cpus);
4375 if (!cpumask_empty(cpus))
4376 goto redo;
4377 }
4378 }
4379
4380 if (!ld_moved) {
4381 int active_balance = 0;
4382
4383 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4384 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4385 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4386 return -1;
4387
4388 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4389 return -1;
4390
4391 if (sd->nr_balance_failed++ < 2)
4392 return -1;
4393
4394 /*
4395 * The only task running in a non-idle cpu can be moved to this
4396 * cpu in an attempt to completely freeup the other CPU
4397 * package. The same method used to move task in load_balance()
4398 * have been extended for load_balance_newidle() to speedup
4399 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4400 *
4401 * The package power saving logic comes from
4402 * find_busiest_group(). If there are no imbalance, then
4403 * f_b_g() will return NULL. However when sched_mc={1,2} then
4404 * f_b_g() will select a group from which a running task may be
4405 * pulled to this cpu in order to make the other package idle.
4406 * If there is no opportunity to make a package idle and if
4407 * there are no imbalance, then f_b_g() will return NULL and no
4408 * action will be taken in load_balance_newidle().
4409 *
4410 * Under normal task pull operation due to imbalance, there
4411 * will be more than one task in the source run queue and
4412 * move_tasks() will succeed. ld_moved will be true and this
4413 * active balance code will not be triggered.
4414 */
4415
4416 /* Lock busiest in correct order while this_rq is held */
4417 double_lock_balance(this_rq, busiest);
4418
4419 /*
4420 * don't kick the migration_thread, if the curr
4421 * task on busiest cpu can't be moved to this_cpu
4422 */
4423 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4424 double_unlock_balance(this_rq, busiest);
4425 all_pinned = 1;
4426 return ld_moved;
4427 }
4428
4429 if (!busiest->active_balance) {
4430 busiest->active_balance = 1;
4431 busiest->push_cpu = this_cpu;
4432 active_balance = 1;
4433 }
4434
4435 double_unlock_balance(this_rq, busiest);
4436 /*
4437 * Should not call ttwu while holding a rq->lock
4438 */
4439 raw_spin_unlock(&this_rq->lock);
4440 if (active_balance)
4441 wake_up_process(busiest->migration_thread);
4442 raw_spin_lock(&this_rq->lock);
4443
4444 } else
4445 sd->nr_balance_failed = 0;
4446
4447 update_shares_locked(this_rq, sd);
4448 return ld_moved;
4449
4450out_balanced:
4451 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4452 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4453 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4454 return -1;
4455 sd->nr_balance_failed = 0;
4456
4457 return 0;
4458}
4459
4460/*
4461 * idle_balance is called by schedule() if this_cpu is about to become
4462 * idle. Attempts to pull tasks from other CPUs.
4463 */
4464static void idle_balance(int this_cpu, struct rq *this_rq)
4465{
4466 struct sched_domain *sd;
4467 int pulled_task = 0;
4468 unsigned long next_balance = jiffies + HZ;
4469
4470 this_rq->idle_stamp = this_rq->clock;
4471
4472 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4473 return;
4474
4475 for_each_domain(this_cpu, sd) {
4476 unsigned long interval;
4477
4478 if (!(sd->flags & SD_LOAD_BALANCE))
4479 continue;
4480
4481 if (sd->flags & SD_BALANCE_NEWIDLE)
4482 /* If we've pulled tasks over stop searching: */
4483 pulled_task = load_balance_newidle(this_cpu, this_rq,
4484 sd);
4485
4486 interval = msecs_to_jiffies(sd->balance_interval);
4487 if (time_after(next_balance, sd->last_balance + interval))
4488 next_balance = sd->last_balance + interval;
4489 if (pulled_task) {
4490 this_rq->idle_stamp = 0;
4491 break;
4492 }
4493 }
4494 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4495 /*
4496 * We are going idle. next_balance may be set based on
4497 * a busy processor. So reset next_balance.
4498 */
4499 this_rq->next_balance = next_balance;
4500 }
4501}
4502
4503/*
4504 * active_load_balance is run by migration threads. It pushes running tasks
4505 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4506 * running on each physical CPU where possible, and avoids physical /
4507 * logical imbalances.
4508 *
4509 * Called with busiest_rq locked.
4510 */
4511static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4512{
4513 int target_cpu = busiest_rq->push_cpu;
4514 struct sched_domain *sd;
4515 struct rq *target_rq;
4516
4517 /* Is there any task to move? */
4518 if (busiest_rq->nr_running <= 1)
4519 return;
4520
4521 target_rq = cpu_rq(target_cpu);
4522
4523 /*
4524 * This condition is "impossible", if it occurs
4525 * we need to fix it. Originally reported by
4526 * Bjorn Helgaas on a 128-cpu setup.
4527 */
4528 BUG_ON(busiest_rq == target_rq);
4529
4530 /* move a task from busiest_rq to target_rq */
4531 double_lock_balance(busiest_rq, target_rq);
4532 update_rq_clock(busiest_rq);
4533 update_rq_clock(target_rq);
4534
4535 /* Search for an sd spanning us and the target CPU. */
4536 for_each_domain(target_cpu, sd) {
4537 if ((sd->flags & SD_LOAD_BALANCE) &&
4538 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4539 break;
4540 }
4541
4542 if (likely(sd)) {
4543 schedstat_inc(sd, alb_count);
4544
4545 if (move_one_task(target_rq, target_cpu, busiest_rq,
4546 sd, CPU_IDLE))
4547 schedstat_inc(sd, alb_pushed);
4548 else
4549 schedstat_inc(sd, alb_failed);
4550 }
4551 double_unlock_balance(busiest_rq, target_rq);
4552}
4553
4554#ifdef CONFIG_NO_HZ
4555static struct {
4556 atomic_t load_balancer;
4557 cpumask_var_t cpu_mask;
4558 cpumask_var_t ilb_grp_nohz_mask;
4559} nohz ____cacheline_aligned = {
4560 .load_balancer = ATOMIC_INIT(-1),
4561};
4562
4563int get_nohz_load_balancer(void)
4564{
4565 return atomic_read(&nohz.load_balancer);
4566}
4567
4568#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4569/**
4570 * lowest_flag_domain - Return lowest sched_domain containing flag.
4571 * @cpu: The cpu whose lowest level of sched domain is to
4572 * be returned.
4573 * @flag: The flag to check for the lowest sched_domain
4574 * for the given cpu.
4575 *
4576 * Returns the lowest sched_domain of a cpu which contains the given flag.
4577 */
4578static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4579{
4580 struct sched_domain *sd;
4581
4582 for_each_domain(cpu, sd)
4583 if (sd && (sd->flags & flag))
4584 break;
4585
4586 return sd;
4587}
4588
4589/**
4590 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4591 * @cpu: The cpu whose domains we're iterating over.
4592 * @sd: variable holding the value of the power_savings_sd
4593 * for cpu.
4594 * @flag: The flag to filter the sched_domains to be iterated.
4595 *
4596 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4597 * set, starting from the lowest sched_domain to the highest.
4598 */
4599#define for_each_flag_domain(cpu, sd, flag) \
4600 for (sd = lowest_flag_domain(cpu, flag); \
4601 (sd && (sd->flags & flag)); sd = sd->parent)
4602
4603/**
4604 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4605 * @ilb_group: group to be checked for semi-idleness
4606 *
4607 * Returns: 1 if the group is semi-idle. 0 otherwise.
4608 *
4609 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4610 * and atleast one non-idle CPU. This helper function checks if the given
4611 * sched_group is semi-idle or not.
4612 */
4613static inline int is_semi_idle_group(struct sched_group *ilb_group)
4614{
4615 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4616 sched_group_cpus(ilb_group));
4617
4618 /*
4619 * A sched_group is semi-idle when it has atleast one busy cpu
4620 * and atleast one idle cpu.
4621 */
4622 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4623 return 0;
4624
4625 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4626 return 0;
4627
4628 return 1;
4629}
4630/**
4631 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4632 * @cpu: The cpu which is nominating a new idle_load_balancer.
4633 *
4634 * Returns: Returns the id of the idle load balancer if it exists,
4635 * Else, returns >= nr_cpu_ids.
4636 *
4637 * This algorithm picks the idle load balancer such that it belongs to a
4638 * semi-idle powersavings sched_domain. The idea is to try and avoid
4639 * completely idle packages/cores just for the purpose of idle load balancing
4640 * when there are other idle cpu's which are better suited for that job.
4641 */
4642static int find_new_ilb(int cpu)
4643{
4644 struct sched_domain *sd;
4645 struct sched_group *ilb_group;
4646
4647 /*
4648 * Have idle load balancer selection from semi-idle packages only
4649 * when power-aware load balancing is enabled
4650 */
4651 if (!(sched_smt_power_savings || sched_mc_power_savings))
4652 goto out_done;
4653
4654 /*
4655 * Optimize for the case when we have no idle CPUs or only one
4656 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4657 */
4658 if (cpumask_weight(nohz.cpu_mask) < 2)
4659 goto out_done;
4660
4661 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4662 ilb_group = sd->groups;
4663
4664 do {
4665 if (is_semi_idle_group(ilb_group))
4666 return cpumask_first(nohz.ilb_grp_nohz_mask);
4667
4668 ilb_group = ilb_group->next;
4669
4670 } while (ilb_group != sd->groups);
4671 }
4672
4673out_done:
4674 return cpumask_first(nohz.cpu_mask);
4675}
4676#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4677static inline int find_new_ilb(int call_cpu)
4678{
4679 return cpumask_first(nohz.cpu_mask);
4680}
4681#endif
4682
4683/*
4684 * This routine will try to nominate the ilb (idle load balancing)
4685 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4686 * load balancing on behalf of all those cpus. If all the cpus in the system
4687 * go into this tickless mode, then there will be no ilb owner (as there is
4688 * no need for one) and all the cpus will sleep till the next wakeup event
4689 * arrives...
4690 *
4691 * For the ilb owner, tick is not stopped. And this tick will be used
4692 * for idle load balancing. ilb owner will still be part of
4693 * nohz.cpu_mask..
4694 *
4695 * While stopping the tick, this cpu will become the ilb owner if there
4696 * is no other owner. And will be the owner till that cpu becomes busy
4697 * or if all cpus in the system stop their ticks at which point
4698 * there is no need for ilb owner.
4699 *
4700 * When the ilb owner becomes busy, it nominates another owner, during the
4701 * next busy scheduler_tick()
4702 */
4703int select_nohz_load_balancer(int stop_tick)
4704{
4705 int cpu = smp_processor_id();
4706
4707 if (stop_tick) {
4708 cpu_rq(cpu)->in_nohz_recently = 1;
4709
4710 if (!cpu_active(cpu)) {
4711 if (atomic_read(&nohz.load_balancer) != cpu)
4712 return 0;
4713
4714 /*
4715 * If we are going offline and still the leader,
4716 * give up!
4717 */
4718 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4719 BUG();
4720
4721 return 0;
4722 }
4723
4724 cpumask_set_cpu(cpu, nohz.cpu_mask);
4725
4726 /* time for ilb owner also to sleep */
4727 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4728 if (atomic_read(&nohz.load_balancer) == cpu)
4729 atomic_set(&nohz.load_balancer, -1);
4730 return 0;
4731 }
4732
4733 if (atomic_read(&nohz.load_balancer) == -1) {
4734 /* make me the ilb owner */
4735 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4736 return 1;
4737 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4738 int new_ilb;
4739
4740 if (!(sched_smt_power_savings ||
4741 sched_mc_power_savings))
4742 return 1;
4743 /*
4744 * Check to see if there is a more power-efficient
4745 * ilb.
4746 */
4747 new_ilb = find_new_ilb(cpu);
4748 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4749 atomic_set(&nohz.load_balancer, -1);
4750 resched_cpu(new_ilb);
4751 return 0;
4752 }
4753 return 1;
4754 }
4755 } else {
4756 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4757 return 0;
4758
4759 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4760
4761 if (atomic_read(&nohz.load_balancer) == cpu)
4762 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4763 BUG();
4764 }
4765 return 0;
4766}
4767#endif
4768
4769static DEFINE_SPINLOCK(balancing);
4770
4771/*
4772 * It checks each scheduling domain to see if it is due to be balanced,
4773 * and initiates a balancing operation if so.
4774 *
4775 * Balancing parameters are set up in arch_init_sched_domains.
4776 */
4777static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4778{
4779 int balance = 1;
4780 struct rq *rq = cpu_rq(cpu);
4781 unsigned long interval;
4782 struct sched_domain *sd;
4783 /* Earliest time when we have to do rebalance again */
4784 unsigned long next_balance = jiffies + 60*HZ;
4785 int update_next_balance = 0;
4786 int need_serialize;
4787
4788 for_each_domain(cpu, sd) {
4789 if (!(sd->flags & SD_LOAD_BALANCE))
4790 continue;
4791
4792 interval = sd->balance_interval;
4793 if (idle != CPU_IDLE)
4794 interval *= sd->busy_factor;
4795
4796 /* scale ms to jiffies */
4797 interval = msecs_to_jiffies(interval);
4798 if (unlikely(!interval))
4799 interval = 1;
4800 if (interval > HZ*NR_CPUS/10)
4801 interval = HZ*NR_CPUS/10;
4802
4803 need_serialize = sd->flags & SD_SERIALIZE;
4804
4805 if (need_serialize) {
4806 if (!spin_trylock(&balancing))
4807 goto out;
4808 }
4809
4810 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4811 if (load_balance(cpu, rq, sd, idle, &balance)) {
4812 /*
4813 * We've pulled tasks over so either we're no
4814 * longer idle, or one of our SMT siblings is
4815 * not idle.
4816 */
4817 idle = CPU_NOT_IDLE;
4818 }
4819 sd->last_balance = jiffies;
4820 }
4821 if (need_serialize)
4822 spin_unlock(&balancing);
4823out:
4824 if (time_after(next_balance, sd->last_balance + interval)) {
4825 next_balance = sd->last_balance + interval;
4826 update_next_balance = 1;
4827 }
4828
4829 /*
4830 * Stop the load balance at this level. There is another
4831 * CPU in our sched group which is doing load balancing more
4832 * actively.
4833 */
4834 if (!balance)
4835 break;
4836 }
4837
4838 /*
4839 * next_balance will be updated only when there is a need.
4840 * When the cpu is attached to null domain for ex, it will not be
4841 * updated.
4842 */
4843 if (likely(update_next_balance))
4844 rq->next_balance = next_balance;
4845}
4846
4847/*
4848 * run_rebalance_domains is triggered when needed from the scheduler tick.
4849 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4850 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4851 */
4852static void run_rebalance_domains(struct softirq_action *h)
4853{
4854 int this_cpu = smp_processor_id();
4855 struct rq *this_rq = cpu_rq(this_cpu);
4856 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4857 CPU_IDLE : CPU_NOT_IDLE;
4858
4859 rebalance_domains(this_cpu, idle);
4860
4861#ifdef CONFIG_NO_HZ
4862 /*
4863 * If this cpu is the owner for idle load balancing, then do the
4864 * balancing on behalf of the other idle cpus whose ticks are
4865 * stopped.
4866 */
4867 if (this_rq->idle_at_tick &&
4868 atomic_read(&nohz.load_balancer) == this_cpu) {
4869 struct rq *rq;
4870 int balance_cpu;
4871
4872 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4873 if (balance_cpu == this_cpu)
4874 continue;
4875
4876 /*
4877 * If this cpu gets work to do, stop the load balancing
4878 * work being done for other cpus. Next load
4879 * balancing owner will pick it up.
4880 */
4881 if (need_resched())
4882 break;
4883
4884 rebalance_domains(balance_cpu, CPU_IDLE);
4885
4886 rq = cpu_rq(balance_cpu);
4887 if (time_after(this_rq->next_balance, rq->next_balance))
4888 this_rq->next_balance = rq->next_balance;
4889 }
4890 }
4891#endif
4892}
4893
4894static inline int on_null_domain(int cpu)
4895{
4896 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4897}
4898
4899/*
4900 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4901 *
4902 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4903 * idle load balancing owner or decide to stop the periodic load balancing,
4904 * if the whole system is idle.
4905 */
4906static inline void trigger_load_balance(struct rq *rq, int cpu)
4907{
4908#ifdef CONFIG_NO_HZ
4909 /*
4910 * If we were in the nohz mode recently and busy at the current
4911 * scheduler tick, then check if we need to nominate new idle
4912 * load balancer.
4913 */
4914 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4915 rq->in_nohz_recently = 0;
4916
4917 if (atomic_read(&nohz.load_balancer) == cpu) {
4918 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4919 atomic_set(&nohz.load_balancer, -1);
4920 }
4921
4922 if (atomic_read(&nohz.load_balancer) == -1) {
4923 int ilb = find_new_ilb(cpu);
4924
4925 if (ilb < nr_cpu_ids)
4926 resched_cpu(ilb);
4927 }
4928 }
4929
4930 /*
4931 * If this cpu is idle and doing idle load balancing for all the
4932 * cpus with ticks stopped, is it time for that to stop?
4933 */
4934 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4935 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4936 resched_cpu(cpu);
4937 return;
4938 }
4939
4940 /*
4941 * If this cpu is idle and the idle load balancing is done by
4942 * someone else, then no need raise the SCHED_SOFTIRQ
4943 */
4944 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4945 cpumask_test_cpu(cpu, nohz.cpu_mask))
4946 return;
4947#endif
4948 /* Don't need to rebalance while attached to NULL domain */
4949 if (time_after_eq(jiffies, rq->next_balance) &&
4950 likely(!on_null_domain(cpu)))
4951 raise_softirq(SCHED_SOFTIRQ);
4952}
4953
4954#else /* CONFIG_SMP */
4955
4956/*
4957 * on UP we do not need to balance between CPUs:
4958 */
4959static inline void idle_balance(int cpu, struct rq *rq)
4960{
4961}
4962
4963#endif 3164#endif
4964 3165
4965DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5314,7 +3515,7 @@ void scheduler_tick(void)
5314 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5315 raw_spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5316 3517
5317 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5318 3519
5319#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5320 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5528,7 +3729,7 @@ need_resched_nonpreemptible:
5528 3729
5529 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5530 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5531 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5532 3733
5533 rq->nr_switches++; 3734 rq->nr_switches++;
5534 rq->curr = next; 3735 rq->curr = next;
@@ -6059,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6059 unsigned long flags; 4260 unsigned long flags;
6060 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6061 struct rq *rq; 4262 struct rq *rq;
6062 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6063 4264
6064 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6065 4266
@@ -6067,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6067 update_rq_clock(rq); 4268 update_rq_clock(rq);
6068 4269
6069 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6070 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6071 running = task_current(rq, p); 4273 running = task_current(rq, p);
6072 if (on_rq) 4274 if (on_rq)
@@ -6084,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6084 if (running) 4286 if (running)
6085 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6086 if (on_rq) { 4288 if (on_rq) {
6087 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6088 4290
6089 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6090 } 4292 }
@@ -6128,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6128 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6129 4331
6130 if (on_rq) { 4332 if (on_rq) {
6131 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6132 /* 4334 /*
6133 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6134 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6286,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6286{ 4488{
6287 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6288 unsigned long flags; 4490 unsigned long flags;
6289 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6290 struct rq *rq; 4492 struct rq *rq;
6291 int reset_on_fork; 4493 int reset_on_fork;
6292 4494
@@ -6400,6 +4602,7 @@ recheck:
6400 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6401 4603
6402 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6403 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6404 4607
6405 if (running) 4608 if (running)
@@ -7150,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7150 struct rq *rq; 5353 struct rq *rq;
7151 int ret = 0; 5354 int ret = 0;
7152 5355
7153 /*
7154 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7155 * the ->cpus_allowed mask from under waking tasks, which would be
7156 * possible when we change rq->lock in ttwu(), so synchronize against
7157 * TASK_WAKING to avoid that.
7158 *
7159 * Make an exception for freshly cloned tasks, since cpuset namespaces
7160 * might move the task about, we have to validate the target in
7161 * wake_up_new_task() anyway since the cpu might have gone away.
7162 */
7163again:
7164 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7165 cpu_relax();
7166
7167 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7168 5357
7169 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7170 task_rq_unlock(rq, &flags);
7171 goto again;
7172 }
7173
7174 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7175 ret = -EINVAL; 5359 ret = -EINVAL;
7176 goto out; 5360 goto out;
@@ -9457,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9457 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9458 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9459 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9460 rt_rq->rt_se = rt_se;
9461 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9462 if (add) 7645 if (add)
9463 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9488,9 +7671,6 @@ void __init sched_init(void)
9488#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9489 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9490#endif 7673#endif
9491#ifdef CONFIG_USER_SCHED
9492 alloc_size *= 2;
9493#endif
9494#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9495 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9496#endif 7676#endif
@@ -9504,13 +7684,6 @@ void __init sched_init(void)
9504 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9505 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9506 7686
9507#ifdef CONFIG_USER_SCHED
9508 root_task_group.se = (struct sched_entity **)ptr;
9509 ptr += nr_cpu_ids * sizeof(void **);
9510
9511 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9512 ptr += nr_cpu_ids * sizeof(void **);
9513#endif /* CONFIG_USER_SCHED */
9514#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9515#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9516 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9519,13 +7692,6 @@ void __init sched_init(void)
9519 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9520 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9521 7694
9522#ifdef CONFIG_USER_SCHED
9523 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9524 ptr += nr_cpu_ids * sizeof(void **);
9525
9526 root_task_group.rt_rq = (struct rt_rq **)ptr;
9527 ptr += nr_cpu_ids * sizeof(void **);
9528#endif /* CONFIG_USER_SCHED */
9529#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9530#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9531 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9545,22 +7711,13 @@ void __init sched_init(void)
9545#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9546 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9547 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9548#ifdef CONFIG_USER_SCHED
9549 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9550 global_rt_period(), RUNTIME_INF);
9551#endif /* CONFIG_USER_SCHED */
9552#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9553 7715
9554#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9555 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9556 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9557 7719
9558#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9559 INIT_LIST_HEAD(&root_task_group.children);
9560 init_task_group.parent = &root_task_group;
9561 list_add(&init_task_group.siblings, &root_task_group.children);
9562#endif /* CONFIG_USER_SCHED */
9563#endif /* CONFIG_GROUP_SCHED */
9564 7721
9565#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9566 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9600,25 +7757,6 @@ void __init sched_init(void)
9600 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9601 */ 7758 */
9602 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9603#elif defined CONFIG_USER_SCHED
9604 root_task_group.shares = NICE_0_LOAD;
9605 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9606 /*
9607 * In case of task-groups formed thr' the user id of tasks,
9608 * init_task_group represents tasks belonging to root user.
9609 * Hence it forms a sibling of all subsequent groups formed.
9610 * In this case, init_task_group gets only a fraction of overall
9611 * system cpu resource, based on the weight assigned to root
9612 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9613 * by letting tasks of init_task_group sit in a separate cfs_rq
9614 * (init_tg_cfs_rq) and having one entity represent this group of
9615 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9616 */
9617 init_tg_cfs_entry(&init_task_group,
9618 &per_cpu(init_tg_cfs_rq, i),
9619 &per_cpu(init_sched_entity, i), i, 1,
9620 root_task_group.se[i]);
9621
9622#endif 7760#endif
9623#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9624 7762
@@ -9627,12 +7765,6 @@ void __init sched_init(void)
9627 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9628#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9629 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9630#elif defined CONFIG_USER_SCHED
9631 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9632 init_tg_rt_entry(&init_task_group,
9633 &per_cpu(init_rt_rq_var, i),
9634 &per_cpu(init_sched_rt_entity, i), i, 1,
9635 root_task_group.rt_se[i]);
9636#endif 7768#endif
9637#endif 7769#endif
9638 7770
@@ -9717,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
9717 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9718} 7850}
9719 7851
9720void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9721{ 7853{
9722#ifdef in_atomic 7854#ifdef in_atomic
9723 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10028,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10028} 8160}
10029#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
10030 8162
10031#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
10032static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
10033{ 8165{
10034 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10133,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
10133 if (unlikely(running)) 8265 if (unlikely(running))
10134 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10135 if (on_rq) 8267 if (on_rq)
10136 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10137 8269
10138 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10139} 8271}
10140#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10141 8273
10142#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10143static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10279,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10279 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10280 } 8412 }
10281 8413
10282#ifdef CONFIG_USER_SCHED
10283 if (tg == &root_task_group) {
10284 period = global_rt_period();
10285 runtime = global_rt_runtime();
10286 }
10287#endif
10288
10289 /* 8414 /*
10290 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10291 */ 8416 */
@@ -10905,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10905} 9030}
10906 9031
10907/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10908 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10909 */ 9051 */
10910static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10911 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10912{ 9054{
10913 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10914 9057
10915 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10916 return; 9059 return;
@@ -10919,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10919 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10920 9063
10921 do { 9064 do {
10922 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10923 ca = ca->parent; 9066 ca = ca->parent;
10924 } while (ca); 9067 } while (ca);
10925 rcu_read_unlock(); 9068 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1713 dequeue_pushable_task(rq, p);
1722} 1714}
1723 1715
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1716static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1717{
1726 /* 1718 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1719 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1738#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1739 .select_task_rq = select_task_rq_rt,
1748 1740
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1741 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1742 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1743 .rq_offline = rq_offline_rt,
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14 14
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 15static struct {
18 struct list_head queue; 16 struct list_head queue;
19 raw_spinlock_t lock; 17 raw_spinlock_t lock;
@@ -33,12 +31,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 31 cpumask_var_t cpumask;
34}; 32};
35 33
34static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 raw_spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 42
43static int 43static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 256 }
257} 257}
258 258
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 259static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 260
261/* 261/*
262 * smp_call_function_single - Run a function on a specific CPU 262 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/sys.c b/kernel/sys.c
index 18bde979f346..877fe4f8e05e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -571,11 +571,6 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) {
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
579 if (atomic_read(&new_user->processes) >= 574 if (atomic_read(&new_user->processes) >=
580 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 575 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
581 new_user != INIT_USER) { 576 new_user != INIT_USER) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 60e2ce0181ee..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -328,15 +328,6 @@ config BRANCH_TRACER
328 328
329 Say N if unsure. 329 Say N if unsure.
330 330
331config POWER_TRACER
332 bool "Trace power consumption behavior"
333 depends on X86
334 select GENERIC_TRACER
335 help
336 This tracer helps developers to analyze and optimize the kernel's
337 power management decisions, specifically the C-state and P-state
338 behavior.
339
340config KSYM_TRACER 331config KSYM_TRACER
341 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
342 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -449,7 +440,7 @@ config BLK_DEV_IO_TRACE
449 440
450config KPROBE_EVENT 441config KPROBE_EVENT
451 depends on KPROBES 442 depends on KPROBES
452 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
453 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
454 select TRACING 445 select TRACING
455 default y 446 default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..83783579378f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
@@ -2426,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = {
2426static DEFINE_MUTEX(graph_lock); 2402static DEFINE_MUTEX(graph_lock);
2427 2403
2428int ftrace_graph_count; 2404int ftrace_graph_count;
2405int ftrace_graph_filter_enabled;
2429unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2406unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2430 2407
2431static void * 2408static void *
@@ -2448,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2448 mutex_lock(&graph_lock); 2425 mutex_lock(&graph_lock);
2449 2426
2450 /* Nothing, tell g_show to print all functions are enabled */ 2427 /* Nothing, tell g_show to print all functions are enabled */
2451 if (!ftrace_graph_count && !*pos) 2428 if (!ftrace_graph_filter_enabled && !*pos)
2452 return (void *)1; 2429 return (void *)1;
2453 2430
2454 return __g_next(m, pos); 2431 return __g_next(m, pos);
@@ -2494,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2494 mutex_lock(&graph_lock); 2471 mutex_lock(&graph_lock);
2495 if ((file->f_mode & FMODE_WRITE) && 2472 if ((file->f_mode & FMODE_WRITE) &&
2496 (file->f_flags & O_TRUNC)) { 2473 (file->f_flags & O_TRUNC)) {
2474 ftrace_graph_filter_enabled = 0;
2497 ftrace_graph_count = 0; 2475 ftrace_graph_count = 0;
2498 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2476 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2499 } 2477 }
@@ -2519,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2519 struct dyn_ftrace *rec; 2497 struct dyn_ftrace *rec;
2520 struct ftrace_page *pg; 2498 struct ftrace_page *pg;
2521 int search_len; 2499 int search_len;
2522 int found = 0; 2500 int fail = 1;
2523 int type, not; 2501 int type, not;
2524 char *search; 2502 char *search;
2525 bool exists; 2503 bool exists;
@@ -2530,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2530 2508
2531 /* decode regex */ 2509 /* decode regex */
2532 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2510 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2533 if (not) 2511 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2534 return -EINVAL; 2512 return -EBUSY;
2535 2513
2536 search_len = strlen(search); 2514 search_len = strlen(search);
2537 2515
2538 mutex_lock(&ftrace_lock); 2516 mutex_lock(&ftrace_lock);
2539 do_for_each_ftrace_rec(pg, rec) { 2517 do_for_each_ftrace_rec(pg, rec) {
2540 2518
2541 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2542 break;
2543
2544 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2519 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2545 continue; 2520 continue;
2546 2521
2547 if (ftrace_match_record(rec, search, search_len, type)) { 2522 if (ftrace_match_record(rec, search, search_len, type)) {
2548 /* ensure it is not already in the array */ 2523 /* if it is in the array */
2549 exists = false; 2524 exists = false;
2550 for (i = 0; i < *idx; i++) 2525 for (i = 0; i < *idx; i++) {
2551 if (array[i] == rec->ip) { 2526 if (array[i] == rec->ip) {
2552 exists = true; 2527 exists = true;
2553 break; 2528 break;
2554 } 2529 }
2555 if (!exists) 2530 }
2556 array[(*idx)++] = rec->ip; 2531
2557 found = 1; 2532 if (!not) {
2533 fail = 0;
2534 if (!exists) {
2535 array[(*idx)++] = rec->ip;
2536 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2537 goto out;
2538 }
2539 } else {
2540 if (exists) {
2541 array[i] = array[--(*idx)];
2542 array[*idx] = 0;
2543 fail = 0;
2544 }
2545 }
2558 } 2546 }
2559 } while_for_each_ftrace_rec(); 2547 } while_for_each_ftrace_rec();
2560 2548out:
2561 mutex_unlock(&ftrace_lock); 2549 mutex_unlock(&ftrace_lock);
2562 2550
2563 return found ? 0 : -EINVAL; 2551 if (fail)
2552 return -EINVAL;
2553
2554 ftrace_graph_filter_enabled = 1;
2555 return 0;
2564} 2556}
2565 2557
2566static ssize_t 2558static ssize_t
@@ -2570,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2570 struct trace_parser parser; 2562 struct trace_parser parser;
2571 ssize_t read, ret; 2563 ssize_t read, ret;
2572 2564
2573 if (!cnt || cnt < 0) 2565 if (!cnt)
2574 return 0; 2566 return 0;
2575 2567
2576 mutex_lock(&graph_lock); 2568 mutex_lock(&graph_lock);
2577 2569
2578 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2579 ret = -EBUSY;
2580 goto out_unlock;
2581 }
2582
2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2570 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2584 ret = -ENOMEM; 2571 ret = -ENOMEM;
2585 goto out_unlock; 2572 goto out_unlock;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eac6875cb990..032c57ca6502 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -102,9 +103,6 @@ static inline void ftrace_enable_cpu(void)
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -1320,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1320 entry->fmt = fmt; 1397 entry->fmt = fmt;
1321 1398
1322 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1399 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1323 if (!filter_check_discard(call, entry, buffer, event)) 1400 if (!filter_check_discard(call, entry, buffer, event)) {
1324 ring_buffer_unlock_commit(buffer, event); 1401 ring_buffer_unlock_commit(buffer, event);
1402 ftrace_trace_stack(buffer, flags, 6, pc);
1403 }
1325 1404
1326out_unlock: 1405out_unlock:
1327 arch_spin_unlock(&trace_buf_lock); 1406 arch_spin_unlock(&trace_buf_lock);
@@ -1394,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr,
1394 1473
1395 memcpy(&entry->buf, trace_buf, len); 1474 memcpy(&entry->buf, trace_buf, len);
1396 entry->buf[len] = '\0'; 1475 entry->buf[len] = '\0';
1397 if (!filter_check_discard(call, entry, buffer, event)) 1476 if (!filter_check_discard(call, entry, buffer, event)) {
1398 ring_buffer_unlock_commit(buffer, event); 1477 ring_buffer_unlock_commit(buffer, event);
1478 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1479 }
1399 1480
1400 out_unlock: 1481 out_unlock:
1401 arch_spin_unlock(&trace_buf_lock); 1482 arch_spin_unlock(&trace_buf_lock);
@@ -1585,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1585} 1666}
1586 1667
1587/* 1668/*
1588 * No necessary locking here. The worst thing which can
1589 * happen is loosing events consumed at the same time
1590 * by a trace_pipe reader.
1591 * Other than that, we don't risk to crash the ring buffer
1592 * because it serializes the readers.
1593 *
1594 * The current tracer is copied to avoid a global locking 1669 * The current tracer is copied to avoid a global locking
1595 * all around. 1670 * all around.
1596 */ 1671 */
@@ -1645,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1645 } 1720 }
1646 1721
1647 trace_event_read_lock(); 1722 trace_event_read_lock();
1723 trace_access_lock(cpu_file);
1648 return p; 1724 return p;
1649} 1725}
1650 1726
1651static void s_stop(struct seq_file *m, void *p) 1727static void s_stop(struct seq_file *m, void *p)
1652{ 1728{
1729 struct trace_iterator *iter = m->private;
1730
1653 atomic_dec(&trace_record_cmdline_disabled); 1731 atomic_dec(&trace_record_cmdline_disabled);
1732 trace_access_unlock(iter->cpu_file);
1654 trace_event_read_unlock(); 1733 trace_event_read_unlock();
1655} 1734}
1656 1735
@@ -2841,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2841 2920
2842 mutex_lock(&trace_types_lock); 2921 mutex_lock(&trace_types_lock);
2843 2922
2844 /* We only allow one reader per cpu */
2845 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2846 if (!cpumask_empty(tracing_reader_cpumask)) {
2847 ret = -EBUSY;
2848 goto out;
2849 }
2850 cpumask_setall(tracing_reader_cpumask);
2851 } else {
2852 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2853 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2854 else {
2855 ret = -EBUSY;
2856 goto out;
2857 }
2858 }
2859
2860 /* create a buffer to store the information to pass to userspace */ 2923 /* create a buffer to store the information to pass to userspace */
2861 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2924 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2862 if (!iter) { 2925 if (!iter) {
@@ -2912,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2912 2975
2913 mutex_lock(&trace_types_lock); 2976 mutex_lock(&trace_types_lock);
2914 2977
2915 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2916 cpumask_clear(tracing_reader_cpumask);
2917 else
2918 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2919
2920
2921 if (iter->trace->pipe_close) 2978 if (iter->trace->pipe_close)
2922 iter->trace->pipe_close(iter); 2979 iter->trace->pipe_close(iter);
2923 2980
@@ -3079,6 +3136,7 @@ waitagain:
3079 iter->pos = -1; 3136 iter->pos = -1;
3080 3137
3081 trace_event_read_lock(); 3138 trace_event_read_lock();
3139 trace_access_lock(iter->cpu_file);
3082 while (find_next_entry_inc(iter) != NULL) { 3140 while (find_next_entry_inc(iter) != NULL) {
3083 enum print_line_t ret; 3141 enum print_line_t ret;
3084 int len = iter->seq.len; 3142 int len = iter->seq.len;
@@ -3095,6 +3153,7 @@ waitagain:
3095 if (iter->seq.len >= cnt) 3153 if (iter->seq.len >= cnt)
3096 break; 3154 break;
3097 } 3155 }
3156 trace_access_unlock(iter->cpu_file);
3098 trace_event_read_unlock(); 3157 trace_event_read_unlock();
3099 3158
3100 /* Now copy what we have to the user */ 3159 /* Now copy what we have to the user */
@@ -3220,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3220 } 3279 }
3221 3280
3222 trace_event_read_lock(); 3281 trace_event_read_lock();
3282 trace_access_lock(iter->cpu_file);
3223 3283
3224 /* Fill as many pages as possible. */ 3284 /* Fill as many pages as possible. */
3225 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3285 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3243,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3243 trace_seq_init(&iter->seq); 3303 trace_seq_init(&iter->seq);
3244 } 3304 }
3245 3305
3306 trace_access_unlock(iter->cpu_file);
3246 trace_event_read_unlock(); 3307 trace_event_read_unlock();
3247 mutex_unlock(&iter->mutex); 3308 mutex_unlock(&iter->mutex);
3248 3309
@@ -3544,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3544 3605
3545 info->read = 0; 3606 info->read = 0;
3546 3607
3608 trace_access_lock(info->cpu);
3547 ret = ring_buffer_read_page(info->tr->buffer, 3609 ret = ring_buffer_read_page(info->tr->buffer,
3548 &info->spare, 3610 &info->spare,
3549 count, 3611 count,
3550 info->cpu, 0); 3612 info->cpu, 0);
3613 trace_access_unlock(info->cpu);
3551 if (ret < 0) 3614 if (ret < 0)
3552 return 0; 3615 return 0;
3553 3616
@@ -3675,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3675 len &= PAGE_MASK; 3738 len &= PAGE_MASK;
3676 } 3739 }
3677 3740
3741 trace_access_lock(info->cpu);
3678 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3742 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3679 3743
3680 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3744 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3722,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3722 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3786 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3723 } 3787 }
3724 3788
3789 trace_access_unlock(info->cpu);
3725 spd.nr_pages = i; 3790 spd.nr_pages = i;
3726 3791
3727 /* did we read anything? */ 3792 /* did we read anything? */
@@ -4158,6 +4223,8 @@ static __init int tracer_init_debugfs(void)
4158 struct dentry *d_tracer; 4223 struct dentry *d_tracer;
4159 int cpu; 4224 int cpu;
4160 4225
4226 trace_access_lock_init();
4227
4161 d_tracer = tracing_init_dentry(); 4228 d_tracer = tracing_init_dentry();
4162 4229
4163 trace_create_file("tracing_enabled", 0644, d_tracer, 4230 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4392,9 +4459,6 @@ __init static int tracer_alloc_buffers(void)
4392 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4459 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4393 goto out_free_buffer_mask; 4460 goto out_free_buffer_mask;
4394 4461
4395 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4396 goto out_free_tracing_cpumask;
4397
4398 /* To save memory, keep the ring buffer size to its minimum */ 4462 /* To save memory, keep the ring buffer size to its minimum */
4399 if (ring_buffer_expanded) 4463 if (ring_buffer_expanded)
4400 ring_buf_size = trace_buf_size; 4464 ring_buf_size = trace_buf_size;
@@ -4452,8 +4516,6 @@ __init static int tracer_alloc_buffers(void)
4452 return 0; 4516 return 0;
4453 4517
4454out_free_cpumask: 4518out_free_cpumask:
4455 free_cpumask_var(tracing_reader_cpumask);
4456out_free_tracing_cpumask:
4457 free_cpumask_var(tracing_cpumask); 4519 free_cpumask_var(tracing_cpumask);
4458out_free_buffer_mask: 4520out_free_buffer_mask:
4459 free_cpumask_var(tracing_buffer_mask); 4521 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..fd05bcaf91b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
497#ifdef CONFIG_DYNAMIC_FTRACE 497#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 498/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 499#define FTRACE_GRAPH_MAX_FUNCS 32
500extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 501extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 502extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 503
@@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 505{
505 int i; 506 int i;
506 507
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 508 if (!ftrace_graph_filter_enabled)
508 return 1; 509 return 1;
509 510
510 for (i = 0; i < ftrace_graph_count; i++) { 511 for (i = 0; i < ftrace_graph_count; i++) {
@@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 792
792#undef FTRACE_ENTRY 793#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 794#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 795 extern struct ftrace_event_call \
796 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 797#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 798#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 799 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 60 return 0;
61 61
62err: 62err:
63 if (field) { 63 if (field)
64 kfree(field->name); 64 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 65 kfree(field);
68 66
69 return -ENOMEM; 67 return -ENOMEM;
@@ -520,41 +518,16 @@ out:
520 return ret; 518 return ret;
521} 519}
522 520
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 521static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 522event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 523 loff_t *ppos)
553{ 524{
554 struct ftrace_event_call *call = filp->private_data; 525 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field;
555 struct trace_seq *s; 527 struct trace_seq *s;
528 int common_field_count = 5;
556 char *buf; 529 char *buf;
557 int r; 530 int r = 0;
558 531
559 if (*ppos) 532 if (*ppos)
560 return 0; 533 return 0;
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 538
566 trace_seq_init(s); 539 trace_seq_init(s);
567 540
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 541 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 542 trace_seq_printf(s, "ID: %d\n", call->id);
572 trace_seq_printf(s, "format:\n"); 543 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 544
575 r = call->show_format(call, s); 545 list_for_each_entry_reverse(field, &call->fields, link) {
546 /*
547 * Smartly shows the array type(except dynamic array).
548 * Normal:
549 * field:TYPE VAR
550 * If TYPE := TYPE[LEN], it is shown:
551 * field:TYPE VAR[LEN]
552 */
553 const char *array_descriptor = strchr(field->type, '[');
554
555 if (!strncmp(field->type, "__data_loc", 10))
556 array_descriptor = NULL;
557
558 if (!array_descriptor) {
559 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
560 "\tsize:%u;\tsigned:%d;\n",
561 field->type, field->name, field->offset,
562 field->size, !!field->is_signed);
563 } else {
564 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
565 "\tsize:%u;\tsigned:%d;\n",
566 (int)(array_descriptor - field->type),
567 field->type, field->name,
568 array_descriptor, field->offset,
569 field->size, !!field->is_signed);
570 }
571
572 if (--common_field_count == 0)
573 r = trace_seq_printf(s, "\n");
574
575 if (!r)
576 break;
577 }
578
579 if (r)
580 r = trace_seq_printf(s, "\nprint fmt: %s\n",
581 call->print_fmt);
582
576 if (!r) { 583 if (!r) {
577 /* 584 /*
578 * ug! The format output is bigger than a PAGE!! 585 * ug! The format output is bigger than a PAGE!!
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
948 filter); 955 filter);
949 } 956 }
950 957
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 958 trace_create_file("format", 0444, call->dir, call,
956 format); 959 format);
957 960
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1371,7 +1371,7 @@ out_unlock:
1371 return err; 1371 return err;
1372} 1372}
1373 1373
1374#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1375 1375
1376void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1377{ 1377{
@@ -1439,5 +1439,5 @@ out_unlock:
1439 return err; 1439 return err;
1440} 1440}
1441 1441
1442#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1443 1443
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
175 return ret; 103 return ret;
176 104
177#undef __dynamic_array 105#undef __dynamic_array
178#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
179 112
180#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
181#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
198 return 0; 131 return 0;
199} 132}
200 133
134#undef __entry
135#define __entry REC
136
201#undef __field 137#undef __field
202#define __field(type, item) 138#define __field(type, item)
203 139
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
213#undef __dynamic_array 149#undef __dynamic_array
214#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
215 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
216#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
218 \ 157 \
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
223 .id = type, \ 162 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
226 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
227 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
228}; \ 167}; \
229 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..e998a824e9db 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore; 20 int ignore;
21 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 22};
22 23
23struct fgraph_data { 24struct fgraph_data {
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 213 int cpu;
213 int pc; 214 int pc;
214 215
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 216 if (!ftrace_trace_task(current))
219 return 0; 217 return 0;
220 218
221 if (!ftrace_graph_addr(trace->func)) 219 /* trace it when it is-nested-in or is a function enabled. */
220 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 221 return 0;
223 222
224 local_irq_save(flags); 223 local_irq_save(flags);
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 230 } else {
232 ret = 0; 231 ret = 0;
233 } 232 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 233
238 atomic_dec(&data->disabled); 234 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 235 local_irq_restore(flags);
@@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 277 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 278 __trace_graph_return(tr, trace, flags, pc);
283 } 279 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 280 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 281 local_irq_restore(flags);
288} 282}
289 283
284void set_graph_array(struct trace_array *tr)
285{
286 graph_array = tr;
287
288 /* Make graph_array visible before we start tracing */
289
290 smp_mb();
291}
292
290static int graph_trace_init(struct trace_array *tr) 293static int graph_trace_init(struct trace_array *tr)
291{ 294{
292 int ret; 295 int ret;
293 296
294 graph_array = tr; 297 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 298 ret = register_ftrace_graph(&trace_graph_return,
296 &trace_graph_entry); 299 &trace_graph_entry);
297 if (ret) 300 if (ret)
@@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 304 return 0;
302} 305}
303 306
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 307static void graph_trace_reset(struct trace_array *tr)
310{ 308{
311 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
@@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 671 duration = graph_ret->rettime - graph_ret->calltime;
674 672
675 if (data) { 673 if (data) {
674 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 675 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 676
677 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 678
679 /* 679 /*
680 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 681 * this is a leaf function, keep the comments
682 * equal to this depth. 682 * equal to this depth.
683 */ 683 */
684 *depth = call->depth - 1; 684 cpu_data->depth = call->depth - 1;
685
686 /* No need to keep this function around for this depth */
687 if (call->depth < FTRACE_RETFUNC_DEPTH)
688 cpu_data->enter_funcs[call->depth] = 0;
685 } 689 }
686 690
687 /* Overhead */ 691 /* Overhead */
@@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 725 int i;
722 726
723 if (data) { 727 if (data) {
728 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 729 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 730
727 *depth = call->depth; 731 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
732 cpu_data->depth = call->depth;
733
734 /* Save this function pointer to see if the exit matches */
735 if (call->depth < FTRACE_RETFUNC_DEPTH)
736 cpu_data->enter_funcs[call->depth] = call->func;
728 } 737 }
729 738
730 /* No overhead */ 739 /* No overhead */
@@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
854 struct fgraph_data *data = iter->private; 863 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 864 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 865 int cpu = iter->cpu;
866 int func_match = 1;
857 int ret; 867 int ret;
858 int i; 868 int i;
859 869
860 if (data) { 870 if (data) {
871 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 872 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 873
874 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 875
864 /* 876 /*
865 * Comments display at + 1 to depth. This is the 877 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 878 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 879 * to display at the same level of the bracket.
868 */ 880 */
869 *depth = trace->depth - 1; 881 cpu_data->depth = trace->depth - 1;
882
883 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
884 if (cpu_data->enter_funcs[trace->depth] != trace->func)
885 func_match = 0;
886 cpu_data->enter_funcs[trace->depth] = 0;
887 }
870 } 888 }
871 889
872 if (print_graph_prologue(iter, s, 0, 0)) 890 if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 909 return TRACE_TYPE_PARTIAL_LINE;
892 } 910 }
893 911
894 ret = trace_seq_printf(s, "}\n"); 912 /*
895 if (!ret) 913 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 914 * then the entry was lost. Instead of just printing
915 * the '}' and letting the user guess what function this
916 * belongs to, write out the function name.
917 */
918 if (func_match) {
919 ret = trace_seq_printf(s, "}\n");
920 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE;
922 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
924 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE;
926 }
897 927
898 /* Overrun */ 928 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 929 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 50b1b8239806..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
489 } 482 }
490 } else 483 } else
491 ret = -EINVAL; 484 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else 485 } else
501 ret = -EINVAL; 486 ret = -EINVAL;
502 return ret; 487 return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args: 598 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value 599 * $retval : fetch return value
616 * $stack : fetch stack address 600 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -651,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
651 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
652 event[-1] = '\0'; 636 event[-1] = '\0';
653 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
654 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
655 return -EINVAL; 639 return -EINVAL;
656 } 640 }
657 } 641 }
658 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
659 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
660 return -EINVAL; 644 return -EINVAL;
661 } 645 }
662 } 646 }
@@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
958}; 942};
959 943
960/* Kprobe handler */ 944/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{ 946{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
979 irq_flags, pc); 963 irq_flags, pc);
980 if (!event) 964 if (!event)
981 return 0; 965 return;
982 966
983 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
988 972
989 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992} 975}
993 976
994/* Kretprobe handler */ 977/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs) 979 struct pt_regs *regs)
997{ 980{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
1012 irq_flags, pc); 995 irq_flags, pc);
1013 if (!event) 996 if (!event)
1014 return 0; 997 return;
1015 998
1016 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1022 1005
1023 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027} 1008}
1028 1009
1029/* Event entry printers */ 1010/* Event entry printers */
@@ -1174,213 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1174 return 0; 1155 return 0;
1175} 1156}
1176 1157
1177static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1178 struct trace_probe *tp, const char *fmt,
1179 const char *arg)
1180{ 1159{
1181 int i; 1160 int i;
1161 int pos = 0;
1182 1162
1183 /* Show format */ 1163 const char *fmt, *arg;
1184 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1185 return 0;
1186 1164
1187 for (i = 0; i < tp->nr_args; i++) 1165 if (!probe_is_return(tp)) {
1188 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1166 fmt = "(%lx)";
1189 return 0; 1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1190 1172
1191 if (!trace_seq_printf(s, "\", %s", arg)) 1173 /* When len=0, we just calculate the needed length */
1192 return 0; 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1193 1175
1194 for (i = 0; i < tp->nr_args; i++) 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1195 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1196 return 0;
1197
1198 return trace_seq_puts(s, "\n");
1199}
1200 1177
1201#undef SHOW_FIELD 1178 for (i = 0; i < tp->nr_args; i++) {
1202#define SHOW_FIELD(type, item, name) \ 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1203 do { \ 1180 tp->args[i].name);
1204 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ 1181 }
1205 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1206 (unsigned int)offsetof(typeof(field), item),\
1207 (unsigned int)sizeof(type), \
1208 is_signed_type(type)); \
1209 if (!ret) \
1210 return 0; \
1211 } while (0)
1212 1182
1213static int kprobe_event_show_format(struct ftrace_event_call *call, 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1214 struct trace_seq *s)
1215{
1216 struct kprobe_trace_entry field __attribute__((unused));
1217 int ret, i;
1218 struct trace_probe *tp = (struct trace_probe *)call->data;
1219 1184
1220 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); 1185 for (i = 0; i < tp->nr_args; i++) {
1221 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1187 tp->args[i].name);
1188 }
1222 1189
1223 /* Show fields */ 1190#undef LEN_OR_ZERO
1224 for (i = 0; i < tp->nr_args; i++)
1225 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1226 trace_seq_puts(s, "\n");
1227 1191
1228 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1229 "REC->" FIELD_STRING_IP); 1193 return pos;
1230} 1194}
1231 1195
1232static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1233 struct trace_seq *s)
1234{ 1197{
1235 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1236 int ret, i; 1199 char *print_fmt;
1237 struct trace_probe *tp = (struct trace_probe *)call->data;
1238 1200
1239 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1240 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1241 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1242 1206
1243 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1244 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1245 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1246 trace_seq_puts(s, "\n");
1247 1210
1248 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1249 "REC->" FIELD_STRING_FUNC
1250 ", REC->" FIELD_STRING_RETIP);
1251} 1212}
1252 1213
1253#ifdef CONFIG_EVENT_PROFILE 1214#ifdef CONFIG_PERF_EVENTS
1254 1215
1255/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1256static __kprobes int kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_profile_func(struct kprobe *kp,
1257 struct pt_regs *regs) 1218 struct pt_regs *regs)
1258{ 1219{
1259 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1260 struct ftrace_event_call *call = &tp->call; 1221 struct ftrace_event_call *call = &tp->call;
1261 struct kprobe_trace_entry *entry; 1222 struct kprobe_trace_entry *entry;
1262 struct trace_entry *ent; 1223 int size, __size, i;
1263 int size, __size, i, pc, __cpu;
1264 unsigned long irq_flags; 1224 unsigned long irq_flags;
1265 char *trace_buf;
1266 char *raw_data;
1267 int rctx; 1225 int rctx;
1268 1226
1269 pc = preempt_count();
1270 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1271 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1272 size -= sizeof(u32); 1229 size -= sizeof(u32);
1273 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1274 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1275 return 0; 1232 return;
1276
1277 /*
1278 * Protect the non nmi buffer
1279 * This also protects the rcu read side
1280 */
1281 local_irq_save(irq_flags);
1282
1283 rctx = perf_swevent_get_recursion_context();
1284 if (rctx < 0)
1285 goto end_recursion;
1286
1287 __cpu = smp_processor_id();
1288
1289 if (in_nmi())
1290 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1291 else
1292 trace_buf = rcu_dereference(perf_trace_buf);
1293 1233
1294 if (!trace_buf) 1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1295 goto end; 1235 if (!entry)
1296 1236 return;
1297 raw_data = per_cpu_ptr(trace_buf, __cpu);
1298
1299 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1300 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1301 entry = (struct kprobe_trace_entry *)raw_data;
1302 ent = &entry->ent;
1303 1237
1304 tracing_generic_entry_update(ent, irq_flags, pc);
1305 ent->type = call->id;
1306 entry->nargs = tp->nr_args; 1238 entry->nargs = tp->nr_args;
1307 entry->ip = (unsigned long)kp->addr; 1239 entry->ip = (unsigned long)kp->addr;
1308 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1309 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1310 perf_tp_event(call->id, entry->ip, 1, entry, size);
1311
1312end:
1313 perf_swevent_put_recursion_context(rctx);
1314end_recursion:
1315 local_irq_restore(irq_flags);
1316 1242
1317 return 0; 1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1318} 1244}
1319 1245
1320/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1321static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1322 struct pt_regs *regs) 1248 struct pt_regs *regs)
1323{ 1249{
1324 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1325 struct ftrace_event_call *call = &tp->call; 1251 struct ftrace_event_call *call = &tp->call;
1326 struct kretprobe_trace_entry *entry; 1252 struct kretprobe_trace_entry *entry;
1327 struct trace_entry *ent; 1253 int size, __size, i;
1328 int size, __size, i, pc, __cpu;
1329 unsigned long irq_flags; 1254 unsigned long irq_flags;
1330 char *trace_buf;
1331 char *raw_data;
1332 int rctx; 1255 int rctx;
1333 1256
1334 pc = preempt_count();
1335 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1336 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1337 size -= sizeof(u32); 1259 size -= sizeof(u32);
1338 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1339 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1340 return 0; 1262 return;
1341
1342 /*
1343 * Protect the non nmi buffer
1344 * This also protects the rcu read side
1345 */
1346 local_irq_save(irq_flags);
1347
1348 rctx = perf_swevent_get_recursion_context();
1349 if (rctx < 0)
1350 goto end_recursion;
1351
1352 __cpu = smp_processor_id();
1353 1263
1354 if (in_nmi()) 1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1355 trace_buf = rcu_dereference(perf_trace_buf_nmi); 1265 if (!entry)
1356 else 1266 return;
1357 trace_buf = rcu_dereference(perf_trace_buf);
1358
1359 if (!trace_buf)
1360 goto end;
1361
1362 raw_data = per_cpu_ptr(trace_buf, __cpu);
1363
1364 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1365 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1366 entry = (struct kretprobe_trace_entry *)raw_data;
1367 ent = &entry->ent;
1368 1267
1369 tracing_generic_entry_update(ent, irq_flags, pc);
1370 ent->type = call->id;
1371 entry->nargs = tp->nr_args; 1268 entry->nargs = tp->nr_args;
1372 entry->func = (unsigned long)tp->rp.kp.addr; 1269 entry->func = (unsigned long)tp->rp.kp.addr;
1373 entry->ret_ip = (unsigned long)ri->ret_addr; 1270 entry->ret_ip = (unsigned long)ri->ret_addr;
1374 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1375 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1376 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1377
1378end:
1379 perf_swevent_put_recursion_context(rctx);
1380end_recursion:
1381 local_irq_restore(irq_flags);
1382 1273
1383 return 0; 1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1384} 1275}
1385 1276
1386static int probe_profile_enable(struct ftrace_event_call *call) 1277static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1408,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1408 disable_kprobe(&tp->rp.kp); 1299 disable_kprobe(&tp->rp.kp);
1409 } 1300 }
1410} 1301}
1411#endif /* CONFIG_EVENT_PROFILE */ 1302#endif /* CONFIG_PERF_EVENTS */
1412 1303
1413 1304
1414static __kprobes 1305static __kprobes
@@ -1418,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1418 1309
1419 if (tp->flags & TP_FLAG_TRACE) 1310 if (tp->flags & TP_FLAG_TRACE)
1420 kprobe_trace_func(kp, regs); 1311 kprobe_trace_func(kp, regs);
1421#ifdef CONFIG_EVENT_PROFILE 1312#ifdef CONFIG_PERF_EVENTS
1422 if (tp->flags & TP_FLAG_PROFILE) 1313 if (tp->flags & TP_FLAG_PROFILE)
1423 kprobe_profile_func(kp, regs); 1314 kprobe_profile_func(kp, regs);
1424#endif /* CONFIG_EVENT_PROFILE */ 1315#endif
1425 return 0; /* We don't tweek kernel, so just return 0 */ 1316 return 0; /* We don't tweek kernel, so just return 0 */
1426} 1317}
1427 1318
@@ -1432,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1432 1323
1433 if (tp->flags & TP_FLAG_TRACE) 1324 if (tp->flags & TP_FLAG_TRACE)
1434 kretprobe_trace_func(ri, regs); 1325 kretprobe_trace_func(ri, regs);
1435#ifdef CONFIG_EVENT_PROFILE 1326#ifdef CONFIG_PERF_EVENTS
1436 if (tp->flags & TP_FLAG_PROFILE) 1327 if (tp->flags & TP_FLAG_PROFILE)
1437 kretprobe_profile_func(ri, regs); 1328 kretprobe_profile_func(ri, regs);
1438#endif /* CONFIG_EVENT_PROFILE */ 1329#endif
1439 return 0; /* We don't tweek kernel, so just return 0 */ 1330 return 0; /* We don't tweek kernel, so just return 0 */
1440} 1331}
1441 1332
@@ -1448,23 +1339,25 @@ static int register_probe_event(struct trace_probe *tp)
1448 if (probe_is_return(tp)) { 1339 if (probe_is_return(tp)) {
1449 tp->event.trace = print_kretprobe_event; 1340 tp->event.trace = print_kretprobe_event;
1450 call->raw_init = probe_event_raw_init; 1341 call->raw_init = probe_event_raw_init;
1451 call->show_format = kretprobe_event_show_format;
1452 call->define_fields = kretprobe_event_define_fields; 1342 call->define_fields = kretprobe_event_define_fields;
1453 } else { 1343 } else {
1454 tp->event.trace = print_kprobe_event; 1344 tp->event.trace = print_kprobe_event;
1455 call->raw_init = probe_event_raw_init; 1345 call->raw_init = probe_event_raw_init;
1456 call->show_format = kprobe_event_show_format;
1457 call->define_fields = kprobe_event_define_fields; 1346 call->define_fields = kprobe_event_define_fields;
1458 } 1347 }
1348 if (set_print_fmt(tp) < 0)
1349 return -ENOMEM;
1459 call->event = &tp->event; 1350 call->event = &tp->event;
1460 call->id = register_ftrace_event(&tp->event); 1351 call->id = register_ftrace_event(&tp->event);
1461 if (!call->id) 1352 if (!call->id) {
1353 kfree(call->print_fmt);
1462 return -ENODEV; 1354 return -ENODEV;
1355 }
1463 call->enabled = 0; 1356 call->enabled = 0;
1464 call->regfunc = probe_event_enable; 1357 call->regfunc = probe_event_enable;
1465 call->unregfunc = probe_event_disable; 1358 call->unregfunc = probe_event_disable;
1466 1359
1467#ifdef CONFIG_EVENT_PROFILE 1360#ifdef CONFIG_PERF_EVENTS
1468 call->profile_enable = probe_profile_enable; 1361 call->profile_enable = probe_profile_enable;
1469 call->profile_disable = probe_profile_disable; 1362 call->profile_disable = probe_profile_disable;
1470#endif 1363#endif
@@ -1472,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
1472 ret = trace_add_event_call(call); 1365 ret = trace_add_event_call(call);
1473 if (ret) { 1366 if (ret) {
1474 pr_info("Failed to register kprobe event: %s\n", call->name); 1367 pr_info("Failed to register kprobe event: %s\n", call->name);
1368 kfree(call->print_fmt);
1475 unregister_ftrace_event(&tp->event); 1369 unregister_ftrace_event(&tp->event);
1476 } 1370 }
1477 return ret; 1371 return ret;
@@ -1481,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1481{ 1375{
1482 /* tp->event is unregistered in trace_remove_event_call() */ 1376 /* tp->event is unregistered in trace_remove_event_call() */
1483 trace_remove_event_call(&tp->call); 1377 trace_remove_event_call(&tp->call);
1378 kfree(tp->call.print_fmt);
1484} 1379}
1485 1380
1486/* Make a debugfs interface for controling probe points */ 1381/* Make a debugfs interface for controling probe points */
@@ -1523,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1523 1418
1524static __init int kprobe_trace_self_tests_init(void) 1419static __init int kprobe_trace_self_tests_init(void)
1525{ 1420{
1526 int ret; 1421 int ret, warn = 0;
1527 int (*target)(int, int, int, int, int, int); 1422 int (*target)(int, int, int, int, int, int);
1423 struct trace_probe *tp;
1528 1424
1529 target = kprobe_trace_selftest_target; 1425 target = kprobe_trace_selftest_target;
1530 1426
1531 pr_info("Testing kprobe tracing: "); 1427 pr_info("Testing kprobe tracing: ");
1532 1428
1533 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1429 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1534 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1430 "$stack $stack0 +0($stack)");
1535 if (WARN_ON_ONCE(ret)) 1431 if (WARN_ON_ONCE(ret)) {
1536 pr_warning("error enabling function entry\n"); 1432 pr_warning("error on probing function entry.\n");
1433 warn++;
1434 } else {
1435 /* Enable trace point */
1436 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1437 if (WARN_ON_ONCE(tp == NULL)) {
1438 pr_warning("error on getting new probe.\n");
1439 warn++;
1440 } else
1441 probe_event_enable(&tp->call);
1442 }
1537 1443
1538 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1444 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1539 "$retval"); 1445 "$retval");
1540 if (WARN_ON_ONCE(ret)) 1446 if (WARN_ON_ONCE(ret)) {
1541 pr_warning("error enabling function return\n"); 1447 pr_warning("error on probing function return.\n");
1448 warn++;
1449 } else {
1450 /* Enable trace point */
1451 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1452 if (WARN_ON_ONCE(tp == NULL)) {
1453 pr_warning("error on getting new probe.\n");
1454 warn++;
1455 } else
1456 probe_event_enable(&tp->call);
1457 }
1458
1459 if (warn)
1460 goto end;
1542 1461
1543 ret = target(1, 2, 3, 4, 5, 6); 1462 ret = target(1, 2, 3, 4, 5, 6);
1544 1463
1545 cleanup_all_probes(); 1464 ret = command_trace_probe("-:testprobe");
1465 if (WARN_ON_ONCE(ret)) {
1466 pr_warning("error on deleting a probe.\n");
1467 warn++;
1468 }
1546 1469
1547 pr_cont("OK\n"); 1470 ret = command_trace_probe("-:testprobe2");
1471 if (WARN_ON_ONCE(ret)) {
1472 pr_warning("error on deleting a probe.\n");
1473 warn++;
1474 }
1475
1476end:
1477 cleanup_all_probes();
1478 if (warn)
1479 pr_cont("NG: Some tests are failed. Please check them.\n");
1480 else
1481 pr_cont("OK\n");
1548 return 0; 1482 return 0;
1549} 1483}
1550 1484
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 144 sizeof(trace.name), is_signed_type(type)
145 145
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146static
147int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 148{
148 int i; 149 int i;
149 int ret; 150 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 151
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 152 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 153#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 154
155 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 156 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 157 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 158 entry->args[i], sizeof(unsigned long),
163 if (!ret) 159 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 160 }
161 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 162
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 163 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 164 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 165 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 166 }
182 trace_seq_putc(s, '"');
183 167
184 for (i = 0; i < entry->nb_args; i++) { 168#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 169
191 return trace_seq_putc(s, '\n'); 170 /* return the length of print_fmt */
171 return pos;
192} 172}
193 173
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 174static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 175{
196 int ret; 176 char *print_fmt;
197 struct syscall_trace_exit trace; 177 int len;
178 struct syscall_metadata *entry = call->data;
198 179
199 ret = trace_seq_printf(s, 180 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 181 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 182 return 0;
183 }
184
185 /* First: called with 0 length to calculate the needed length */
186 len = __set_enter_print_fmt(entry, NULL, 0);
187
188 print_fmt = kmalloc(len + 1, GFP_KERNEL);
189 if (!print_fmt)
190 return -ENOMEM;
191
192 /* Second: actually write the @print_fmt */
193 __set_enter_print_fmt(entry, print_fmt, len + 1);
194 call->print_fmt = print_fmt;
208 195
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 196 return 0;
197}
198
199static void free_syscall_print_fmt(struct ftrace_event_call *call)
200{
201 struct syscall_metadata *entry = call->data;
202
203 if (entry->enter_event == call)
204 kfree(call->print_fmt);
210} 205}
211 206
212int syscall_enter_define_fields(struct ftrace_event_call *call) 207int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 381{
387 int id; 382 int id;
388 383
389 id = register_ftrace_event(call->event); 384 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 385 return -ENOMEM;
391 return -ENODEV; 386
392 call->id = id; 387 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 388
394 return 0; 389 if (id < 0) {
390 free_syscall_print_fmt(call);
391 return id;
392 }
393
394 return id;
395}
396
397unsigned long __init arch_syscall_addr(int nr)
398{
399 return (unsigned long)sys_call_table[nr];
395} 400}
396 401
397int __init init_ftrace_syscalls(void) 402int __init init_ftrace_syscalls(void)
@@ -421,7 +426,7 @@ int __init init_ftrace_syscalls(void)
421} 426}
422core_initcall(init_ftrace_syscalls); 427core_initcall(init_ftrace_syscalls);
423 428
424#ifdef CONFIG_EVENT_PROFILE 429#ifdef CONFIG_PERF_EVENTS
425 430
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
433 struct syscall_metadata *sys_data; 438 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 439 struct syscall_trace_enter *rec;
435 unsigned long flags; 440 unsigned long flags;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 441 int syscall_nr;
439 int rctx; 442 int rctx;
440 int size; 443 int size;
441 int cpu;
442 444
443 syscall_nr = syscall_get_nr(current, regs); 445 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
457 "profile buffer not large enough")) 459 "profile buffer not large enough"))
458 return; 460 return;
459 461
460 /* Protect the per cpu buffer, begin the rcu read side */ 462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
461 local_irq_save(flags); 463 sys_data->enter_event->id, &rctx, &flags);
462 464 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 465 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 466
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 467 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 468 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 469 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
486
487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
490 local_irq_restore(flags);
491} 471}
492 472
493int prof_sysenter_enable(struct ftrace_event_call *call) 473int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
531 struct syscall_trace_exit *rec; 511 struct syscall_trace_exit *rec;
532 unsigned long flags; 512 unsigned long flags;
533 int syscall_nr; 513 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 514 int rctx;
537 int size; 515 int size;
538 int cpu;
539 516
540 syscall_nr = syscall_get_nr(current, regs); 517 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
557 "exit event has grown above profile buffer size")) 534 "exit event has grown above profile buffer size"))
558 return; 535 return;
559 536
560 /* Protect the per cpu buffer, begin the rcu read side */ 537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
561 local_irq_save(flags); 538 sys_data->exit_event->id, &rctx, &flags);
562 539 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 540 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 541
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 542 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 543 rec->ret = syscall_get_return_value(current, regs);
585 544
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
587
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 546}
593 547
594int prof_sysexit_enable(struct ftrace_event_call *call) 548int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -603,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
603 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(prof_syscall_exit);
604 if (ret) { 558 if (ret) {
605 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
606 "syscall entry trace point"); 560 "syscall exit trace point");
607 } else { 561 } else {
608 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_prof_exit_syscalls);
609 sys_prof_refcount_exit++; 563 sys_prof_refcount_exit++;
@@ -626,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
626 mutex_unlock(&syscall_trace_lock); 580 mutex_unlock(&syscall_trace_lock);
627} 581}
628 582
629#endif 583#endif /* CONFIG_PERF_EVENTS */
630
631 584
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186