aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/early_res.c590
-rw-r--r--kernel/futex.c66
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hrtimer.c13
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig53
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c378
-rw-r--r--kernel/irq/dummychip.c68
-rw-r--r--kernel/irq/handle.c341
-rw-r--r--kernel/irq/internals.h39
-rw-r--r--kernel/irq/irqdesc.c395
-rw-r--r--kernel/irq/manage.c87
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c120
-rw-r--r--kernel/irq/proc.c26
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/perf_event.c98
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/power/Kconfig17
-rw-r--r--kernel/power/hibernate.c25
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/power.h10
-rw-r--r--kernel/power/process.c11
-rw-r--r--kernel/power/snapshot.c13
-rw-r--r--kernel/power/swap.c300
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/rcupdate.c8
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h582
-rw-r--r--kernel/rcutorture.c17
-rw-r--r--kernel/rcutree.c92
-rw-r--r--kernel/rcutree.h20
-rw-r--r--kernel/rcutree_plugin.h47
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c307
-rw-r--r--kernel/sched_fair.c81
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c40
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/softirq.c73
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/sysctl_check.c9
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/watchdog.c2
57 files changed, 2587 insertions, 1682 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4d9bf5f8531f..0b5ff083fa22 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,7 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o jump_label.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 14obj-y += groups.o
16 15
17ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -87,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
87obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
88obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
89obj-$(CONFIG_TINY_RCU) += rcutiny.o 88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
90obj-$(CONFIG_RELAY) += relay.o 90obj-$(CONFIG_RELAY) += relay.o
91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..291ba3d04bea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 138 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 139 * css_tryget() should be used for avoiding race.
140 */ 140 */
141 struct cgroup_subsys_state *css; 141 struct cgroup_subsys_state __rcu *css;
142 /* 142 /*
143 * ID of this css. 143 * ID of this css.
144 */ 144 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..51b143e2a07a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1397 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1398 return -EINVAL;
1399 1399
1400 ret = security_task_setscheduler(tsk, 0, NULL); 1400 ret = security_task_setscheduler(tsk);
1401 if (ret) 1401 if (ret)
1402 return ret; 1402 return ret;
1403 if (threadgroup) { 1403 if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1405 1405
1406 rcu_read_lock(); 1406 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL); 1408 ret = security_task_setscheduler(c);
1409 if (ret) { 1409 if (ret) {
1410 rcu_read_unlock(); 1410 rcu_read_unlock();
1411 return ret; 1411 return ret;
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
12
13/*
14 * Early reserved memory areas.
15 */
16/*
17 * need to make sure this one is bigger enough before
18 * find_fw_memmap_area could be used
19 */
20#define MAX_EARLY_RES_X 32
21
22struct early_res {
23 u64 start, end;
24 char name[15];
25 char overlap_ok;
26};
27static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
28
29static int max_early_res __initdata = MAX_EARLY_RES_X;
30static struct early_res *early_res __initdata = &early_res_x[0];
31static int early_res_count __initdata;
32
33static int __init find_overlapped_early(u64 start, u64 end)
34{
35 int i;
36 struct early_res *r;
37
38 for (i = 0; i < max_early_res && early_res[i].end; i++) {
39 r = &early_res[i];
40 if (end > r->start && start < r->end)
41 break;
42 }
43
44 return i;
45}
46
47/*
48 * Drop the i-th range from the early reservation map,
49 * by copying any higher ranges down one over it, and
50 * clearing what had been the last slot.
51 */
52static void __init drop_range(int i)
53{
54 int j;
55
56 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
57 ;
58
59 memmove(&early_res[i], &early_res[i + 1],
60 (j - 1 - i) * sizeof(struct early_res));
61
62 early_res[j - 1].end = 0;
63 early_res_count--;
64}
65
66static void __init drop_range_partial(int i, u64 start, u64 end)
67{
68 u64 common_start, common_end;
69 u64 old_start, old_end;
70
71 old_start = early_res[i].start;
72 old_end = early_res[i].end;
73 common_start = max(old_start, start);
74 common_end = min(old_end, end);
75
76 /* no overlap ? */
77 if (common_start >= common_end)
78 return;
79
80 if (old_start < common_start) {
81 /* make head segment */
82 early_res[i].end = common_start;
83 if (old_end > common_end) {
84 char name[15];
85
86 /*
87 * Save a local copy of the name, since the
88 * early_res array could get resized inside
89 * reserve_early_without_check() ->
90 * __check_and_double_early_res(), which would
91 * make the current name pointer invalid.
92 */
93 strncpy(name, early_res[i].name,
94 sizeof(early_res[i].name) - 1);
95 /* add another for left over on tail */
96 reserve_early_without_check(common_end, old_end, name);
97 }
98 return;
99 } else {
100 if (old_end > common_end) {
101 /* reuse the entry for tail left */
102 early_res[i].start = common_end;
103 return;
104 }
105 /* all covered */
106 drop_range(i);
107 }
108}
109
110/*
111 * Split any existing ranges that:
112 * 1) are marked 'overlap_ok', and
113 * 2) overlap with the stated range [start, end)
114 * into whatever portion (if any) of the existing range is entirely
115 * below or entirely above the stated range. Drop the portion
116 * of the existing range that overlaps with the stated range,
117 * which will allow the caller of this routine to then add that
118 * stated range without conflicting with any existing range.
119 */
120static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
121{
122 int i;
123 struct early_res *r;
124 u64 lower_start, lower_end;
125 u64 upper_start, upper_end;
126 char name[15];
127
128 for (i = 0; i < max_early_res && early_res[i].end; i++) {
129 r = &early_res[i];
130
131 /* Continue past non-overlapping ranges */
132 if (end <= r->start || start >= r->end)
133 continue;
134
135 /*
136 * Leave non-ok overlaps as is; let caller
137 * panic "Overlapping early reservations"
138 * when it hits this overlap.
139 */
140 if (!r->overlap_ok)
141 return;
142
143 /*
144 * We have an ok overlap. We will drop it from the early
145 * reservation map, and add back in any non-overlapping
146 * portions (lower or upper) as separate, overlap_ok,
147 * non-overlapping ranges.
148 */
149
150 /* 1. Note any non-overlapping (lower or upper) ranges. */
151 strncpy(name, r->name, sizeof(name) - 1);
152
153 lower_start = lower_end = 0;
154 upper_start = upper_end = 0;
155 if (r->start < start) {
156 lower_start = r->start;
157 lower_end = start;
158 }
159 if (r->end > end) {
160 upper_start = end;
161 upper_end = r->end;
162 }
163
164 /* 2. Drop the original ok overlapping range */
165 drop_range(i);
166
167 i--; /* resume for-loop on copied down entry */
168
169 /* 3. Add back in any non-overlapping ranges. */
170 if (lower_end)
171 reserve_early_overlap_ok(lower_start, lower_end, name);
172 if (upper_end)
173 reserve_early_overlap_ok(upper_start, upper_end, name);
174 }
175}
176
177static void __init __reserve_early(u64 start, u64 end, char *name,
178 int overlap_ok)
179{
180 int i;
181 struct early_res *r;
182
183 i = find_overlapped_early(start, end);
184 if (i >= max_early_res)
185 panic("Too many early reservations");
186 r = &early_res[i];
187 if (r->end)
188 panic("Overlapping early reservations "
189 "%llx-%llx %s to %llx-%llx %s\n",
190 start, end - 1, name ? name : "", r->start,
191 r->end - 1, r->name);
192 r->start = start;
193 r->end = end;
194 r->overlap_ok = overlap_ok;
195 if (name)
196 strncpy(r->name, name, sizeof(r->name) - 1);
197 early_res_count++;
198}
199
200/*
201 * A few early reservtations come here.
202 *
203 * The 'overlap_ok' in the name of this routine does -not- mean it
204 * is ok for these reservations to overlap an earlier reservation.
205 * Rather it means that it is ok for subsequent reservations to
206 * overlap this one.
207 *
208 * Use this entry point to reserve early ranges when you are doing
209 * so out of "Paranoia", reserving perhaps more memory than you need,
210 * just in case, and don't mind a subsequent overlapping reservation
211 * that is known to be needed.
212 *
213 * The drop_overlaps_that_are_ok() call here isn't really needed.
214 * It would be needed if we had two colliding 'overlap_ok'
215 * reservations, so that the second such would not panic on the
216 * overlap with the first. We don't have any such as of this
217 * writing, but might as well tolerate such if it happens in
218 * the future.
219 */
220void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
221{
222 drop_overlaps_that_are_ok(start, end);
223 __reserve_early(start, end, name, 1);
224}
225
226static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
227{
228 u64 start, end, size, mem;
229 struct early_res *new;
230
231 /* do we have enough slots left ? */
232 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
233 return;
234
235 /* double it */
236 mem = -1ULL;
237 size = sizeof(struct early_res) * max_early_res * 2;
238 if (early_res == early_res_x)
239 start = 0;
240 else
241 start = early_res[0].end;
242 end = ex_start;
243 if (start + size < end)
244 mem = find_fw_memmap_area(start, end, size,
245 sizeof(struct early_res));
246 if (mem == -1ULL) {
247 start = ex_end;
248 end = get_max_mapped();
249 if (start + size < end)
250 mem = find_fw_memmap_area(start, end, size,
251 sizeof(struct early_res));
252 }
253 if (mem == -1ULL)
254 panic("can not find more space for early_res array");
255
256 new = __va(mem);
257 /* save the first one for own */
258 new[0].start = mem;
259 new[0].end = mem + size;
260 new[0].overlap_ok = 0;
261 /* copy old to new */
262 if (early_res == early_res_x) {
263 memcpy(&new[1], &early_res[0],
264 sizeof(struct early_res) * max_early_res);
265 memset(&new[max_early_res+1], 0,
266 sizeof(struct early_res) * (max_early_res - 1));
267 early_res_count++;
268 } else {
269 memcpy(&new[1], &early_res[1],
270 sizeof(struct early_res) * (max_early_res - 1));
271 memset(&new[max_early_res], 0,
272 sizeof(struct early_res) * max_early_res);
273 }
274 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
275 early_res = new;
276 max_early_res *= 2;
277 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
278 max_early_res, mem, mem + size - 1);
279}
280
281/*
282 * Most early reservations come here.
283 *
284 * We first have drop_overlaps_that_are_ok() drop any pre-existing
285 * 'overlap_ok' ranges, so that we can then reserve this memory
286 * range without risk of panic'ing on an overlapping overlap_ok
287 * early reservation.
288 */
289void __init reserve_early(u64 start, u64 end, char *name)
290{
291 if (start >= end)
292 return;
293
294 __check_and_double_early_res(start, end);
295
296 drop_overlaps_that_are_ok(start, end);
297 __reserve_early(start, end, name, 0);
298}
299
300void __init reserve_early_without_check(u64 start, u64 end, char *name)
301{
302 struct early_res *r;
303
304 if (start >= end)
305 return;
306
307 __check_and_double_early_res(start, end);
308
309 r = &early_res[early_res_count];
310
311 r->start = start;
312 r->end = end;
313 r->overlap_ok = 0;
314 if (name)
315 strncpy(r->name, name, sizeof(r->name) - 1);
316 early_res_count++;
317}
318
319void __init free_early(u64 start, u64 end)
320{
321 struct early_res *r;
322 int i;
323
324 kmemleak_free_part(__va(start), end - start);
325
326 i = find_overlapped_early(start, end);
327 r = &early_res[i];
328 if (i >= max_early_res || r->end != end || r->start != start)
329 panic("free_early on not reserved area: %llx-%llx!",
330 start, end - 1);
331
332 drop_range(i);
333}
334
335void __init free_early_partial(u64 start, u64 end)
336{
337 struct early_res *r;
338 int i;
339
340 kmemleak_free_part(__va(start), end - start);
341
342 if (start == end)
343 return;
344
345 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
346 return;
347
348try_next:
349 i = find_overlapped_early(start, end);
350 if (i >= max_early_res)
351 return;
352
353 r = &early_res[i];
354 /* hole ? */
355 if (r->end >= end && r->start <= start) {
356 drop_range_partial(i, start, end);
357 return;
358 }
359
360 drop_range_partial(i, start, end);
361 goto try_next;
362}
363
364#ifdef CONFIG_NO_BOOTMEM
365static void __init subtract_early_res(struct range *range, int az)
366{
367 int i, count;
368 u64 final_start, final_end;
369 int idx = 0;
370
371 count = 0;
372 for (i = 0; i < max_early_res && early_res[i].end; i++)
373 count++;
374
375 /* need to skip first one ?*/
376 if (early_res != early_res_x)
377 idx = 1;
378
379#define DEBUG_PRINT_EARLY_RES 1
380
381#if DEBUG_PRINT_EARLY_RES
382 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
383#endif
384 for (i = idx; i < count; i++) {
385 struct early_res *r = &early_res[i];
386#if DEBUG_PRINT_EARLY_RES
387 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
388 r->start, r->end, r->name);
389#endif
390 final_start = PFN_DOWN(r->start);
391 final_end = PFN_UP(r->end);
392 if (final_start >= final_end)
393 continue;
394 subtract_range(range, az, final_start, final_end);
395 }
396
397}
398
399int __init get_free_all_memory_range(struct range **rangep, int nodeid)
400{
401 int i, count;
402 u64 start = 0, end;
403 u64 size;
404 u64 mem;
405 struct range *range;
406 int nr_range;
407
408 count = 0;
409 for (i = 0; i < max_early_res && early_res[i].end; i++)
410 count++;
411
412 count *= 2;
413
414 size = sizeof(struct range) * count;
415 end = get_max_mapped();
416#ifdef MAX_DMA32_PFN
417 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
418 start = MAX_DMA32_PFN << PAGE_SHIFT;
419#endif
420 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
421 if (mem == -1ULL)
422 panic("can not find more space for range free");
423
424 range = __va(mem);
425 /* use early_node_map[] and early_res to get range array at first */
426 memset(range, 0, size);
427 nr_range = 0;
428
429 /* need to go over early_node_map to find out good range for node */
430 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
431#ifdef CONFIG_X86_32
432 subtract_range(range, count, max_low_pfn, -1ULL);
433#endif
434 subtract_early_res(range, count);
435 nr_range = clean_sort_range(range, count);
436
437 /* need to clear it ? */
438 if (nodeid == MAX_NUMNODES) {
439 memset(&early_res[0], 0,
440 sizeof(struct early_res) * max_early_res);
441 early_res = NULL;
442 max_early_res = 0;
443 }
444
445 *rangep = range;
446 return nr_range;
447}
448#else
449void __init early_res_to_bootmem(u64 start, u64 end)
450{
451 int i, count;
452 u64 final_start, final_end;
453 int idx = 0;
454
455 count = 0;
456 for (i = 0; i < max_early_res && early_res[i].end; i++)
457 count++;
458
459 /* need to skip first one ?*/
460 if (early_res != early_res_x)
461 idx = 1;
462
463 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
464 count - idx, max_early_res, start, end);
465 for (i = idx; i < count; i++) {
466 struct early_res *r = &early_res[i];
467 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
468 r->start, r->end, r->name);
469 final_start = max(start, r->start);
470 final_end = min(end, r->end);
471 if (final_start >= final_end) {
472 printk(KERN_CONT "\n");
473 continue;
474 }
475 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
476 final_start, final_end);
477 reserve_bootmem_generic(final_start, final_end - final_start,
478 BOOTMEM_DEFAULT);
479 }
480 /* clear them */
481 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
482 early_res = NULL;
483 max_early_res = 0;
484 early_res_count = 0;
485}
486#endif
487
488/* Check for already reserved areas */
489static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
490{
491 int i;
492 u64 addr = *addrp;
493 int changed = 0;
494 struct early_res *r;
495again:
496 i = find_overlapped_early(addr, addr + size);
497 r = &early_res[i];
498 if (i < max_early_res && r->end) {
499 *addrp = addr = round_up(r->end, align);
500 changed = 1;
501 goto again;
502 }
503 return changed;
504}
505
506/* Check for already reserved areas */
507static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
508{
509 int i;
510 u64 addr = *addrp, last;
511 u64 size = *sizep;
512 int changed = 0;
513again:
514 last = addr + size;
515 for (i = 0; i < max_early_res && early_res[i].end; i++) {
516 struct early_res *r = &early_res[i];
517 if (last > r->start && addr < r->start) {
518 size = r->start - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last > r->end && addr < r->end) {
523 addr = round_up(r->end, align);
524 size = last - addr;
525 changed = 1;
526 goto again;
527 }
528 if (last <= r->end && addr >= r->start) {
529 (*sizep)++;
530 return 0;
531 }
532 }
533 if (changed) {
534 *addrp = addr;
535 *sizep = size;
536 }
537 return changed;
538}
539
540/*
541 * Find a free area with specified alignment in a specific range.
542 * only with the area.between start to end is active range from early_node_map
543 * so they are good as RAM
544 */
545u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
546 u64 size, u64 align)
547{
548 u64 addr, last;
549
550 addr = round_up(ei_start, align);
551 if (addr < start)
552 addr = round_up(start, align);
553 if (addr >= ei_last)
554 goto out;
555 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
556 ;
557 last = addr + size;
558 if (last > ei_last)
559 goto out;
560 if (last > end)
561 goto out;
562
563 return addr;
564
565out:
566 return -1ULL;
567}
568
569u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
570 u64 *sizep, u64 align)
571{
572 u64 addr, last;
573
574 addr = round_up(ei_start, align);
575 if (addr < start)
576 addr = round_up(start, align);
577 if (addr >= ei_last)
578 goto out;
579 *sizep = ei_last - addr;
580 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
581 ;
582 last = addr + *sizep;
583 if (last > ei_last)
584 goto out;
585
586 return addr;
587
588out:
589 return -1ULL;
590}
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..a118bf160e0b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
91 91
92/** 92/**
93 * struct futex_q - The hashed futex queue entry, one per waiting task 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @list: priority-sorted list of tasks waiting on this futex
94 * @task: the task waiting on the futex 95 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock 96 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on 97 * @key: the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
104 * 105 *
105 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 106 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 107 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
107 * The order of wakup is always to make the first condition true, then 108 * The order of wakeup is always to make the first condition true, then
108 * the second. 109 * the second.
109 * 110 *
110 * PI futexes are typically woken before they are removed from the hash list via 111 * PI futexes are typically woken before they are removed from the hash list via
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr. 297 * access to @uaddr.
297 * 298 *
298 * We have no generic implementation of a non destructive write to the 299 * We have no generic implementation of a non-destructive write to the
299 * user address. We know that we faulted in the atomic pagefault 300 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by 301 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away. 302 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
515 */ 516 */
516 pi_state = this->pi_state; 517 pi_state = this->pi_state;
517 /* 518 /*
518 * Userspace might have messed up non PI and PI futexes 519 * Userspace might have messed up non-PI and PI futexes
519 */ 520 */
520 if (unlikely(!pi_state)) 521 if (unlikely(!pi_state))
521 return -EINVAL; 522 return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
736 737
737 /* 738 /*
738 * We set q->lock_ptr = NULL _before_ we wake up the task. If 739 * We set q->lock_ptr = NULL _before_ we wake up the task. If
739 * a non futex wake up happens on another CPU then the task 740 * a non-futex wake up happens on another CPU then the task
740 * might exit and p would dereference a non existing task 741 * might exit and p would dereference a non-existing task
741 * struct. Prevent this by holding a reference on p across the 742 * struct. Prevent this by holding a reference on p across the
742 * wake up. 743 * wake up.
743 */ 744 */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1131 1132
1132/** 1133/**
1133 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1134 * uaddr1: source futex user address 1135 * @uaddr1: source futex user address
1135 * uaddr2: target futex user address 1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
1136 * nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1137 * @uaddr2: target futex user address
1137 * nr_requeue: number of waiters to requeue (0-INT_MAX) 1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1138 * requeue_pi: if we are attempting to requeue from a non-pi futex to a 1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1139 * pi futex (pi to pi requeue is not supported) 1142 * pi futex (pi to pi requeue is not supported)
1140 * 1143 *
1141 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
1360 1363
1361/* The key must be already stored in q->key. */ 1364/* The key must be already stored in q->key. */
1362static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1365static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1366 __acquires(&hb->lock)
1363{ 1367{
1364 struct futex_hash_bucket *hb; 1368 struct futex_hash_bucket *hb;
1365 1369
1366 get_futex_key_refs(&q->key);
1367 hb = hash_futex(&q->key); 1370 hb = hash_futex(&q->key);
1368 q->lock_ptr = &hb->lock; 1371 q->lock_ptr = &hb->lock;
1369 1372
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1373 1376
1374static inline void 1377static inline void
1375queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1378queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1379 __releases(&hb->lock)
1376{ 1380{
1377 spin_unlock(&hb->lock); 1381 spin_unlock(&hb->lock);
1378 drop_futex_key_refs(&q->key);
1379} 1382}
1380 1383
1381/** 1384/**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1391 * an example). 1394 * an example).
1392 */ 1395 */
1393static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1396static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1397 __releases(&hb->lock)
1394{ 1398{
1395 int prio; 1399 int prio;
1396 1400
@@ -1471,6 +1475,7 @@ retry:
1471 * and dropped here. 1475 * and dropped here.
1472 */ 1476 */
1473static void unqueue_me_pi(struct futex_q *q) 1477static void unqueue_me_pi(struct futex_q *q)
1478 __releases(q->lock_ptr)
1474{ 1479{
1475 WARN_ON(plist_node_empty(&q->list)); 1480 WARN_ON(plist_node_empty(&q->list));
1476 plist_del(&q->list, &q->list.plist); 1481 plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
1480 q->pi_state = NULL; 1485 q->pi_state = NULL;
1481 1486
1482 spin_unlock(q->lock_ptr); 1487 spin_unlock(q->lock_ptr);
1483
1484 drop_futex_key_refs(&q->key);
1485} 1488}
1486 1489
1487/* 1490/*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1812 } 1815 }
1813 1816
1814retry: 1817retry:
1815 /* Prepare to wait on uaddr. */ 1818 /*
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs.
1821 */
1816 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1817 if (ret) 1823 if (ret)
1818 goto out; 1824 goto out;
@@ -1822,28 +1828,27 @@ retry:
1822 1828
1823 /* If we were woken (and unqueued), we succeeded, whatever. */ 1829 /* If we were woken (and unqueued), we succeeded, whatever. */
1824 ret = 0; 1830 ret = 0;
1831 /* unqueue_me() drops q.key ref */
1825 if (!unqueue_me(&q)) 1832 if (!unqueue_me(&q))
1826 goto out_put_key; 1833 goto out;
1827 ret = -ETIMEDOUT; 1834 ret = -ETIMEDOUT;
1828 if (to && !to->task) 1835 if (to && !to->task)
1829 goto out_put_key; 1836 goto out;
1830 1837
1831 /* 1838 /*
1832 * We expect signal_pending(current), but we might be the 1839 * We expect signal_pending(current), but we might be the
1833 * victim of a spurious wakeup as well. 1840 * victim of a spurious wakeup as well.
1834 */ 1841 */
1835 if (!signal_pending(current)) { 1842 if (!signal_pending(current))
1836 put_futex_key(fshared, &q.key);
1837 goto retry; 1843 goto retry;
1838 }
1839 1844
1840 ret = -ERESTARTSYS; 1845 ret = -ERESTARTSYS;
1841 if (!abs_time) 1846 if (!abs_time)
1842 goto out_put_key; 1847 goto out;
1843 1848
1844 restart = &current_thread_info()->restart_block; 1849 restart = &current_thread_info()->restart_block;
1845 restart->fn = futex_wait_restart; 1850 restart->fn = futex_wait_restart;
1846 restart->futex.uaddr = (u32 *)uaddr; 1851 restart->futex.uaddr = uaddr;
1847 restart->futex.val = val; 1852 restart->futex.val = val;
1848 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1849 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
1856 1861
1857 ret = -ERESTART_RESTARTBLOCK; 1862 ret = -ERESTART_RESTARTBLOCK;
1858 1863
1859out_put_key:
1860 put_futex_key(fshared, &q.key);
1861out: 1864out:
1862 if (to) { 1865 if (to) {
1863 hrtimer_cancel(&to->timer); 1866 hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
1869 1872
1870static long futex_wait_restart(struct restart_block *restart) 1873static long futex_wait_restart(struct restart_block *restart)
1871{ 1874{
1872 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1875 u32 __user *uaddr = restart->futex.uaddr;
1873 int fshared = 0; 1876 int fshared = 0;
1874 ktime_t t, *tp = NULL; 1877 ktime_t t, *tp = NULL;
1875 1878
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2236 q.rt_waiter = &rt_waiter; 2239 q.rt_waiter = &rt_waiter;
2237 q.requeue_pi_key = &key2; 2240 q.requeue_pi_key = &key2;
2238 2241
2239 /* Prepare to wait on uaddr. */ 2242 /*
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count.
2245 */
2240 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2241 if (ret) 2247 if (ret)
2242 goto out_key2; 2248 goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2254 * In order for us to be here, we know our q.key == key2, and since 2260 * In order for us to be here, we know our q.key == key2, and since
2255 * we took the hb->lock above, we also know that futex_requeue() has 2261 * we took the hb->lock above, we also know that futex_requeue() has
2256 * completed and we no longer have to concern ourselves with a wakeup 2262 * completed and we no longer have to concern ourselves with a wakeup
2257 * race with the atomic proxy lock acquition by the requeue code. 2263 * race with the atomic proxy lock acquisition by the requeue code. The
2264 * futex_requeue dropped our key1 reference and incremented our key2
2265 * reference count.
2258 */ 2266 */
2259 2267
2260 /* Check if the requeue code acquired the second futex for us. */ 2268 /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
2458 */ 2466 */
2459static inline int fetch_robust_entry(struct robust_list __user **entry, 2467static inline int fetch_robust_entry(struct robust_list __user **entry,
2460 struct robust_list __user * __user *head, 2468 struct robust_list __user * __user *head,
2461 int *pi) 2469 unsigned int *pi)
2462{ 2470{
2463 unsigned long uentry; 2471 unsigned long uentry;
2464 2472
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
2647 * of the complex code paths. Also we want to prevent 2655 * of the complex code paths. Also we want to prevent
2648 * registration of robust lists in that case. NULL is 2656 * registration of robust lists in that case. NULL is
2649 * guaranteed to fault and we get -EFAULT on functional 2657 * guaranteed to fault and we get -EFAULT on functional
2650 * implementation, the non functional ones will return 2658 * implementation, the non-functional ones will return
2651 * -ENOSYS. 2659 * -ENOSYS.
2652 */ 2660 */
2653 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2661 curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
19 */ 19 */
20static inline int 20static inline int
21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
22 compat_uptr_t __user *head, int *pi) 22 compat_uptr_t __user *head, unsigned int *pi)
23{ 23{
24 if (get_user(*uentry, head)) 24 if (get_user(*uentry, head))
25 return -EFAULT; 25 return -EFAULT;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1decafbb6b1a..72206cf5c6cf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -931,6 +931,7 @@ static inline int
931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) 931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
932{ 932{
933 if (hrtimer_is_queued(timer)) { 933 if (hrtimer_is_queued(timer)) {
934 unsigned long state;
934 int reprogram; 935 int reprogram;
935 936
936 /* 937 /*
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
944 debug_deactivate(timer); 945 debug_deactivate(timer);
945 timer_stats_hrtimer_clear_start_info(timer); 946 timer_stats_hrtimer_clear_start_info(timer);
946 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 947 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
947 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 948 /*
948 reprogram); 949 * We must preserve the CALLBACK state flag here,
950 * otherwise we could move the timer base in
951 * switch_hrtimer_base.
952 */
953 state = timer->state & HRTIMER_STATE_CALLBACK;
954 __remove_hrtimer(timer, base, state, reprogram);
949 return 1; 955 return 1;
950 } 956 }
951 return 0; 957 return 0;
@@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1231 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1237 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1232 enqueue_hrtimer(timer, base); 1238 enqueue_hrtimer(timer, base);
1233 } 1239 }
1240
1241 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1242
1234 timer->state &= ~HRTIMER_STATE_CALLBACK; 1243 timer->state &= ~HRTIMER_STATE_CALLBACK;
1235} 1244}
1236 1245
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n"); 99 " disables this message.\n");
100 sched_show_task(t); 100 sched_show_task(t);
101 __debug_show_held_locks(t); 101 debug_show_held_locks(t);
102 102
103 touch_nmi_watchdog(); 103 touch_nmi_watchdog();
104 104
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
111 * periodically exit the critical section and enter a new one. 111 * periodically exit the critical section and enter a new one.
112 * 112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required. 114 * to exit the grace period. For classic RCU, a reschedule is required.
115 */ 115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{ 117{
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..31d766bf5d2e
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
1config HAVE_GENERIC_HARDIRQS
2 def_bool n
3
4if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem"
6#
7# Interrupt subsystem related configuration options
8#
9config GENERIC_HARDIRQS
10 def_bool y
11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n
18
19# Options selectable by the architecture code
20config HAVE_SPARSE_IRQ
21 def_bool n
22
23config GENERIC_IRQ_PROBE
24 def_bool n
25
26config GENERIC_PENDING_IRQ
27 def_bool n
28
29config AUTO_IRQ_AFFINITY
30 def_bool n
31
32config IRQ_PER_CPU
33 def_bool n
34
35config HARDIRQS_SW_RESEND
36 def_bool n
37
38config SPARSE_IRQ
39 bool "Support sparse irq numbering"
40 depends on HAVE_SPARSE_IRQ
41 ---help---
42
43 Sparse irq numbering is useful for distro kernels that want
44 to define a high CONFIG_NR_CPUS value but still want to have
45 low kernel memory footprint on smaller machines.
46
47 ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
48 out the interrupt descriptors in a more NUMA-friendly way. )
49
50 If you don't know what to do here, say N.
51
52endmenu
53endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..54329cd7b3ee 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
1 1
2obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 6obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..505798f86c36 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
57 * Some chips need to know about probing in 57 * Some chips need to know about probing in
58 * progress: 58 * progress:
59 */ 59 */
60 if (desc->chip->set_type) 60 if (desc->irq_data.chip->irq_set_type)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 desc->chip->startup(i); 62 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data);
63 } 64 }
64 raw_spin_unlock_irq(&desc->lock); 65 raw_spin_unlock_irq(&desc->lock);
65 } 66 }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
76 raw_spin_lock_irq(&desc->lock); 77 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 80 if (desc->irq_data.chip->irq_startup(&desc->irq_data))
80 desc->status |= IRQ_PENDING; 81 desc->status |= IRQ_PENDING;
81 } 82 }
82 raw_spin_unlock_irq(&desc->lock); 83 raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
98 /* It triggered already - consider it spurious. */ 99 /* It triggered already - consider it spurious. */
99 if (!(status & IRQ_WAITING)) { 100 if (!(status & IRQ_WAITING)) {
100 desc->status = status & ~IRQ_AUTODETECT; 101 desc->status = status & ~IRQ_AUTODETECT;
101 desc->chip->shutdown(i); 102 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
102 } else 103 } else
103 if (i < 32) 104 if (i < 32)
104 mask |= 1 << i; 105 mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
137 mask |= 1 << i; 138 mask |= 1 << i;
138 139
139 desc->status = status & ~IRQ_AUTODETECT; 140 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 141 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
141 } 142 }
142 raw_spin_unlock_irq(&desc->lock); 143 raw_spin_unlock_irq(&desc->lock);
143 } 144 }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
181 nr_of_irqs++; 182 nr_of_irqs++;
182 } 183 }
183 desc->status = status & ~IRQ_AUTODETECT; 184 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 185 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
185 } 186 }
186 raw_spin_unlock_irq(&desc->lock); 187 raw_spin_unlock_irq(&desc->lock);
187 } 188 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..baa5c4acad83 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22{
23 struct irq_desc *desc;
24 unsigned long flags;
25
26 desc = irq_to_desc(irq);
27 if (!desc) {
28 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
29 return;
30 }
31
32 /* Ensure we don't have left over values from a previous use of this irq */
33 raw_spin_lock_irqsave(&desc->lock, flags);
34 desc->status = IRQ_DISABLED;
35 desc->chip = &no_irq_chip;
36 desc->handle_irq = handle_bad_irq;
37 desc->depth = 1;
38 desc->msi_desc = NULL;
39 desc->handler_data = NULL;
40 if (!keep_chip_data)
41 desc->chip_data = NULL;
42 desc->action = NULL;
43 desc->irq_count = 0;
44 desc->irqs_unhandled = 0;
45#ifdef CONFIG_SMP
46 cpumask_setall(desc->affinity);
47#ifdef CONFIG_GENERIC_PENDING_IRQ
48 cpumask_clear(desc->pending_mask);
49#endif
50#endif
51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52}
53
54/**
55 * dynamic_irq_init - initialize a dynamically allocated irq
56 * @irq: irq number to initialize
57 */
58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
75{
76 struct irq_desc *desc = irq_to_desc(irq);
77 unsigned long flags;
78
79 if (!desc) {
80 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
81 return;
82 }
83
84 raw_spin_lock_irqsave(&desc->lock, flags);
85 if (desc->action) {
86 raw_spin_unlock_irqrestore(&desc->lock, flags);
87 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
88 irq);
89 return;
90 }
91 desc->msi_desc = NULL;
92 desc->handler_data = NULL;
93 if (!keep_chip_data)
94 desc->chip_data = NULL;
95 desc->handle_irq = handle_bad_irq;
96 desc->chip = &no_irq_chip;
97 desc->name = NULL;
98 clear_kstat_irqs(desc);
99 raw_spin_unlock_irqrestore(&desc->lock, flags);
100}
101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
122
123/** 21/**
124 * set_irq_chip - set the irq chip for an irq 22 * set_irq_chip - set the irq chip for an irq
125 * @irq: irq number 23 * @irq: irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
140 38
141 raw_spin_lock_irqsave(&desc->lock, flags); 39 raw_spin_lock_irqsave(&desc->lock, flags);
142 irq_chip_set_defaults(chip); 40 irq_chip_set_defaults(chip);
143 desc->chip = chip; 41 desc->irq_data.chip = chip;
144 raw_spin_unlock_irqrestore(&desc->lock, flags); 42 raw_spin_unlock_irqrestore(&desc->lock, flags);
145 43
146 return 0; 44 return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
193 } 91 }
194 92
195 raw_spin_lock_irqsave(&desc->lock, flags); 93 raw_spin_lock_irqsave(&desc->lock, flags);
196 desc->handler_data = data; 94 desc->irq_data.handler_data = data;
197 raw_spin_unlock_irqrestore(&desc->lock, flags); 95 raw_spin_unlock_irqrestore(&desc->lock, flags);
198 return 0; 96 return 0;
199} 97}
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
218 } 116 }
219 117
220 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
221 desc->msi_desc = entry; 119 desc->irq_data.msi_desc = entry;
222 if (entry) 120 if (entry)
223 entry->irq = irq; 121 entry->irq = irq;
224 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
243 return -EINVAL; 141 return -EINVAL;
244 } 142 }
245 143
246 if (!desc->chip) { 144 if (!desc->irq_data.chip) {
247 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
248 return -EINVAL; 146 return -EINVAL;
249 } 147 }
250 148
251 raw_spin_lock_irqsave(&desc->lock, flags); 149 raw_spin_lock_irqsave(&desc->lock, flags);
252 desc->chip_data = data; 150 desc->irq_data.chip_data = data;
253 raw_spin_unlock_irqrestore(&desc->lock, flags); 151 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 152
255 return 0; 153 return 0;
256} 154}
257EXPORT_SYMBOL(set_irq_chip_data); 155EXPORT_SYMBOL(set_irq_chip_data);
258 156
157struct irq_data *irq_get_irq_data(unsigned int irq)
158{
159 struct irq_desc *desc = irq_to_desc(irq);
160
161 return desc ? &desc->irq_data : NULL;
162}
163EXPORT_SYMBOL_GPL(irq_get_irq_data);
164
259/** 165/**
260 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq 166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
261 * 167 *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
287/* 193/*
288 * default enable function 194 * default enable function
289 */ 195 */
290static void default_enable(unsigned int irq) 196static void default_enable(struct irq_data *data)
291{ 197{
292 struct irq_desc *desc = irq_to_desc(irq); 198 struct irq_desc *desc = irq_data_to_desc(data);
293 199
294 desc->chip->unmask(irq); 200 desc->irq_data.chip->irq_unmask(&desc->irq_data);
295 desc->status &= ~IRQ_MASKED; 201 desc->status &= ~IRQ_MASKED;
296} 202}
297 203
298/* 204/*
299 * default disable function 205 * default disable function
300 */ 206 */
301static void default_disable(unsigned int irq) 207static void default_disable(struct irq_data *data)
302{ 208{
303} 209}
304 210
305/* 211/*
306 * default startup function 212 * default startup function
307 */ 213 */
308static unsigned int default_startup(unsigned int irq) 214static unsigned int default_startup(struct irq_data *data)
309{ 215{
310 struct irq_desc *desc = irq_to_desc(irq); 216 struct irq_desc *desc = irq_data_to_desc(data);
311 217
312 desc->chip->enable(irq); 218 desc->irq_data.chip->irq_enable(data);
313 return 0; 219 return 0;
314} 220}
315 221
316/* 222/*
317 * default shutdown function 223 * default shutdown function
318 */ 224 */
319static void default_shutdown(unsigned int irq) 225static void default_shutdown(struct irq_data *data)
320{ 226{
321 struct irq_desc *desc = irq_to_desc(irq); 227 struct irq_desc *desc = irq_data_to_desc(data);
322 228
323 desc->chip->mask(irq); 229 desc->irq_data.chip->irq_mask(&desc->irq_data);
324 desc->status |= IRQ_MASKED; 230 desc->status |= IRQ_MASKED;
325} 231}
326 232
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
234/* Temporary migration helpers */
235static void compat_irq_mask(struct irq_data *data)
236{
237 data->chip->mask(data->irq);
238}
239
240static void compat_irq_unmask(struct irq_data *data)
241{
242 data->chip->unmask(data->irq);
243}
244
245static void compat_irq_ack(struct irq_data *data)
246{
247 data->chip->ack(data->irq);
248}
249
250static void compat_irq_mask_ack(struct irq_data *data)
251{
252 data->chip->mask_ack(data->irq);
253}
254
255static void compat_irq_eoi(struct irq_data *data)
256{
257 data->chip->eoi(data->irq);
258}
259
260static void compat_irq_enable(struct irq_data *data)
261{
262 data->chip->enable(data->irq);
263}
264
265static void compat_irq_disable(struct irq_data *data)
266{
267 data->chip->disable(data->irq);
268}
269
270static void compat_irq_shutdown(struct irq_data *data)
271{
272 data->chip->shutdown(data->irq);
273}
274
275static unsigned int compat_irq_startup(struct irq_data *data)
276{
277 return data->chip->startup(data->irq);
278}
279
280static int compat_irq_set_affinity(struct irq_data *data,
281 const struct cpumask *dest, bool force)
282{
283 return data->chip->set_affinity(data->irq, dest);
284}
285
286static int compat_irq_set_type(struct irq_data *data, unsigned int type)
287{
288 return data->chip->set_type(data->irq, type);
289}
290
291static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
292{
293 return data->chip->set_wake(data->irq, on);
294}
295
296static int compat_irq_retrigger(struct irq_data *data)
297{
298 return data->chip->retrigger(data->irq);
299}
300
301static void compat_bus_lock(struct irq_data *data)
302{
303 data->chip->bus_lock(data->irq);
304}
305
306static void compat_bus_sync_unlock(struct irq_data *data)
307{
308 data->chip->bus_sync_unlock(data->irq);
309}
310#endif
311
327/* 312/*
328 * Fixup enable/disable function pointers 313 * Fixup enable/disable function pointers
329 */ 314 */
330void irq_chip_set_defaults(struct irq_chip *chip) 315void irq_chip_set_defaults(struct irq_chip *chip)
331{ 316{
332 if (!chip->enable) 317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
333 chip->enable = default_enable;
334 if (!chip->disable)
335 chip->disable = default_disable;
336 if (!chip->startup)
337 chip->startup = default_startup;
338 /* 318 /*
339 * We use chip->disable, when the user provided its own. When 319 * Compat fixup functions need to be before we set the
340 * we have default_disable set for chip->disable, then we need 320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable)
323 chip->irq_enable = compat_irq_enable;
324 if (chip->disable)
325 chip->irq_disable = compat_irq_disable;
326 if (chip->shutdown)
327 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup)
329 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
341 * to use default_shutdown, otherwise the irq line is not 343 * to use default_shutdown, otherwise the irq line is not
342 * disabled on free_irq(): 344 * disabled on free_irq():
343 */ 345 */
344 if (!chip->shutdown) 346 if (!chip->irq_shutdown)
345 chip->shutdown = chip->disable != default_disable ? 347 chip->irq_shutdown = chip->irq_disable != default_disable ?
346 chip->disable : default_shutdown; 348 chip->irq_disable : default_shutdown;
347 if (!chip->name) 349
348 chip->name = chip->typename; 350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
349 if (!chip->end) 351 if (!chip->end)
350 chip->end = dummy_irq_chip.end; 352 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock)
360 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
361 if (chip->mask)
362 chip->irq_mask = compat_irq_mask;
363 if (chip->unmask)
364 chip->irq_unmask = compat_irq_unmask;
365 if (chip->ack)
366 chip->irq_ack = compat_irq_ack;
367 if (chip->mask_ack)
368 chip->irq_mask_ack = compat_irq_mask_ack;
369 if (chip->eoi)
370 chip->irq_eoi = compat_irq_eoi;
371 if (chip->set_affinity)
372 chip->irq_set_affinity = compat_irq_set_affinity;
373 if (chip->set_type)
374 chip->irq_set_type = compat_irq_set_type;
375 if (chip->set_wake)
376 chip->irq_set_wake = compat_irq_set_wake;
377 if (chip->retrigger)
378 chip->irq_retrigger = compat_irq_retrigger;
379#endif
351} 380}
352 381
353static inline void mask_ack_irq(struct irq_desc *desc, int irq) 382static inline void mask_ack_irq(struct irq_desc *desc)
354{ 383{
355 if (desc->chip->mask_ack) 384 if (desc->irq_data.chip->irq_mask_ack)
356 desc->chip->mask_ack(irq); 385 desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
357 else { 386 else {
358 desc->chip->mask(irq); 387 desc->irq_data.chip->irq_mask(&desc->irq_data);
359 if (desc->chip->ack) 388 if (desc->irq_data.chip->irq_ack)
360 desc->chip->ack(irq); 389 desc->irq_data.chip->irq_ack(&desc->irq_data);
361 } 390 }
362 desc->status |= IRQ_MASKED; 391 desc->status |= IRQ_MASKED;
363} 392}
364 393
365static inline void mask_irq(struct irq_desc *desc, int irq) 394static inline void mask_irq(struct irq_desc *desc)
366{ 395{
367 if (desc->chip->mask) { 396 if (desc->irq_data.chip->irq_mask) {
368 desc->chip->mask(irq); 397 desc->irq_data.chip->irq_mask(&desc->irq_data);
369 desc->status |= IRQ_MASKED; 398 desc->status |= IRQ_MASKED;
370 } 399 }
371} 400}
372 401
373static inline void unmask_irq(struct irq_desc *desc, int irq) 402static inline void unmask_irq(struct irq_desc *desc)
374{ 403{
375 if (desc->chip->unmask) { 404 if (desc->irq_data.chip->irq_unmask) {
376 desc->chip->unmask(irq); 405 desc->irq_data.chip->irq_unmask(&desc->irq_data);
377 desc->status &= ~IRQ_MASKED; 406 desc->status &= ~IRQ_MASKED;
378 } 407 }
379} 408}
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
476 irqreturn_t action_ret; 505 irqreturn_t action_ret;
477 506
478 raw_spin_lock(&desc->lock); 507 raw_spin_lock(&desc->lock);
479 mask_ack_irq(desc, irq); 508 mask_ack_irq(desc);
480 509
481 if (unlikely(desc->status & IRQ_INPROGRESS)) 510 if (unlikely(desc->status & IRQ_INPROGRESS))
482 goto out_unlock; 511 goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
502 desc->status &= ~IRQ_INPROGRESS; 531 desc->status &= ~IRQ_INPROGRESS;
503 532
504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
505 unmask_irq(desc, irq); 534 unmask_irq(desc);
506out_unlock: 535out_unlock:
507 raw_spin_unlock(&desc->lock); 536 raw_spin_unlock(&desc->lock);
508} 537}
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
539 action = desc->action; 568 action = desc->action;
540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
541 desc->status |= IRQ_PENDING; 570 desc->status |= IRQ_PENDING;
542 mask_irq(desc, irq); 571 mask_irq(desc);
543 goto out; 572 goto out;
544 } 573 }
545 574
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
554 raw_spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
555 desc->status &= ~IRQ_INPROGRESS; 584 desc->status &= ~IRQ_INPROGRESS;
556out: 585out:
557 desc->chip->eoi(irq); 586 desc->irq_data.chip->irq_eoi(&desc->irq_data);
558 587
559 raw_spin_unlock(&desc->lock); 588 raw_spin_unlock(&desc->lock);
560} 589}
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
590 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
591 !desc->action)) { 620 !desc->action)) {
592 desc->status |= (IRQ_PENDING | IRQ_MASKED); 621 desc->status |= (IRQ_PENDING | IRQ_MASKED);
593 mask_ack_irq(desc, irq); 622 mask_ack_irq(desc);
594 goto out_unlock; 623 goto out_unlock;
595 } 624 }
596 kstat_incr_irqs_this_cpu(irq, desc); 625 kstat_incr_irqs_this_cpu(irq, desc);
597 626
598 /* Start handling the irq */ 627 /* Start handling the irq */
599 if (desc->chip->ack) 628 desc->irq_data.chip->irq_ack(&desc->irq_data);
600 desc->chip->ack(irq);
601 629
602 /* Mark the IRQ currently in progress.*/ 630 /* Mark the IRQ currently in progress.*/
603 desc->status |= IRQ_INPROGRESS; 631 desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
607 irqreturn_t action_ret; 635 irqreturn_t action_ret;
608 636
609 if (unlikely(!action)) { 637 if (unlikely(!action)) {
610 mask_irq(desc, irq); 638 mask_irq(desc);
611 goto out_unlock; 639 goto out_unlock;
612 } 640 }
613 641
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
619 if (unlikely((desc->status & 647 if (unlikely((desc->status &
620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
621 (IRQ_PENDING | IRQ_MASKED))) { 649 (IRQ_PENDING | IRQ_MASKED))) {
622 unmask_irq(desc, irq); 650 unmask_irq(desc);
623 } 651 }
624 652
625 desc->status &= ~IRQ_PENDING; 653 desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
650 678
651 kstat_incr_irqs_this_cpu(irq, desc); 679 kstat_incr_irqs_this_cpu(irq, desc);
652 680
653 if (desc->chip->ack) 681 if (desc->irq_data.chip->irq_ack)
654 desc->chip->ack(irq); 682 desc->irq_data.chip->irq_ack(&desc->irq_data);
655 683
656 action_ret = handle_IRQ_event(irq, desc->action); 684 action_ret = handle_IRQ_event(irq, desc->action);
657 if (!noirqdebug) 685 if (!noirqdebug)
658 note_interrupt(irq, desc, action_ret); 686 note_interrupt(irq, desc, action_ret);
659 687
660 if (desc->chip->eoi) 688 if (desc->irq_data.chip->irq_eoi)
661 desc->chip->eoi(irq); 689 desc->irq_data.chip->irq_eoi(&desc->irq_data);
662} 690}
663 691
664void 692void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
676 704
677 if (!handle) 705 if (!handle)
678 handle = handle_bad_irq; 706 handle = handle_bad_irq;
679 else if (desc->chip == &no_irq_chip) { 707 else if (desc->irq_data.chip == &no_irq_chip) {
680 printk(KERN_WARNING "Trying to install %sinterrupt handler " 708 printk(KERN_WARNING "Trying to install %sinterrupt handler "
681 "for IRQ%d\n", is_chained ? "chained " : "", irq); 709 "for IRQ%d\n", is_chained ? "chained " : "", irq);
682 /* 710 /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
686 * prevent us to setup the interrupt at all. Switch it to 714 * prevent us to setup the interrupt at all. Switch it to
687 * dummy_irq_chip for easy transition. 715 * dummy_irq_chip for easy transition.
688 */ 716 */
689 desc->chip = &dummy_irq_chip; 717 desc->irq_data.chip = &dummy_irq_chip;
690 } 718 }
691 719
692 chip_bus_lock(irq, desc); 720 chip_bus_lock(desc);
693 raw_spin_lock_irqsave(&desc->lock, flags); 721 raw_spin_lock_irqsave(&desc->lock, flags);
694 722
695 /* Uninstall? */ 723 /* Uninstall? */
696 if (handle == handle_bad_irq) { 724 if (handle == handle_bad_irq) {
697 if (desc->chip != &no_irq_chip) 725 if (desc->irq_data.chip != &no_irq_chip)
698 mask_ack_irq(desc, irq); 726 mask_ack_irq(desc);
699 desc->status |= IRQ_DISABLED; 727 desc->status |= IRQ_DISABLED;
700 desc->depth = 1; 728 desc->depth = 1;
701 } 729 }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
706 desc->status &= ~IRQ_DISABLED; 734 desc->status &= ~IRQ_DISABLED;
707 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
708 desc->depth = 0; 736 desc->depth = 0;
709 desc->chip->startup(irq); 737 desc->irq_data.chip->irq_startup(&desc->irq_data);
710 } 738 }
711 raw_spin_unlock_irqrestore(&desc->lock, flags); 739 raw_spin_unlock_irqrestore(&desc->lock, flags);
712 chip_bus_sync_unlock(irq, desc); 740 chip_bus_sync_unlock(desc);
713} 741}
714EXPORT_SYMBOL_GPL(__set_irq_handler); 742EXPORT_SYMBOL_GPL(__set_irq_handler);
715 743
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
729 __set_irq_handler(irq, handle, 0, name); 757 __set_irq_handler(irq, handle, 0, name);
730} 758}
731 759
732void set_irq_noprobe(unsigned int irq) 760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
733{ 761{
734 struct irq_desc *desc = irq_to_desc(irq); 762 struct irq_desc *desc = irq_to_desc(irq);
735 unsigned long flags; 763 unsigned long flags;
736 764
737 if (!desc) { 765 if (!desc)
738 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
739 return; 766 return;
740 }
741
742 raw_spin_lock_irqsave(&desc->lock, flags);
743 desc->status |= IRQ_NOPROBE;
744 raw_spin_unlock_irqrestore(&desc->lock, flags);
745}
746
747void set_irq_probe(unsigned int irq)
748{
749 struct irq_desc *desc = irq_to_desc(irq);
750 unsigned long flags;
751 767
752 if (!desc) { 768 /* Sanitize flags */
753 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 769 set &= IRQF_MODIFY_MASK;
754 return; 770 clr &= IRQF_MODIFY_MASK;
755 }
756 771
757 raw_spin_lock_irqsave(&desc->lock, flags); 772 raw_spin_lock_irqsave(&desc->lock, flags);
758 desc->status &= ~IRQ_NOPROBE; 773 desc->status &= ~clr;
774 desc->status |= set;
759 raw_spin_unlock_irqrestore(&desc->lock, flags); 775 raw_spin_unlock_irqrestore(&desc->lock, flags);
760} 776}
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..20dc5474947e
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the dummy interrupt chip implementation
6 */
7#include <linux/interrupt.h>
8#include <linux/irq.h>
9
10#include "internals.h"
11
12/*
13 * What should we do if we get a hw irq event on an illegal vector?
14 * Each architecture has to answer this themself.
15 */
16static void ack_bad(struct irq_data *data)
17{
18 struct irq_desc *desc = irq_data_to_desc(data);
19
20 print_irq_desc(data->irq, desc);
21 ack_bad_irq(data->irq);
22}
23
24/*
25 * NOP functions
26 */
27static void noop(struct irq_data *data) { }
28
29static unsigned int noop_ret(struct irq_data *data)
30{
31 return 0;
32}
33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/*
42 * Generic no controller implementation
43 */
44struct irq_chip no_irq_chip = {
45 .name = "none",
46 .irq_startup = noop_ret,
47 .irq_shutdown = noop,
48 .irq_enable = noop,
49 .irq_disable = noop,
50 .irq_ack = ack_bad,
51 END_INIT
52};
53
54/*
55 * Generic dummy implementation which can be used for
56 * real dumb interrupt sources
57 */
58struct irq_chip dummy_irq_chip = {
59 .name = "dummy",
60 .irq_startup = noop_ret,
61 .irq_shutdown = noop,
62 .irq_enable = noop,
63 .irq_disable = noop,
64 .irq_ack = noop,
65 .irq_mask = noop,
66 .irq_unmask = noop,
67 END_INIT
68};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..e2347eb63306 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/sched.h>
18#include <linux/interrupt.h> 16#include <linux/interrupt.h>
19#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 18
21#include <linux/hash.h>
22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 19#include <trace/events/irq.h>
24 20
25#include "internals.h" 21#include "internals.h"
26 22
27/*
28 * lockdep: we want to handle all irq_desc locks as a single lock-class:
29 */
30struct lock_class_key irq_desc_lock_class;
31
32/** 23/**
33 * handle_bad_irq - handle spurious and unhandled irqs 24 * handle_bad_irq - handle spurious and unhandled irqs
34 * @irq: the interrupt number 25 * @irq: the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
43 ack_bad_irq(irq); 34 ack_bad_irq(irq);
44} 35}
45 36
46#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
47static void __init init_irq_default_affinity(void)
48{
49 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
50 cpumask_setall(irq_default_affinity);
51}
52#else
53static void __init init_irq_default_affinity(void)
54{
55}
56#endif
57
58/*
59 * Linux has a controller-independent interrupt architecture.
60 * Every controller has a 'controller-template', that is used
61 * by the main code to do the right thing. Each driver-visible
62 * interrupt source is transparently wired to the appropriate
63 * controller. Thus drivers need not be aware of the
64 * interrupt-controller.
65 *
66 * The code is designed to be easily extended with new/different
67 * interrupt controllers, without having to do assembly magic or
68 * having to touch the generic code.
69 *
70 * Controller mappings for all interrupt sources:
71 */
72int nr_irqs = NR_IRQS;
73EXPORT_SYMBOL_GPL(nr_irqs);
74
75#ifdef CONFIG_SPARSE_IRQ
76
77static struct irq_desc irq_desc_init = {
78 .irq = -1,
79 .status = IRQ_DISABLED,
80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq,
82 .depth = 1,
83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84};
85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{
88 void *ptr;
89
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92
93 /*
94 * don't overwite if can not get new one
95 * init_copy_kstat_irqs() could still use old one
96 */
97 if (ptr) {
98 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
99 desc->kstat_irqs = ptr;
100 }
101}
102
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106
107 raw_spin_lock_init(&desc->lock);
108 desc->irq = irq;
109#ifdef CONFIG_SMP
110 desc->node = node;
111#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1);
117 }
118 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1);
121 }
122 init_desc_masks(desc);
123 arch_init_chip_data(desc, node);
124}
125
126/*
127 * Protect the sparse_irqs:
128 */
129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
130
131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
151
152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
153 [0 ... NR_IRQS_LEGACY-1] = {
154 .irq = -1,
155 .status = IRQ_DISABLED,
156 .chip = &no_irq_chip,
157 .handle_irq = handle_bad_irq,
158 .depth = 1,
159 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
160 }
161};
162
163static unsigned int *kstat_irqs_legacy;
164
165int __init early_irq_init(void)
166{
167 struct irq_desc *desc;
168 int legacy_count;
169 int node;
170 int i;
171
172 init_irq_default_affinity();
173
174 /* initialize nr_irqs based on nr_cpu_ids */
175 arch_probe_nr_irqs();
176 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
177
178 desc = irq_desc_legacy;
179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
180 node = first_online_node;
181
182 /* allocate based on nr_cpu_ids */
183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
184 sizeof(int), GFP_NOWAIT, node);
185
186 for (i = 0; i < legacy_count; i++) {
187 desc[i].irq = i;
188#ifdef CONFIG_SMP
189 desc[i].node = node;
190#endif
191 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
193 alloc_desc_masks(&desc[i], node, true);
194 init_desc_masks(&desc[i]);
195 set_irq_desc(i, &desc[i]);
196 }
197
198 return arch_early_irq_init();
199}
200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 struct irq_desc *desc;
204 unsigned long flags;
205
206 if (irq >= nr_irqs) {
207 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
208 irq, nr_irqs);
209 return NULL;
210 }
211
212 desc = irq_to_desc(irq);
213 if (desc)
214 return desc;
215
216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
217
218 /* We have to check it to avoid races with another CPU */
219 desc = irq_to_desc(irq);
220 if (desc)
221 goto out_unlock;
222
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224
225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
226 if (!desc) {
227 printk(KERN_ERR "can not alloc irq_desc\n");
228 BUG_ON(1);
229 }
230 init_one_irq_desc(irq, desc, node);
231
232 set_irq_desc(irq, desc);
233
234out_unlock:
235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
236
237 return desc;
238}
239
240#else /* !CONFIG_SPARSE_IRQ */
241
242struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
243 [0 ... NR_IRQS-1] = {
244 .status = IRQ_DISABLED,
245 .chip = &no_irq_chip,
246 .handle_irq = handle_bad_irq,
247 .depth = 1,
248 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
249 }
250};
251
252static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
253int __init early_irq_init(void)
254{
255 struct irq_desc *desc;
256 int count;
257 int i;
258
259 init_irq_default_affinity();
260
261 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
262
263 desc = irq_desc;
264 count = ARRAY_SIZE(irq_desc);
265
266 for (i = 0; i < count; i++) {
267 desc[i].irq = i;
268 alloc_desc_masks(&desc[i], 0, true);
269 init_desc_masks(&desc[i]);
270 desc[i].kstat_irqs = kstat_irqs_all[i];
271 }
272 return arch_early_irq_init();
273}
274
275struct irq_desc *irq_to_desc(unsigned int irq)
276{
277 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
278}
279
280struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
281{
282 return irq_to_desc(irq);
283}
284#endif /* !CONFIG_SPARSE_IRQ */
285
286void clear_kstat_irqs(struct irq_desc *desc)
287{
288 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
289}
290
291/*
292 * What should we do if we get a hw irq event on an illegal vector?
293 * Each architecture has to answer this themself.
294 */
295static void ack_bad(unsigned int irq)
296{
297 struct irq_desc *desc = irq_to_desc(irq);
298
299 print_irq_desc(irq, desc);
300 ack_bad_irq(irq);
301}
302
303/*
304 * NOP functions
305 */
306static void noop(unsigned int irq)
307{
308}
309
310static unsigned int noop_ret(unsigned int irq)
311{
312 return 0;
313}
314
315/*
316 * Generic no controller implementation
317 */
318struct irq_chip no_irq_chip = {
319 .name = "none",
320 .startup = noop_ret,
321 .shutdown = noop,
322 .enable = noop,
323 .disable = noop,
324 .ack = ack_bad,
325 .end = noop,
326};
327
328/*
329 * Generic dummy implementation which can be used for
330 * real dumb interrupt sources
331 */
332struct irq_chip dummy_irq_chip = {
333 .name = "dummy",
334 .startup = noop_ret,
335 .shutdown = noop,
336 .enable = noop,
337 .disable = noop,
338 .ack = noop,
339 .mask = noop,
340 .unmask = noop,
341 .end = noop,
342};
343
344/* 37/*
345 * Special, empty irq handler: 38 * Special, empty irq handler:
346 */ 39 */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
457 /* 150 /*
458 * No locking required for CPU-local interrupts: 151 * No locking required for CPU-local interrupts:
459 */ 152 */
460 if (desc->chip->ack) 153 if (desc->irq_data.chip->ack)
461 desc->chip->ack(irq); 154 desc->irq_data.chip->ack(irq);
462 if (likely(!(desc->status & IRQ_DISABLED))) { 155 if (likely(!(desc->status & IRQ_DISABLED))) {
463 action_ret = handle_IRQ_event(irq, desc->action); 156 action_ret = handle_IRQ_event(irq, desc->action);
464 if (!noirqdebug) 157 if (!noirqdebug)
465 note_interrupt(irq, desc, action_ret); 158 note_interrupt(irq, desc, action_ret);
466 } 159 }
467 desc->chip->end(irq); 160 desc->irq_data.chip->end(irq);
468 return 1; 161 return 1;
469 } 162 }
470 163
471 raw_spin_lock(&desc->lock); 164 raw_spin_lock(&desc->lock);
472 if (desc->chip->ack) 165 if (desc->irq_data.chip->ack)
473 desc->chip->ack(irq); 166 desc->irq_data.chip->ack(irq);
474 /* 167 /*
475 * REPLAY is when Linux resends an IRQ that was dropped earlier 168 * REPLAY is when Linux resends an IRQ that was dropped earlier
476 * WAITING is used by probe to mark irqs that are being tested 169 * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
530 * The ->end() handler has to deal with interrupts which got 223 * The ->end() handler has to deal with interrupts which got
531 * disabled while the handler was running. 224 * disabled while the handler was running.
532 */ 225 */
533 desc->chip->end(irq); 226 desc->irq_data.chip->end(irq);
534 raw_spin_unlock(&desc->lock); 227 raw_spin_unlock(&desc->lock);
535 228
536 return 1; 229 return 1;
537} 230}
538#endif 231#endif
539
540void early_init_irq_lock_class(void)
541{
542 struct irq_desc *desc;
543 int i;
544
545 for_each_irq_desc(i, desc) {
546 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
547 }
548}
549
550unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
551{
552 struct irq_desc *desc = irq_to_desc(irq);
553 return desc ? desc->kstat_irqs[cpu] : 0;
554}
555EXPORT_SYMBOL(kstat_irqs_cpu);
556
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..4571ae7e085a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 */ 3 */
4#include <linux/irqdesc.h>
4 5
5extern int noirqdebug; 6extern int noirqdebug;
6 7
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
9
7/* Set default functions for irq_chip structures: */ 10/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip); 11extern void irq_chip_set_defaults(struct irq_chip *chip);
9 12
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 20
18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23/* Resending of interrupts :*/
24void replace_irq_desc(unsigned int irq, struct irq_desc *desc); 24void check_irq_resend(struct irq_desc *desc, unsigned int irq);
25#endif
26 25
27#ifdef CONFIG_PROC_FS 26#ifdef CONFIG_PROC_FS
28extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
28extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
29extern void register_handler_proc(unsigned int irq, struct irqaction *action); 29extern void register_handler_proc(unsigned int irq, struct irqaction *action);
30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
31#else 31#else
32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } 32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void register_handler_proc(unsigned int irq, 34static inline void register_handler_proc(unsigned int irq,
34 struct irqaction *action) { } 35 struct irqaction *action) { }
35static inline void unregister_handler_proc(unsigned int irq, 36static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
40 41
41extern void irq_set_thread_affinity(struct irq_desc *desc); 42extern void irq_set_thread_affinity(struct irq_desc *desc);
42 43
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
43/* Inline functions for support of irq chips on slow busses */ 54/* Inline functions for support of irq chips on slow busses */
44static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) 55static inline void chip_bus_lock(struct irq_desc *desc)
45{ 56{
46 if (unlikely(desc->chip->bus_lock)) 57 if (unlikely(desc->irq_data.chip->irq_bus_lock))
47 desc->chip->bus_lock(irq); 58 desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
48} 59}
49 60
50static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) 61static inline void chip_bus_sync_unlock(struct irq_desc *desc)
51{ 62{
52 if (unlikely(desc->chip->bus_sync_unlock)) 63 if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
53 desc->chip->bus_sync_unlock(irq); 64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
54} 65}
55 66
56/* 67/*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
67 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); 78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
68 printk("->handle_irq(): %p, ", desc->handle_irq); 79 printk("->handle_irq(): %p, ", desc->handle_irq);
69 print_symbol("%s\n", (unsigned long)desc->handle_irq); 80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
70 printk("->chip(): %p, ", desc->chip); 81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
71 print_symbol("%s\n", (unsigned long)desc->chip); 82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
72 printk("->action(): %p\n", desc->action); 83 printk("->action(): %p\n", desc->action);
73 if (desc->action) { 84 if (desc->action) {
74 printk("->action->handler(): %p, ", desc->action->handler); 85 printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..9d917ff72675
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,395 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the interrupt descriptor management code
6 *
7 * Detailed information is available in Documentation/DocBook/genericirq
8 *
9 */
10#include <linux/irq.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h>
16#include <linux/bitmap.h>
17
18#include "internals.h"
19
20/*
21 * lockdep: we want to handle all irq_desc locks as a single lock-class:
22 */
23static struct lock_class_key irq_desc_lock_class;
24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
26static void __init init_irq_default_affinity(void)
27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
29 cpumask_setall(irq_default_affinity);
30}
31#else
32static void __init init_irq_default_affinity(void)
33{
34}
35#endif
36
37#ifdef CONFIG_SMP
38static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
39{
40 if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
41 return -ENOMEM;
42
43#ifdef CONFIG_GENERIC_PENDING_IRQ
44 if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
45 free_cpumask_var(desc->irq_data.affinity);
46 return -ENOMEM;
47 }
48#endif
49 return 0;
50}
51
52static void desc_smp_init(struct irq_desc *desc, int node)
53{
54 desc->irq_data.node = node;
55 cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
56#ifdef CONFIG_GENERIC_PENDING_IRQ
57 cpumask_clear(desc->pending_mask);
58#endif
59}
60
61static inline int desc_node(struct irq_desc *desc)
62{
63 return desc->irq_data.node;
64}
65
66#else
67static inline int
68alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
69static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif
72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{
75 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL;
78 desc->irq_data.handler_data = NULL;
79 desc->irq_data.msi_desc = NULL;
80 desc->status = IRQ_DEFAULT_INIT_FLAGS;
81 desc->handle_irq = handle_bad_irq;
82 desc->depth = 1;
83 desc->irq_count = 0;
84 desc->irqs_unhandled = 0;
85 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
87 desc_smp_init(desc, node);
88}
89
90int nr_irqs = NR_IRQS;
91EXPORT_SYMBOL_GPL(nr_irqs);
92
93static DEFINE_MUTEX(sparse_irq_lock);
94static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
95
96#ifdef CONFIG_SPARSE_IRQ
97
98static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
99
100static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
101{
102 radix_tree_insert(&irq_desc_tree, irq, desc);
103}
104
105struct irq_desc *irq_to_desc(unsigned int irq)
106{
107 return radix_tree_lookup(&irq_desc_tree, irq);
108}
109
110static void delete_irq_desc(unsigned int irq)
111{
112 radix_tree_delete(&irq_desc_tree, irq);
113}
114
115#ifdef CONFIG_SMP
116static void free_masks(struct irq_desc *desc)
117{
118#ifdef CONFIG_GENERIC_PENDING_IRQ
119 free_cpumask_var(desc->pending_mask);
120#endif
121 free_cpumask_var(desc->irq_data.affinity);
122}
123#else
124static inline void free_masks(struct irq_desc *desc) { }
125#endif
126
127static struct irq_desc *alloc_desc(int irq, int node)
128{
129 struct irq_desc *desc;
130 gfp_t gfp = GFP_KERNEL;
131
132 desc = kzalloc_node(sizeof(*desc), gfp, node);
133 if (!desc)
134 return NULL;
135 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
137 gfp, node);
138 if (!desc->kstat_irqs)
139 goto err_desc;
140
141 if (alloc_masks(desc, gfp, node))
142 goto err_kstat;
143
144 raw_spin_lock_init(&desc->lock);
145 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
146
147 desc_set_defaults(irq, desc, node);
148
149 return desc;
150
151err_kstat:
152 kfree(desc->kstat_irqs);
153err_desc:
154 kfree(desc);
155 return NULL;
156}
157
158static void free_desc(unsigned int irq)
159{
160 struct irq_desc *desc = irq_to_desc(irq);
161
162 unregister_irq_proc(irq, desc);
163
164 mutex_lock(&sparse_irq_lock);
165 delete_irq_desc(irq);
166 mutex_unlock(&sparse_irq_lock);
167
168 free_masks(desc);
169 kfree(desc->kstat_irqs);
170 kfree(desc);
171}
172
173static int alloc_descs(unsigned int start, unsigned int cnt, int node)
174{
175 struct irq_desc *desc;
176 int i;
177
178 for (i = 0; i < cnt; i++) {
179 desc = alloc_desc(start + i, node);
180 if (!desc)
181 goto err;
182 mutex_lock(&sparse_irq_lock);
183 irq_insert_desc(start + i, desc);
184 mutex_unlock(&sparse_irq_lock);
185 }
186 return start;
187
188err:
189 for (i--; i >= 0; i--)
190 free_desc(start + i);
191
192 mutex_lock(&sparse_irq_lock);
193 bitmap_clear(allocated_irqs, start, cnt);
194 mutex_unlock(&sparse_irq_lock);
195 return -ENOMEM;
196}
197
198struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
199{
200 int res = irq_alloc_descs(irq, irq, 1, node);
201
202 if (res == -EEXIST || res == irq)
203 return irq_to_desc(irq);
204 return NULL;
205}
206
207int __init early_irq_init(void)
208{
209 int i, initcnt, node = first_online_node;
210 struct irq_desc *desc;
211
212 init_irq_default_affinity();
213
214 /* Let arch update nr_irqs and return the nr of preallocated irqs */
215 initcnt = arch_probe_nr_irqs();
216 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
217
218 for (i = 0; i < initcnt; i++) {
219 desc = alloc_desc(i, node);
220 set_bit(i, allocated_irqs);
221 irq_insert_desc(i, desc);
222 }
223 return arch_early_irq_init();
224}
225
226#else /* !CONFIG_SPARSE_IRQ */
227
228struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
229 [0 ... NR_IRQS-1] = {
230 .status = IRQ_DEFAULT_INIT_FLAGS,
231 .handle_irq = handle_bad_irq,
232 .depth = 1,
233 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
234 }
235};
236
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void)
239{
240 int count, i, node = first_online_node;
241 struct irq_desc *desc;
242
243 init_irq_default_affinity();
244
245 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
246
247 desc = irq_desc;
248 count = ARRAY_SIZE(irq_desc);
249
250 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i];
254 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
257 }
258 return arch_early_irq_init();
259}
260
261struct irq_desc *irq_to_desc(unsigned int irq)
262{
263 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
264}
265
266struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
267{
268 return irq_to_desc(irq);
269}
270
271static void free_desc(unsigned int irq)
272{
273 dynamic_irq_cleanup(irq);
274}
275
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{
278 return start;
279}
280#endif /* !CONFIG_SPARSE_IRQ */
281
282/* Dynamic interrupt handling */
283
284/**
285 * irq_free_descs - free irq descriptors
286 * @from: Start of descriptor range
287 * @cnt: Number of consecutive irqs to free
288 */
289void irq_free_descs(unsigned int from, unsigned int cnt)
290{
291 int i;
292
293 if (from >= nr_irqs || (from + cnt) > nr_irqs)
294 return;
295
296 for (i = 0; i < cnt; i++)
297 free_desc(from + i);
298
299 mutex_lock(&sparse_irq_lock);
300 bitmap_clear(allocated_irqs, from, cnt);
301 mutex_unlock(&sparse_irq_lock);
302}
303
304/**
305 * irq_alloc_descs - allocate and initialize a range of irq descriptors
306 * @irq: Allocate for specific irq number if irq >= 0
307 * @from: Start the search from this irq number
308 * @cnt: Number of consecutive irqs to allocate.
309 * @node: Preferred node on which the irq descriptor should be allocated
310 *
311 * Returns the first irq number or error code
312 */
313int __ref
314irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
315{
316 int start, ret;
317
318 if (!cnt)
319 return -EINVAL;
320
321 mutex_lock(&sparse_irq_lock);
322
323 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
324 ret = -EEXIST;
325 if (irq >=0 && start != irq)
326 goto err;
327
328 ret = -ENOMEM;
329 if (start >= nr_irqs)
330 goto err;
331
332 bitmap_set(allocated_irqs, start, cnt);
333 mutex_unlock(&sparse_irq_lock);
334 return alloc_descs(start, cnt, node);
335
336err:
337 mutex_unlock(&sparse_irq_lock);
338 return ret;
339}
340
341/**
342 * irq_reserve_irqs - mark irqs allocated
343 * @from: mark from irq number
344 * @cnt: number of irqs to mark
345 *
346 * Returns 0 on success or an appropriate error code
347 */
348int irq_reserve_irqs(unsigned int from, unsigned int cnt)
349{
350 unsigned int start;
351 int ret = 0;
352
353 if (!cnt || (from + cnt) > nr_irqs)
354 return -EINVAL;
355
356 mutex_lock(&sparse_irq_lock);
357 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
358 if (start == from)
359 bitmap_set(allocated_irqs, start, cnt);
360 else
361 ret = -EEXIST;
362 mutex_unlock(&sparse_irq_lock);
363 return ret;
364}
365
366/**
367 * irq_get_next_irq - get next allocated irq number
368 * @offset: where to start the search
369 *
370 * Returns next irq number after offset or nr_irqs if none is found.
371 */
372unsigned int irq_get_next_irq(unsigned int offset)
373{
374 return find_next_bit(allocated_irqs, nr_irqs, offset);
375}
376
377/**
378 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
379 * @irq: irq number to initialize
380 */
381void dynamic_irq_cleanup(unsigned int irq)
382{
383 struct irq_desc *desc = irq_to_desc(irq);
384 unsigned long flags;
385
386 raw_spin_lock_irqsave(&desc->lock, flags);
387 desc_set_defaults(irq, desc, desc_node(desc));
388 raw_spin_unlock_irqrestore(&desc->lock, flags);
389}
390
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{
393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0;
395}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a3..644e8d5fa367 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 73{
74 struct irq_desc *desc = irq_to_desc(irq); 74 struct irq_desc *desc = irq_to_desc(irq);
75 75
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
77 !desc->chip->set_affinity) 77 !desc->irq_data.chip->irq_set_affinity)
78 return 0; 78 return 0;
79 79
80 return 1; 80 return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
110{ 110{
111 struct irq_desc *desc = irq_to_desc(irq); 111 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip;
112 unsigned long flags; 113 unsigned long flags;
113 114
114 if (!desc->chip->set_affinity) 115 if (!chip->irq_set_affinity)
115 return -EINVAL; 116 return -EINVAL;
116 117
117 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
118 119
119#ifdef CONFIG_GENERIC_PENDING_IRQ 120#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 121 if (desc->status & IRQ_MOVE_PCNTXT) {
121 if (!desc->chip->set_affinity(irq, cpumask)) { 122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
122 cpumask_copy(desc->affinity, cpumask); 123 cpumask_copy(desc->irq_data.affinity, cpumask);
123 irq_set_thread_affinity(desc); 124 irq_set_thread_affinity(desc);
124 } 125 }
125 } 126 }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
128 cpumask_copy(desc->pending_mask, cpumask); 129 cpumask_copy(desc->pending_mask, cpumask);
129 } 130 }
130#else 131#else
131 if (!desc->chip->set_affinity(irq, cpumask)) { 132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
132 cpumask_copy(desc->affinity, cpumask); 133 cpumask_copy(desc->irq_data.affinity, cpumask);
133 irq_set_thread_affinity(desc); 134 irq_set_thread_affinity(desc);
134 } 135 }
135#endif 136#endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * one of the targets is online. 169 * one of the targets is online.
169 */ 170 */
170 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
171 if (cpumask_any_and(desc->affinity, cpu_online_mask) 172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
172 < nr_cpu_ids) 173 < nr_cpu_ids)
173 goto set_affinity; 174 goto set_affinity;
174 else 175 else
175 desc->status &= ~IRQ_AFFINITY_SET; 176 desc->status &= ~IRQ_AFFINITY_SET;
176 } 177 }
177 178
178 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); 179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
179set_affinity: 180set_affinity:
180 desc->chip->set_affinity(irq, desc->affinity); 181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
181 182
182 return 0; 183 return 0;
183} 184}
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
223 224
224 if (!desc->depth++) { 225 if (!desc->depth++) {
225 desc->status |= IRQ_DISABLED; 226 desc->status |= IRQ_DISABLED;
226 desc->chip->disable(irq); 227 desc->irq_data.chip->irq_disable(&desc->irq_data);
227 } 228 }
228} 229}
229 230
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
246 if (!desc) 247 if (!desc)
247 return; 248 return;
248 249
249 chip_bus_lock(irq, desc); 250 chip_bus_lock(desc);
250 raw_spin_lock_irqsave(&desc->lock, flags); 251 raw_spin_lock_irqsave(&desc->lock, flags);
251 __disable_irq(desc, irq, false); 252 __disable_irq(desc, irq, false);
252 raw_spin_unlock_irqrestore(&desc->lock, flags); 253 raw_spin_unlock_irqrestore(&desc->lock, flags);
253 chip_bus_sync_unlock(irq, desc); 254 chip_bus_sync_unlock(desc);
254} 255}
255EXPORT_SYMBOL(disable_irq_nosync); 256EXPORT_SYMBOL(disable_irq_nosync);
256 257
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
313 * IRQ line is re-enabled. 314 * IRQ line is re-enabled.
314 * 315 *
315 * This function may be called from IRQ context only when 316 * This function may be called from IRQ context only when
316 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! 317 * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
317 */ 318 */
318void enable_irq(unsigned int irq) 319void enable_irq(unsigned int irq)
319{ 320{
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
323 if (!desc) 324 if (!desc)
324 return; 325 return;
325 326
326 chip_bus_lock(irq, desc); 327 chip_bus_lock(desc);
327 raw_spin_lock_irqsave(&desc->lock, flags); 328 raw_spin_lock_irqsave(&desc->lock, flags);
328 __enable_irq(desc, irq, false); 329 __enable_irq(desc, irq, false);
329 raw_spin_unlock_irqrestore(&desc->lock, flags); 330 raw_spin_unlock_irqrestore(&desc->lock, flags);
330 chip_bus_sync_unlock(irq, desc); 331 chip_bus_sync_unlock(desc);
331} 332}
332EXPORT_SYMBOL(enable_irq); 333EXPORT_SYMBOL(enable_irq);
333 334
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
336 struct irq_desc *desc = irq_to_desc(irq); 337 struct irq_desc *desc = irq_to_desc(irq);
337 int ret = -ENXIO; 338 int ret = -ENXIO;
338 339
339 if (desc->chip->set_wake) 340 if (desc->irq_data.chip->irq_set_wake)
340 ret = desc->chip->set_wake(irq, on); 341 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
341 342
342 return ret; 343 return ret;
343} 344}
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
429} 430}
430 431
431int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 432int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
432 unsigned long flags) 433 unsigned long flags)
433{ 434{
434 int ret; 435 int ret;
435 struct irq_chip *chip = desc->chip; 436 struct irq_chip *chip = desc->irq_data.chip;
436 437
437 if (!chip || !chip->set_type) { 438 if (!chip || !chip->irq_set_type) {
438 /* 439 /*
439 * IRQF_TRIGGER_* but the PIC does not support multiple 440 * IRQF_TRIGGER_* but the PIC does not support multiple
440 * flow-types? 441 * flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
445 } 446 }
446 447
447 /* caller masked out all except trigger mode flags */ 448 /* caller masked out all except trigger mode flags */
448 ret = chip->set_type(irq, flags); 449 ret = chip->irq_set_type(&desc->irq_data, flags);
449 450
450 if (ret) 451 if (ret)
451 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 452 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
452 (int)flags, irq, chip->set_type); 453 flags, irq, chip->irq_set_type);
453 else { 454 else {
454 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 455 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
455 flags |= IRQ_LEVEL; 456 flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 458 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
458 desc->status |= flags; 459 desc->status |= flags;
459 460
460 if (chip != desc->chip) 461 if (chip != desc->irq_data.chip)
461 irq_chip_set_defaults(desc->chip); 462 irq_chip_set_defaults(desc->irq_data.chip);
462 } 463 }
463 464
464 return ret; 465 return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 508static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
508{ 509{
509again: 510again:
510 chip_bus_lock(irq, desc); 511 chip_bus_lock(desc);
511 raw_spin_lock_irq(&desc->lock); 512 raw_spin_lock_irq(&desc->lock);
512 513
513 /* 514 /*
@@ -521,17 +522,17 @@ again:
521 */ 522 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) { 523 if (unlikely(desc->status & IRQ_INPROGRESS)) {
523 raw_spin_unlock_irq(&desc->lock); 524 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc); 525 chip_bus_sync_unlock(desc);
525 cpu_relax(); 526 cpu_relax();
526 goto again; 527 goto again;
527 } 528 }
528 529
529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 530 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
530 desc->status &= ~IRQ_MASKED; 531 desc->status &= ~IRQ_MASKED;
531 desc->chip->unmask(irq); 532 desc->irq_data.chip->irq_unmask(&desc->irq_data);
532 } 533 }
533 raw_spin_unlock_irq(&desc->lock); 534 raw_spin_unlock_irq(&desc->lock);
534 chip_bus_sync_unlock(irq, desc); 535 chip_bus_sync_unlock(desc);
535} 536}
536 537
537#ifdef CONFIG_SMP 538#ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
556 } 557 }
557 558
558 raw_spin_lock_irq(&desc->lock); 559 raw_spin_lock_irq(&desc->lock);
559 cpumask_copy(mask, desc->affinity); 560 cpumask_copy(mask, desc->irq_data.affinity);
560 raw_spin_unlock_irq(&desc->lock); 561 raw_spin_unlock_irq(&desc->lock);
561 562
562 set_cpus_allowed_ptr(current, mask); 563 set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657 if (!desc) 658 if (!desc)
658 return -EINVAL; 659 return -EINVAL;
659 660
660 if (desc->chip == &no_irq_chip) 661 if (desc->irq_data.chip == &no_irq_chip)
661 return -ENOSYS; 662 return -ENOSYS;
662 /* 663 /*
663 * Some drivers like serial.c use request_irq() heavily, 664 * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
752 } 753 }
753 754
754 if (!shared) { 755 if (!shared) {
755 irq_chip_set_defaults(desc->chip); 756 irq_chip_set_defaults(desc->irq_data.chip);
756 757
757 init_waitqueue_head(&desc->wait_for_threads); 758 init_waitqueue_head(&desc->wait_for_threads);
758 759
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
779 if (!(desc->status & IRQ_NOAUTOEN)) { 780 if (!(desc->status & IRQ_NOAUTOEN)) {
780 desc->depth = 0; 781 desc->depth = 0;
781 desc->status &= ~IRQ_DISABLED; 782 desc->status &= ~IRQ_DISABLED;
782 desc->chip->startup(irq); 783 desc->irq_data.chip->irq_startup(&desc->irq_data);
783 } else 784 } else
784 /* Undo nested disables: */ 785 /* Undo nested disables: */
785 desc->depth = 1; 786 desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
912 913
913 /* Currently used only by UML, might disappear one day: */ 914 /* Currently used only by UML, might disappear one day: */
914#ifdef CONFIG_IRQ_RELEASE_METHOD 915#ifdef CONFIG_IRQ_RELEASE_METHOD
915 if (desc->chip->release) 916 if (desc->irq_data.chip->release)
916 desc->chip->release(irq, dev_id); 917 desc->irq_data.chip->release(irq, dev_id);
917#endif 918#endif
918 919
919 /* If this was the last handler, shut down the IRQ line: */ 920 /* If this was the last handler, shut down the IRQ line: */
920 if (!desc->action) { 921 if (!desc->action) {
921 desc->status |= IRQ_DISABLED; 922 desc->status |= IRQ_DISABLED;
922 if (desc->chip->shutdown) 923 if (desc->irq_data.chip->irq_shutdown)
923 desc->chip->shutdown(irq); 924 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
924 else 925 else
925 desc->chip->disable(irq); 926 desc->irq_data.chip->irq_disable(&desc->irq_data);
926 } 927 }
927 928
928#ifdef CONFIG_SMP 929#ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
997 if (!desc) 998 if (!desc)
998 return; 999 return;
999 1000
1000 chip_bus_lock(irq, desc); 1001 chip_bus_lock(desc);
1001 kfree(__free_irq(irq, dev_id)); 1002 kfree(__free_irq(irq, dev_id));
1002 chip_bus_sync_unlock(irq, desc); 1003 chip_bus_sync_unlock(desc);
1003} 1004}
1004EXPORT_SYMBOL(free_irq); 1005EXPORT_SYMBOL(free_irq);
1005 1006
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1086 action->name = devname; 1087 action->name = devname;
1087 action->dev_id = dev_id; 1088 action->dev_id = dev_id;
1088 1089
1089 chip_bus_lock(irq, desc); 1090 chip_bus_lock(desc);
1090 retval = __setup_irq(irq, desc, action); 1091 retval = __setup_irq(irq, desc, action);
1091 chip_bus_sync_unlock(irq, desc); 1092 chip_bus_sync_unlock(desc);
1092 1093
1093 if (retval) 1094 if (retval)
1094 kfree(action); 1095 kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..1d2541940480 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
7void move_masked_irq(int irq) 7void move_masked_irq(int irq)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_to_desc(irq);
10 struct irq_chip *chip = desc->irq_data.chip;
10 11
11 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
12 return; 13 return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
24 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
25 return; 26 return;
26 27
27 if (!desc->chip->set_affinity) 28 if (!chip->irq_set_affinity)
28 return; 29 return;
29 30
30 assert_raw_spin_locked(&desc->lock); 31 assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
43 */ 44 */
44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
45 < nr_cpu_ids)) 46 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 47 if (!chip->irq_set_affinity(&desc->irq_data,
47 cpumask_copy(desc->affinity, desc->pending_mask); 48 desc->pending_mask, false)) {
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc); 50 irq_set_thread_affinity(desc);
49 } 51 }
50 52
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
61 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(desc->status & IRQ_DISABLED))
62 return; 64 return;
63 65
64 desc->chip->mask(irq); 66 desc->irq_data.chip->irq_mask(&desc->irq_data);
65 move_masked_irq(irq); 67 move_masked_irq(irq);
66 desc->chip->unmask(irq); 68 desc->irq_data.chip->irq_unmask(&desc->irq_data);
67} 69}
68 70
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/slab.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13#include <linux/kernel_stat.h>
14
15#include "internals.h"
16
17static void init_copy_kstat_irqs(struct irq_desc *old_desc,
18 struct irq_desc *desc,
19 int node, int nr)
20{
21 init_kstat_irqs(desc, node, nr);
22
23 if (desc->kstat_irqs != old_desc->kstat_irqs)
24 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
25 nr * sizeof(*desc->kstat_irqs));
26}
27
28static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
29{
30 if (old_desc->kstat_irqs == desc->kstat_irqs)
31 return;
32
33 kfree(old_desc->kstat_irqs);
34 old_desc->kstat_irqs = NULL;
35}
36
37static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
38 struct irq_desc *desc, int node)
39{
40 memcpy(desc, old_desc, sizeof(struct irq_desc));
41 if (!alloc_desc_masks(desc, node, false)) {
42 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
43 "for migration.\n", irq);
44 return false;
45 }
46 raw_spin_lock_init(&desc->lock);
47 desc->node = node;
48 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
49 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
50 init_copy_desc_masks(old_desc, desc);
51 arch_init_copy_chip_data(old_desc, desc, node);
52 return true;
53}
54
55static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
56{
57 free_kstat_irqs(old_desc, desc);
58 free_desc_masks(old_desc, desc);
59 arch_free_chip_data(old_desc, desc);
60}
61
62static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
63 int node)
64{
65 struct irq_desc *desc;
66 unsigned int irq;
67 unsigned long flags;
68
69 irq = old_desc->irq;
70
71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
72
73 /* We have to check it to avoid races with another CPU */
74 desc = irq_to_desc(irq);
75
76 if (desc && old_desc != desc)
77 goto out_unlock;
78
79 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
80 if (!desc) {
81 printk(KERN_ERR "irq %d: can not get new irq_desc "
82 "for migration.\n", irq);
83 /* still use old one */
84 desc = old_desc;
85 goto out_unlock;
86 }
87 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
88 /* still use old one */
89 kfree(desc);
90 desc = old_desc;
91 goto out_unlock;
92 }
93
94 replace_irq_desc(irq, desc);
95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
96
97 /* free the old one */
98 free_one_irq_desc(old_desc, desc);
99 kfree(old_desc);
100
101 return desc;
102
103out_unlock:
104 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
105
106 return desc;
107}
108
109struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
110{
111 /* those static or target node is -1, do not move them */
112 if (desc->irq < NR_IRQS_LEGACY || node == -1)
113 return desc;
114
115 if (desc->node != node)
116 desc = __real_move_irq_desc(desc, node);
117
118 return desc;
119}
120
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..01b1d3a88983 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
21static int irq_affinity_proc_show(struct seq_file *m, void *v) 21static int irq_affinity_proc_show(struct seq_file *m, void *v)
22{ 22{
23 struct irq_desc *desc = irq_to_desc((long)m->private); 23 struct irq_desc *desc = irq_to_desc((long)m->private);
24 const struct cpumask *mask = desc->affinity; 24 const struct cpumask *mask = desc->irq_data.affinity;
25 25
26#ifdef CONFIG_GENERIC_PENDING_IRQ 26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 27 if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 65 cpumask_var_t new_value;
66 int err; 66 int err;
67 67
68 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
69 irq_balancing_disabled(irq)) 69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
185{ 185{
186 struct irq_desc *desc = irq_to_desc((long) m->private); 186 struct irq_desc *desc = irq_to_desc((long) m->private);
187 187
188 seq_printf(m, "%d\n", desc->node); 188 seq_printf(m, "%d\n", desc->irq_data.node);
189 return 0; 189 return 0;
190} 190}
191 191
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
269{ 269{
270 char name [MAX_NAMELEN]; 270 char name [MAX_NAMELEN];
271 271
272 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 272 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
273 return; 273 return;
274 274
275 memset(name, 0, MAX_NAMELEN); 275 memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
297 &irq_spurious_proc_fops, (void *)(long)irq); 297 &irq_spurious_proc_fops, (void *)(long)irq);
298} 298}
299 299
300void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
301{
302 char name [MAX_NAMELEN];
303
304 if (!root_irq_dir || !desc->dir)
305 return;
306#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir);
309 remove_proc_entry("node", desc->dir);
310#endif
311 remove_proc_entry("spurious", desc->dir);
312
313 memset(name, 0, MAX_NAMELEN);
314 sprintf(name, "%u", irq);
315 remove_proc_entry(name, root_irq_dir);
316}
317
300#undef MAX_NAMELEN 318#undef MAX_NAMELEN
301 319
302void unregister_handler_proc(unsigned int irq, struct irqaction *action) 320void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..891115a929aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
60 /* 60 /*
61 * Make sure the interrupt is enabled, before resending it: 61 * Make sure the interrupt is enabled, before resending it:
62 */ 62 */
63 desc->chip->enable(irq); 63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64 64
65 /* 65 /*
66 * We do not resend level type interrupts. Level type 66 * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { 73 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
74#ifdef CONFIG_HARDIRQS_SW_RESEND 75#ifdef CONFIG_HARDIRQS_SW_RESEND
75 /* Set it pending and activate the softirq: */ 76 /* Set it pending and activate the softirq: */
76 set_bit(irq, irqs_resend); 77 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..3089d3b9d5f3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16 16
17#include "internals.h"
18
17static int irqfixup __read_mostly; 19static int irqfixup __read_mostly;
18 20
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
78 * If we did actual work for the real IRQ line we must let the 80 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too 81 * IRQ controller clean up too
80 */ 82 */
81 if (work && desc->chip && desc->chip->end) 83 if (work)
82 desc->chip->end(irq); 84 irq_end(irq, desc);
83 raw_spin_unlock(&desc->lock); 85 raw_spin_unlock(&desc->lock);
84 86
85 return ok; 87 return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
255 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
256 desc->depth++; 258 desc->depth++;
257 desc->chip->disable(irq); 259 desc->irq_data.chip->irq_disable(&desc->irq_data);
258 260
259 mod_timer(&poll_spurious_irq_timer, 261 mod_timer(&poll_spurious_irq_timer,
260 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
639 } 639 }
640#endif 640#endif
641 641
642 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
643 debug_locks_off();
644 printk(KERN_ERR
645 "BUG: looking up invalid subclass: %u\n", subclass);
646 printk(KERN_ERR
647 "turning off the locking correctness validator.\n");
648 dump_stack();
649 return NULL;
650 }
651
642 /* 652 /*
643 * Static locks do not have their class-keys yet - for them the key 653 * Static locks do not have their class-keys yet - for them the key
644 * is the lock object itself: 654 * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
774 raw_local_irq_restore(flags); 784 raw_local_irq_restore(flags);
775 785
776 if (!subclass || force) 786 if (!subclass || force)
777 lock->class_cache = class; 787 lock->class_cache[0] = class;
788 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
789 lock->class_cache[subclass] = class;
778 790
779 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 791 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
780 return NULL; 792 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2679void lockdep_init_map(struct lockdep_map *lock, const char *name, 2691void lockdep_init_map(struct lockdep_map *lock, const char *name,
2680 struct lock_class_key *key, int subclass) 2692 struct lock_class_key *key, int subclass)
2681{ 2693{
2682 lock->class_cache = NULL; 2694 int i;
2695
2696 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2697 lock->class_cache[i] = NULL;
2698
2683#ifdef CONFIG_LOCK_STAT 2699#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2700 lock->cpu = raw_smp_processor_id();
2685#endif 2701#endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2739 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2755 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2740 return 0; 2756 return 0;
2741 2757
2742 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
2743 debug_locks_off();
2744 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2745 printk("turning off the locking correctness validator.\n");
2746 dump_stack();
2747 return 0;
2748 }
2749
2750 if (lock->key == &__lockdep_no_validate__) 2758 if (lock->key == &__lockdep_no_validate__)
2751 check = 1; 2759 check = 1;
2752 2760
2753 if (!subclass) 2761 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
2754 class = lock->class_cache; 2762 class = lock->class_cache[subclass];
2755 /* 2763 /*
2756 * Not cached yet or subclass? 2764 * Not cached?
2757 */ 2765 */
2758 if (unlikely(!class)) { 2766 if (unlikely(!class)) {
2759 class = register_lock_class(lock, subclass, 0); 2767 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2918 return 1; 2926 return 1;
2919 2927
2920 if (hlock->references) { 2928 if (hlock->references) {
2921 struct lock_class *class = lock->class_cache; 2929 struct lock_class *class = lock->class_cache[0];
2922 2930
2923 if (!class) 2931 if (!class)
2924 class = look_up_lock_class(lock, 0); 2932 class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3559 if (list_empty(head)) 3567 if (list_empty(head))
3560 continue; 3568 continue;
3561 list_for_each_entry_safe(class, next, head, hash_entry) { 3569 list_for_each_entry_safe(class, next, head, hash_entry) {
3562 if (unlikely(class == lock->class_cache)) { 3570 int match = 0;
3571
3572 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
3573 match |= class == lock->class_cache[j];
3574
3575 if (unlikely(match)) {
3563 if (debug_locks_off_graph_unlock()) 3576 if (debug_locks_off_graph_unlock())
3564 WARN_ON(1); 3577 WARN_ON(1);
3565 goto out_restore; 3578 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
3775 * Careful: only use this function if you are sure that 3788 * Careful: only use this function if you are sure that
3776 * the task cannot run in parallel! 3789 * the task cannot run in parallel!
3777 */ 3790 */
3778void __debug_show_held_locks(struct task_struct *task) 3791void debug_show_held_locks(struct task_struct *task)
3779{ 3792{
3780 if (unlikely(!debug_locks)) { 3793 if (unlikely(!debug_locks)) {
3781 printk("INFO: lockdep is turned off.\n"); 3794 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
3783 } 3796 }
3784 lockdep_print_held_locks(task); 3797 lockdep_print_held_locks(task);
3785} 3798}
3786EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3787
3788void debug_show_held_locks(struct task_struct *task)
3789{
3790 __debug_show_held_locks(task);
3791}
3792EXPORT_SYMBOL_GPL(debug_show_held_locks); 3799EXPORT_SYMBOL_GPL(debug_show_held_locks);
3793 3800
3794void lockdep_sys_exit(void) 3801void lockdep_sys_exit(void)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 05ecf6f7c672..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)
417 return event->cpu == -1 || event->cpu == smp_processor_id(); 417 return event->cpu == -1 || event->cpu == smp_processor_id();
418} 418}
419 419
420static int 420static void
421__event_sched_out(struct perf_event *event, 421event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
424{ 424{
@@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,
437 } 437 }
438 438
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
440 return 0; 440 return;
441 441
442 event->state = PERF_EVENT_STATE_INACTIVE; 442 event->state = PERF_EVENT_STATE_INACTIVE;
443 if (event->pending_disable) { 443 if (event->pending_disable) {
444 event->pending_disable = 0; 444 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
446 } 446 }
447 event->tstamp_stopped = ctx->time;
447 event->pmu->del(event, 0); 448 event->pmu->del(event, 0);
448 event->oncpu = -1; 449 event->oncpu = -1;
449 450
@@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,
452 ctx->nr_active--; 453 ctx->nr_active--;
453 if (event->attr.exclusive || !cpuctx->active_oncpu) 454 if (event->attr.exclusive || !cpuctx->active_oncpu)
454 cpuctx->exclusive = 0; 455 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
468} 456}
469 457
470static void 458static void
@@ -664,7 +652,7 @@ retry:
664} 652}
665 653
666static int 654static int
667__event_sched_in(struct perf_event *event, 655event_sched_in(struct perf_event *event,
668 struct perf_cpu_context *cpuctx, 656 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx) 657 struct perf_event_context *ctx)
670{ 658{
@@ -684,6 +672,8 @@ __event_sched_in(struct perf_event *event,
684 return -EAGAIN; 672 return -EAGAIN;
685 } 673 }
686 674
675 event->tstamp_running += ctx->time - event->tstamp_stopped;
676
687 if (!is_software_event(event)) 677 if (!is_software_event(event))
688 cpuctx->active_oncpu++; 678 cpuctx->active_oncpu++;
689 ctx->nr_active++; 679 ctx->nr_active++;
@@ -694,35 +684,6 @@ __event_sched_in(struct perf_event *event,
694 return 0; 684 return 0;
695} 685}
696 686
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
726static int 687static int
727group_sched_in(struct perf_event *group_event, 688group_sched_in(struct perf_event *group_event,
728 struct perf_cpu_context *cpuctx, 689 struct perf_cpu_context *cpuctx,
@@ -730,19 +691,15 @@ group_sched_in(struct perf_event *group_event,
730{ 691{
731 struct perf_event *event, *partial_group = NULL; 692 struct perf_event *event, *partial_group = NULL;
732 struct pmu *pmu = group_event->pmu; 693 struct pmu *pmu = group_event->pmu;
694 u64 now = ctx->time;
695 bool simulate = false;
733 696
734 if (group_event->state == PERF_EVENT_STATE_OFF) 697 if (group_event->state == PERF_EVENT_STATE_OFF)
735 return 0; 698 return 0;
736 699
737 pmu->start_txn(pmu); 700 pmu->start_txn(pmu);
738 701
739 /* 702 if (event_sched_in(group_event, cpuctx, ctx)) {
740 * use __event_sched_in() to delay updating tstamp_running
741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu); 703 pmu->cancel_txn(pmu);
747 return -EAGAIN; 704 return -EAGAIN;
748 } 705 }
@@ -751,31 +708,42 @@ group_sched_in(struct perf_event *group_event,
751 * Schedule in siblings as one group (if any): 708 * Schedule in siblings as one group (if any):
752 */ 709 */
753 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 710 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
754 if (__event_sched_in(event, cpuctx, ctx)) { 711 if (event_sched_in(event, cpuctx, ctx)) {
755 partial_group = event; 712 partial_group = event;
756 goto group_error; 713 goto group_error;
757 } 714 }
758 } 715 }
759 716
760 if (!pmu->commit_txn(pmu)) { 717 if (!pmu->commit_txn(pmu))
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
763 return 0; 718 return 0;
764 } 719
765group_error: 720group_error:
766 /* 721 /*
767 * Groups can be scheduled in as one unit only, so undo any 722 * Groups can be scheduled in as one unit only, so undo any
768 * partial group before returning: 723 * partial group before returning:
724 * The events up to the failed event are scheduled out normally,
725 * tstamp_stopped will be updated.
769 * 726 *
770 * use __event_sched_out() to avoid updating tstamp_stopped 727 * The failed events and the remaining siblings need to have
771 * because the event never actually ran 728 * their timings updated as if they had gone thru event_sched_in()
729 * and event_sched_out(). This is required to get consistent timings
730 * across the group. This also takes care of the case where the group
731 * could never be scheduled by ensuring tstamp_stopped is set to mark
732 * the time the event was actually stopped, such that time delta
733 * calculation in update_event_times() is correct.
772 */ 734 */
773 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 735 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
774 if (event == partial_group) 736 if (event == partial_group)
775 break; 737 simulate = true;
776 __event_sched_out(event, cpuctx, ctx); 738
739 if (simulate) {
740 event->tstamp_running += now - event->tstamp_stopped;
741 event->tstamp_stopped = now;
742 } else {
743 event_sched_out(event, cpuctx, ctx);
744 }
777 } 745 }
778 __event_sched_out(group_event, cpuctx, ctx); 746 event_sched_out(group_event, cpuctx, ctx);
779 747
780 pmu->cancel_txn(pmu); 748 pmu->cancel_txn(pmu);
781 749
@@ -2509,15 +2477,13 @@ static void perf_event_for_each(struct perf_event *event,
2509static int perf_event_period(struct perf_event *event, u64 __user *arg) 2477static int perf_event_period(struct perf_event *event, u64 __user *arg)
2510{ 2478{
2511 struct perf_event_context *ctx = event->ctx; 2479 struct perf_event_context *ctx = event->ctx;
2512 unsigned long size;
2513 int ret = 0; 2480 int ret = 0;
2514 u64 value; 2481 u64 value;
2515 2482
2516 if (!event->attr.sample_period) 2483 if (!event->attr.sample_period)
2517 return -EINVAL; 2484 return -EINVAL;
2518 2485
2519 size = copy_from_user(&value, arg, sizeof(value)); 2486 if (copy_from_user(&value, arg, sizeof(value)))
2520 if (size != sizeof(value))
2521 return -EFAULT; 2487 return -EFAULT;
2522 2488
2523 if (!value) 2489 if (!value)
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
401 struct task_struct *result = NULL; 401 struct task_struct *result = NULL;
402 if (pid) { 402 if (pid) {
403 struct hlist_node *first; 403 struct hlist_node *first;
404 first = rcu_dereference_check(pid->tasks[type].first, 404 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
405 rcu_read_lock_held() || 405 rcu_read_lock_held() ||
406 lockdep_tasklist_lock_is_held()); 406 lockdep_tasklist_lock_is_held());
407 if (first) 407 if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
416 */ 416 */
417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
418{ 418{
419 rcu_lockdep_assert(rcu_read_lock_held());
419 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 420 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
420} 421}
421 422
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..29bff6117abc 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP
86 depends on SMP 86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE 87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP 88 depends on PM_SLEEP
89 select HOTPLUG
89 select HOTPLUG_CPU 90 select HOTPLUG_CPU
90 default y 91 default y
91 92
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER
137config HIBERNATION 138config HIBERNATION
138 bool "Hibernation (aka 'suspend to disk')" 139 bool "Hibernation (aka 'suspend to disk')"
139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS
142 select LZO_DECOMPRESS
140 select SUSPEND_NVS if HAS_IOMEM 143 select SUSPEND_NVS if HAS_IOMEM
141 ---help--- 144 ---help---
142 Enable the suspend to disk (STD) functionality, which is usually 145 Enable the suspend to disk (STD) functionality, which is usually
@@ -242,3 +245,17 @@ config PM_OPS
242 bool 245 bool
243 depends on PM_SLEEP || PM_RUNTIME 246 depends on PM_SLEEP || PM_RUNTIME
244 default y 247 default y
248
249config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM
252 ---help---
253 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This
255 is called Operating Performance Point or OPP. The actual definitions
256 of OPP varies over silicon within the same family of devices.
257
258 OPP layer organizes the data internally using device pointers
259 representing individual voltage domains and provides SOC
260 implementations a ready to use framework to manage OPPs.
261 For more information, read <file:Documentation/power/opp.txt>
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..657272e91d0a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -29,6 +29,7 @@
29#include "power.h" 29#include "power.h"
30 30
31 31
32static int nocompress = 0;
32static int noresume = 0; 33static int noresume = 0;
33static char resume_file[256] = CONFIG_PM_STD_PARTITION; 34static char resume_file[256] = CONFIG_PM_STD_PARTITION;
34dev_t swsusp_resume_device; 35dev_t swsusp_resume_device;
@@ -638,6 +639,8 @@ int hibernate(void)
638 639
639 if (hibernation_mode == HIBERNATION_PLATFORM) 640 if (hibernation_mode == HIBERNATION_PLATFORM)
640 flags |= SF_PLATFORM_MODE; 641 flags |= SF_PLATFORM_MODE;
642 if (nocompress)
643 flags |= SF_NOCOMPRESS_MODE;
641 pr_debug("PM: writing image.\n"); 644 pr_debug("PM: writing image.\n");
642 error = swsusp_write(flags); 645 error = swsusp_write(flags);
643 swsusp_free(); 646 swsusp_free();
@@ -705,7 +708,7 @@ static int software_resume(void)
705 goto Unlock; 708 goto Unlock;
706 } 709 }
707 710
708 pr_debug("PM: Checking image partition %s\n", resume_file); 711 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
709 712
710 /* Check if the device is there */ 713 /* Check if the device is there */
711 swsusp_resume_device = name_to_dev_t(resume_file); 714 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -730,10 +733,10 @@ static int software_resume(void)
730 } 733 }
731 734
732 Check_image: 735 Check_image:
733 pr_debug("PM: Resume from partition %d:%d\n", 736 pr_debug("PM: Hibernation image partition %d:%d present\n",
734 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); 737 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
735 738
736 pr_debug("PM: Checking hibernation image.\n"); 739 pr_debug("PM: Looking for hibernation image.\n");
737 error = swsusp_check(); 740 error = swsusp_check();
738 if (error) 741 if (error)
739 goto Unlock; 742 goto Unlock;
@@ -765,14 +768,14 @@ static int software_resume(void)
765 goto Done; 768 goto Done;
766 } 769 }
767 770
768 pr_debug("PM: Reading hibernation image.\n"); 771 pr_debug("PM: Loading hibernation image.\n");
769 772
770 error = swsusp_read(&flags); 773 error = swsusp_read(&flags);
771 swsusp_close(FMODE_READ); 774 swsusp_close(FMODE_READ);
772 if (!error) 775 if (!error)
773 hibernation_restore(flags & SF_PLATFORM_MODE); 776 hibernation_restore(flags & SF_PLATFORM_MODE);
774 777
775 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 778 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
776 swsusp_free(); 779 swsusp_free();
777 thaw_processes(); 780 thaw_processes();
778 Done: 781 Done:
@@ -785,7 +788,7 @@ static int software_resume(void)
785 /* For success case, the suspend path will release the lock */ 788 /* For success case, the suspend path will release the lock */
786 Unlock: 789 Unlock:
787 mutex_unlock(&pm_mutex); 790 mutex_unlock(&pm_mutex);
788 pr_debug("PM: Resume from disk failed.\n"); 791 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
789 return error; 792 return error;
790close_finish: 793close_finish:
791 swsusp_close(FMODE_READ); 794 swsusp_close(FMODE_READ);
@@ -1004,6 +1007,15 @@ static int __init resume_offset_setup(char *str)
1004 return 1; 1007 return 1;
1005} 1008}
1006 1009
1010static int __init hibernate_setup(char *str)
1011{
1012 if (!strncmp(str, "noresume", 8))
1013 noresume = 1;
1014 else if (!strncmp(str, "nocompress", 10))
1015 nocompress = 1;
1016 return 1;
1017}
1018
1007static int __init noresume_setup(char *str) 1019static int __init noresume_setup(char *str)
1008{ 1020{
1009 noresume = 1; 1021 noresume = 1;
@@ -1013,3 +1025,4 @@ static int __init noresume_setup(char *str)
1013__setup("noresume", noresume_setup); 1025__setup("noresume", noresume_setup);
1014__setup("resume_offset=", resume_offset_setup); 1026__setup("resume_offset=", resume_offset_setup);
1015__setup("resume=", resume_setup); 1027__setup("resume=", resume_setup);
1028__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 62b0bc6e4983..7b5db6a8561e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -237,18 +237,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr, 237 struct kobj_attribute *attr,
238 char *buf) 238 char *buf)
239{ 239{
240 unsigned long val; 240 unsigned int val;
241 241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; 242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
243} 243}
244 244
245static ssize_t wakeup_count_store(struct kobject *kobj, 245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr, 246 struct kobj_attribute *attr,
247 const char *buf, size_t n) 247 const char *buf, size_t n)
248{ 248{
249 unsigned long val; 249 unsigned int val;
250 250
251 if (sscanf(buf, "%lu", &val) == 1) { 251 if (sscanf(buf, "%u", &val) == 1) {
252 if (pm_save_wakeup_count(val)) 252 if (pm_save_wakeup_count(val))
253 return n; 253 return n;
254 } 254 }
@@ -281,12 +281,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
281} 281}
282 282
283power_attr(pm_trace); 283power_attr(pm_trace);
284
285static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
286 struct kobj_attribute *attr,
287 char *buf)
288{
289 return show_trace_dev_match(buf, PAGE_SIZE);
290}
291
292static ssize_t
293pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
294 const char *buf, size_t n)
295{
296 return -EINVAL;
297}
298
299power_attr(pm_trace_dev_match);
300
284#endif /* CONFIG_PM_TRACE */ 301#endif /* CONFIG_PM_TRACE */
285 302
286static struct attribute * g[] = { 303static struct attribute * g[] = {
287 &state_attr.attr, 304 &state_attr.attr,
288#ifdef CONFIG_PM_TRACE 305#ifdef CONFIG_PM_TRACE
289 &pm_trace_attr.attr, 306 &pm_trace_attr.attr,
307 &pm_trace_dev_match_attr.attr,
290#endif 308#endif
291#ifdef CONFIG_PM_SLEEP 309#ifdef CONFIG_PM_SLEEP
292 &pm_async_attr.attr, 310 &pm_async_attr.attr,
@@ -308,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
308 326
309static int __init pm_start_workqueue(void) 327static int __init pm_start_workqueue(void)
310{ 328{
311 pm_wq = create_freezeable_workqueue("pm"); 329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
312 330
313 return pm_wq ? 0 : -ENOMEM; 331 return pm_wq ? 0 : -ENOMEM;
314} 332}
@@ -321,6 +339,7 @@ static int __init pm_init(void)
321 int error = pm_start_workqueue(); 339 int error = pm_start_workqueue();
322 if (error) 340 if (error)
323 return error; 341 return error;
342 hibernate_image_size_init();
324 power_kobj = kobject_create_and_add("power", NULL); 343 power_kobj = kobject_create_and_add("power", NULL);
325 if (!power_kobj) 344 if (!power_kobj)
326 return -ENOMEM; 345 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..03634be55f62 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,9 @@ struct swsusp_info {
14} __attribute__((aligned(PAGE_SIZE))); 14} __attribute__((aligned(PAGE_SIZE)));
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */
18extern void __init hibernate_image_size_init(void);
19
17#ifdef CONFIG_ARCH_HIBERNATION_HEADER 20#ifdef CONFIG_ARCH_HIBERNATION_HEADER
18/* Maximum size of architecture specific data in a hibernation header */ 21/* Maximum size of architecture specific data in a hibernation header */
19#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) 22#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info)
49extern int hibernation_snapshot(int platform_mode); 52extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 53extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 54extern int hibernation_platform_enter(void);
52#endif 55
56#else /* !CONFIG_HIBERNATION */
57
58static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */
53 60
54extern int pfn_is_nosave(unsigned long); 61extern int pfn_is_nosave(unsigned long);
55 62
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void);
134 * the image header. 141 * the image header.
135 */ 142 */
136#define SF_PLATFORM_MODE 1 143#define SF_PLATFORM_MODE 1
144#define SF_NOCOMPRESS_MODE 2
137 145
138/* kernel/power/hibernate.c */ 146/* kernel/power/hibernate.c */
139extern int swsusp_check(void); 147extern int swsusp_check(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 028a99598f49..e50b4c1b2a0f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only)
40 struct timeval start, end; 40 struct timeval start, end;
41 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
42 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
43 bool wakeup = false;
43 44
44 do_gettimeofday(&start); 45 do_gettimeofday(&start);
45 46
@@ -78,6 +79,11 @@ static int try_to_freeze_tasks(bool sig_only)
78 if (!todo || time_after(jiffies, end_time)) 79 if (!todo || time_after(jiffies, end_time))
79 break; 80 break;
80 81
82 if (!pm_check_wakeup_events()) {
83 wakeup = true;
84 break;
85 }
86
81 /* 87 /*
82 * We need to retry, but first give the freezing tasks some 88 * We need to retry, but first give the freezing tasks some
83 * time to enter the regrigerator. 89 * time to enter the regrigerator.
@@ -97,8 +103,9 @@ static int try_to_freeze_tasks(bool sig_only)
97 * but it cleans up leftover PF_FREEZE requests. 103 * but it cleans up leftover PF_FREEZE requests.
98 */ 104 */
99 printk("\n"); 105 printk("\n");
100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 106 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
101 "(%d tasks refusing to freeze, wq_busy=%d):\n", 107 "(%d tasks refusing to freeze, wq_busy=%d):\n",
108 wakeup ? "aborted" : "failed",
102 elapsed_csecs / 100, elapsed_csecs % 100, 109 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy); 110 todo - wq_busy, wq_busy);
104 111
@@ -107,7 +114,7 @@ static int try_to_freeze_tasks(bool sig_only)
107 read_lock(&tasklist_lock); 114 read_lock(&tasklist_lock);
108 do_each_thread(g, p) { 115 do_each_thread(g, p) {
109 task_lock(p); 116 task_lock(p);
110 if (freezing(p) && !freezer_should_skip(p)) 117 if (!wakeup && freezing(p) && !freezer_should_skip(p))
111 sched_show_task(p); 118 sched_show_task(p);
112 cancel_freezing(p); 119 cancel_freezing(p);
113 task_unlock(p); 120 task_unlock(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d3f795f01bbc..ac7eb109f196 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *);
46 * size will not exceed N bytes, but if that is impossible, it will 46 * size will not exceed N bytes, but if that is impossible, it will
47 * try to create the smallest image possible. 47 * try to create the smallest image possible.
48 */ 48 */
49unsigned long image_size = 500 * 1024 * 1024; 49unsigned long image_size;
50
51void __init hibernate_image_size_init(void)
52{
53 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54}
50 55
51/* List of PBEs needed for restoring the pages that were allocated before 56/* List of PBEs needed for restoring the pages that were allocated before
52 * the suspend and included in the suspend image, but have also been 57 * the suspend and included in the suspend image, but have also been
@@ -1318,12 +1323,14 @@ int hibernate_preallocate_memory(void)
1318 1323
1319 /* Compute the maximum number of saveable pages to leave in memory. */ 1324 /* Compute the maximum number of saveable pages to leave in memory. */
1320 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1326 /* Compute the desired number of image pages specified by image_size. */
1321 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1322 if (size > max_size) 1328 if (size > max_size)
1323 size = max_size; 1329 size = max_size;
1324 /* 1330 /*
1325 * If the maximum is not less than the current number of saveable pages 1331 * If the desired number of image pages is at least as large as the
1326 * in memory, allocate page frames for the image and we're done. 1332 * current number of saveable pages in memory, allocate page frames for
1333 * the image and we're done.
1327 */ 1334 */
1328 if (size >= saveable) { 1335 if (size >= saveable) {
1329 pages = preallocate_image_highmem(save_highmem); 1336 pages = preallocate_image_highmem(save_highmem);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e6a5bdf61a37..916eaa790399 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -24,10 +24,12 @@
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/lzo.h>
28#include <linux/vmalloc.h>
27 29
28#include "power.h" 30#include "power.h"
29 31
30#define SWSUSP_SIG "S1SUSPEND" 32#define HIBERNATE_SIG "LINHIB0001"
31 33
32/* 34/*
33 * The swap map is a data structure used for keeping track of each page 35 * The swap map is a data structure used for keeping track of each page
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 195 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 196 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 197 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 198 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
197 swsusp_header->image = handle->first_sector; 199 swsusp_header->image = handle->first_sector;
198 swsusp_header->flags = flags; 200 swsusp_header->flags = flags;
199 error = hib_bio_write_page(swsusp_resume_block, 201 error = hib_bio_write_page(swsusp_resume_block,
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
357 return error; 359 return error;
358} 360}
359 361
362/* We need to remember how much compressed data we need to read. */
363#define LZO_HEADER sizeof(size_t)
364
365/* Number of pages/bytes we'll compress at one time. */
366#define LZO_UNC_PAGES 32
367#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
368
369/* Number of pages/bytes we need for compressed data (worst case). */
370#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
371 LZO_HEADER, PAGE_SIZE)
372#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
373
360/** 374/**
361 * save_image - save the suspend image data 375 * save_image - save the suspend image data
362 */ 376 */
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle,
404 return ret; 418 return ret;
405} 419}
406 420
421
422/**
423 * save_image_lzo - Save the suspend image data compressed with LZO.
424 * @handle: Swap mam handle to use for saving the image.
425 * @snapshot: Image to read data from.
426 * @nr_to_write: Number of pages to save.
427 */
428static int save_image_lzo(struct swap_map_handle *handle,
429 struct snapshot_handle *snapshot,
430 unsigned int nr_to_write)
431{
432 unsigned int m;
433 int ret = 0;
434 int nr_pages;
435 int err2;
436 struct bio *bio;
437 struct timeval start;
438 struct timeval stop;
439 size_t off, unc_len, cmp_len;
440 unsigned char *unc, *cmp, *wrk, *page;
441
442 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
443 if (!page) {
444 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
445 return -ENOMEM;
446 }
447
448 wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
449 if (!wrk) {
450 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
451 free_page((unsigned long)page);
452 return -ENOMEM;
453 }
454
455 unc = vmalloc(LZO_UNC_SIZE);
456 if (!unc) {
457 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
458 vfree(wrk);
459 free_page((unsigned long)page);
460 return -ENOMEM;
461 }
462
463 cmp = vmalloc(LZO_CMP_SIZE);
464 if (!cmp) {
465 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
466 vfree(unc);
467 vfree(wrk);
468 free_page((unsigned long)page);
469 return -ENOMEM;
470 }
471
472 printk(KERN_INFO
473 "PM: Compressing and saving image data (%u pages) ... ",
474 nr_to_write);
475 m = nr_to_write / 100;
476 if (!m)
477 m = 1;
478 nr_pages = 0;
479 bio = NULL;
480 do_gettimeofday(&start);
481 for (;;) {
482 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
483 ret = snapshot_read_next(snapshot);
484 if (ret < 0)
485 goto out_finish;
486
487 if (!ret)
488 break;
489
490 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
491
492 if (!(nr_pages % m))
493 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
494 nr_pages++;
495 }
496
497 if (!off)
498 break;
499
500 unc_len = off;
501 ret = lzo1x_1_compress(unc, unc_len,
502 cmp + LZO_HEADER, &cmp_len, wrk);
503 if (ret < 0) {
504 printk(KERN_ERR "PM: LZO compression failed\n");
505 break;
506 }
507
508 if (unlikely(!cmp_len ||
509 cmp_len > lzo1x_worst_compress(unc_len))) {
510 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
511 ret = -1;
512 break;
513 }
514
515 *(size_t *)cmp = cmp_len;
516
517 /*
518 * Given we are writing one page at a time to disk, we copy
519 * that much from the buffer, although the last bit will likely
520 * be smaller than full page. This is OK - we saved the length
521 * of the compressed data, so any garbage at the end will be
522 * discarded when we read it.
523 */
524 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
525 memcpy(page, cmp + off, PAGE_SIZE);
526
527 ret = swap_write_page(handle, page, &bio);
528 if (ret)
529 goto out_finish;
530 }
531 }
532
533out_finish:
534 err2 = hib_wait_on_bio_chain(&bio);
535 do_gettimeofday(&stop);
536 if (!ret)
537 ret = err2;
538 if (!ret)
539 printk(KERN_CONT "\b\b\b\bdone\n");
540 else
541 printk(KERN_CONT "\n");
542 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
543
544 vfree(cmp);
545 vfree(unc);
546 vfree(wrk);
547 free_page((unsigned long)page);
548
549 return ret;
550}
551
407/** 552/**
408 * enough_swap - Make sure we have enough swap to save the image. 553 * enough_swap - Make sure we have enough swap to save the image.
409 * 554 *
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle,
411 * space avaiable from the resume partition. 556 * space avaiable from the resume partition.
412 */ 557 */
413 558
414static int enough_swap(unsigned int nr_pages) 559static int enough_swap(unsigned int nr_pages, unsigned int flags)
415{ 560{
416 unsigned int free_swap = count_swap_pages(root_swap, 1); 561 unsigned int free_swap = count_swap_pages(root_swap, 1);
562 unsigned int required;
417 563
418 pr_debug("PM: Free swap pages: %u\n", free_swap); 564 pr_debug("PM: Free swap pages: %u\n", free_swap);
419 return free_swap > nr_pages + PAGES_FOR_IO; 565
566 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
567 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
568 return free_swap > required;
420} 569}
421 570
422/** 571/**
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags)
443 printk(KERN_ERR "PM: Cannot get swap writer\n"); 592 printk(KERN_ERR "PM: Cannot get swap writer\n");
444 return error; 593 return error;
445 } 594 }
446 if (!enough_swap(pages)) { 595 if (!enough_swap(pages, flags)) {
447 printk(KERN_ERR "PM: Not enough free swap\n"); 596 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC; 597 error = -ENOSPC;
449 goto out_finish; 598 goto out_finish;
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags)
458 } 607 }
459 header = (struct swsusp_info *)data_of(snapshot); 608 header = (struct swsusp_info *)data_of(snapshot);
460 error = swap_write_page(&handle, header, NULL); 609 error = swap_write_page(&handle, header, NULL);
461 if (!error) 610 if (!error) {
462 error = save_image(&handle, &snapshot, pages - 1); 611 error = (flags & SF_NOCOMPRESS_MODE) ?
612 save_image(&handle, &snapshot, pages - 1) :
613 save_image_lzo(&handle, &snapshot, pages - 1);
614 }
463out_finish: 615out_finish:
464 error = swap_writer_finish(&handle, flags, error); 616 error = swap_writer_finish(&handle, flags, error);
465 return error; 617 return error;
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle,
590} 742}
591 743
592/** 744/**
745 * load_image_lzo - Load compressed image data and decompress them with LZO.
746 * @handle: Swap map handle to use for loading data.
747 * @snapshot: Image to copy uncompressed data into.
748 * @nr_to_read: Number of pages to load.
749 */
750static int load_image_lzo(struct swap_map_handle *handle,
751 struct snapshot_handle *snapshot,
752 unsigned int nr_to_read)
753{
754 unsigned int m;
755 int error = 0;
756 struct timeval start;
757 struct timeval stop;
758 unsigned nr_pages;
759 size_t off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page;
761
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
763 if (!page) {
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
765 return -ENOMEM;
766 }
767
768 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page);
772 return -ENOMEM;
773 }
774
775 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
778 vfree(unc);
779 free_page((unsigned long)page);
780 return -ENOMEM;
781 }
782
783 printk(KERN_INFO
784 "PM: Loading and decompressing image data (%u pages) ... ",
785 nr_to_read);
786 m = nr_to_read / 100;
787 if (!m)
788 m = 1;
789 nr_pages = 0;
790 do_gettimeofday(&start);
791
792 error = snapshot_write_next(snapshot);
793 if (error <= 0)
794 goto out_finish;
795
796 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */
798 if (error)
799 break;
800
801 cmp_len = *(size_t *)page;
802 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
805 error = -1;
806 break;
807 }
808
809 memcpy(cmp, page, PAGE_SIZE);
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
811 error = swap_read_page(handle, page, NULL); /* sync */
812 if (error)
813 goto out_finish;
814
815 memcpy(cmp + off, page, PAGE_SIZE);
816 }
817
818 unc_len = LZO_UNC_SIZE;
819 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
820 unc, &unc_len);
821 if (error < 0) {
822 printk(KERN_ERR "PM: LZO decompression failed\n");
823 break;
824 }
825
826 if (unlikely(!unc_len ||
827 unc_len > LZO_UNC_SIZE ||
828 unc_len & (PAGE_SIZE - 1))) {
829 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
830 error = -1;
831 break;
832 }
833
834 for (off = 0; off < unc_len; off += PAGE_SIZE) {
835 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
836
837 if (!(nr_pages % m))
838 printk("\b\b\b\b%3d%%", nr_pages / m);
839 nr_pages++;
840
841 error = snapshot_write_next(snapshot);
842 if (error <= 0)
843 goto out_finish;
844 }
845 }
846
847out_finish:
848 do_gettimeofday(&stop);
849 if (!error) {
850 printk("\b\b\b\bdone\n");
851 snapshot_write_finalize(snapshot);
852 if (!snapshot_image_loaded(snapshot))
853 error = -ENODATA;
854 } else
855 printk("\n");
856 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
857
858 vfree(cmp);
859 vfree(unc);
860 free_page((unsigned long)page);
861
862 return error;
863}
864
865/**
593 * swsusp_read - read the hibernation image. 866 * swsusp_read - read the hibernation image.
594 * @flags_p: flags passed by the "frozen" kernel in the image header should 867 * @flags_p: flags passed by the "frozen" kernel in the image header should
595 * be written into this memeory location 868 * be written into this memeory location
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p)
612 goto end; 885 goto end;
613 if (!error) 886 if (!error)
614 error = swap_read_page(&handle, header, NULL); 887 error = swap_read_page(&handle, header, NULL);
615 if (!error) 888 if (!error) {
616 error = load_image(&handle, &snapshot, header->pages - 1); 889 error = (*flags_p & SF_NOCOMPRESS_MODE) ?
890 load_image(&handle, &snapshot, header->pages - 1) :
891 load_image_lzo(&handle, &snapshot, header->pages - 1);
892 }
617 swap_reader_finish(&handle); 893 swap_reader_finish(&handle);
618end: 894end:
619 if (!error) 895 if (!error)
@@ -640,7 +916,7 @@ int swsusp_check(void)
640 if (error) 916 if (error)
641 goto put; 917 goto put;
642 918
643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 919 if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 920 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
645 /* Reset swap signature now */ 921 /* Reset swap signature now */
646 error = hib_bio_write_page(swsusp_resume_block, 922 error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +929,13 @@ put:
653 if (error) 929 if (error)
654 blkdev_put(hib_resume_bdev, FMODE_READ); 930 blkdev_put(hib_resume_bdev, FMODE_READ);
655 else 931 else
656 pr_debug("PM: Signature found, resuming\n"); 932 pr_debug("PM: Image signature found, resuming\n");
657 } else { 933 } else {
658 error = PTR_ERR(hib_resume_bdev); 934 error = PTR_ERR(hib_resume_bdev);
659 } 935 }
660 936
661 if (error) 937 if (error)
662 pr_debug("PM: Error %d checking image file\n", error); 938 pr_debug("PM: Image not found (code %d)\n", error);
663 939
664 return error; 940 return error;
665} 941}
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008a..2531017795f6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
85 * provides serialisation for access to the entire console 85 * provides serialisation for access to the entire console
86 * driver system. 86 * driver system.
87 */ 87 */
88static DECLARE_MUTEX(console_sem); 88static DEFINE_SEMAPHORE(console_sem);
89struct console *console_drivers; 89struct console *console_drivers;
90EXPORT_SYMBOL_GPL(console_drivers); 90EXPORT_SYMBOL_GPL(console_drivers);
91 91
@@ -556,7 +556,7 @@ static void zap_locks(void)
556 /* If a crash is occurring, make sure we can't deadlock */ 556 /* If a crash is occurring, make sure we can't deadlock */
557 spin_lock_init(&logbuf_lock); 557 spin_lock_init(&logbuf_lock);
558 /* And make sure that we print immediately */ 558 /* And make sure that we print immediately */
559 init_MUTEX(&console_sem); 559 sema_init(&console_sem, 1);
560} 560}
561 561
562#if defined(CONFIG_PRINTK_TIME) 562#if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74 74
75/** 75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? 76 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
77 * 77 *
78 * Check for bottom half being disabled, which covers both the 78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation. 81 * will show the situation. This is useful for debug checks in functions
82 * that require that they be called within an RCU read-side critical
83 * section.
82 * 84 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */ 86 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
86{ 88{
87 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
88 return 1; 90 return 1;
89 return in_softirq(); 91 return in_softirq() || irqs_disabled();
90} 92}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92 94
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 61
62/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
64static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp);
67
68#include "rcutiny_plugin.h"
69
62#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
63 71
64static long rcu_dynticks_nesting = 1; 72static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
140 rcu_sched_qs(cpu); 148 rcu_sched_qs(cpu);
141 else if (!in_softirq()) 149 else if (!in_softirq())
142 rcu_bh_qs(cpu); 150 rcu_bh_qs(cpu);
151 rcu_preempt_check_callbacks();
143} 152}
144 153
145/* 154/*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
163 if (rcp->curtail == rcp->donetail) 172 if (rcp->curtail == rcp->donetail)
164 rcp->curtail = &rcp->rcucblist; 173 rcp->curtail = &rcp->rcucblist;
174 rcu_preempt_remove_callbacks(rcp);
165 rcp->donetail = &rcp->rcucblist; 175 rcp->donetail = &rcp->rcucblist;
166 local_irq_restore(flags); 176 local_irq_restore(flags);
167 177
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
182{ 192{
183 __rcu_process_callbacks(&rcu_sched_ctrlblk); 193 __rcu_process_callbacks(&rcu_sched_ctrlblk);
184 __rcu_process_callbacks(&rcu_bh_ctrlblk); 194 __rcu_process_callbacks(&rcu_bh_ctrlblk);
195 rcu_preempt_process_callbacks();
185} 196}
186 197
187/* 198/*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
223} 234}
224 235
225/* 236/*
226 * Post an RCU callback to be invoked after the end of an RCU grace 237 * Post an RCU callback to be invoked after the end of an RCU-sched grace
227 * period. But since we have but one CPU, that would be after any 238 * period. But since we have but one CPU, that would be after any
228 * quiescent state. 239 * quiescent state.
229 */ 240 */
230void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 241void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
231{ 242{
232 __call_rcu(head, func, &rcu_sched_ctrlblk); 243 __call_rcu(head, func, &rcu_sched_ctrlblk);
233} 244}
234EXPORT_SYMBOL_GPL(call_rcu); 245EXPORT_SYMBOL_GPL(call_rcu_sched);
235 246
236/* 247/*
237 * Post an RCU bottom-half callback to be invoked after any subsequent 248 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
243} 254}
244EXPORT_SYMBOL_GPL(call_rcu_bh); 255EXPORT_SYMBOL_GPL(call_rcu_bh);
245 256
246void rcu_barrier(void)
247{
248 struct rcu_synchronize rcu;
249
250 init_rcu_head_on_stack(&rcu.head);
251 init_completion(&rcu.completion);
252 /* Will wake me after RCU finished. */
253 call_rcu(&rcu.head, wakeme_after_rcu);
254 /* Wait for it. */
255 wait_for_completion(&rcu.completion);
256 destroy_rcu_head_on_stack(&rcu.head);
257}
258EXPORT_SYMBOL_GPL(rcu_barrier);
259
260void rcu_barrier_bh(void) 257void rcu_barrier_bh(void)
261{ 258{
262 struct rcu_synchronize rcu; 259 struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
289{ 286{
290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291} 288}
292
293#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 19 *
20 * Copyright IBM Corporation, 2009 20 * Copyright (c) 2010 Linaro
21 * 21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#ifdef CONFIG_TINY_PREEMPT_RCU
26
27#include <linux/delay.h>
28
29/* Global control variables for preemptible RCU. */
30struct rcu_preempt_ctrlblk {
31 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
32 struct rcu_head **nexttail;
33 /* Tasks blocked in a preemptible RCU */
34 /* read-side critical section while an */
35 /* preemptible-RCU grace period is in */
36 /* progress must wait for a later grace */
37 /* period. This pointer points to the */
38 /* ->next pointer of the last task that */
39 /* must wait for a later grace period, or */
40 /* to &->rcb.rcucblist if there is no */
41 /* such task. */
42 struct list_head blkd_tasks;
43 /* Tasks blocked in RCU read-side critical */
44 /* section. Tasks are placed at the head */
45 /* of this list and age towards the tail. */
46 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */
49 /* is not such task. */
50 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */
54 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */
56 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */
60};
61
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
63 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
64 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
65 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
66 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
67};
68
69static int rcu_preempted_readers_exp(void);
70static void rcu_report_exp_done(void);
71
72/*
73 * Return true if the CPU has not yet responded to the current grace period.
74 */
75static int rcu_cpu_blocking_cur_gp(void)
76{
77 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
78}
79
80/*
81 * Check for a running RCU reader. Because there is only one CPU,
82 * there can be but one running RCU reader at a time. ;-)
83 */
84static int rcu_preempt_running_reader(void)
85{
86 return current->rcu_read_lock_nesting;
87}
88
89/*
90 * Check for preempted RCU readers blocking any grace period.
91 * If the caller needs a reliable answer, it must disable hard irqs.
92 */
93static int rcu_preempt_blocked_readers_any(void)
94{
95 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
96}
97
98/*
99 * Check for preempted RCU readers blocking the current grace period.
100 * If the caller needs a reliable answer, it must disable hard irqs.
101 */
102static int rcu_preempt_blocked_readers_cgp(void)
103{
104 return rcu_preempt_ctrlblk.gp_tasks != NULL;
105}
106
107/*
108 * Return true if another preemptible-RCU grace period is needed.
109 */
110static int rcu_preempt_needs_another_gp(void)
111{
112 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
113}
114
115/*
116 * Return true if a preemptible-RCU grace period is in progress.
117 * The caller must disable hardirqs.
118 */
119static int rcu_preempt_gp_in_progress(void)
120{
121 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
122}
123
124/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked
128 * while in an RCU read-side critical section.
129 *
130 * Unlike the other rcu_*_qs() functions, callers to this function
131 * must disable irqs in order to protect the assignment to
132 * ->rcu_read_unlock_special.
133 *
134 * Because this is a single-CPU implementation, the only way a grace
135 * period can end is if the CPU is in a quiescent state. The reason is
136 * that a blocked preemptible-RCU reader can exit its critical section
137 * only if the CPU is running it at the time. Therefore, when the
138 * last task blocking the current grace period exits its RCU read-side
139 * critical section, neither the CPU nor blocked tasks will be stopping
140 * the current grace period. (In contrast, SMP implementations
141 * might have CPUs running in RCU read-side critical sections that
142 * block later grace periods -- but this is not possible given only
143 * one CPU.)
144 */
145static void rcu_preempt_cpu_qs(void)
146{
147 /* Record both CPU and task as having responded to current GP. */
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150
151 /*
152 * If there is no GP, or if blocked readers are still blocking GP,
153 * then there is nothing more to do.
154 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
156 return;
157
158 /* Advance callbacks. */
159 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
160 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
161 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
162
163 /* If there are no blocked readers, next GP is done instantly. */
164 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ);
170}
171
172/*
173 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
174 */
175static void rcu_preempt_start_gp(void)
176{
177 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
178
179 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++;
181
182 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next;
186
187 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs();
190 }
191}
192
193/*
194 * We have entered the scheduler, and the current task might soon be
195 * context-switched away from. If this task is in an RCU read-side
196 * critical section, we will no longer be able to rely on the CPU to
197 * record that fact, so we enqueue the task on the blkd_tasks list.
198 * If the task started after the current grace period began, as recorded
199 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
200 * before the element referenced by ->gp_tasks (or at the tail if
201 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
202 * The task will dequeue itself when it exits the outermost enclosing
203 * RCU read-side critical section. Therefore, the current grace period
204 * cannot be permitted to complete until the ->gp_tasks pointer becomes
205 * NULL.
206 *
207 * Caller must disable preemption.
208 */
209void rcu_preempt_note_context_switch(void)
210{
211 struct task_struct *t = current;
212 unsigned long flags;
213
214 local_irq_save(flags); /* must exclude scheduler_tick(). */
215 if (rcu_preempt_running_reader() &&
216 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
217
218 /* Possibly blocking in an RCU read-side critical section. */
219 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
220
221 /*
222 * If this CPU has already checked in, then this task
223 * will hold up the next grace period rather than the
224 * current grace period. Queue the task accordingly.
225 * If the task is queued for the current grace period
226 * (i.e., this CPU has not yet passed through a quiescent
227 * state for the current grace period), then as long
228 * as that task remains queued, the current grace period
229 * cannot end.
230 */
231 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
232 if (rcu_cpu_blocking_cur_gp())
233 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
234 }
235
236 /*
237 * Either we were not in an RCU read-side critical section to
238 * begin with, or we have now recorded that critical section
239 * globally. Either way, we can now note a quiescent state
240 * for this CPU. Again, if we were in an RCU read-side critical
241 * section, and if that critical section was blocking the current
242 * grace period, then the fact that the task has been enqueued
243 * means that current grace period continues to be blocked.
244 */
245 rcu_preempt_cpu_qs();
246 local_irq_restore(flags);
247}
248
249/*
250 * Tiny-preemptible RCU implementation for rcu_read_lock().
251 * Just increment ->rcu_read_lock_nesting, shared state will be updated
252 * if we block.
253 */
254void __rcu_read_lock(void)
255{
256 current->rcu_read_lock_nesting++;
257 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
258}
259EXPORT_SYMBOL_GPL(__rcu_read_lock);
260
261/*
262 * Handle special cases during rcu_read_unlock(), such as needing to
263 * notify RCU core processing or task having blocked during the RCU
264 * read-side critical section.
265 */
266static void rcu_read_unlock_special(struct task_struct *t)
267{
268 int empty;
269 int empty_exp;
270 unsigned long flags;
271 struct list_head *np;
272 int special;
273
274 /*
275 * NMI handlers cannot block and cannot safely manipulate state.
276 * They therefore cannot possibly be special, so just leave.
277 */
278 if (in_nmi())
279 return;
280
281 local_irq_save(flags);
282
283 /*
284 * If RCU core is waiting for this CPU to exit critical section,
285 * let it know that we have done so.
286 */
287 special = t->rcu_read_unlock_special;
288 if (special & RCU_READ_UNLOCK_NEED_QS)
289 rcu_preempt_cpu_qs();
290
291 /* Hardware IRQ handlers cannot block. */
292 if (in_irq()) {
293 local_irq_restore(flags);
294 return;
295 }
296
297 /* Clean up if blocked during RCU read-side critical section. */
298 if (special & RCU_READ_UNLOCK_BLOCKED) {
299 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
300
301 /*
302 * Remove this task from the ->blkd_tasks list and adjust
303 * any pointers that might have been referencing it.
304 */
305 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next;
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np;
315 INIT_LIST_HEAD(&t->rcu_node_entry);
316
317 /*
318 * If this was the last task on the current list, and if
319 * we aren't waiting on the CPU, report the quiescent state
320 * and start a new grace period if needed.
321 */
322 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
323 rcu_preempt_cpu_qs();
324 rcu_preempt_start_gp();
325 }
326
327 /*
328 * If this was the last task on the expedited lists,
329 * then we need wake up the waiting task.
330 */
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done();
333 }
334 local_irq_restore(flags);
335}
336
337/*
338 * Tiny-preemptible RCU implementation for rcu_read_unlock().
339 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
340 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
341 * invoke rcu_read_unlock_special() to clean up after a context switch
342 * in an RCU read-side critical section and other special cases.
343 */
344void __rcu_read_unlock(void)
345{
346 struct task_struct *t = current;
347
348 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
349 --t->rcu_read_lock_nesting;
350 barrier(); /* decrement before load of ->rcu_read_unlock_special */
351 if (t->rcu_read_lock_nesting == 0 &&
352 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
353 rcu_read_unlock_special(t);
354#ifdef CONFIG_PROVE_LOCKING
355 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
356#endif /* #ifdef CONFIG_PROVE_LOCKING */
357}
358EXPORT_SYMBOL_GPL(__rcu_read_unlock);
359
360/*
361 * Check for a quiescent state from the current CPU. When a task blocks,
362 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
363 * checked elsewhere. This is called from the scheduling-clock interrupt.
364 *
365 * Caller must disable hard irqs.
366 */
367static void rcu_preempt_check_callbacks(void)
368{
369 struct task_struct *t = current;
370
371 if (rcu_preempt_gp_in_progress() &&
372 (!rcu_preempt_running_reader() ||
373 !rcu_cpu_blocking_cur_gp()))
374 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ);
378 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader())
381 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
382}
383
384/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there
390 * is no need for an explicit check.
391 */
392static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
393{
394 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
395 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
396}
397
398/*
399 * Process callbacks for preemptible RCU.
400 */
401static void rcu_preempt_process_callbacks(void)
402{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404}
405
406/*
407 * Queue a preemptible -RCU callback for invocation after a grace period.
408 */
409void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
410{
411 unsigned long flags;
412
413 debug_rcu_head_queue(head);
414 head->func = func;
415 head->next = NULL;
416
417 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next;
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags);
422}
423EXPORT_SYMBOL_GPL(call_rcu);
424
425void rcu_barrier(void)
426{
427 struct rcu_synchronize rcu;
428
429 init_rcu_head_on_stack(&rcu.head);
430 init_completion(&rcu.completion);
431 /* Will wake me after RCU finished. */
432 call_rcu(&rcu.head, wakeme_after_rcu);
433 /* Wait for it. */
434 wait_for_completion(&rcu.completion);
435 destroy_rcu_head_on_stack(&rcu.head);
436}
437EXPORT_SYMBOL_GPL(rcu_barrier);
438
439/*
440 * synchronize_rcu - wait until a grace period has elapsed.
441 *
442 * Control will return to the caller some time after a full grace
443 * period has elapsed, in other words after all currently executing RCU
444 * read-side critical sections have completed. RCU read-side critical
445 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
446 * and may be nested.
447 */
448void synchronize_rcu(void)
449{
450#ifdef CONFIG_DEBUG_LOCK_ALLOC
451 if (!rcu_scheduler_active)
452 return;
453#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
454
455 WARN_ON_ONCE(rcu_preempt_running_reader());
456 if (!rcu_preempt_blocked_readers_any())
457 return;
458
459 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
460 rcu_barrier();
461}
462EXPORT_SYMBOL_GPL(synchronize_rcu);
463
464static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
465static unsigned long sync_rcu_preempt_exp_count;
466static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
467
468/*
469 * Return non-zero if there are any tasks in RCU read-side critical
470 * sections blocking the current preemptible-RCU expedited grace period.
471 * If there is no preemptible-RCU expedited grace period currently in
472 * progress, returns zero unconditionally.
473 */
474static int rcu_preempted_readers_exp(void)
475{
476 return rcu_preempt_ctrlblk.exp_tasks != NULL;
477}
478
479/*
480 * Report the exit from RCU read-side critical section for the last task
481 * that queued itself during or before the current expedited preemptible-RCU
482 * grace period.
483 */
484static void rcu_report_exp_done(void)
485{
486 wake_up(&sync_rcu_preempt_exp_wq);
487}
488
489/*
490 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
491 * is to rely in the fact that there is but one CPU, and that it is
492 * illegal for a task to invoke synchronize_rcu_expedited() while in a
493 * preemptible-RCU read-side critical section. Therefore, any such
494 * critical sections must correspond to blocked tasks, which must therefore
495 * be on the ->blkd_tasks list. So just record the current head of the
496 * list in the ->exp_tasks pointer, and wait for all tasks including and
497 * after the task pointed to by ->exp_tasks to drain.
498 */
499void synchronize_rcu_expedited(void)
500{
501 unsigned long flags;
502 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
503 unsigned long snap;
504
505 barrier(); /* ensure prior action seen before grace period. */
506
507 WARN_ON_ONCE(rcu_preempt_running_reader());
508
509 /*
510 * Acquire lock so that there is only one preemptible RCU grace
511 * period in flight. Of course, if someone does the expedited
512 * grace period for us while we are acquiring the lock, just leave.
513 */
514 snap = sync_rcu_preempt_exp_count + 1;
515 mutex_lock(&sync_rcu_preempt_exp_mutex);
516 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
517 goto unlock_mb_ret; /* Others did our work for us. */
518
519 local_irq_save(flags);
520
521 /*
522 * All RCU readers have to already be on blkd_tasks because
523 * we cannot legally be executing in an RCU read-side critical
524 * section.
525 */
526
527 /* Snapshot current head of ->blkd_tasks list. */
528 rpcp->exp_tasks = rpcp->blkd_tasks.next;
529 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
530 rpcp->exp_tasks = NULL;
531 local_irq_restore(flags);
532
533 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp())
535 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp());
537
538 /* Clean up and exit. */
539 barrier(); /* ensure expedited GP seen before counter increment. */
540 sync_rcu_preempt_exp_count++;
541unlock_mb_ret:
542 mutex_unlock(&sync_rcu_preempt_exp_mutex);
543 barrier(); /* ensure subsequent action seen after grace period. */
544}
545EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
546
547/*
548 * Does preemptible RCU need the CPU to stay out of dynticks mode?
549 */
550int rcu_preempt_needs_cpu(void)
551{
552 if (!rcu_preempt_running_reader())
553 rcu_preempt_cpu_qs();
554 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
555}
556
557/*
558 * Check for a task exiting while in a preemptible -RCU read-side
559 * critical section, clean up if so. No need to issue warnings,
560 * as debug_check_no_locks_held() already does this if lockdep
561 * is enabled.
562 */
563void exit_rcu(void)
564{
565 struct task_struct *t = current;
566
567 if (t->rcu_read_lock_nesting == 0)
568 return;
569 t->rcu_read_lock_nesting = 1;
570 rcu_read_unlock();
571}
572
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574
575/*
576 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check.
578 */
579static void rcu_preempt_check_callbacks(void)
580{
581}
582
583/*
584 * Because preemptible RCU does not exist, it never has any callbacks
585 * to remove.
586 */
587static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
588{
589}
590
591/*
592 * Because preemptible RCU does not exist, it never has any callbacks
593 * to process.
594 */
595static void rcu_preempt_process_callbacks(void)
596{
597}
598
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600
25#ifdef CONFIG_DEBUG_LOCK_ALLOC 601#ifdef CONFIG_DEBUG_LOCK_ALLOC
26 602
27#include <linux/kernel_stat.h> 603#include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
120}; 120};
121 121
122static LIST_HEAD(rcu_torture_freelist); 122static LIST_HEAD(rcu_torture_freelist);
123static struct rcu_torture *rcu_torture_current; 123static struct rcu_torture __rcu *rcu_torture_current;
124static long rcu_torture_current_version; 124static long rcu_torture_current_version;
125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
126static DEFINE_SPINLOCK(rcu_torture_lock); 126static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
155static int fullstop = FULLSTOP_RMMOD; 155static int fullstop = FULLSTOP_RMMOD;
156DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ 156/*
157 /* of kthreads. */ 157 * Protect fullstop transitions and spawning of kthreads.
158 */
159static DEFINE_MUTEX(fullstop_mutex);
158 160
159/* 161/*
160 * Detect and respond to a system shutdown. 162 * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
303 mdelay(longdelay_ms); 305 mdelay(longdelay_ms);
304 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 306 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
305 udelay(shortdelay_us); 307 udelay(shortdelay_us);
308#ifdef CONFIG_PREEMPT
309 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
310 preempt_schedule(); /* No QS if preempt_disable() in effect */
311#endif
306} 312}
307 313
308static void rcu_torture_read_unlock(int idx) __releases(RCU) 314static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
536 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 542 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
537 if (!delay) 543 if (!delay)
538 schedule_timeout_interruptible(longdelay); 544 schedule_timeout_interruptible(longdelay);
545 else
546 rcu_read_delay(rrsp);
539} 547}
540 548
541static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 549static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
731 continue; 739 continue;
732 rp->rtort_pipe_count = 0; 740 rp->rtort_pipe_count = 0;
733 udelay(rcu_random(&rand) & 0x3ff); 741 udelay(rcu_random(&rand) & 0x3ff);
734 old_rp = rcu_torture_current; 742 old_rp = rcu_dereference_check(rcu_torture_current,
743 current == writer_task);
735 rp->rtort_mbtest = 1; 744 rp->rtort_mbtest = 1;
736 rcu_assign_pointer(rcu_torture_current, rp); 745 rcu_assign_pointer(rcu_torture_current, rp);
737 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 746 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
143module_param(qhimark, int, 0); 143module_param(qhimark, int, 0);
144module_param(qlowmark, int, 0); 144module_param(qlowmark, int, 0);
145 145
146#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
147int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
148module_param(rcu_cpu_stall_suppress, int, 0644);
149#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
150
146static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 151static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
147static int rcu_pending(int cpu); 152static int rcu_pending(int cpu);
148 153
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 455
451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 456#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
452 457
453int rcu_cpu_stall_panicking __read_mostly; 458int rcu_cpu_stall_suppress __read_mostly;
454 459
455static void record_gp_stall_check_time(struct rcu_state *rsp) 460static void record_gp_stall_check_time(struct rcu_state *rsp)
456{ 461{
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
482 rcu_print_task_stall(rnp); 487 rcu_print_task_stall(rnp);
483 raw_spin_unlock_irqrestore(&rnp->lock, flags); 488 raw_spin_unlock_irqrestore(&rnp->lock, flags);
484 489
485 /* OK, time to rat on our buddy... */ 490 /*
486 491 * OK, time to rat on our buddy...
492 * See Documentation/RCU/stallwarn.txt for info on how to debug
493 * RCU CPU stall warnings.
494 */
487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 495 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name); 496 rsp->name);
489 rcu_for_each_leaf_node(rsp, rnp) { 497 rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
512 unsigned long flags; 520 unsigned long flags;
513 struct rcu_node *rnp = rcu_get_root(rsp); 521 struct rcu_node *rnp = rcu_get_root(rsp);
514 522
523 /*
524 * OK, time to rat on ourselves...
525 * See Documentation/RCU/stallwarn.txt for info on how to debug
526 * RCU CPU stall warnings.
527 */
515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 528 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 529 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
517 trigger_all_cpu_backtrace(); 530 trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
530 long delta; 543 long delta;
531 struct rcu_node *rnp; 544 struct rcu_node *rnp;
532 545
533 if (rcu_cpu_stall_panicking) 546 if (rcu_cpu_stall_suppress)
534 return; 547 return;
535 delta = jiffies - rsp->jiffies_stall; 548 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
536 rnp = rdp->mynode; 549 rnp = rdp->mynode;
537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 550 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
538 551
539 /* We haven't checked in, so go dump stack. */ 552 /* We haven't checked in, so go dump stack. */
540 print_cpu_stall(rsp); 553 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
548 561
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 562static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{ 563{
551 rcu_cpu_stall_panicking = 1; 564 rcu_cpu_stall_suppress = 1;
552 return NOTIFY_DONE; 565 return NOTIFY_DONE;
553} 566}
554 567
568/**
569 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
570 *
571 * Set the stall-warning timeout way off into the future, thus preventing
572 * any RCU CPU stall-warning messages from appearing in the current set of
573 * RCU grace periods.
574 *
575 * The caller must disable hard irqs.
576 */
577void rcu_cpu_stall_reset(void)
578{
579 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
580 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
581 rcu_preempt_stall_reset();
582}
583
555static struct notifier_block rcu_panic_block = { 584static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic, 585 .notifier_call = rcu_panic,
557}; 586};
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
571{ 600{
572} 601}
573 602
603void rcu_cpu_stall_reset(void)
604{
605}
606
574static void __init check_cpu_stall_init(void) 607static void __init check_cpu_stall_init(void)
575{ 608{
576} 609}
@@ -712,7 +745,7 @@ static void
712rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 745rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
713 __releases(rcu_get_root(rsp)->lock) 746 __releases(rcu_get_root(rsp)->lock)
714{ 747{
715 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 748 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
716 struct rcu_node *rnp = rcu_get_root(rsp); 749 struct rcu_node *rnp = rcu_get_root(rsp);
717 750
718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 751 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
960static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
961{ 994{
962 int i; 995 int i;
963 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
964 997
965 if (rdp->nxtlist == NULL) 998 if (rdp->nxtlist == NULL)
966 return; /* irqs disabled, so comparison is stable. */ 999 return; /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
971 for (i = 0; i < RCU_NEXT_SIZE; i++) 1004 for (i = 0; i < RCU_NEXT_SIZE; i++)
972 rdp->nxttail[i] = &rdp->nxtlist; 1005 rdp->nxttail[i] = &rdp->nxtlist;
973 rsp->orphan_qlen += rdp->qlen; 1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
974 rdp->qlen = 0; 1008 rdp->qlen = 0;
975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
976} 1010}
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
984 struct rcu_data *rdp; 1018 struct rcu_data *rdp;
985 1019
986 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
987 rdp = rsp->rda[smp_processor_id()]; 1021 rdp = this_cpu_ptr(rsp->rda);
988 if (rsp->orphan_cbs_list == NULL) { 1022 if (rsp->orphan_cbs_list == NULL) {
989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
990 return; 1024 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
993 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; 1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
994 rdp->qlen += rsp->orphan_qlen; 1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
995 rsp->orphan_cbs_list = NULL; 1030 rsp->orphan_cbs_list = NULL;
996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
997 rsp->orphan_qlen = 0; 1032 rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1007 unsigned long flags; 1042 unsigned long flags;
1008 unsigned long mask; 1043 unsigned long mask;
1009 int need_report = 0; 1044 int need_report = 0;
1010 struct rcu_data *rdp = rsp->rda[cpu]; 1045 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1011 struct rcu_node *rnp; 1046 struct rcu_node *rnp;
1012 1047
1013 /* Exclude any attempts to start a new grace period. */ 1048 /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1123 1158
1124 /* Update count, and requeue any remaining callbacks. */ 1159 /* Update count, and requeue any remaining callbacks. */
1125 rdp->qlen -= count; 1160 rdp->qlen -= count;
1161 rdp->n_cbs_invoked += count;
1126 if (list != NULL) { 1162 if (list != NULL) {
1127 *tail = rdp->nxtlist; 1163 *tail = rdp->nxtlist;
1128 rdp->nxtlist = list; 1164 rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1226 cpu = rnp->grplo; 1262 cpu = rnp->grplo;
1227 bit = 1; 1263 bit = 1;
1228 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 1264 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1229 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1265 if ((rnp->qsmask & bit) != 0 &&
1266 f(per_cpu_ptr(rsp->rda, cpu)))
1230 mask |= bit; 1267 mask |= bit;
1231 } 1268 }
1232 if (mask != 0) { 1269 if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1402 * a quiescent state betweentimes. 1439 * a quiescent state betweentimes.
1403 */ 1440 */
1404 local_irq_save(flags); 1441 local_irq_save(flags);
1405 rdp = rsp->rda[smp_processor_id()]; 1442 rdp = this_cpu_ptr(rsp->rda);
1406 rcu_process_gp_end(rsp, rdp); 1443 rcu_process_gp_end(rsp, rdp);
1407 check_for_new_grace_period(rsp, rdp); 1444 check_for_new_grace_period(rsp, rdp);
1408 1445
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1701{ 1738{
1702 unsigned long flags; 1739 unsigned long flags;
1703 int i; 1740 int i;
1704 struct rcu_data *rdp = rsp->rda[cpu]; 1741 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1705 struct rcu_node *rnp = rcu_get_root(rsp); 1742 struct rcu_node *rnp = rcu_get_root(rsp);
1706 1743
1707 /* Set up local state, ensuring consistent view of global state. */ 1744 /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1729{ 1766{
1730 unsigned long flags; 1767 unsigned long flags;
1731 unsigned long mask; 1768 unsigned long mask;
1732 struct rcu_data *rdp = rsp->rda[cpu]; 1769 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1733 struct rcu_node *rnp = rcu_get_root(rsp); 1770 struct rcu_node *rnp = rcu_get_root(rsp);
1734 1771
1735 /* Set up local state, ensuring consistent view of global state. */ 1772 /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1865/* 1902/*
1866 * Helper function for rcu_init() that initializes one rcu_state structure. 1903 * Helper function for rcu_init() that initializes one rcu_state structure.
1867 */ 1904 */
1868static void __init rcu_init_one(struct rcu_state *rsp) 1905static void __init rcu_init_one(struct rcu_state *rsp,
1906 struct rcu_data __percpu *rda)
1869{ 1907{
1870 static char *buf[] = { "rcu_node_level_0", 1908 static char *buf[] = { "rcu_node_level_0",
1871 "rcu_node_level_1", 1909 "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1918 } 1956 }
1919 } 1957 }
1920 1958
1959 rsp->rda = rda;
1921 rnp = rsp->level[NUM_RCU_LVLS - 1]; 1960 rnp = rsp->level[NUM_RCU_LVLS - 1];
1922 for_each_possible_cpu(i) { 1961 for_each_possible_cpu(i) {
1923 while (i > rnp->grphi) 1962 while (i > rnp->grphi)
1924 rnp++; 1963 rnp++;
1925 rsp->rda[i]->mynode = rnp; 1964 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
1926 rcu_boot_init_percpu_data(i, rsp); 1965 rcu_boot_init_percpu_data(i, rsp);
1927 } 1966 }
1928} 1967}
1929 1968
1930/*
1931 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1932 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1933 * structure.
1934 */
1935#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1936do { \
1937 int i; \
1938 \
1939 for_each_possible_cpu(i) { \
1940 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1941 } \
1942 rcu_init_one(rsp); \
1943} while (0)
1944
1945void __init rcu_init(void) 1969void __init rcu_init(void)
1946{ 1970{
1947 int cpu; 1971 int cpu;
1948 1972
1949 rcu_bootup_announce(); 1973 rcu_bootup_announce();
1950 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1974 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1951 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1975 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1952 __rcu_init_preempt(); 1976 __rcu_init_preempt();
1953 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1977 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1954 1978
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
202 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check; 203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 204 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
205 unsigned long n_force_qs_snap; 208 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */ 209 /* did other CPU force QS recently? */
207 long blimit; /* Upper limit on a processed batch */ 210 long blimit; /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
254#define RCU_STALL_DELAY_DELTA 0 257#define RCU_STALL_DELAY_DELTA 0
255#endif 258#endif
256 259
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) 260#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
261 RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */ 262 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) 263#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
260 /* for rsp->jiffies_stall */ 264 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 265#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */ 266 /* to take at least one */
263 /* scheduling clock irq */ 267 /* scheduling clock irq */
264 /* before ratting on them. */ 268 /* before ratting on them. */
265 269
266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 270#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
271#define RCU_CPU_STALL_SUPPRESS_INIT 0
272#else
273#define RCU_CPU_STALL_SUPPRESS_INIT 1
274#endif
267 275
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
270 277
271/* 278/*
272 * RCU global state, including node hierarchy. This hierarchy is 279 * RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
283 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 290 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
284 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 291 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
285 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 292 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
286 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ 293 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
287 294
288 /* The following fields are guarded by the root rcu_node's lock. */ 295 /* The following fields are guarded by the root rcu_node's lock. */
289 296
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 372#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp); 373static void rcu_print_detail_task_stall(struct rcu_state *rsp);
367static void rcu_print_task_stall(struct rcu_node *rnp); 374static void rcu_print_task_stall(struct rcu_node *rnp);
375static void rcu_preempt_stall_reset(void);
368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 376#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 377static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
370#ifdef CONFIG_HOTPLUG_CPU 378#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
57 printk(KERN_INFO 57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n"); 58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif 59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE 60#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif 62#endif
63#if NUM_RCU_LVL_4 != 0 63#if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
155 155
156 /* Possibly blocking in an RCU read-side critical section. */ 156 /* Possibly blocking in an RCU read-side critical section. */
157 rdp = rcu_preempt_state.rda[cpu]; 157 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
158 rnp = rdp->mynode; 158 rnp = rdp->mynode;
159 raw_spin_lock_irqsave(&rnp->lock, flags); 159 raw_spin_lock_irqsave(&rnp->lock, flags);
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
201 */ 201 */
202void __rcu_read_lock(void) 202void __rcu_read_lock(void)
203{ 203{
204 ACCESS_ONCE(current->rcu_read_lock_nesting)++; 204 current->rcu_read_lock_nesting++;
205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ 205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
206} 206}
207EXPORT_SYMBOL_GPL(__rcu_read_lock); 207EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
344 struct task_struct *t = current; 344 struct task_struct *t = current;
345 345
346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 347 --t->rcu_read_lock_nesting;
348 barrier(); /* decrement before load of ->rcu_read_unlock_special */
349 if (t->rcu_read_lock_nesting == 0 &&
348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 350 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
349 rcu_read_unlock_special(t); 351 rcu_read_unlock_special(t);
350#ifdef CONFIG_PROVE_LOCKING 352#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
417 } 419 }
418} 420}
419 421
422/*
423 * Suppress preemptible RCU's CPU stall warnings by pushing the
424 * time of the next stall-warning message comfortably far into the
425 * future.
426 */
427static void rcu_preempt_stall_reset(void)
428{
429 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
430}
431
420#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 432#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
421 433
422/* 434/*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
546 * 558 *
547 * Control will return to the caller some time after a full grace 559 * Control will return to the caller some time after a full grace
548 * period has elapsed, in other words after all currently executing RCU 560 * period has elapsed, in other words after all currently executing RCU
549 * read-side critical sections have completed. RCU read-side critical 561 * read-side critical sections have completed. Note, however, that
550 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 562 * upon return from synchronize_rcu(), the caller might well be executing
551 * and may be nested. 563 * concurrently with new RCU read-side critical sections that began while
564 * synchronize_rcu() was waiting. RCU read-side critical sections are
565 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
552 */ 566 */
553void synchronize_rcu(void) 567void synchronize_rcu(void)
554{ 568{
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
771 */ 785 */
772static void __init __rcu_init_preempt(void) 786static void __init __rcu_init_preempt(void)
773{ 787{
774 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); 788 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
775} 789}
776 790
777/* 791/*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
865{ 879{
866} 880}
867 881
882/*
883 * Because preemptible RCU does not exist, there is no need to suppress
884 * its CPU stall warnings.
885 */
886static void rcu_preempt_stall_reset(void)
887{
888}
889
868#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 890#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
869 891
870/* 892/*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
919} 941}
920 942
921/* 943/*
922 * In classic RCU, call_rcu() is just call_rcu_sched().
923 */
924void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
925{
926 call_rcu_sched(head, func);
927}
928EXPORT_SYMBOL_GPL(call_rcu);
929
930/*
931 * Wait for an rcu-preempt grace period, but make it happen quickly. 944 * Wait for an rcu-preempt grace period, but make it happen quickly.
932 * But because preemptable RCU does not exist, map to rcu-sched. 945 * But because preemptable RCU does not exist, map to rcu-sched.
933 */ 946 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 64 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 65#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); 67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
68} 70}
69 71
70#define PRINT_RCU_DATA(name, func, m) \ 72#define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
119 rdp->dynticks_fqs); 121 rdp->dynticks_fqs);
120#endif /* #ifdef CONFIG_NO_HZ */ 122#endif /* #ifdef CONFIG_NO_HZ */
121 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
122 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); 124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
123} 127}
124 128
125static int show_rcudata_csv(struct seq_file *m, void *unused) 129static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
128#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
130#endif /* #ifdef CONFIG_NO_HZ */ 134#endif /* #ifdef CONFIG_NO_HZ */
131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
132#ifdef CONFIG_TREE_PREEMPT_RCU 136#ifdef CONFIG_TREE_PREEMPT_RCU
133 seq_puts(m, "\"rcu_preempt:\"\n"); 137 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
262 struct rcu_data *rdp; 266 struct rcu_data *rdp;
263 267
264 for_each_possible_cpu(cpu) { 268 for_each_possible_cpu(cpu) {
265 rdp = rsp->rda[cpu]; 269 rdp = per_cpu_ptr(rsp->rda, cpu);
266 if (rdp->beenonline) 270 if (rdp->beenonline)
267 print_one_rcu_pending(m, rdp); 271 print_one_rcu_pending(m, rdp);
268 } 272 }
diff --git a/kernel/sched.c b/kernel/sched.c
index c0d2067f3e0d..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
643 646
644#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
645 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
646inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
647{ 653{
648 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
650} 665}
651 666
652/* 667/*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
724{ 739{
725 char buf[64]; 740 char buf[64];
726 char *cmp = buf; 741 char *cmp;
727 int neg = 0; 742 int neg = 0;
728 int i; 743 int i;
729 744
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 749 return -EFAULT;
735 750
736 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
737 753
738 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 755 neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 757 }
742 758
743 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 761 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
749 else 763 else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1840 1854
1841static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1842 1856
1843#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1844#define for_each_class(class) \ 1858#define for_each_class(class) \
1845 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1846 1860
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1858 1872
1859static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1860{ 1874{
1861 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = 0;
1863 p->se.load.inv_weight = WMULT_CONST;
1864 return;
1865 }
1866
1867 /* 1875 /*
1868 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1869 */ 1877 */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1917 dec_nr_running(rq); 1925 dec_nr_running(rq);
1918} 1926}
1919 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1920#include "sched_idletask.c" 2016#include "sched_idletask.c"
1921#include "sched_fair.c" 2017#include "sched_fair.c"
1922#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1923#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1924# include "sched_debug.c" 2021# include "sched_debug.c"
1925#endif 2022#endif
1926 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1927/* 2054/*
1928 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1929 */ 2056 */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
2004 return 0; 2131 return 0;
2005 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
2006 /* 2136 /*
2007 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
2008 */ 2138 */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2852 */ 2982 */
2853 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2854 2984
2855 if (likely(!mm)) { 2985 if (!mm) {
2856 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2857 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2858 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2859 } else 2989 } else
2860 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2861 2991
2862 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2863 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2865 } 2995 }
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3248 3378
3249 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3250 update_rq_clock(rq); 3380 update_rq_clock(rq);
3251 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3252 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3253 ns = 0; 3383 ns = 0;
3254 } 3384 }
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3397 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3398 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3399 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3400 else if (softirq_count()) 3530 else if (in_serving_softirq())
3401 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3402 else 3532 else
3403 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
3723 return p; 3853 return p;
3724 } 3854 }
3725 3855
3726 class = sched_class_highest; 3856 for_each_class(class) {
3727 for ( ; ; ) {
3728 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3729 if (p) 3858 if (p)
3730 return p; 3859 return p;
3731 /*
3732 * Will never be NULL as the idle class always
3733 * returns a non-NULL p:
3734 */
3735 class = class->next;
3736 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3737} 3863}
3738 3864
3739/* 3865/*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4358 4484
4359 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4360 4486
4487 trace_sched_pi_setprio(p, prio);
4361 oldprio = p->prio; 4488 oldprio = p->prio;
4362 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4363 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4645,7 +4772,7 @@ recheck:
4645 } 4772 }
4646 4773
4647 if (user) { 4774 if (user) {
4648 retval = security_task_setscheduler(p, policy, param); 4775 retval = security_task_setscheduler(p);
4649 if (retval) 4776 if (retval)
4650 return retval; 4777 return retval;
4651 } 4778 }
@@ -4661,6 +4788,15 @@ recheck:
4661 */ 4788 */
4662 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4663 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4664#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4665 if (user) { 4801 if (user) {
4666 /* 4802 /*
@@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4887 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5023 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4888 goto out_unlock; 5024 goto out_unlock;
4889 5025
4890 retval = security_task_setscheduler(p, 0, NULL); 5026 retval = security_task_setscheduler(p);
4891 if (retval) 5027 if (retval)
4892 goto out_unlock; 5028 goto out_unlock;
4893 5029
4894 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4895 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4896 again: 5032again:
4897 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4898 5034
4899 if (!retval) { 5035 if (!retval) {
@@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5337 idle->se.exec_start = sched_clock(); 5473 idle->se.exec_start = sched_clock();
5338 5474
5339 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5475 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5476 /*
5477 * We're having a chicken and egg problem, even though we are
5478 * holding rq->lock, the cpu isn't yet set to this cpu so the
5479 * lockdep check in task_group() will fail.
5480 *
5481 * Similar case to sched_fork(). / Alternatively we could
5482 * use task_rq_lock() here and obtain the other rq->lock.
5483 *
5484 * Silence PROVE_RCU
5485 */
5486 rcu_read_lock();
5340 __set_task_cpu(idle, cpu); 5487 __set_task_cpu(idle, cpu);
5488 rcu_read_unlock();
5341 5489
5342 rq->curr = rq->idle = idle; 5490 rq->curr = rq->idle = idle;
5343#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5491#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6662,7 @@ struct s_data {
6514 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6515 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6516 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6517 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6518 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6519 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6525,6 +6674,7 @@ enum s_alloc {
6525 sa_rootdomain, 6674 sa_rootdomain,
6526 sa_tmpmask, 6675 sa_tmpmask,
6527 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6528 sa_this_core_map, 6678 sa_this_core_map,
6529 sa_this_sibling_map, 6679 sa_this_sibling_map,
6530 sa_nodemask, 6680 sa_nodemask,
@@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6560#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6561static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6562static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6563#endif /* CONFIG_SCHED_MC */
6564 6713
6565#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6566static int 6714static int
6567cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6568 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6569{ 6717{
6570 int group; 6718 int group;
6571 6719#ifdef CONFIG_SCHED_SMT
6572 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6573 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6574 if (sg) 6725 if (sg)
6575 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6576 return group; 6727 return group;
6577} 6728}
6578#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6579static int 6738static int
6580cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6581 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6582{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6583 if (sg) 6750 if (sg)
6584 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6585 return cpu; 6752 return group;
6586} 6753}
6587#endif 6754#endif /* CONFIG_SCHED_BOOK */
6588 6755
6589static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6590static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6594 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6595{ 6762{
6596 int group; 6763 int group;
6597#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6598 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6599 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6600#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6855#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6856 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6857#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6858 7031
6859static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6860 7033
@@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6904 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6905 case sa_send_covered: 7078 case sa_send_covered:
6906 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6907 case sa_this_core_map: 7082 case sa_this_core_map:
6908 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6909 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6950 return sa_nodemask; 7125 return sa_nodemask;
6951 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6952 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6953 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6954 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6955 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6956 return sa_send_covered; 7133 return sa_send_covered;
6957 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7009 return sd; 7186 return sd;
7010} 7187}
7011 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
7012static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7013 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7014 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7066 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
7067 break; 7261 break;
7068#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
7069 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
7070 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7071 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7113 7316
7114 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7115 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7116 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7117 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7118 } 7322 }
7119 7323
7120 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
7121 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7122 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7123 } 7328 }
7124 7329
@@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7149 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
7150 } 7355 }
7151#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
7152 7363
7153 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
7154 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7174 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
7175#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
7176 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
7177#else 7390#else
7178 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
7179#endif 7392#endif
@@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8078 8291
8079 return 1; 8292 return 1;
8080 8293
8081 err_free_rq: 8294err_free_rq:
8082 kfree(cfs_rq); 8295 kfree(cfs_rq);
8083 err: 8296err:
8084 return 0; 8297 return 0;
8085} 8298}
8086 8299
@@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8168 8381
8169 return 1; 8382 return 1;
8170 8383
8171 err_free_rq: 8384err_free_rq:
8172 kfree(rt_rq); 8385 kfree(rt_rq);
8173 err: 8386err:
8174 return 0; 8387 return 0;
8175} 8388}
8176 8389
@@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8528 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8529 } 8742 }
8530 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8531 unlock: 8744unlock:
8532 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8533 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8534 8747
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49d..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25 25
26/* 26/*
27 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 28 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 29 *
30 * NOTE: this latency value is not the same as the concept of 30 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 31 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 750000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
519static void update_curr(struct cfs_rq *cfs_rq) 519static void update_curr(struct cfs_rq *cfs_rq)
520{ 520{
521 struct sched_entity *curr = cfs_rq->curr; 521 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 522 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 523 unsigned long delta_exec;
524 524
525 if (unlikely(!curr)) 525 if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 602 /*
603 * We are starting a new run period: 603 * We are starting a new run period:
604 */ 604 */
605 se->exec_start = rq_of(cfs_rq)->clock; 605 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 606}
607 607
608/************************************************** 608/**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1767} 1771}
1768 1772
1769/* 1773/*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1798 * 2) too many balance attempts have failed. 1802 * 2) too many balance attempts have failed.
1799 */ 1803 */
1800 1804
1801 tsk_cache_hot = task_hot(p, rq->clock, sd); 1805 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1802 if (!tsk_cache_hot || 1806 if (!tsk_cache_hot ||
1803 sd->nr_balance_failed > sd->cache_nice_tries) { 1807 sd->nr_balance_failed > sd->cache_nice_tries) {
1804#ifdef CONFIG_SCHEDSTATS 1808#ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
2030 unsigned long this_load; 2034 unsigned long this_load;
2031 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2032 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2033 2038
2034 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2035 unsigned long max_load; 2040 unsigned long max_load;
2036 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2037 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2038 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2039 2045
2040 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059 unsigned long group_capacity; 2065 unsigned long group_capacity;
2060 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2061}; 2068};
2062 2069
2063/** 2070/**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
2268 u64 total, available; 2275 u64 total, available;
2269 2276
2270 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2277 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2271 available = total - rq->rt_avg; 2278
2279 if (unlikely(total < rq->rt_avg)) {
2280 /* Ensures that power won't end up being negative */
2281 available = 0;
2282 } else {
2283 available = total - rq->rt_avg;
2284 }
2272 2285
2273 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2286 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2274 total = SCHED_LOAD_SCALE; 2287 total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2378 int local_group, const struct cpumask *cpus, 2391 int local_group, const struct cpumask *cpus,
2379 int *balance, struct sg_lb_stats *sgs) 2392 int *balance, struct sg_lb_stats *sgs)
2380{ 2393{
2381 unsigned long load, max_cpu_load, min_cpu_load; 2394 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2382 int i; 2395 int i;
2383 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2396 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2384 unsigned long avg_load_per_task = 0; 2397 unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2389 /* Tally up the load of all CPUs in the group */ 2402 /* Tally up the load of all CPUs in the group */
2390 max_cpu_load = 0; 2403 max_cpu_load = 0;
2391 min_cpu_load = ~0UL; 2404 min_cpu_load = ~0UL;
2405 max_nr_running = 0;
2392 2406
2393 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2407 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2394 struct rq *rq = cpu_rq(i); 2408 struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2406 load = target_load(i, load_idx); 2420 load = target_load(i, load_idx);
2407 } else { 2421 } else {
2408 load = source_load(i, load_idx); 2422 load = source_load(i, load_idx);
2409 if (load > max_cpu_load) 2423 if (load > max_cpu_load) {
2410 max_cpu_load = load; 2424 max_cpu_load = load;
2425 max_nr_running = rq->nr_running;
2426 }
2411 if (min_cpu_load > load) 2427 if (min_cpu_load > load)
2412 min_cpu_load = load; 2428 min_cpu_load = load;
2413 } 2429 }
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2447 if (sgs->sum_nr_running) 2463 if (sgs->sum_nr_running)
2448 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2464 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2449 2465
2450 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2466 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2451 sgs->group_imb = 1; 2467 sgs->group_imb = 1;
2452 2468
2453 sgs->group_capacity = 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455 if (!sgs->group_capacity) 2470 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group); 2471 sgs->group_capacity = fix_small_capacity(sd, group);
2472
2473 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1;
2457} 2475}
2458 2476
2459/** 2477/**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2542 /* 2560 /*
2543 * In case the child domain prefers tasks go to siblings 2561 * In case the child domain prefers tasks go to siblings
2544 * first, lower the sg capacity to one so that we'll try 2562 * first, lower the sg capacity to one so that we'll try
2545 * and move all the excess tasks away. 2563 * and move all the excess tasks away. We lower the capacity
2564 * of a group only if the local group has the capacity to fit
2565 * these excess tasks, i.e. nr_running < group_capacity. The
2566 * extra check prevents the case where you always pull from the
2567 * heaviest group when it is already under-utilized (possible
2568 * with a large weight task outweighs the tasks on the system).
2546 */ 2569 */
2547 if (prefer_sibling) 2570 if (prefer_sibling && !local_group && sds->this_has_capacity)
2548 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2571 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2549 2572
2550 if (local_group) { 2573 if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2552 sds->this = sg; 2575 sds->this = sg;
2553 sds->this_nr_running = sgs.sum_nr_running; 2576 sds->this_nr_running = sgs.sum_nr_running;
2554 sds->this_load_per_task = sgs.sum_weighted_load; 2577 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity;
2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2556 sds->max_load = sgs.avg_load; 2580 sds->max_load = sgs.avg_load;
2557 sds->busiest = sg; 2581 sds->busiest = sg;
2558 sds->busiest_nr_running = sgs.sum_nr_running; 2582 sds->busiest_nr_running = sgs.sum_nr_running;
2559 sds->busiest_group_capacity = sgs.group_capacity; 2583 sds->busiest_group_capacity = sgs.group_capacity;
2560 sds->busiest_load_per_task = sgs.sum_weighted_load; 2584 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity;
2561 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2562 } 2587 }
2563 2588
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2754 return fix_small_imbalance(sds, this_cpu, imbalance); 2779 return fix_small_imbalance(sds, this_cpu, imbalance);
2755 2780
2756} 2781}
2782
2757/******* find_busiest_group() helpers end here *********************/ 2783/******* find_busiest_group() helpers end here *********************/
2758 2784
2759/** 2785/**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2805 * 4) This group is more busy than the avg busieness at this 2831 * 4) This group is more busy than the avg busieness at this
2806 * sched_domain. 2832 * sched_domain.
2807 * 5) The imbalance is within the specified limit. 2833 * 5) The imbalance is within the specified limit.
2834 *
2835 * Note: when doing newidle balance, if the local group has excess
2836 * capacity (i.e. nr_running < group_capacity) and the busiest group
2837 * does not have any capacity, we force a load balance to pull tasks
2838 * to the local group. In this case, we skip past checks 3, 4 and 5.
2808 */ 2839 */
2809 if (!(*balance)) 2840 if (!(*balance))
2810 goto ret; 2841 goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2816 if (!sds.busiest || sds.busiest_nr_running == 0) 2847 if (!sds.busiest || sds.busiest_nr_running == 0)
2817 goto out_balanced; 2848 goto out_balanced;
2818 2849
2850 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2851 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2852 !sds.busiest_has_capacity)
2853 goto force_balance;
2854
2819 if (sds.this_load >= sds.max_load) 2855 if (sds.this_load >= sds.max_load)
2820 goto out_balanced; 2856 goto out_balanced;
2821 2857
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2827 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2828 goto out_balanced; 2864 goto out_balanced;
2829 2865
2866force_balance:
2830 /* Looks like there is an imbalance. Compute it */ 2867 /* Looks like there is an imbalance. Compute it */
2831 calculate_imbalance(&sds, this_cpu, imbalance); 2868 calculate_imbalance(&sds, this_cpu, imbalance);
2832 return sds.busiest; 2869 return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
3031 3068
3032 if (!ld_moved) { 3069 if (!ld_moved) {
3033 schedstat_inc(sd, lb_failed[idle]); 3070 schedstat_inc(sd, lb_failed[idle]);
3034 sd->nr_balance_failed++; 3071 /*
3072 * Increment the failure counter only on periodic balance.
3073 * We do not want newidle balance, which can be very
3074 * frequent, pollute the failure counter causing
3075 * excessive cache_hot migrations and active balances.
3076 */
3077 if (idle != CPU_NEWLY_IDLE)
3078 sd->nr_balance_failed++;
3035 3079
3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3080 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) { 3081 this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3153 interval = msecs_to_jiffies(sd->balance_interval); 3197 interval = msecs_to_jiffies(sd->balance_interval);
3154 if (time_after(next_balance, sd->last_balance + interval)) 3198 if (time_after(next_balance, sd->last_balance + interval))
3155 next_balance = sd->last_balance + interval; 3199 next_balance = sd->last_balance + interval;
3156 if (pulled_task) { 3200 if (pulled_task)
3157 this_rq->idle_stamp = 0;
3158 break; 3201 break;
3159 }
3160 } 3202 }
3161 3203
3162 raw_spin_lock(&this_rq->lock); 3204 raw_spin_lock(&this_rq->lock);
@@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
3751 3793
3752 update_rq_clock(rq); 3794 update_rq_clock(rq);
3753 3795
3754 if (unlikely(task_cpu(p) != this_cpu)) 3796 if (unlikely(task_cpu(p) != this_cpu)) {
3797 rcu_read_lock();
3755 __set_task_cpu(p, this_cpu); 3798 __set_task_cpu(p, this_cpu);
3799 rcu_read_unlock();
3800 }
3756 3801
3757 update_curr(cfs_rq); 3802 update_curr(cfs_rq);
3758 3803
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 61 * release the lock. Decreases scheduling overhead.
62 */ 62 */
63SCHED_FEAT(OWNER_SPIN, 1) 63SCHED_FEAT(OWNER_SPIN, 1)
64
65/*
66 * Decrement CPU power based on irq activity
67 */
68SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
609 if (!task_has_rt_policy(curr)) 609 if (!task_has_rt_policy(curr))
610 return; 610 return;
611 611
612 delta_exec = rq->clock - curr->se.exec_start; 612 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
620 620
621 curr->se.exec_start = rq->clock; 621 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 622 cpuacct_charge(curr, delta_exec);
623 623
624 sched_rt_avg_update(rq, delta_exec); 624 sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
960 * runqueue. Otherwise simply start this RT task 960 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 961 * on its current runqueue.
962 * 962 *
963 * We want to avoid overloading runqueues. Even if 963 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 964 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 965 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 966 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 967 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 968 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 969 * lock?
970 * RT task in hopes of saving some of a RT task 970 *
971 * that is just being woken and probably will have 971 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway.
973 */ 972 */
974 if (unlikely(rt_task(rq->curr)) && 973 if (unlikely(rt_task(rq->curr)) &&
974 (rq->curr->rt.nr_cpus_allowed < 2 ||
975 rq->curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 976 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 977 int cpu = find_lowest_rq(p);
977 978
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1075 } while (rt_rq);
1075 1076
1076 p = rt_task_of(rt_se); 1077 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1078 p->se.exec_start = rq->clock_task;
1078 1079
1079 return p; 1080 return p;
1080} 1081}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1140 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1141 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1142 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1143next_idx:
1143 if (idx >= MAX_RT_PRIO) 1144 if (idx >= MAX_RT_PRIO)
1144 continue; 1145 continue;
1145 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1316 if (!next_task)
1316 return 0; 1317 return 0;
1317 1318
1318 retry: 1319retry:
1319 if (unlikely(next_task == rq->curr)) { 1320 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1321 WARN_ON(1);
1321 return 0; 1322 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 * but possible) 1464 * but possible)
1464 */ 1465 */
1465 } 1466 }
1466 skip: 1467skip:
1467 double_unlock_balance(this_rq, src_rq); 1468 double_unlock_balance(this_rq, src_rq);
1468 } 1469 }
1469 1470
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1494 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1495 p->rt.nr_cpus_allowed > 1 &&
1496 rt_task(rq->curr) &&
1497 (rq->curr->rt.nr_cpus_allowed < 2 ||
1498 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1499 push_rt_tasks(rq);
1496} 1500}
1497 1501
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1709{ 1713{
1710 struct task_struct *p = rq->curr; 1714 struct task_struct *p = rq->curr;
1711 1715
1712 p->se.exec_start = rq->clock; 1716 p->se.exec_start = rq->clock_task;
1713 1717
1714 /* The running task is never eligible for pushing */ 1718 /* The running task is never eligible for pushing */
1715 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p,
13 int sd_flag, int flags)
14{
15 return task_cpu(p); /* stop tasks as never migrate */
16}
17#endif /* CONFIG_SMP */
18
19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->curr); /* we preempt everything */
23}
24
25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{
27 struct task_struct *stop = rq->stop;
28
29 if (stop && stop->state == TASK_RUNNING)
30 return stop;
31
32 return NULL;
33}
34
35static void
36enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
37{
38}
39
40static void
41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
42{
43}
44
45static void yield_task_stop(struct rq *rq)
46{
47 BUG(); /* the stop task should never yield, its pointless. */
48}
49
50static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
51{
52}
53
54static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_stop(struct rq *rq)
59{
60}
61
62static void switched_to_stop(struct rq *rq, struct task_struct *p,
63 int running)
64{
65 BUG(); /* its impossible to change to this class */
66}
67
68static void prio_changed_stop(struct rq *rq, struct task_struct *p,
69 int oldprio, int running)
70{
71 BUG(); /* how!?, what priority? */
72}
73
74static unsigned int
75get_rr_interval_stop(struct rq *rq, struct task_struct *task)
76{
77 return 0;
78}
79
80/*
81 * Simple, special scheduling class for the per-CPU stop tasks:
82 */
83static const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class,
85
86 .enqueue_task = enqueue_task_stop,
87 .dequeue_task = dequeue_task_stop,
88 .yield_task = yield_task_stop,
89
90 .check_preempt_curr = check_preempt_curr_stop,
91
92 .pick_next_task = pick_next_task_stop,
93 .put_prev_task = put_prev_task_stop,
94
95#ifdef CONFIG_SMP
96 .select_task_rq = select_task_rq_stop,
97#endif
98
99 .set_curr_task = set_curr_task_stop,
100 .task_tick = task_tick_stop,
101
102 .get_rr_interval = get_rr_interval_stop,
103
104 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108};
diff --git a/kernel/signal.c b/kernel/signal.c
index bded65187780..919562c3d6b7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2215,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2215#ifdef __ARCH_SI_TRAPNO 2215#ifdef __ARCH_SI_TRAPNO
2216 err |= __put_user(from->si_trapno, &to->si_trapno); 2216 err |= __put_user(from->si_trapno, &to->si_trapno);
2217#endif 2217#endif
2218#ifdef BUS_MCEERR_AO
2219 /*
2220 * Other callers might not initialize the si_lsb field,
2221 * so check explicitely for the right codes here.
2222 */
2223 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2224 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2225#endif
2218 break; 2226 break;
2219 case __SI_CHLD: 2227 case __SI_CHLD:
2220 err |= __put_user(from->si_pid, &to->si_pid); 2228 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..fc978889b194 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
245 lockdep_softirq_exit(); 262 lockdep_softirq_exit();
246 263
247 account_system_vtime(current); 264 account_system_vtime(current);
248 _local_bh_enable(); 265 __local_bh_enable(SOFTIRQ_OFFSET);
249} 266}
250 267
251#ifndef __ARCH_HAS_DO_SOFTIRQ 268#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
279 296
280 rcu_irq_enter(); 297 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 298 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 299 /*
300 * Prevent raise_softirq from needlessly waking up ksoftirqd
301 * here, as softirq will be serviced on return from interrupt.
302 */
303 local_bh_disable();
283 tick_check_idle(cpu); 304 tick_check_idle(cpu);
284 } else 305 _local_bh_enable();
285 __irq_enter(); 306 }
307
308 __irq_enter();
286} 309}
287 310
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 311#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
696{ 719{
697 set_current_state(TASK_INTERRUPTIBLE); 720 set_current_state(TASK_INTERRUPTIBLE);
698 721
722 current->flags |= PF_KSOFTIRQD;
699 while (!kthread_should_stop()) { 723 while (!kthread_should_stop()) {
700 preempt_disable(); 724 preempt_disable();
701 if (!local_softirq_pending()) { 725 if (!local_softirq_pending()) {
@@ -886,17 +910,14 @@ int __init __weak early_irq_init(void)
886 return 0; 910 return 0;
887} 911}
888 912
913#ifdef CONFIG_GENERIC_HARDIRQS
889int __init __weak arch_probe_nr_irqs(void) 914int __init __weak arch_probe_nr_irqs(void)
890{ 915{
891 return 0; 916 return NR_IRQS_LEGACY;
892} 917}
893 918
894int __init __weak arch_early_irq_init(void) 919int __init __weak arch_early_irq_init(void)
895{ 920{
896 return 0; 921 return 0;
897} 922}
898 923#endif
899int __weak arch_init_chip_data(struct irq_desc *desc, int node)
900{
901 return 0;
902}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
46int __init_srcu_struct(struct srcu_struct *sp, const char *name, 46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key) 47 struct lock_class_key *key)
48{ 48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */ 49 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 50 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0); 51 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp); 52 return init_srcu_struct_fields(sp);
55} 53}
56EXPORT_SYMBOL_GPL(__init_srcu_struct); 54EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..090c28812ce1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 cpu); 305 cpu);
305 if (IS_ERR(p)) 306 if (IS_ERR(p))
306 return NOTIFY_BAD; 307 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 311 stopper->thread = p;
310 break; 312 break;
311 313
312 case CPU_ONLINE: 314 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 315 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 316 wake_up_process(stopper->thread);
316 /* mark enabled */ 317 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 326 {
326 struct cpu_stop_work *work; 327 struct cpu_stop_work *work;
327 328
329 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 330 /* kill the stopper */
329 kthread_stop(stopper->thread); 331 kthread_stop(stopper->thread);
330 /* drain remaining works */ 332 /* drain remaining works */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bad369ec5403..c782fe9924c7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 51cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
53cond_syscall(compat_sys_recv);
53cond_syscall(compat_sys_recvfrom); 54cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg); 55cond_syscall(compat_sys_recvmmsg);
55cond_syscall(sys_socketcall); 56cond_syscall(sys_socketcall);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c6d227..3a45c224770f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2485,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2485 kbuf[left] = 0; 2485 kbuf[left] = 0;
2486 } 2486 }
2487 2487
2488 for (; left && vleft--; i++, min++, max++, first=0) { 2488 for (; left && vleft--; i++, first = 0) {
2489 unsigned long val; 2489 unsigned long val;
2490 2490
2491 if (write) { 2491 if (write) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c827..10b90d8a03c4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
143 if (!table->maxlen) 143 if (!table->maxlen)
144 set_fail(&fail, table, "No maxlen"); 144 set_fail(&fail, table, "No maxlen");
145 } 145 }
146 if ((table->proc_handler == proc_doulongvec_minmax) ||
147 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
148 if (table->maxlen > sizeof (unsigned long)) {
149 if (!table->extra1)
150 set_fail(&fail, table, "No min");
151 if (!table->extra2)
152 set_fail(&fail, table, "No max");
153 }
154 }
155#ifdef CONFIG_PROC_SYSCTL 146#ifdef CONFIG_PROC_SYSCTL
156 if (table->procname && !table->proc_handler) 147 if (table->procname && !table->proc_handler)
157 set_fail(&fail, table, "No proc_handler"); 148 set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..d2321891538f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
149 time_reftime = get_seconds(); 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = ntp_update_offset_fll(offset64, secs);
153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
154 153
155 freq_adj += ntp_update_offset_fll(offset64, secs); 154 /*
155 * Clamp update interval to reduce PLL gain with low
156 * sampling rate (e.g. intermittent network connection)
157 * to avoid instability.
158 */
159 if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
160 secs = 1 << (SHIFT_PLL + 1 + time_constant);
161
162 freq_adj += (offset64 * secs) <<
163 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
156 164
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); 165 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158 166
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e550d2eda1df..e04b8bcdef88 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -126,7 +126,7 @@ if FTRACE
126config FUNCTION_TRACER 126config FUNCTION_TRACER
127 bool "Kernel Function Tracer" 127 bool "Kernel Function Tracer"
128 depends on HAVE_FUNCTION_TRACER 128 depends on HAVE_FUNCTION_TRACER
129 select FRAME_POINTER 129 select FRAME_POINTER if (!ARM_UNWIND)
130 select KALLSYMS 130 select KALLSYMS
131 select GENERIC_TRACER 131 select GENERIC_TRACER
132 select CONTEXT_SWITCH_TRACER 132 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4e2f03410377..c5a632a669e1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 406
407/* Max number of timestamps that can fit on a page */ 407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) 408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409 409
410int ring_buffer_print_page_header(struct trace_seq *s) 410int ring_buffer_print_page_header(struct trace_seq *s)
411{ 411{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dc8e16824b51..bafba687a6d8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -196,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
196}; 196};
197 197
198/* Callback function for perf event subsystem */ 198/* Callback function for perf event subsystem */
199void watchdog_overflow_callback(struct perf_event *event, int nmi, 199static void watchdog_overflow_callback(struct perf_event *event, int nmi,
200 struct perf_sample_data *data, 200 struct perf_sample_data *data,
201 struct pt_regs *regs) 201 struct pt_regs *regs)
202{ 202{