aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-01-09 04:42:21 -0500
committerIngo Molnar <mingo@elte.hu>2011-01-09 04:42:21 -0500
commit4385428a477559b26736cc3c80d8b68f31126c71 (patch)
tree8eb0cbc78e79c368687fa13a1e0674ae537f830f
parent047a3772feaae8e43d81d790f3d3f80dae8ae676 (diff)
parent2d75af2f2a7a6103a6d539a492fe81deacabde44 (diff)
Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/urgent
-rw-r--r--Documentation/RCU/trace.txt144
-rw-r--r--Documentation/dontdiff26
-rw-r--r--Documentation/kernel-docs.txt27
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--Documentation/x86/boot.txt1
-rw-r--r--MAINTAINERS16
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/mutex.h2
-rw-r--r--arch/x86/Kconfig41
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/include/asm/alternative.h1
-rw-r--r--arch/x86/include/asm/amd_nb.h49
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/i387.h24
-rw-r--r--arch/x86/include/asm/io_apic.h8
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/microcode.h6
-rw-r--r--arch/x86/include/asm/mpspec.h31
-rw-r--r--arch/x86/include/asm/mpspec_def.h7
-rw-r--r--arch/x86/include/asm/mrst-vrtc.h9
-rw-r--r--arch/x86/include/asm/mrst.h14
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/paravirt.h2
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/setup.h6
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h9
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/alternative.c3
-rw-r--r--arch/x86/kernel/amd_nb.c135
-rw-r--r--arch/x86/kernel/apb_timer.c1
-rw-r--r--arch/x86/kernel/aperture_64.c10
-rw-r--r--arch/x86/kernel/apic/apic.c104
-rw-r--r--arch/x86/kernel/apic/io_apic.c37
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c61
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c147
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c135
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c40
-rw-r--r--arch/x86/kernel/early_printk.c3
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head_32.S83
-rw-r--r--arch/x86/kernel/microcode_amd.c34
-rw-r--r--arch/x86/kernel/pci-gart_64.c34
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c16
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kernel/trampoline_64.S2
-rw-r--r--arch/x86/kernel/tsc.c96
-rw-r--r--arch/x86/kernel/verify_cpu.S (renamed from arch/x86/kernel/verify_cpu_64.S)49
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/lguest/i386_head.S105
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/amdtopology_64.c (renamed from arch/x86/mm/k8topology_64.c)12
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/init_32.c20
-rw-r--r--arch/x86/mm/numa_64.c22
-rw-r--r--arch/x86/mm/pageattr.c33
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/mm/srat_64.c10
-rw-r--r--arch/x86/oprofile/op_model_amd.c1
-rw-r--r--arch/x86/pci/Makefile1
-rw-r--r--arch/x86/pci/ce4100.c315
-rw-r--r--arch/x86/pci/pcbios.c23
-rw-r--r--arch/x86/platform/Makefile2
-rw-r--r--arch/x86/platform/ce4100/Makefile1
-rw-r--r--arch/x86/platform/ce4100/ce4100.c132
-rw-r--r--arch/x86/platform/iris/Makefile1
-rw-r--r--arch/x86/platform/iris/iris.c91
-rw-r--r--arch/x86/platform/mrst/Makefile2
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c (renamed from arch/x86/kernel/early_printk_mrst.c)0
-rw-r--r--arch/x86/platform/mrst/mrst.c546
-rw-r--r--arch/x86/platform/mrst/vrtc.c165
-rw-r--r--arch/x86/platform/sfi/sfi.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c22
-rw-r--r--arch/x86/platform/visws/visws_quirks.c2
-rw-r--r--drivers/acpi/numa.c14
-rw-r--r--drivers/char/agp/amd64-agp.c33
-rw-r--r--drivers/edac/amd64_edac.c4
-rw-r--r--drivers/platform/x86/intel_scu_ipc.c5
-rw-r--r--drivers/rtc/Kconfig12
-rw-r--r--drivers/rtc/Makefile1
-rw-r--r--drivers/rtc/rtc-mrst.c582
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/gfs2/glock.c71
-rw-r--r--fs/gfs2/glock.h28
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h12
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/ops_inode.c18
-rw-r--r--fs/gfs2/quota.c13
-rw-r--r--fs/gfs2/rgrp.c57
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/xattr.c23
-rw-r--r--fs/proc/base.c79
-rw-r--r--include/linux/completion.h8
-rw-r--r--include/linux/dynamic_debug.h18
-rw-r--r--include/linux/hrtimer.h33
-rw-r--r--include/linux/init_task.h18
-rw-r--r--include/linux/interrupt.h6
-rw-r--r--include/linux/module.h11
-rw-r--r--include/linux/mutex.h4
-rw-r--r--include/linux/rculist.h5
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--include/linux/rcutiny.h13
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/sched.h47
-rw-r--r--include/linux/sfi.h8
-rw-r--r--include/linux/timer.h32
-rw-r--r--include/linux/timerqueue.h50
-rw-r--r--include/linux/tracepoint.h4
-rw-r--r--include/linux/workqueue.h8
-rw-r--r--include/trace/define_trace.h10
-rw-r--r--include/trace/events/skb.h4
-rw-r--r--init/Kconfig68
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/futex.c235
-rw-r--r--kernel/hrtimer.c83
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/module.c171
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/rcutiny.c105
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c156
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c638
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c322
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/srcu.c8
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/timer.c50
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/watchdog.c2
-rw-r--r--lib/Makefile2
-rw-r--r--lib/dynamic_debug.c9
-rw-r--r--lib/timerqueue.c107
-rwxr-xr-xscripts/kernel-doc102
165 files changed, 5763 insertions, 2060 deletions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index a851118775d8..6a8c73f55b80 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
1CONFIG_RCU_TRACE debugfs Files and Formats 1CONFIG_RCU_TRACE debugfs Files and Formats
2 2
3 3
4The rcutree implementation of RCU provides debugfs trace output that 4The rcutree and rcutiny implementations of RCU provide debugfs trace
5summarizes counters and state. This information is useful for debugging 5output that summarizes counters and state. This information is useful for
6RCU itself, and can sometimes also help to debug abuses of RCU. 6debugging RCU itself, and can sometimes also help to debug abuses of RCU.
7The following sections describe the debugfs files and formats. 7The following sections describe the debugfs files and formats, first
8for rcutree and next for rcutiny.
8 9
9 10
10Hierarchical RCU debugfs Files and Formats 11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
11 12
12This implementation of RCU provides three debugfs files under the 13These implementations of RCU provides five debugfs files under the
13top-level directory RCU: rcu/rcudata (which displays fields in struct 14top-level directory RCU: rcu/rcudata (which displays fields in struct
14rcu_data), rcu/rcugp (which displays grace-period counters), and 15rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
15rcu/rcuhier (which displays the struct rcu_node hierarchy). 16rcu/rcudata), rcu/rcugp (which displays grace-period counters),
17rcu/rcuhier (which displays the struct rcu_node hierarchy), and
18rcu/rcu_pending (which displays counts of the reasons that the
19rcu_pending() function decided that there was core RCU work to do).
16 20
17The output of "cat rcu/rcudata" looks as follows: 21The output of "cat rcu/rcudata" looks as follows:
18 22
@@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for
130 been registered in absence of CPU-hotplug activity. 134 been registered in absence of CPU-hotplug activity.
131 135
132o "co" is the number of RCU callbacks that have been orphaned due to 136o "co" is the number of RCU callbacks that have been orphaned due to
133 this CPU going offline. 137 this CPU going offline. These orphaned callbacks have been moved
138 to an arbitrarily chosen online CPU.
134 139
135o "ca" is the number of RCU callbacks that have been adopted due to 140o "ca" is the number of RCU callbacks that have been adopted due to
136 other CPUs going offline. Note that ci+co-ca+ql is the number of 141 other CPUs going offline. Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is
168 173
169The output of "cat rcu/rcuhier" looks as follows, with very long lines: 174The output of "cat rcu/rcuhier" looks as follows, with very long lines:
170 175
171c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 176c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1721/1 .>. 0:127 ^0 1771/1 .>. 0:127 ^0
1733/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 1783/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
1743/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 1793/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
175rcu_bh: 180rcu_bh:
176c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 181c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
1770/1 .>. 0:127 ^0 1820/1 .>. 0:127 ^0
1780/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 1830/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
1790/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 1840/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that
212 exited immediately (without even being counted in nfqs above) 217 exited immediately (without even being counted in nfqs above)
213 due to contention on ->fqslock. 218 due to contention on ->fqslock.
214 219
215o "oqlen" is the number of callbacks on the "orphan" callback
216 list. RCU callbacks are placed on this list by CPUs going
217 offline, and are "adopted" either by the CPU helping the outgoing
218 CPU or by the next rcu_barrier*() call, whichever comes first.
219
220o Each element of the form "1/1 0:127 ^0" represents one struct 220o Each element of the form "1/1 0:127 ^0" represents one struct
221 rcu_node. Each line represents one level of the hierarchy, from 221 rcu_node. Each line represents one level of the hierarchy, from
222 root to leaves. It is best to think of the rcu_data structures 222 root to leaves. It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert
326 readers will note that the rcu "nn" number for a given CPU very 326 readers will note that the rcu "nn" number for a given CPU very
327 closely matches the rcu_bh "np" number for that same CPU. This 327 closely matches the rcu_bh "np" number for that same CPU. This
328 is due to short-circuit evaluation in rcu_pending(). 328 is due to short-circuit evaluation in rcu_pending().
329
330
331CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
332
333These implementations of RCU provides a single debugfs file under the
334top-level directory RCU, namely rcu/rcudata, which displays fields in
335rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
336rcu_preempt_ctrlblk.
337
338The output of "cat rcu/rcudata" is as follows:
339
340rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
341 ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
342 normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
343 exp balk: bt=0 nos=0
344rcu_sched: qlen: 0
345rcu_bh: qlen: 0
346
347This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
348rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
349The last three lines of the rcu_preempt section appear only in
350CONFIG_RCU_BOOST kernel builds. The fields are as follows:
351
352o "qlen" is the number of RCU callbacks currently waiting either
353 for an RCU grace period or waiting to be invoked. This is the
354 only field present for rcu_sched and rcu_bh, due to the
355 short-circuiting of grace period in those two cases.
356
357o "gp" is the number of grace periods that have completed.
358
359o "g197/p197/c197" displays the grace-period state, with the
360 "g" number being the number of grace periods that have started
361 (mod 256), the "p" number being the number of grace periods
362 that the CPU has responded to (also mod 256), and the "c"
363 number being the number of grace periods that have completed
364 (once again mode 256).
365
366 Why have both "gp" and "g"? Because the data flowing into
367 "gp" is only present in a CONFIG_RCU_TRACE kernel.
368
369o "tasks" is a set of bits. The first bit is "T" if there are
370 currently tasks that have recently blocked within an RCU
371 read-side critical section, the second bit is "N" if any of the
372 aforementioned tasks are blocking the current RCU grace period,
373 and the third bit is "E" if any of the aforementioned tasks are
374 blocking the current expedited grace period. Each bit is "."
375 if the corresponding condition does not hold.
376
377o "ttb" is a single bit. It is "B" if any of the blocked tasks
378 need to be priority boosted and "." otherwise.
379
380o "btg" indicates whether boosting has been carried out during
381 the current grace period, with "exp" indicating that boosting
382 is in progress for an expedited grace period, "no" indicating
383 that boosting has not yet started for a normal grace period,
384 "begun" indicating that boosting has bebug for a normal grace
385 period, and "done" indicating that boosting has completed for
386 a normal grace period.
387
388o "ntb" is the total number of tasks subjected to RCU priority boosting
389 periods since boot.
390
391o "neb" is the number of expedited grace periods that have had
392 to resort to RCU priority boosting since boot.
393
394o "nnb" is the number of normal grace periods that have had
395 to resort to RCU priority boosting since boot.
396
397o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
398
399o "bt" is the low-order 12 bits of the value that the jiffies counter
400 will have at the next time that boosting is scheduled to begin.
401
402o In the line beginning with "normal balk", the fields are as follows:
403
404 o "nt" is the number of times that the system balked from
405 boosting because there were no blocked tasks to boost.
406 Note that the system will balk from boosting even if the
407 grace period is overdue when the currently running task
408 is looping within an RCU read-side critical section.
409 There is no point in boosting in this case, because
410 boosting a running task won't make it run any faster.
411
412 o "gt" is the number of times that the system balked
413 from boosting because, although there were blocked tasks,
414 none of them were preventing the current grace period
415 from completing.
416
417 o "bt" is the number of times that the system balked
418 from boosting because boosting was already in progress.
419
420 o "b" is the number of times that the system balked from
421 boosting because boosting had already completed for
422 the grace period in question.
423
424 o "ny" is the number of times that the system balked from
425 boosting because it was not yet time to start boosting
426 the grace period in question.
427
428 o "nos" is the number of times that the system balked from
429 boosting for inexplicable ("not otherwise specified")
430 reasons. This can actually happen due to races involving
431 increments of the jiffies counter.
432
433o In the line beginning with "exp balk", the fields are as follows:
434
435 o "bt" is the number of times that the system balked from
436 boosting because there were no blocked tasks to boost.
437
438 o "nos" is the number of times that the system balked from
439 boosting for inexplicable ("not otherwise specified")
440 reasons.
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index d9bcffd59433..470d3dba1a69 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,6 +62,10 @@ aic7*reg_print.c*
62aic7*seq.h* 62aic7*seq.h*
63aicasm 63aicasm
64aicdb.h* 64aicdb.h*
65altivec1.c
66altivec2.c
67altivec4.c
68altivec8.c
65asm-offsets.h 69asm-offsets.h
66asm_offsets.h 70asm_offsets.h
67autoconf.h* 71autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
76build 80build
77bvmlinux 81bvmlinux
78bzImage* 82bzImage*
83capflags.c
79classlist.h* 84classlist.h*
80comp*.log 85comp*.log
81compile.h* 86compile.h*
@@ -94,6 +99,7 @@ devlist.h*
94docproc 99docproc
95elf2ecoff 100elf2ecoff
96elfconfig.h* 101elfconfig.h*
102evergreen_reg_safe.h
97fixdep 103fixdep
98flask.h 104flask.h
99fore200e_mkfirm 105fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
108*_gray256.c 114*_gray256.c
109ihex2fw 115ihex2fw
110ikconfig.h* 116ikconfig.h*
117inat-tables.c
111initramfs_data.cpio 118initramfs_data.cpio
112initramfs_data.cpio.gz 119initramfs_data.cpio.gz
113initramfs_list 120initramfs_list
121int16.c
122int1.c
123int2.c
124int32.c
125int4.c
126int8.c
114kallsyms 127kallsyms
115kconfig 128kconfig
116keywords.c 129keywords.c
@@ -140,6 +153,7 @@ mkprep
140mktables 153mktables
141mktree 154mktree
142modpost 155modpost
156modules.builtin
143modules.order 157modules.order
144modversions.h* 158modversions.h*
145ncscope.* 159ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
153pca200e_ecd.bin2 167pca200e_ecd.bin2
154piggy.gz 168piggy.gz
155piggyback 169piggyback
170piggy.S
156pnmtologo 171pnmtologo
157ppc_defs.h* 172ppc_defs.h*
158pss_boot.h 173pss_boot.h
159qconf 174qconf
175r100_reg_safe.h
176r200_reg_safe.h
177r300_reg_safe.h
178r420_reg_safe.h
179r600_reg_safe.h
160raid6altivec*.c 180raid6altivec*.c
161raid6int*.c 181raid6int*.c
162raid6tables.c 182raid6tables.c
163relocs 183relocs
184rn50_reg_safe.h
185rs600_reg_safe.h
186rv515_reg_safe.h
164series 187series
165setup 188setup
166setup.bin 189setup.bin
@@ -169,6 +192,7 @@ sImage
169sm_tbl* 192sm_tbl*
170split-include 193split-include
171syscalltab.h 194syscalltab.h
195tables.c
172tags 196tags
173tftpboot.img 197tftpboot.img
174timeconst.h 198timeconst.h
@@ -190,6 +214,7 @@ vmlinux
190vmlinux-* 214vmlinux-*
191vmlinux.aout 215vmlinux.aout
192vmlinux.lds 216vmlinux.lds
217voffset.h
193vsyscall.lds 218vsyscall.lds
194vsyscall_32.lds 219vsyscall_32.lds
195wanxlfw.inc 220wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
200wakeup.lds 225wakeup.lds
201zImage* 226zImage*
202zconf.hash.c 227zconf.hash.c
228zoffset.h
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 715eaaf1519d..9a8674629a07 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -537,7 +537,7 @@
537 Notes: Further information in 537 Notes: Further information in
538 http://www.oreilly.com/catalog/linuxdrive2/ 538 http://www.oreilly.com/catalog/linuxdrive2/
539 539
540 * Title: "Linux Device Drivers, 3nd Edition" 540 * Title: "Linux Device Drivers, 3rd Edition"
541 Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman 541 Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
542 Publisher: O'Reilly & Associates. 542 Publisher: O'Reilly & Associates.
543 Date: 2005. 543 Date: 2005.
@@ -592,14 +592,6 @@
592 Pages: 600. 592 Pages: 600.
593 ISBN: 0-13-101908-2 593 ISBN: 0-13-101908-2
594 594
595 * Title: "The Design and Implementation of the 4.4 BSD UNIX
596 Operating System"
597 Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
598 John S. Quarterman.
599 Publisher: Addison-Wesley.
600 Date: 1996.
601 ISBN: 0-201-54979-4
602
603 * Title: "Programming for the real world - POSIX.4" 595 * Title: "Programming for the real world - POSIX.4"
604 Author: Bill O. Gallmeister. 596 Author: Bill O. Gallmeister.
605 Publisher: O'Reilly & Associates, Inc.. 597 Publisher: O'Reilly & Associates, Inc..
@@ -610,28 +602,13 @@
610 POSIX. Good reference. 602 POSIX. Good reference.
611 603
612 * Title: "UNIX Systems for Modern Architectures: Symmetric 604 * Title: "UNIX Systems for Modern Architectures: Symmetric
613 Multiprocesssing and Caching for Kernel Programmers" 605 Multiprocessing and Caching for Kernel Programmers"
614 Author: Curt Schimmel. 606 Author: Curt Schimmel.
615 Publisher: Addison Wesley. 607 Publisher: Addison Wesley.
616 Date: June, 1994. 608 Date: June, 1994.
617 Pages: 432. 609 Pages: 432.
618 ISBN: 0-201-63338-8 610 ISBN: 0-201-63338-8
619 611
620 * Title: "The Design and Implementation of the 4.3 BSD UNIX
621 Operating System"
622 Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
623 Karels, John S. Quarterman.
624 Publisher: Addison-Wesley.
625 Date: 1989 (reprinted with corrections on October, 1990).
626 ISBN: 0-201-06196-1
627
628 * Title: "The Design of the UNIX Operating System"
629 Author: Maurice J. Bach.
630 Publisher: Prentice Hall.
631 Date: 1986.
632 Pages: 471.
633 ISBN: 0-13-201757-1
634
635 MISCELLANEOUS: 612 MISCELLANEOUS:
636 613
637 * Name: linux/Documentation 614 * Name: linux/Documentation
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 992cda68fa63..f3dc951e949f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
1614 noapic [SMP,APIC] Tells the kernel to not make use of any 1614 noapic [SMP,APIC] Tells the kernel to not make use of any
1615 IOAPICs that may be present in the system. 1615 IOAPICs that may be present in the system.
1616 1616
1617 noautogroup Disable scheduler automatic task group creation.
1618
1617 nobats [PPC] Do not use BATs for mapping kernel lowmem 1619 nobats [PPC] Do not use BATs for mapping kernel lowmem
1618 on "Classic" PPC cores. 1620 on "Classic" PPC cores.
1619 1621
@@ -2459,12 +2461,13 @@ and is between 256 and 4096 characters. It is defined in the file
2459 to facilitate early boot debugging. 2461 to facilitate early boot debugging.
2460 See also Documentation/trace/events.txt 2462 See also Documentation/trace/events.txt
2461 2463
2462 tsc= Disable clocksource-must-verify flag for TSC. 2464 tsc= Disable clocksource stability checks for TSC.
2463 Format: <string> 2465 Format: <string>
2464 [x86] reliable: mark tsc clocksource as reliable, this 2466 [x86] reliable: mark tsc clocksource as reliable, this
2465 disables clocksource verification at runtime. 2467 disables clocksource verification at runtime, as well
2466 Used to enable high-resolution timer mode on older 2468 as the stability checks done at bootup. Used to enable
2467 hardware, and in virtualized environment. 2469 high-resolution timer mode on older hardware, and in
2470 virtualized environment.
2468 [x86] noirqtime: Do not use TSC to do irq accounting. 2471 [x86] noirqtime: Do not use TSC to do irq accounting.
2469 Used to run time disable IRQ_TIME_ACCOUNTING on any 2472 Used to run time disable IRQ_TIME_ACCOUNTING on any
2470 platforms where RDTSC is slow and this accounting 2473 platforms where RDTSC is slow and this accounting
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 30b43e1b2697..bdeb81ccb5f6 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -600,6 +600,7 @@ Protocol: 2.07+
600 0x00000001 lguest 600 0x00000001 lguest
601 0x00000002 Xen 601 0x00000002 Xen
602 0x00000003 Moorestown MID 602 0x00000003 Moorestown MID
603 0x00000004 CE4100 TV Platform
603 604
604Field name: hardware_subarch_data 605Field name: hardware_subarch_data
605Type: write (subarch-dependent) 606Type: write (subarch-dependent)
diff --git a/MAINTAINERS b/MAINTAINERS
index b1dda78a1e75..c5c7292daba0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2812,6 +2812,10 @@ M: Thomas Gleixner <tglx@linutronix.de>
2812S: Maintained 2812S: Maintained
2813F: Documentation/timers/ 2813F: Documentation/timers/
2814F: kernel/hrtimer.c 2814F: kernel/hrtimer.c
2815F: kernel/time/clockevents.c
2816F: kernel/time/tick*.*
2817F: kernel/time/timer_*.c
2818F include/linux/clockevents.h
2815F: include/linux/hrtimer.h 2819F: include/linux/hrtimer.h
2816 2820
2817HIGH-SPEED SCC DRIVER FOR AX.25 2821HIGH-SPEED SCC DRIVER FOR AX.25
@@ -5142,6 +5146,18 @@ L: alsa-devel@alsa-project.org (moderated for non-subscribers)
5142S: Supported 5146S: Supported
5143F: sound/soc/s3c24xx 5147F: sound/soc/s3c24xx
5144 5148
5149TIMEKEEPING, NTP
5150M: John Stultz <johnstul@us.ibm.com>
5151M: Thomas Gleixner <tglx@linutronix.de>
5152S: Supported
5153F: include/linux/clocksource.h
5154F: include/linux/time.h
5155F: include/linux/timex.h
5156F: include/linux/timekeeping.h
5157F: kernel/time/clocksource.c
5158F: kernel/time/time*.c
5159F: kernel/time/ntp.c
5160
5145TLG2300 VIDEO4LINUX-2 DRIVER 5161TLG2300 VIDEO4LINUX-2 DRIVER
5146M: Huang Shijie <shijie8@gmail.com> 5162M: Huang Shijie <shijie8@gmail.com>
5147M: Kang Yong <kangyong@telegent.com> 5163M: Kang Yong <kangyong@telegent.com>
diff --git a/arch/Kconfig b/arch/Kconfig
index 8bf0fa652eb6..f78c2be4242b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
175config HAVE_ARCH_JUMP_LABEL 175config HAVE_ARCH_JUMP_LABEL
176 bool 176 bool
177 177
178config HAVE_ARCH_MUTEX_CPU_RELAX
179 bool
180
178source "kernel/gcov/Kconfig" 181source "kernel/gcov/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e0b98e71ff47..6c6d7b339aae 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
99 select HAVE_KERNEL_LZMA 99 select HAVE_KERNEL_LZMA
100 select HAVE_KERNEL_LZO 100 select HAVE_KERNEL_LZO
101 select HAVE_GET_USER_PAGES_FAST 101 select HAVE_GET_USER_PAGES_FAST
102 select HAVE_ARCH_MUTEX_CPU_RELAX
102 select ARCH_INLINE_SPIN_TRYLOCK 103 select ARCH_INLINE_SPIN_TRYLOCK
103 select ARCH_INLINE_SPIN_TRYLOCK_BH 104 select ARCH_INLINE_SPIN_TRYLOCK_BH
104 select ARCH_INLINE_SPIN_LOCK 105 select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h
index 458c1f7fbc18..688271f5f2e4 100644
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
7 */ 7 */
8 8
9#include <asm-generic/mutex-dec.h> 9#include <asm-generic/mutex-dec.h>
10
11#define arch_mutex_cpu_relax() barrier()
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e330da21b84f..b6fccb07123e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -377,6 +377,18 @@ config X86_ELAN
377 377
378 If unsure, choose "PC-compatible" instead. 378 If unsure, choose "PC-compatible" instead.
379 379
380config X86_INTEL_CE
381 bool "CE4100 TV platform"
382 depends on PCI
383 depends on PCI_GODIRECT
384 depends on X86_32
385 depends on X86_EXTENDED_PLATFORM
386 select X86_REBOOTFIXUPS
387 ---help---
388 Select for the Intel CE media processor (CE4100) SOC.
389 This option compiles in support for the CE4100 SOC for settop
390 boxes and media devices.
391
380config X86_MRST 392config X86_MRST
381 bool "Moorestown MID platform" 393 bool "Moorestown MID platform"
382 depends on PCI 394 depends on PCI
@@ -385,6 +397,10 @@ config X86_MRST
385 depends on X86_EXTENDED_PLATFORM 397 depends on X86_EXTENDED_PLATFORM
386 depends on X86_IO_APIC 398 depends on X86_IO_APIC
387 select APB_TIMER 399 select APB_TIMER
400 select I2C
401 select SPI
402 select INTEL_SCU_IPC
403 select X86_PLATFORM_DEVICES
388 ---help--- 404 ---help---
389 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin 405 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
390 Internet Device(MID) platform. Moorestown consists of two chips: 406 Internet Device(MID) platform. Moorestown consists of two chips:
@@ -466,6 +482,19 @@ config X86_ES7000
466 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is 482 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
467 supposed to run on an IA32-based Unisys ES7000 system. 483 supposed to run on an IA32-based Unisys ES7000 system.
468 484
485config X86_32_IRIS
486 tristate "Eurobraille/Iris poweroff module"
487 depends on X86_32
488 ---help---
489 The Iris machines from EuroBraille do not have APM or ACPI support
490 to shut themselves down properly. A special I/O sequence is
491 needed to do so, which is what this module does at
492 kernel shutdown.
493
494 This is only for Iris machines from EuroBraille.
495
496 If unused, say N.
497
469config SCHED_OMIT_FRAME_POINTER 498config SCHED_OMIT_FRAME_POINTER
470 def_bool y 499 def_bool y
471 prompt "Single-depth WCHAN output" 500 prompt "Single-depth WCHAN output"
@@ -1141,16 +1170,16 @@ config NUMA
1141comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" 1170comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1142 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 1171 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
1143 1172
1144config K8_NUMA 1173config AMD_NUMA
1145 def_bool y 1174 def_bool y
1146 prompt "Old style AMD Opteron NUMA detection" 1175 prompt "Old style AMD Opteron NUMA detection"
1147 depends on X86_64 && NUMA && PCI 1176 depends on X86_64 && NUMA && PCI
1148 ---help--- 1177 ---help---
1149 Enable K8 NUMA node topology detection. You should say Y here if 1178 Enable AMD NUMA node topology detection. You should say Y here if
1150 you have a multi processor AMD K8 system. This uses an old 1179 you have a multi processor AMD system. This uses an old method to
1151 method to read the NUMA configuration directly from the builtin 1180 read the NUMA configuration directly from the builtin Northbridge
1152 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA 1181 of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
1153 instead, which also takes priority if both are compiled in. 1182 which also takes priority if both are compiled in.
1154 1183
1155config X86_64_ACPI_NUMA 1184config X86_64_ACPI_NUMA
1156 def_bool y 1185 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b59ee765414e..45143bbcfe5e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
117 feature as well as for the change_page_attr() infrastructure. 117 feature as well as for the change_page_attr() infrastructure.
118 If in doubt, say "N" 118 If in doubt, say "N"
119 119
120config DEBUG_SET_MODULE_RONX
121 bool "Set loadable kernel module data as NX and text as RO"
122 depends on MODULES
123 ---help---
124 This option helps catch unintended modifications to loadable
125 kernel module's text and read-only data. It also prevents execution
126 of module data. Such protection may interfere with run-time code
127 patching and dynamic kernel tracing - and they might also protect
128 against certain classes of kernel exploits.
129 If in doubt, say "N".
130
120config DEBUG_NX_TEST 131config DEBUG_NX_TEST
121 tristate "Testcase for the NX non-executable stack feature" 132 tristate "Testcase for the NX non-executable stack feature"
122 depends on DEBUG_KERNEL && m 133 depends on DEBUG_KERNEL && m
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a196fa0..35af09d13dc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
182 hlt 182 hlt
183 jmp 1b 183 jmp 1b
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu.S"
186 186
187 /* 187 /*
188 * Be careful here startup_64 needs to be at a predictable 188 * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 4a2adaa9aefc..13009d1af99a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
66extern void alternatives_smp_module_del(struct module *mod); 66extern void alternatives_smp_module_del(struct module *mod);
67extern void alternatives_smp_switch(int smp); 67extern void alternatives_smp_switch(int smp);
68extern int alternatives_text_reserved(void *start, void *end); 68extern int alternatives_text_reserved(void *start, void *end);
69extern bool skip_smp_alternatives;
69#else 70#else
70static inline void alternatives_smp_module_add(struct module *mod, char *name, 71static inline void alternatives_smp_module_add(struct module *mod, char *name,
71 void *locks, void *locks_end, 72 void *locks, void *locks_end,
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c8517f81b21e..6aee50d655d1 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,36 +3,53 @@
3 3
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id amd_nb_misc_ids[];
7struct bootnode; 7struct bootnode;
8 8
9extern int early_is_k8_nb(u32 value); 9extern int early_is_amd_nb(u32 value);
10extern int cache_k8_northbridges(void); 10extern int amd_cache_northbridges(void);
11extern void k8_flush_garts(void); 11extern void amd_flush_garts(void);
12extern int k8_get_nodes(struct bootnode *nodes); 12extern int amd_get_nodes(struct bootnode *nodes);
13extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); 13extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
14extern int k8_scan_nodes(void); 14extern int amd_scan_nodes(void);
15 15
16struct k8_northbridge_info { 16struct amd_northbridge {
17 struct pci_dev *misc;
18};
19
20struct amd_northbridge_info {
17 u16 num; 21 u16 num;
18 u8 gart_supported; 22 u64 flags;
19 struct pci_dev **nb_misc; 23 struct amd_northbridge *nb;
20}; 24};
21extern struct k8_northbridge_info k8_northbridges; 25extern struct amd_northbridge_info amd_northbridges;
26
27#define AMD_NB_GART 0x1
28#define AMD_NB_L3_INDEX_DISABLE 0x2
22 29
23#ifdef CONFIG_AMD_NB 30#ifdef CONFIG_AMD_NB
24 31
25static inline struct pci_dev *node_to_k8_nb_misc(int node) 32static inline int amd_nb_num(void)
26{ 33{
27 return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; 34 return amd_northbridges.num;
28} 35}
29 36
30#else 37static inline int amd_nb_has_feature(int feature)
38{
39 return ((amd_northbridges.flags & feature) == feature);
40}
31 41
32static inline struct pci_dev *node_to_k8_nb_misc(int node) 42static inline struct amd_northbridge *node_to_amd_nb(int node)
33{ 43{
34 return NULL; 44 return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
35} 45}
46
47#else
48
49#define amd_nb_num(x) 0
50#define amd_nb_has_feature(x) false
51#define node_to_amd_nb(x) NULL
52
36#endif 53#endif
37 54
38 55
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f6ce0bda3b98..cf12007796db 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void);
238extern void setup_secondary_APIC_clock(void); 238extern void setup_secondary_APIC_clock(void);
239extern int APIC_init_uniprocessor(void); 239extern int APIC_init_uniprocessor(void);
240extern void enable_NMI_through_LVT0(void); 240extern void enable_NMI_through_LVT0(void);
241extern int apic_force_enable(void);
241 242
242/* 243/*
243 * On 32bit this is mach-xxx local 244 * On 32bit this is mach-xxx local
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index a859ca461fb0..47a30ff8e517 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
145 145
146#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
147# define MAX_IO_APICS 64 147# define MAX_IO_APICS 64
148# define MAX_LOCAL_APIC 256
148#else 149#else
149# define MAX_IO_APICS 128 150# define MAX_IO_APICS 128
150# define MAX_LOCAL_APIC 32768 151# define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 8e6218550e77..c8bfe63a06de 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -124,6 +124,7 @@ enum {
124 X86_SUBARCH_LGUEST, 124 X86_SUBARCH_LGUEST,
125 X86_SUBARCH_XEN, 125 X86_SUBARCH_XEN,
126 X86_SUBARCH_MRST, 126 X86_SUBARCH_MRST,
127 X86_SUBARCH_CE4100,
127 X86_NR_SUBARCHS, 128 X86_NR_SUBARCHS,
128}; 129};
129 130
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 9479a037419f..0141b234406f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,6 +117,10 @@ enum fixed_addresses {
117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ 117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ 118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
119 __end_of_permanent_fixed_addresses, 119 __end_of_permanent_fixed_addresses,
120
121#ifdef CONFIG_X86_MRST
122 FIX_LNW_VRTC,
123#endif
120 /* 124 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(), 125 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional. 126 * before ioremap() is functional.
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 4aa2bb3b242a..ef328901c802 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
93 int err; 93 int err;
94 94
95 /* See comment in fxsave() below. */ 95 /* See comment in fxsave() below. */
96#ifdef CONFIG_AS_FXSAVEQ
97 asm volatile("1: fxrstorq %[fx]\n\t"
98 "2:\n"
99 ".section .fixup,\"ax\"\n"
100 "3: movl $-1,%[err]\n"
101 " jmp 2b\n"
102 ".previous\n"
103 _ASM_EXTABLE(1b, 3b)
104 : [err] "=r" (err)
105 : [fx] "m" (*fx), "0" (0));
106#else
96 asm volatile("1: rex64/fxrstor (%[fx])\n\t" 107 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
97 "2:\n" 108 "2:\n"
98 ".section .fixup,\"ax\"\n" 109 ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
102 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
103 : [err] "=r" (err) 114 : [err] "=r" (err)
104 : [fx] "R" (fx), "m" (*fx), "0" (0)); 115 : [fx] "R" (fx), "m" (*fx), "0" (0));
116#endif
105 return err; 117 return err;
106} 118}
107 119
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
119 return -EFAULT; 131 return -EFAULT;
120 132
121 /* See comment in fxsave() below. */ 133 /* See comment in fxsave() below. */
134#ifdef CONFIG_AS_FXSAVEQ
135 asm volatile("1: fxsaveq %[fx]\n\t"
136 "2:\n"
137 ".section .fixup,\"ax\"\n"
138 "3: movl $-1,%[err]\n"
139 " jmp 2b\n"
140 ".previous\n"
141 _ASM_EXTABLE(1b, 3b)
142 : [err] "=r" (err), [fx] "=m" (*fx)
143 : "0" (0));
144#else
122 asm volatile("1: rex64/fxsave (%[fx])\n\t" 145 asm volatile("1: rex64/fxsave (%[fx])\n\t"
123 "2:\n" 146 "2:\n"
124 ".section .fixup,\"ax\"\n" 147 ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
128 _ASM_EXTABLE(1b, 3b) 151 _ASM_EXTABLE(1b, 3b)
129 : [err] "=r" (err), "=m" (*fx) 152 : [err] "=r" (err), "=m" (*fx)
130 : [fx] "R" (fx), "0" (0)); 153 : [fx] "R" (fx), "0" (0));
154#endif
131 if (unlikely(err) && 155 if (unlikely(err) &&
132 __clear_user(fx, sizeof(struct i387_fxsave_struct))) 156 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
133 err = -EFAULT; 157 err = -EFAULT;
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a6b28d017c2f..0c5ca4e30d7b 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 159extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 160 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi); 161void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_init_mappings(void); 162extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 163extern void ioapic_insert_resources(void);
164 164
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,10 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170 170
171extern void probe_nr_irqs_gsi(void);
172extern int get_nr_irqs_gsi(void); 171extern int get_nr_irqs_gsi(void);
173
174extern void setup_ioapic_ids_from_mpc(void); 172extern void setup_ioapic_ids_from_mpc(void);
173extern void setup_ioapic_ids_from_mpc_nocheck(void);
175 174
176struct mp_ioapic_gsi{ 175struct mp_ioapic_gsi{
177 u32 gsi_base; 176 u32 gsi_base;
@@ -189,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void);
189#define io_apic_assign_pci_irqs 0 188#define io_apic_assign_pci_irqs 0
190#define setup_ioapic_ids_from_mpc x86_init_noop 189#define setup_ioapic_ids_from_mpc x86_init_noop
191static const int timer_through_8259 = 0; 190static const int timer_through_8259 = 0;
192static inline void ioapic_init_mappings(void) { } 191static inline void ioapic_and_gsi_init(void) { }
193static inline void ioapic_insert_resources(void) { } 192static inline void ioapic_insert_resources(void) { }
194static inline void probe_nr_irqs_gsi(void) { }
195#define gsi_top (NR_IRQS_LEGACY) 193#define gsi_top (NR_IRQS_LEGACY)
196static inline int mp_find_ioapic(u32 gsi) { return 0; } 194static inline int mp_find_ioapic(u32 gsi) { return 0; }
197 195
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13cb9788..eb16e94ae04f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
223 223
224void mce_log_therm_throt_event(__u64 status); 224void mce_log_therm_throt_event(__u64 status);
225 225
226/* Interrupt Handler for core thermal thresholds */
227extern int (*platform_thermal_notify)(__u64 msr_val);
228
226#ifdef CONFIG_X86_THERMAL_VECTOR 229#ifdef CONFIG_X86_THERMAL_VECTOR
227extern void mcheck_intel_therm_init(void); 230extern void mcheck_intel_therm_init(void);
228#else 231#else
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..24215072d0e1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
48 48
49#ifdef CONFIG_MICROCODE_AMD 49#ifdef CONFIG_MICROCODE_AMD
50extern struct microcode_ops * __init init_amd_microcode(void); 50extern struct microcode_ops * __init init_amd_microcode(void);
51
52static inline void get_ucode_data(void *to, const u8 *from, size_t n)
53{
54 memcpy(to, from, n);
55}
56
51#else 57#else
52static inline struct microcode_ops * __init init_amd_microcode(void) 58static inline struct microcode_ops * __init init_amd_microcode(void)
53{ 59{
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e9f905..0c90dd9f0505 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
5 5
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7#include <asm/x86_init.h> 7#include <asm/x86_init.h>
8#include <asm/apicdef.h>
8 9
9extern int apic_version[MAX_APICS]; 10extern int apic_version[];
10extern int pic_mode; 11extern int pic_mode;
11 12
12#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
107 int active_high_low); 108 int active_high_low);
108#endif /* CONFIG_ACPI */ 109#endif /* CONFIG_ACPI */
109 110
110#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) 111#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
111 112
112struct physid_mask { 113struct physid_mask {
113 unsigned long mask[PHYSID_ARRAY_SIZE]; 114 unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
122 test_and_set_bit(physid, (map).mask) 123 test_and_set_bit(physid, (map).mask)
123 124
124#define physids_and(dst, src1, src2) \ 125#define physids_and(dst, src1, src2) \
125 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 126 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
126 127
127#define physids_or(dst, src1, src2) \ 128#define physids_or(dst, src1, src2) \
128 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 129 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
129 130
130#define physids_clear(map) \ 131#define physids_clear(map) \
131 bitmap_zero((map).mask, MAX_APICS) 132 bitmap_zero((map).mask, MAX_LOCAL_APIC)
132 133
133#define physids_complement(dst, src) \ 134#define physids_complement(dst, src) \
134 bitmap_complement((dst).mask, (src).mask, MAX_APICS) 135 bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
135 136
136#define physids_empty(map) \ 137#define physids_empty(map) \
137 bitmap_empty((map).mask, MAX_APICS) 138 bitmap_empty((map).mask, MAX_LOCAL_APIC)
138 139
139#define physids_equal(map1, map2) \ 140#define physids_equal(map1, map2) \
140 bitmap_equal((map1).mask, (map2).mask, MAX_APICS) 141 bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
141 142
142#define physids_weight(map) \ 143#define physids_weight(map) \
143 bitmap_weight((map).mask, MAX_APICS) 144 bitmap_weight((map).mask, MAX_LOCAL_APIC)
144 145
145#define physids_shift_right(d, s, n) \ 146#define physids_shift_right(d, s, n) \
146 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) 147 bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
147 148
148#define physids_shift_left(d, s, n) \ 149#define physids_shift_left(d, s, n) \
149 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 150 bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
150 151
151static inline unsigned long physids_coerce(physid_mask_t *map) 152static inline unsigned long physids_coerce(physid_mask_t *map)
152{ 153{
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
159 map->mask[0] = physids; 160 map->mask[0] = physids;
160} 161}
161 162
162/* Note: will create very large stack frames if physid_mask_t is big */
163#define physid_mask_of_physid(physid) \
164 ({ \
165 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
166 physid_set(physid, __physid_mask); \
167 __physid_mask; \
168 })
169
170static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) 163static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
171{ 164{
172 physids_clear(*map); 165 physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..c0a955a9a087 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
15 15
16#ifdef CONFIG_X86_32 16#ifdef CONFIG_X86_32
17# define MAX_MPC_ENTRY 1024 17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20# if NR_CPUS <= 255
21# define MAX_APICS 255
22# else
23# define MAX_APICS 32768
24# endif
25#endif 18#endif
26 19
27/* Intel MP Floating Pointer Structure */ 20/* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h
new file mode 100644
index 000000000000..73668abdbedf
--- /dev/null
+++ b/arch/x86/include/asm/mrst-vrtc.h
@@ -0,0 +1,9 @@
1#ifndef _MRST_VRTC_H
2#define _MRST_VRTC_H
3
4extern unsigned char vrtc_cmos_read(unsigned char reg);
5extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
6extern unsigned long vrtc_get_time(void);
7extern int vrtc_set_mmss(unsigned long nowtime);
8
9#endif
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 4a711a684b17..719f00b28ff5 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -14,7 +14,9 @@
14#include <linux/sfi.h> 14#include <linux/sfi.h>
15 15
16extern int pci_mrst_init(void); 16extern int pci_mrst_init(void);
17int __init sfi_parse_mrtc(struct sfi_table_header *table); 17extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
18extern int sfi_mrtc_num;
19extern struct sfi_rtc_table_entry sfi_mrtc_array[];
18 20
19/* 21/*
20 * Medfield is the follow-up of Moorestown, it combines two chip solution into 22 * Medfield is the follow-up of Moorestown, it combines two chip solution into
@@ -50,4 +52,14 @@ extern void mrst_early_console_init(void);
50 52
51extern struct console early_hsu_console; 53extern struct console early_hsu_console;
52extern void hsu_early_console_init(void); 54extern void hsu_early_console_init(void);
55
56extern void intel_scu_devices_create(void);
57extern void intel_scu_devices_destroy(void);
58
59/* VRTC timer */
60#define MRST_VRTC_MAP_SZ (1024)
61/*#define MRST_VRTC_PGOFFSET (0xc00) */
62
63extern void mrst_rtc_init(void);
64
53#endif /* _ASM_X86_MRST_H */ 65#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 86030f63ba02..4d0dfa0d998e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -257,6 +257,18 @@
257#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) 257#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
258#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) 258#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
259 259
260/* Thermal Thresholds Support */
261#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
262#define THERM_SHIFT_THRESHOLD0 8
263#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
264#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
265#define THERM_SHIFT_THRESHOLD1 16
266#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
267#define THERM_STATUS_THRESHOLD0 (1 << 6)
268#define THERM_LOG_THRESHOLD0 (1 << 7)
269#define THERM_STATUS_THRESHOLD1 (1 << 8)
270#define THERM_LOG_THRESHOLD1 (1 << 9)
271
260/* MISC_ENABLE bits: architectural */ 272/* MISC_ENABLE bits: architectural */
261#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 273#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
262#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 274#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ef9975812c77..7709c12431b8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
112 112
113static inline void halt(void) 113static inline void halt(void)
114{ 114{
115 PVOP_VCALL0(pv_irq_ops.safe_halt); 115 PVOP_VCALL0(pv_irq_ops.halt);
116} 116}
117 117
118static inline void wbinvd(void) 118static inline void wbinvd(void)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c714b2..676129229630 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
65 65
66#define PCIBIOS_MIN_CARDBUS_IO 0x4000 66#define PCIBIOS_MIN_CARDBUS_IO 0x4000
67 67
68extern int pcibios_enabled;
68void pcibios_config_init(void); 69void pcibios_config_init(void);
69struct pci_bus *pcibios_scan_root(int bus); 70struct pci_bus *pcibios_scan_root(int bus);
70 71
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index d6763b139a84..db8aa19a08a2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void);
53static inline void x86_mrst_early_setup(void) { } 53static inline void x86_mrst_early_setup(void) { }
54#endif 54#endif
55 55
56#ifdef CONFIG_X86_INTEL_CE
57extern void x86_ce4100_early_setup(void);
58#else
59static inline void x86_ce4100_early_setup(void) { }
60#endif
61
56#ifndef _SETUP 62#ifndef _SETUP
57 63
58/* 64/*
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412fd8b02..ce1d54c8a433 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, 26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. 27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 * 28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32 29 * We will use one set for sending BAU messages from each of the
30 * cpu's on the uvhub. 30 * cpu's on the uvhub.
31 * 31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set. 32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). 33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */ 34 */
35 35
36#define MAX_CPUS_PER_UVHUB 64
37#define MAX_CPUS_PER_SOCKET 32
38#define UV_ADP_SIZE 64 /* hardware-provided max. */
39#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
36#define UV_ITEMS_PER_DESCRIPTOR 8 40#define UV_ITEMS_PER_DESCRIPTOR 8
37/* the 'throttle' to prevent the hardware stay-busy bug */ 41/* the 'throttle' to prevent the hardware stay-busy bug */
38#define MAX_BAU_CONCURRENT 3 42#define MAX_BAU_CONCURRENT 3
39#define UV_CPUS_PER_ACT_STATUS 32
40#define UV_ACT_STATUS_MASK 0x3 43#define UV_ACT_STATUS_MASK 0x3
41#define UV_ACT_STATUS_SIZE 2 44#define UV_ACT_STATUS_SIZE 2
42#define UV_ADP_SIZE 32
43#define UV_DISTRIBUTION_SIZE 256 45#define UV_DISTRIBUTION_SIZE 256
44#define UV_SW_ACK_NPENDING 8 46#define UV_SW_ACK_NPENDING 8
45#define UV_NET_ENDPOINT_INTD 0x38 47#define UV_NET_ENDPOINT_INTD 0x38
@@ -100,7 +102,6 @@
100 * number of destination side software ack resources 102 * number of destination side software ack resources
101 */ 103 */
102#define DEST_NUM_RESOURCES 8 104#define DEST_NUM_RESOURCES 8
103#define MAX_CPUS_PER_NODE 32
104/* 105/*
105 * completion statuses for sending a TLB flush message 106 * completion statuses for sending a TLB flush message
106 */ 107 */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 1e994754d323..34244b2cd880 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -85,7 +85,6 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
85obj-$(CONFIG_KGDB) += kgdb.o 85obj-$(CONFIG_KGDB) += kgdb.o
86obj-$(CONFIG_VM86) += vm86_32.o 86obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
89 88
90obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
91obj-$(CONFIG_APB_TIMER) += apb_timer.o 90obj-$(CONFIG_APB_TIMER) += apb_timer.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 71232b941b6c..17c8090fabd4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
198{ 198{
199 unsigned int ver = 0; 199 unsigned int ver = 0;
200 200
201 if (id >= (MAX_LOCAL_APIC-1)) {
202 printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
203 return;
204 }
205
201 if (!enabled) { 206 if (!enabled) {
202 ++disabled_cpus; 207 ++disabled_cpus;
203 return; 208 return;
@@ -910,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
910 acpi_register_lapic_address(acpi_lapic_addr); 915 acpi_register_lapic_address(acpi_lapic_addr);
911 916
912 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 917 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
913 acpi_parse_sapic, MAX_APICS); 918 acpi_parse_sapic, MAX_LOCAL_APIC);
914 919
915 if (!count) { 920 if (!count) {
916 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, 921 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
917 acpi_parse_x2apic, MAX_APICS); 922 acpi_parse_x2apic, MAX_LOCAL_APIC);
918 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 923 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
919 acpi_parse_lapic, MAX_APICS); 924 acpi_parse_lapic, MAX_LOCAL_APIC);
920 } 925 }
921 if (!count && !x2count) { 926 if (!count && !x2count) {
922 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 927 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 553d0b0d639b..123608531c8f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
353 mutex_unlock(&smp_alt); 353 mutex_unlock(&smp_alt);
354} 354}
355 355
356bool skip_smp_alternatives;
356void alternatives_smp_switch(int smp) 357void alternatives_smp_switch(int smp)
357{ 358{
358 struct smp_alt_module *mod; 359 struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
368 printk("lockdep: fixing up alternatives.\n"); 369 printk("lockdep: fixing up alternatives.\n");
369#endif 370#endif
370 371
371 if (noreplace_smp || smp_alt_once) 372 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
372 return; 373 return;
373 BUG_ON(!smp && (num_online_cpus() > 1)); 374 BUG_ON(!smp && (num_online_cpus() > 1));
374 375
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 8f6463d8ed0d..affacb5e0065 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,95 +12,116 @@
12 12
13static u32 *flush_words; 13static u32 *flush_words;
14 14
15struct pci_device_id k8_nb_ids[] = { 15struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
19 {} 19 {}
20}; 20};
21EXPORT_SYMBOL(k8_nb_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23struct k8_northbridge_info k8_northbridges; 23struct amd_northbridge_info amd_northbridges;
24EXPORT_SYMBOL(k8_northbridges); 24EXPORT_SYMBOL(amd_northbridges);
25 25
26static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) 26static struct pci_dev *next_northbridge(struct pci_dev *dev,
27 struct pci_device_id *ids)
27{ 28{
28 do { 29 do {
29 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); 30 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
30 if (!dev) 31 if (!dev)
31 break; 32 break;
32 } while (!pci_match_id(&k8_nb_ids[0], dev)); 33 } while (!pci_match_id(ids, dev));
33 return dev; 34 return dev;
34} 35}
35 36
36int cache_k8_northbridges(void) 37int amd_cache_northbridges(void)
37{ 38{
38 int i; 39 int i = 0;
39 struct pci_dev *dev; 40 struct amd_northbridge *nb;
41 struct pci_dev *misc;
40 42
41 if (k8_northbridges.num) 43 if (amd_nb_num())
42 return 0; 44 return 0;
43 45
44 dev = NULL; 46 misc = NULL;
45 while ((dev = next_k8_northbridge(dev)) != NULL) 47 while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
46 k8_northbridges.num++; 48 i++;
47 49
48 /* some CPU families (e.g. family 0x11) do not support GART */ 50 if (i == 0)
49 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || 51 return 0;
50 boot_cpu_data.x86 == 0x15)
51 k8_northbridges.gart_supported = 1;
52 52
53 k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * 53 nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
54 sizeof(void *), GFP_KERNEL); 54 if (!nb)
55 if (!k8_northbridges.nb_misc)
56 return -ENOMEM; 55 return -ENOMEM;
57 56
58 if (!k8_northbridges.num) { 57 amd_northbridges.nb = nb;
59 k8_northbridges.nb_misc[0] = NULL; 58 amd_northbridges.num = i;
60 return 0;
61 }
62 59
63 if (k8_northbridges.gart_supported) { 60 misc = NULL;
64 flush_words = kmalloc(k8_northbridges.num * sizeof(u32), 61 for (i = 0; i != amd_nb_num(); i++) {
65 GFP_KERNEL); 62 node_to_amd_nb(i)->misc = misc =
66 if (!flush_words) { 63 next_northbridge(misc, amd_nb_misc_ids);
67 kfree(k8_northbridges.nb_misc); 64 }
68 return -ENOMEM; 65
69 } 66 /* some CPU families (e.g. family 0x11) do not support GART */
70 } 67 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
68 boot_cpu_data.x86 == 0x15)
69 amd_northbridges.flags |= AMD_NB_GART;
70
71 /*
72 * Some CPU families support L3 Cache Index Disable. There are some
73 * limitations because of E382 and E388 on family 0x10.
74 */
75 if (boot_cpu_data.x86 == 0x10 &&
76 boot_cpu_data.x86_model >= 0x8 &&
77 (boot_cpu_data.x86_model > 0x9 ||
78 boot_cpu_data.x86_mask >= 0x1))
79 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
71 80
72 dev = NULL;
73 i = 0;
74 while ((dev = next_k8_northbridge(dev)) != NULL) {
75 k8_northbridges.nb_misc[i] = dev;
76 if (k8_northbridges.gart_supported)
77 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
78 }
79 k8_northbridges.nb_misc[i] = NULL;
80 return 0; 81 return 0;
81} 82}
82EXPORT_SYMBOL_GPL(cache_k8_northbridges); 83EXPORT_SYMBOL_GPL(amd_cache_northbridges);
83 84
84/* Ignores subdevice/subvendor but as far as I can figure out 85/* Ignores subdevice/subvendor but as far as I can figure out
85 they're useless anyways */ 86 they're useless anyways */
86int __init early_is_k8_nb(u32 device) 87int __init early_is_amd_nb(u32 device)
87{ 88{
88 struct pci_device_id *id; 89 struct pci_device_id *id;
89 u32 vendor = device & 0xffff; 90 u32 vendor = device & 0xffff;
90 device >>= 16; 91 device >>= 16;
91 for (id = k8_nb_ids; id->vendor; id++) 92 for (id = amd_nb_misc_ids; id->vendor; id++)
92 if (vendor == id->vendor && device == id->device) 93 if (vendor == id->vendor && device == id->device)
93 return 1; 94 return 1;
94 return 0; 95 return 0;
95} 96}
96 97
97void k8_flush_garts(void) 98int amd_cache_gart(void)
99{
100 int i;
101
102 if (!amd_nb_has_feature(AMD_NB_GART))
103 return 0;
104
105 flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
106 if (!flush_words) {
107 amd_northbridges.flags &= ~AMD_NB_GART;
108 return -ENOMEM;
109 }
110
111 for (i = 0; i != amd_nb_num(); i++)
112 pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
113 &flush_words[i]);
114
115 return 0;
116}
117
118void amd_flush_garts(void)
98{ 119{
99 int flushed, i; 120 int flushed, i;
100 unsigned long flags; 121 unsigned long flags;
101 static DEFINE_SPINLOCK(gart_lock); 122 static DEFINE_SPINLOCK(gart_lock);
102 123
103 if (!k8_northbridges.gart_supported) 124 if (!amd_nb_has_feature(AMD_NB_GART))
104 return; 125 return;
105 126
106 /* Avoid races between AGP and IOMMU. In theory it's not needed 127 /* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +130,16 @@ void k8_flush_garts(void)
109 that it doesn't matter to serialize more. -AK */ 130 that it doesn't matter to serialize more. -AK */
110 spin_lock_irqsave(&gart_lock, flags); 131 spin_lock_irqsave(&gart_lock, flags);
111 flushed = 0; 132 flushed = 0;
112 for (i = 0; i < k8_northbridges.num; i++) { 133 for (i = 0; i < amd_nb_num(); i++) {
113 pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, 134 pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
114 flush_words[i]|1); 135 flush_words[i] | 1);
115 flushed++; 136 flushed++;
116 } 137 }
117 for (i = 0; i < k8_northbridges.num; i++) { 138 for (i = 0; i < amd_nb_num(); i++) {
118 u32 w; 139 u32 w;
119 /* Make sure the hardware actually executed the flush*/ 140 /* Make sure the hardware actually executed the flush*/
120 for (;;) { 141 for (;;) {
121 pci_read_config_dword(k8_northbridges.nb_misc[i], 142 pci_read_config_dword(node_to_amd_nb(i)->misc,
122 0x9c, &w); 143 0x9c, &w);
123 if (!(w & 1)) 144 if (!(w & 1))
124 break; 145 break;
@@ -129,19 +150,23 @@ void k8_flush_garts(void)
129 if (!flushed) 150 if (!flushed)
130 printk("nothing to flush?\n"); 151 printk("nothing to flush?\n");
131} 152}
132EXPORT_SYMBOL_GPL(k8_flush_garts); 153EXPORT_SYMBOL_GPL(amd_flush_garts);
133 154
134static __init int init_k8_nbs(void) 155static __init int init_amd_nbs(void)
135{ 156{
136 int err = 0; 157 int err = 0;
137 158
138 err = cache_k8_northbridges(); 159 err = amd_cache_northbridges();
139 160
140 if (err < 0) 161 if (err < 0)
141 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); 162 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
163
164 if (amd_cache_gart() < 0)
165 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
166 "GART support disabled.\n");
142 167
143 return err; 168 return err;
144} 169}
145 170
146/* This has to go after the PCI subsystem */ 171/* This has to go after the PCI subsystem */
147fs_initcall(init_k8_nbs); 172fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 92543c73cf8e..7c9ab59653e8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
315 315
316 if (system_state == SYSTEM_BOOTING) { 316 if (system_state == SYSTEM_BOOTING) {
317 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); 317 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
318 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
318 /* APB timer irqs are set up as mp_irqs, timer is edge type */ 319 /* APB timer irqs are set up as mp_irqs, timer is edge type */
319 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); 320 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
320 if (request_irq(adev->irq, apbt_interrupt_handler, 321 if (request_irq(adev->irq, apbt_interrupt_handler,
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b3a16e8f0703..dcd7c83e1659 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
206 * Do an PCI bus scan by hand because we're running before the PCI 206 * Do an PCI bus scan by hand because we're running before the PCI
207 * subsystem. 207 * subsystem.
208 * 208 *
209 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan 209 * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
210 * generically. It's probably overkill to always scan all slots because 210 * generically. It's probably overkill to always scan all slots because
211 * the AGP bridges should be always an own bus on the HT hierarchy, 211 * the AGP bridges should be always an own bus on the HT hierarchy,
212 * but do it here for future safety. 212 * but do it here for future safety.
@@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void)
303 dev_limit = bus_dev_ranges[i].dev_limit; 303 dev_limit = bus_dev_ranges[i].dev_limit;
304 304
305 for (slot = dev_base; slot < dev_limit; slot++) { 305 for (slot = dev_base; slot < dev_limit; slot++) {
306 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 306 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
307 continue; 307 continue;
308 308
309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void)
358 dev_limit = bus_dev_ranges[i].dev_limit; 358 dev_limit = bus_dev_ranges[i].dev_limit;
359 359
360 for (slot = dev_base; slot < dev_limit; slot++) { 360 for (slot = dev_base; slot < dev_limit; slot++) {
361 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 361 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
362 continue; 362 continue;
363 363
364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void)
400 dev_limit = bus_dev_ranges[i].dev_limit; 400 dev_limit = bus_dev_ranges[i].dev_limit;
401 401
402 for (slot = dev_base; slot < dev_limit; slot++) { 402 for (slot = dev_base; slot < dev_limit; slot++) {
403 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 403 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
404 continue; 404 continue;
405 405
406 iommu_detected = 1; 406 iommu_detected = 1;
@@ -518,7 +518,7 @@ out:
518 dev_base = bus_dev_ranges[i].dev_base; 518 dev_base = bus_dev_ranges[i].dev_base;
519 dev_limit = bus_dev_ranges[i].dev_limit; 519 dev_limit = bus_dev_ranges[i].dev_limit;
520 for (slot = dev_base; slot < dev_limit; slot++) { 520 for (slot = dev_base; slot < dev_limit; slot++) {
521 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 521 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
522 continue; 522 continue;
523 523
524 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); 524 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fb7657822aad..879999a5230f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -431,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
431 reserved = reserve_eilvt_offset(offset, new); 431 reserved = reserve_eilvt_offset(offset, new);
432 432
433 if (reserved != new) { 433 if (reserved != new) {
434 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " 434 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
435 "vector 0x%x was already reserved by another core, " 435 "vector 0x%x, but the register is already in use for "
436 "APIC%lX=0x%x\n", 436 "vector 0x%x on another cpu\n",
437 smp_processor_id(), new, reserved, reg, old); 437 smp_processor_id(), reg, offset, new, reserved);
438 return -EINVAL; 438 return -EINVAL;
439 } 439 }
440 440
441 if (!eilvt_entry_is_changeable(old, new)) { 441 if (!eilvt_entry_is_changeable(old, new)) {
442 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " 442 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
443 "register already in use, APIC%lX=0x%x\n", 443 "vector 0x%x, but the register is already in use for "
444 smp_processor_id(), new, reg, old); 444 "vector 0x%x on this cpu\n",
445 smp_processor_id(), reg, offset, new, old);
445 return -EBUSY; 446 return -EBUSY;
446 } 447 }
447 448
@@ -1532,13 +1533,60 @@ static int __init detect_init_APIC(void)
1532 return 0; 1533 return 0;
1533} 1534}
1534#else 1535#else
1536
1537static int apic_verify(void)
1538{
1539 u32 features, h, l;
1540
1541 /*
1542 * The APIC feature bit should now be enabled
1543 * in `cpuid'
1544 */
1545 features = cpuid_edx(1);
1546 if (!(features & (1 << X86_FEATURE_APIC))) {
1547 pr_warning("Could not enable APIC!\n");
1548 return -1;
1549 }
1550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1551 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1552
1553 /* The BIOS may have set up the APIC at some other address */
1554 rdmsr(MSR_IA32_APICBASE, l, h);
1555 if (l & MSR_IA32_APICBASE_ENABLE)
1556 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1557
1558 pr_info("Found and enabled local APIC!\n");
1559 return 0;
1560}
1561
1562int apic_force_enable(void)
1563{
1564 u32 h, l;
1565
1566 if (disable_apic)
1567 return -1;
1568
1569 /*
1570 * Some BIOSes disable the local APIC in the APIC_BASE
1571 * MSR. This can only be done in software for Intel P6 or later
1572 * and AMD K7 (Model > 1) or later.
1573 */
1574 rdmsr(MSR_IA32_APICBASE, l, h);
1575 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1576 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1577 l &= ~MSR_IA32_APICBASE_BASE;
1578 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1579 wrmsr(MSR_IA32_APICBASE, l, h);
1580 enabled_via_apicbase = 1;
1581 }
1582 return apic_verify();
1583}
1584
1535/* 1585/*
1536 * Detect and initialize APIC 1586 * Detect and initialize APIC
1537 */ 1587 */
1538static int __init detect_init_APIC(void) 1588static int __init detect_init_APIC(void)
1539{ 1589{
1540 u32 h, l, features;
1541
1542 /* Disabled by kernel option? */ 1590 /* Disabled by kernel option? */
1543 if (disable_apic) 1591 if (disable_apic)
1544 return -1; 1592 return -1;
@@ -1568,38 +1616,12 @@ static int __init detect_init_APIC(void)
1568 "you can enable it with \"lapic\"\n"); 1616 "you can enable it with \"lapic\"\n");
1569 return -1; 1617 return -1;
1570 } 1618 }
1571 /* 1619 if (apic_force_enable())
1572 * Some BIOSes disable the local APIC in the APIC_BASE 1620 return -1;
1573 * MSR. This can only be done in software for Intel P6 or later 1621 } else {
1574 * and AMD K7 (Model > 1) or later. 1622 if (apic_verify())
1575 */ 1623 return -1;
1576 rdmsr(MSR_IA32_APICBASE, l, h);
1577 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1578 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1579 l &= ~MSR_IA32_APICBASE_BASE;
1580 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1581 wrmsr(MSR_IA32_APICBASE, l, h);
1582 enabled_via_apicbase = 1;
1583 }
1584 }
1585 /*
1586 * The APIC feature bit should now be enabled
1587 * in `cpuid'
1588 */
1589 features = cpuid_edx(1);
1590 if (!(features & (1 << X86_FEATURE_APIC))) {
1591 pr_warning("Could not enable APIC!\n");
1592 return -1;
1593 } 1624 }
1594 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1595 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1596
1597 /* The BIOS may have set up the APIC at some other address */
1598 rdmsr(MSR_IA32_APICBASE, l, h);
1599 if (l & MSR_IA32_APICBASE_ENABLE)
1600 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1601
1602 pr_info("Found and enabled local APIC!\n");
1603 1625
1604 apic_pm_activate(); 1626 apic_pm_activate();
1605 1627
@@ -1687,7 +1709,7 @@ void __init init_apic_mappings(void)
1687 * This initializes the IO-APIC and APIC hardware if this is 1709 * This initializes the IO-APIC and APIC hardware if this is
1688 * a UP kernel. 1710 * a UP kernel.
1689 */ 1711 */
1690int apic_version[MAX_APICS]; 1712int apic_version[MAX_LOCAL_APIC];
1691 1713
1692int __init APIC_init_uniprocessor(void) 1714int __init APIC_init_uniprocessor(void)
1693{ 1715{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 16c2db8750a2..f6cd5b410770 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1933,8 +1933,7 @@ void disable_IO_APIC(void)
1933 * 1933 *
1934 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1934 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1935 */ 1935 */
1936 1936void __init setup_ioapic_ids_from_mpc_nocheck(void)
1937void __init setup_ioapic_ids_from_mpc(void)
1938{ 1937{
1939 union IO_APIC_reg_00 reg_00; 1938 union IO_APIC_reg_00 reg_00;
1940 physid_mask_t phys_id_present_map; 1939 physid_mask_t phys_id_present_map;
@@ -1943,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void)
1943 unsigned char old_id; 1942 unsigned char old_id;
1944 unsigned long flags; 1943 unsigned long flags;
1945 1944
1946 if (acpi_ioapic)
1947 return;
1948 /*
1949 * Don't check I/O APIC IDs for xAPIC systems. They have
1950 * no meaning without the serial APIC bus.
1951 */
1952 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1953 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1954 return;
1955 /* 1945 /*
1956 * This is broken; anything with a real cpu count has to 1946 * This is broken; anything with a real cpu count has to
1957 * circumvent this idiocy regardless. 1947 * circumvent this idiocy regardless.
@@ -2005,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void)
2005 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1995 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2006 } 1996 }
2007 1997
2008
2009 /* 1998 /*
2010 * We need to adjust the IRQ routing table 1999 * We need to adjust the IRQ routing table
2011 * if the ID changed. 2000 * if the ID changed.
@@ -2041,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void)
2041 apic_printk(APIC_VERBOSE, " ok.\n"); 2030 apic_printk(APIC_VERBOSE, " ok.\n");
2042 } 2031 }
2043} 2032}
2033
2034void __init setup_ioapic_ids_from_mpc(void)
2035{
2036
2037 if (acpi_ioapic)
2038 return;
2039 /*
2040 * Don't check I/O APIC IDs for xAPIC systems. They have
2041 * no meaning without the serial APIC bus.
2042 */
2043 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2044 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
2045 return;
2046 setup_ioapic_ids_from_mpc_nocheck();
2047}
2044#endif 2048#endif
2045 2049
2046int no_timer_check __initdata; 2050int no_timer_check __initdata;
@@ -3593,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic)
3593 return reg_01.bits.entries + 1; 3597 return reg_01.bits.entries + 1;
3594} 3598}
3595 3599
3596void __init probe_nr_irqs_gsi(void) 3600static void __init probe_nr_irqs_gsi(void)
3597{ 3601{
3598 int nr; 3602 int nr;
3599 3603
@@ -3910,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
3910 return res; 3914 return res;
3911} 3915}
3912 3916
3913void __init ioapic_init_mappings(void) 3917void __init ioapic_and_gsi_init(void)
3914{ 3918{
3915 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3919 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3916 struct resource *ioapic_res; 3920 struct resource *ioapic_res;
@@ -3948,6 +3952,8 @@ fake_ioapic_page:
3948 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; 3952 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
3949 ioapic_res++; 3953 ioapic_res++;
3950 } 3954 }
3955
3956 probe_nr_irqs_gsi();
3951} 3957}
3952 3958
3953void __init ioapic_insert_resources(void) 3959void __init ioapic_insert_resources(void)
@@ -4057,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void)
4057 4063
4058 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4064 printk(KERN_INFO "Early APIC setup for system timer0\n");
4059#ifndef CONFIG_SMP 4065#ifndef CONFIG_SMP
4060 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 4066 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4067 &phys_cpu_present_map);
4061#endif 4068#endif
4062 /* Make sure the irq descriptor is set up */ 4069 /* Make sure the irq descriptor is set up */
4063 cfg = alloc_irq_and_cfg_at(0, 0); 4070 cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 927902d90fe6..936613e77113 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits;
48EXPORT_SYMBOL_GPL(uv_apicid_hibits); 48EXPORT_SYMBOL_GPL(uv_apicid_hibits);
49static DEFINE_SPINLOCK(uv_nmi_lock); 49static DEFINE_SPINLOCK(uv_nmi_lock);
50 50
51static unsigned long __init uv_early_read_mmr(unsigned long addr)
52{
53 unsigned long val, *mmr;
54
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
56 val = *mmr;
57 early_iounmap(mmr, sizeof(*mmr));
58 return val;
59}
60
51static inline bool is_GRU_range(u64 start, u64 end) 61static inline bool is_GRU_range(u64 start, u64 end)
52{ 62{
53 return start >= gru_start_paddr && end <= gru_end_paddr; 63 return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -58,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
58 return is_ISA_range(start, end) || is_GRU_range(start, end); 68 return is_ISA_range(start, end) || is_GRU_range(start, end);
59} 69}
60 70
61static int early_get_nodeid(void) 71static int __init early_get_pnodeid(void)
62{ 72{
63 union uvh_node_id_u node_id; 73 union uvh_node_id_u node_id;
64 unsigned long *mmr; 74 union uvh_rh_gam_config_mmr_u m_n_config;
65 75 int pnode;
66 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
67 node_id.v = *mmr;
68 early_iounmap(mmr, sizeof(*mmr));
69 76
70 /* Currently, all blades have same revision number */ 77 /* Currently, all blades have same revision number */
78 node_id.v = uv_early_read_mmr(UVH_NODE_ID);
79 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
71 uv_min_hub_revision_id = node_id.s.revision; 80 uv_min_hub_revision_id = node_id.s.revision;
72 81
73 return node_id.s.node_id; 82 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
83 return pnode;
74} 84}
75 85
76static void __init early_get_apic_pnode_shift(void) 86static void __init early_get_apic_pnode_shift(void)
77{ 87{
78 unsigned long *mmr; 88 uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
79
80 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
81 uvh_apicid.v = *mmr;
82 early_iounmap(mmr, sizeof(*mmr));
83 if (!uvh_apicid.v) 89 if (!uvh_apicid.v)
84 /* 90 /*
85 * Old bios, use default value 91 * Old bios, use default value
@@ -95,21 +101,17 @@ static void __init early_get_apic_pnode_shift(void)
95static void __init uv_set_apicid_hibit(void) 101static void __init uv_set_apicid_hibit(void)
96{ 102{
97 union uvh_lb_target_physical_apic_id_mask_u apicid_mask; 103 union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
98 unsigned long *mmr;
99 104
100 mmr = early_ioremap(UV_LOCAL_MMR_BASE | 105 apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
101 UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
102 apicid_mask.v = *mmr;
103 early_iounmap(mmr, sizeof(*mmr));
104 uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; 106 uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
105} 107}
106 108
107static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 109static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
108{ 110{
109 int nodeid; 111 int pnodeid;
110 112
111 if (!strcmp(oem_id, "SGI")) { 113 if (!strcmp(oem_id, "SGI")) {
112 nodeid = early_get_nodeid(); 114 pnodeid = early_get_pnodeid();
113 early_get_apic_pnode_shift(); 115 early_get_apic_pnode_shift();
114 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 116 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
115 x86_platform.nmi_init = uv_nmi_init; 117 x86_platform.nmi_init = uv_nmi_init;
@@ -119,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
119 uv_system_type = UV_X2APIC; 121 uv_system_type = UV_X2APIC;
120 else if (!strcmp(oem_table_id, "UVH")) { 122 else if (!strcmp(oem_table_id, "UVH")) {
121 __get_cpu_var(x2apic_extra_bits) = 123 __get_cpu_var(x2apic_extra_bits) =
122 nodeid << (uvh_apicid.s.pnode_shift - 1); 124 pnodeid << uvh_apicid.s.pnode_shift;
123 uv_system_type = UV_NON_UNIQUE_APIC; 125 uv_system_type = UV_NON_UNIQUE_APIC;
124 uv_set_apicid_hibit(); 126 uv_set_apicid_hibit();
125 return 1; 127 return 1;
@@ -682,27 +684,32 @@ void uv_nmi_init(void)
682void __init uv_system_init(void) 684void __init uv_system_init(void)
683{ 685{
684 union uvh_rh_gam_config_mmr_u m_n_config; 686 union uvh_rh_gam_config_mmr_u m_n_config;
687 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
685 union uvh_node_id_u node_id; 688 union uvh_node_id_u node_id;
686 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 689 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
687 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 690 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
688 int gnode_extra, max_pnode = 0; 691 int gnode_extra, max_pnode = 0;
689 unsigned long mmr_base, present, paddr; 692 unsigned long mmr_base, present, paddr;
690 unsigned short pnode_mask; 693 unsigned short pnode_mask, pnode_io_mask;
691 694
692 map_low_mmrs(); 695 map_low_mmrs();
693 696
694 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); 697 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
695 m_val = m_n_config.s.m_skt; 698 m_val = m_n_config.s.m_skt;
696 n_val = m_n_config.s.n_skt; 699 n_val = m_n_config.s.n_skt;
700 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
701 n_io = mmioh.s.n_io;
697 mmr_base = 702 mmr_base =
698 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 703 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
699 ~UV_MMR_ENABLE; 704 ~UV_MMR_ENABLE;
700 pnode_mask = (1 << n_val) - 1; 705 pnode_mask = (1 << n_val) - 1;
706 pnode_io_mask = (1 << n_io) - 1;
707
701 node_id.v = uv_read_local_mmr(UVH_NODE_ID); 708 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
702 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; 709 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
703 gnode_upper = ((unsigned long)gnode_extra << m_val); 710 gnode_upper = ((unsigned long)gnode_extra << m_val);
704 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", 711 printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
705 n_val, m_val, gnode_upper, gnode_extra); 712 n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
706 713
707 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 714 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
708 715
@@ -735,7 +742,7 @@ void __init uv_system_init(void)
735 for (j = 0; j < 64; j++) { 742 for (j = 0; j < 64; j++) {
736 if (!test_bit(j, &present)) 743 if (!test_bit(j, &present))
737 continue; 744 continue;
738 pnode = (i * 64 + j); 745 pnode = (i * 64 + j) & pnode_mask;
739 uv_blade_info[blade].pnode = pnode; 746 uv_blade_info[blade].pnode = pnode;
740 uv_blade_info[blade].nr_possible_cpus = 0; 747 uv_blade_info[blade].nr_possible_cpus = 0;
741 uv_blade_info[blade].nr_online_cpus = 0; 748 uv_blade_info[blade].nr_online_cpus = 0;
@@ -756,6 +763,7 @@ void __init uv_system_init(void)
756 /* 763 /*
757 * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); 764 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
758 */ 765 */
766 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
759 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; 767 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
760 pnode = uv_apicid_to_pnode(apicid); 768 pnode = uv_apicid_to_pnode(apicid);
761 blade = boot_pnode_to_blade(pnode); 769 blade = boot_pnode_to_blade(pnode);
@@ -772,7 +780,6 @@ void __init uv_system_init(void)
772 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 780 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
773 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 781 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
774 uv_cpu_hub_info(cpu)->pnode = pnode; 782 uv_cpu_hub_info(cpu)->pnode = pnode;
775 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
776 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; 783 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
777 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 784 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
778 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 785 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -796,7 +803,7 @@ void __init uv_system_init(void)
796 803
797 map_gru_high(max_pnode); 804 map_gru_high(max_pnode);
798 map_mmr_high(max_pnode); 805 map_mmr_high(max_pnode);
799 map_mmioh_high(max_pnode); 806 map_mmioh_high(max_pnode & pnode_io_mask);
800 807
801 uv_cpu_init(); 808 uv_cpu_init();
802 uv_scir_register_cpu_notifier(); 809 uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 17ad03366211..9ecf81f9b90f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx {
149}; 149};
150 150
151struct amd_l3_cache { 151struct amd_l3_cache {
152 struct pci_dev *dev; 152 struct amd_northbridge *nb;
153 bool can_disable;
154 unsigned indices; 153 unsigned indices;
155 u8 subcaches[4]; 154 u8 subcaches[4];
156}; 155};
@@ -311,14 +310,12 @@ struct _cache_attr {
311/* 310/*
312 * L3 cache descriptors 311 * L3 cache descriptors
313 */ 312 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) 313static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
317{ 314{
318 unsigned int sc0, sc1, sc2, sc3; 315 unsigned int sc0, sc1, sc2, sc3;
319 u32 val = 0; 316 u32 val = 0;
320 317
321 pci_read_config_dword(l3->dev, 0x1C4, &val); 318 pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
322 319
323 /* calculate subcache sizes */ 320 /* calculate subcache sizes */
324 l3->subcaches[0] = sc0 = !(val & BIT(0)); 321 l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 327 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
331} 328}
332 329
333static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) 330static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
334{ 331 int index)
335 struct amd_l3_cache *l3;
336 struct pci_dev *dev = node_to_k8_nb_misc(node);
337
338 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
339 if (!l3) {
340 printk(KERN_WARNING "Error allocating L3 struct\n");
341 return NULL;
342 }
343
344 l3->dev = dev;
345
346 amd_calc_l3_indices(l3);
347
348 return l3;
349}
350
351static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
352 int index)
353{ 332{
333 static struct amd_l3_cache *__cpuinitdata l3_caches;
354 int node; 334 int node;
355 335
356 if (boot_cpu_data.x86 != 0x10) 336 /* only for L3, and not in virtualized environments */
357 return; 337 if (index < 3 || amd_nb_num() == 0)
358
359 if (index < 3)
360 return;
361
362 /* see errata #382 and #388 */
363 if (boot_cpu_data.x86_model < 0x8)
364 return;
365
366 if ((boot_cpu_data.x86_model == 0x8 ||
367 boot_cpu_data.x86_model == 0x9)
368 &&
369 boot_cpu_data.x86_mask < 0x1)
370 return;
371
372 /* not in virtualized environments */
373 if (k8_northbridges.num == 0)
374 return; 338 return;
375 339
376 /* 340 /*
@@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
378 * never freed but this is done only on shutdown so it doesn't matter. 342 * never freed but this is done only on shutdown so it doesn't matter.
379 */ 343 */
380 if (!l3_caches) { 344 if (!l3_caches) {
381 int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); 345 int size = amd_nb_num() * sizeof(struct amd_l3_cache);
382 346
383 l3_caches = kzalloc(size, GFP_ATOMIC); 347 l3_caches = kzalloc(size, GFP_ATOMIC);
384 if (!l3_caches) 348 if (!l3_caches)
@@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
387 351
388 node = amd_get_nb_id(smp_processor_id()); 352 node = amd_get_nb_id(smp_processor_id());
389 353
390 if (!l3_caches[node]) { 354 if (!l3_caches[node].nb) {
391 l3_caches[node] = amd_init_l3_cache(node); 355 l3_caches[node].nb = node_to_amd_nb(node);
392 l3_caches[node]->can_disable = true; 356 amd_calc_l3_indices(&l3_caches[node]);
393 } 357 }
394 358
395 WARN_ON(!l3_caches[node]); 359 this_leaf->l3 = &l3_caches[node];
396
397 this_leaf->l3 = l3_caches[node];
398} 360}
399 361
400/* 362/*
@@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
408{ 370{
409 unsigned int reg = 0; 371 unsigned int reg = 0;
410 372
411 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg); 373 pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
412 374
413 /* check whether this slot is activated already */ 375 /* check whether this slot is activated already */
414 if (reg & (3UL << 30)) 376 if (reg & (3UL << 30))
@@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
422{ 384{
423 int index; 385 int index;
424 386
425 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 387 if (!this_leaf->l3 ||
388 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
426 return -EINVAL; 389 return -EINVAL;
427 390
428 index = amd_get_l3_disable_slot(this_leaf->l3, slot); 391 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
457 if (!l3->subcaches[i]) 420 if (!l3->subcaches[i])
458 continue; 421 continue;
459 422
460 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 423 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
461 424
462 /* 425 /*
463 * We need to WBINVD on a core on the node containing the L3 426 * We need to WBINVD on a core on the node containing the L3
@@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
467 wbinvd_on_cpu(cpu); 430 wbinvd_on_cpu(cpu);
468 431
469 reg |= BIT(31); 432 reg |= BIT(31);
470 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 433 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
471 } 434 }
472} 435}
473 436
@@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
524 if (!capable(CAP_SYS_ADMIN)) 487 if (!capable(CAP_SYS_ADMIN))
525 return -EPERM; 488 return -EPERM;
526 489
527 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 490 if (!this_leaf->l3 ||
491 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
528 return -EINVAL; 492 return -EINVAL;
529 493
530 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 494 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
545#define STORE_CACHE_DISABLE(slot) \ 509#define STORE_CACHE_DISABLE(slot) \
546static ssize_t \ 510static ssize_t \
547store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 511store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
548 const char *buf, size_t count) \ 512 const char *buf, size_t count) \
549{ \ 513{ \
550 return store_cache_disable(this_leaf, buf, count, slot); \ 514 return store_cache_disable(this_leaf, buf, count, slot); \
551} 515}
@@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
558 show_cache_disable_1, store_cache_disable_1); 522 show_cache_disable_1, store_cache_disable_1);
559 523
560#else /* CONFIG_AMD_NB */ 524#else /* CONFIG_AMD_NB */
561static void __cpuinit 525#define amd_init_l3_cache(x, y)
562amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
563{
564};
565#endif /* CONFIG_AMD_NB */ 526#endif /* CONFIG_AMD_NB */
566 527
567static int 528static int
@@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
575 536
576 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 537 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
577 amd_cpuid4(index, &eax, &ebx, &ecx); 538 amd_cpuid4(index, &eax, &ebx, &ecx);
578 amd_check_l3_disable(this_leaf, index); 539 amd_init_l3_cache(this_leaf, index);
579 } else { 540 } else {
580 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 541 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
581 } 542 }
@@ -983,30 +944,48 @@ define_one_ro(size);
983define_one_ro(shared_cpu_map); 944define_one_ro(shared_cpu_map);
984define_one_ro(shared_cpu_list); 945define_one_ro(shared_cpu_list);
985 946
986#define DEFAULT_SYSFS_CACHE_ATTRS \
987 &type.attr, \
988 &level.attr, \
989 &coherency_line_size.attr, \
990 &physical_line_partition.attr, \
991 &ways_of_associativity.attr, \
992 &number_of_sets.attr, \
993 &size.attr, \
994 &shared_cpu_map.attr, \
995 &shared_cpu_list.attr
996
997static struct attribute *default_attrs[] = { 947static struct attribute *default_attrs[] = {
998 DEFAULT_SYSFS_CACHE_ATTRS, 948 &type.attr,
949 &level.attr,
950 &coherency_line_size.attr,
951 &physical_line_partition.attr,
952 &ways_of_associativity.attr,
953 &number_of_sets.attr,
954 &size.attr,
955 &shared_cpu_map.attr,
956 &shared_cpu_list.attr,
999 NULL 957 NULL
1000}; 958};
1001 959
1002static struct attribute *default_l3_attrs[] = {
1003 DEFAULT_SYSFS_CACHE_ATTRS,
1004#ifdef CONFIG_AMD_NB 960#ifdef CONFIG_AMD_NB
1005 &cache_disable_0.attr, 961static struct attribute ** __cpuinit amd_l3_attrs(void)
1006 &cache_disable_1.attr, 962{
963 static struct attribute **attrs;
964 int n;
965
966 if (attrs)
967 return attrs;
968
969 n = sizeof (default_attrs) / sizeof (struct attribute *);
970
971 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
972 n += 2;
973
974 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
975 if (attrs == NULL)
976 return attrs = default_attrs;
977
978 for (n = 0; default_attrs[n]; n++)
979 attrs[n] = default_attrs[n];
980
981 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
982 attrs[n++] = &cache_disable_0.attr;
983 attrs[n++] = &cache_disable_1.attr;
984 }
985
986 return attrs;
987}
1007#endif 988#endif
1008 NULL
1009};
1010 989
1011static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 990static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1012{ 991{
@@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1117 1096
1118 this_leaf = CPUID4_INFO_IDX(cpu, i); 1097 this_leaf = CPUID4_INFO_IDX(cpu, i);
1119 1098
1120 if (this_leaf->l3 && this_leaf->l3->can_disable) 1099 ktype_cache.default_attrs = default_attrs;
1121 ktype_cache.default_attrs = default_l3_attrs; 1100#ifdef CONFIG_AMD_NB
1122 else 1101 if (this_leaf->l3)
1123 ktype_cache.default_attrs = default_attrs; 1102 ktype_cache.default_attrs = amd_l3_attrs();
1124 1103#endif
1125 retval = kobject_init_and_add(&(this_object->kobj), 1104 retval = kobject_init_and_add(&(this_object->kobj),
1126 &ktype_cache, 1105 &ktype_cache,
1127 per_cpu(ici_cache_kobject, cpu), 1106 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 80c482382d5c..5bf2fac52aca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
31#include <asm/mce.h> 31#include <asm/mce.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33 33
34#define PFX "mce_threshold: "
35#define VERSION "version 1.1.1"
36#define NR_BANKS 6 34#define NR_BANKS 6
37#define NR_BLOCKS 9 35#define NR_BLOCKS 9
38#define THRESHOLD_MAX 0xFFF 36#define THRESHOLD_MAX 0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
59 struct list_head miscj; 57 struct list_head miscj;
60}; 58};
61 59
62/* defaults used early on boot */
63static struct threshold_block threshold_defaults = {
64 .interrupt_enable = 0,
65 .threshold_limit = THRESHOLD_MAX,
66};
67
68struct threshold_bank { 60struct threshold_bank {
69 struct kobject *kobj; 61 struct kobject *kobj;
70 struct threshold_block *blocks; 62 struct threshold_block *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
89struct thresh_restart { 81struct thresh_restart {
90 struct threshold_block *b; 82 struct threshold_block *b;
91 int reset; 83 int reset;
84 int set_lvt_off;
85 int lvt_off;
92 u16 old_limit; 86 u16 old_limit;
93}; 87};
94 88
89static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
90{
91 int msr = (hi & MASK_LVTOFF_HI) >> 20;
92
93 if (apic < 0) {
94 pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
95 "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
96 b->bank, b->block, b->address, hi, lo);
97 return 0;
98 }
99
100 if (apic != msr) {
101 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
102 "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
103 b->cpu, apic, b->bank, b->block, b->address, hi, lo);
104 return 0;
105 }
106
107 return 1;
108};
109
95/* must be called with correct cpu affinity */ 110/* must be called with correct cpu affinity */
96/* Called via smp_call_function_single() */ 111/* Called via smp_call_function_single() */
97static void threshold_restart_bank(void *_tr) 112static void threshold_restart_bank(void *_tr)
98{ 113{
99 struct thresh_restart *tr = _tr; 114 struct thresh_restart *tr = _tr;
100 u32 mci_misc_hi, mci_misc_lo; 115 u32 hi, lo;
101 116
102 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 117 rdmsr(tr->b->address, lo, hi);
103 118
104 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 119 if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
105 tr->reset = 1; /* limit cannot be lower than err count */ 120 tr->reset = 1; /* limit cannot be lower than err count */
106 121
107 if (tr->reset) { /* reset err count and overflow bit */ 122 if (tr->reset) { /* reset err count and overflow bit */
108 mci_misc_hi = 123 hi =
109 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 124 (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
110 (THRESHOLD_MAX - tr->b->threshold_limit); 125 (THRESHOLD_MAX - tr->b->threshold_limit);
111 } else if (tr->old_limit) { /* change limit w/o reset */ 126 } else if (tr->old_limit) { /* change limit w/o reset */
112 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 127 int new_count = (hi & THRESHOLD_MAX) +
113 (tr->old_limit - tr->b->threshold_limit); 128 (tr->old_limit - tr->b->threshold_limit);
114 129
115 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 130 hi = (hi & ~MASK_ERR_COUNT_HI) |
116 (new_count & THRESHOLD_MAX); 131 (new_count & THRESHOLD_MAX);
117 } 132 }
118 133
134 if (tr->set_lvt_off) {
135 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
136 /* set new lvt offset */
137 hi &= ~MASK_LVTOFF_HI;
138 hi |= tr->lvt_off << 20;
139 }
140 }
141
119 tr->b->interrupt_enable ? 142 tr->b->interrupt_enable ?
120 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 143 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
121 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 144 (hi &= ~MASK_INT_TYPE_HI);
122 145
123 mci_misc_hi |= MASK_COUNT_EN_HI; 146 hi |= MASK_COUNT_EN_HI;
124 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 147 wrmsr(tr->b->address, lo, hi);
148}
149
150static void mce_threshold_block_init(struct threshold_block *b, int offset)
151{
152 struct thresh_restart tr = {
153 .b = b,
154 .set_lvt_off = 1,
155 .lvt_off = offset,
156 };
157
158 b->threshold_limit = THRESHOLD_MAX;
159 threshold_restart_bank(&tr);
160};
161
162static int setup_APIC_mce(int reserved, int new)
163{
164 if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
165 APIC_EILVT_MSG_FIX, 0))
166 return new;
167
168 return reserved;
125} 169}
126 170
127/* cpu init entry point, called from mce.c with preempt off */ 171/* cpu init entry point, called from mce.c with preempt off */
128void mce_amd_feature_init(struct cpuinfo_x86 *c) 172void mce_amd_feature_init(struct cpuinfo_x86 *c)
129{ 173{
174 struct threshold_block b;
130 unsigned int cpu = smp_processor_id(); 175 unsigned int cpu = smp_processor_id();
131 u32 low = 0, high = 0, address = 0; 176 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 177 unsigned int bank, block;
133 struct thresh_restart tr; 178 int offset = -1;
134 int lvt_off = -1;
135 u8 offset;
136 179
137 for (bank = 0; bank < NR_BANKS; ++bank) { 180 for (bank = 0; bank < NR_BANKS; ++bank) {
138 for (block = 0; block < NR_BLOCKS; ++block) { 181 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
163 if (shared_bank[bank] && c->cpu_core_id) 206 if (shared_bank[bank] && c->cpu_core_id)
164 break; 207 break;
165#endif 208#endif
166 offset = (high & MASK_LVTOFF_HI) >> 20; 209 offset = setup_APIC_mce(offset,
167 if (lvt_off < 0) { 210 (high & MASK_LVTOFF_HI) >> 20);
168 if (setup_APIC_eilvt(offset,
169 THRESHOLD_APIC_VECTOR,
170 APIC_EILVT_MSG_FIX, 0)) {
171 pr_err(FW_BUG "cpu %d, failed to "
172 "setup threshold interrupt "
173 "for bank %d, block %d "
174 "(MSR%08X=0x%x%08x)",
175 smp_processor_id(), bank, block,
176 address, high, low);
177 continue;
178 }
179 lvt_off = offset;
180 } else if (lvt_off != offset) {
181 pr_err(FW_BUG "cpu %d, invalid threshold "
182 "interrupt offset %d for bank %d,"
183 "block %d (MSR%08X=0x%x%08x)",
184 smp_processor_id(), lvt_off, bank,
185 block, address, high, low);
186 continue;
187 }
188
189 high &= ~MASK_LVTOFF_HI;
190 high |= lvt_off << 20;
191 wrmsr(address, low, high);
192 211
193 threshold_defaults.address = address; 212 memset(&b, 0, sizeof(b));
194 tr.b = &threshold_defaults; 213 b.cpu = cpu;
195 tr.reset = 0; 214 b.bank = bank;
196 tr.old_limit = 0; 215 b.block = block;
197 threshold_restart_bank(&tr); 216 b.address = address;
198 217
218 mce_threshold_block_init(&b, offset);
199 mce_threshold_vector = amd_threshold_interrupt; 219 mce_threshold_vector = amd_threshold_interrupt;
200 } 220 }
201 } 221 }
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
298 318
299 b->interrupt_enable = !!new; 319 b->interrupt_enable = !!new;
300 320
321 memset(&tr, 0, sizeof(tr));
301 tr.b = b; 322 tr.b = b;
302 tr.reset = 0;
303 tr.old_limit = 0;
304 323
305 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 324 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
306 325
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
321 if (new < 1) 340 if (new < 1)
322 new = 1; 341 new = 1;
323 342
343 memset(&tr, 0, sizeof(tr));
324 tr.old_limit = b->threshold_limit; 344 tr.old_limit = b->threshold_limit;
325 b->threshold_limit = new; 345 b->threshold_limit = new;
326 tr.b = b; 346 tr.b = b;
327 tr.reset = 0;
328 347
329 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 348 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
330 349
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
603 continue; 622 continue;
604 err = threshold_create_bank(cpu, bank); 623 err = threshold_create_bank(cpu, bank);
605 if (err) 624 if (err)
606 goto out; 625 return err;
607 } 626 }
608out: 627
609 return err; 628 return err;
610} 629}
611 630
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 4b683267eca5..e12246ff5aa6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,13 @@ struct thermal_state {
53 struct _thermal_state core_power_limit; 53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle; 54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit; 55 struct _thermal_state package_power_limit;
56 struct _thermal_state core_thresh0;
57 struct _thermal_state core_thresh1;
56}; 58};
57 59
60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val);
62
58static DEFINE_PER_CPU(struct thermal_state, thermal_state); 63static DEFINE_PER_CPU(struct thermal_state, thermal_state);
59 64
60static atomic_t therm_throt_en = ATOMIC_INIT(0); 65static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level)
200 return 0; 205 return 0;
201} 206}
202 207
208static int thresh_event_valid(int event)
209{
210 struct _thermal_state *state;
211 unsigned int this_cpu = smp_processor_id();
212 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
213 u64 now = get_jiffies_64();
214
215 state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
216
217 if (time_before64(now, state->next_check))
218 return 0;
219
220 state->next_check = now + CHECK_INTERVAL;
221 return 1;
222}
223
203#ifdef CONFIG_SYSFS 224#ifdef CONFIG_SYSFS
204/* Add/Remove thermal_throttle interface for CPU device: */ 225/* Add/Remove thermal_throttle interface for CPU device: */
205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 226static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device);
313#define PACKAGE_THROTTLED ((__u64)2 << 62) 334#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) 335#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315 336
337static void notify_thresholds(__u64 msr_val)
338{
339 /* check whether the interrupt handler is defined;
340 * otherwise simply return
341 */
342 if (!platform_thermal_notify)
343 return;
344
345 /* lower threshold reached */
346 if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
347 platform_thermal_notify(msr_val);
348 /* higher threshold reached */
349 if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
350 platform_thermal_notify(msr_val);
351}
352
316/* Thermal transition interrupt handler */ 353/* Thermal transition interrupt handler */
317static void intel_thermal_interrupt(void) 354static void intel_thermal_interrupt(void)
318{ 355{
@@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void)
321 358
322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 359 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
323 360
361 /* Check for violation of core thermal thresholds*/
362 notify_thresholds(msr_val);
363
324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 364 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT, 365 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0) 366 CORE_LEVEL) != 0)
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 4572f25f9325..cd28a350f7f9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
240 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
241 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
242#endif 242#endif
243#ifdef CONFIG_X86_MRST_EARLY_PRINTK 243#ifdef CONFIG_EARLY_PRINTK_MRST
244 if (!strncmp(buf, "mrst", 4)) { 244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init(); 245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep); 246 early_console_register(&early_mrst_console, keep);
@@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf)
250 hsu_early_console_init(); 250 hsu_early_console_init();
251 early_console_register(&early_hsu_console, keep); 251 early_console_register(&early_hsu_console, keep);
252 } 252 }
253
254#endif 253#endif
255 buf++; 254 buf++;
256 } 255 }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3afb33f14d2d..298448656b60 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/module.h>
22 23
23#include <trace/syscall.h> 24#include <trace/syscall.h>
24 25
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
49int ftrace_arch_code_modify_prepare(void) 50int ftrace_arch_code_modify_prepare(void)
50{ 51{
51 set_kernel_text_rw(); 52 set_kernel_text_rw();
53 set_all_modules_text_rw();
52 modifying_code = 1; 54 modifying_code = 1;
53 return 0; 55 return 0;
54} 56}
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
56int ftrace_arch_code_modify_post_process(void) 58int ftrace_arch_code_modify_post_process(void)
57{ 59{
58 modifying_code = 0; 60 modifying_code = 0;
61 set_all_modules_text_ro();
59 set_kernel_text_ro(); 62 set_kernel_text_ro();
60 return 0; 63 return 0;
61} 64}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 763310165fa0..7f138b3c3c52 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -61,6 +61,9 @@ void __init i386_start_kernel(void)
61 case X86_SUBARCH_MRST: 61 case X86_SUBARCH_MRST:
62 x86_mrst_early_setup(); 62 x86_mrst_early_setup();
63 break; 63 break;
64 case X86_SUBARCH_CE4100:
65 x86_ce4100_early_setup();
66 break;
64 default: 67 default:
65 i386_default_early_setup(); 68 i386_default_early_setup();
66 break; 69 break;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c0dbd9ac24f0..9f54b209c378 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -139,39 +139,6 @@ ENTRY(startup_32)
139 movl %eax, pa(olpc_ofw_pgd) 139 movl %eax, pa(olpc_ofw_pgd)
140#endif 140#endif
141 141
142#ifdef CONFIG_PARAVIRT
143 /* This is can only trip for a broken bootloader... */
144 cmpw $0x207, pa(boot_params + BP_version)
145 jb default_entry
146
147 /* Paravirt-compatible boot parameters. Look to see what architecture
148 we're booting under. */
149 movl pa(boot_params + BP_hardware_subarch), %eax
150 cmpl $num_subarch_entries, %eax
151 jae bad_subarch
152
153 movl pa(subarch_entries)(,%eax,4), %eax
154 subl $__PAGE_OFFSET, %eax
155 jmp *%eax
156
157bad_subarch:
158WEAK(lguest_entry)
159WEAK(xen_entry)
160 /* Unknown implementation; there's really
161 nothing we can do at this point. */
162 ud2a
163
164 __INITDATA
165
166subarch_entries:
167 .long default_entry /* normal x86/PC */
168 .long lguest_entry /* lguest hypervisor */
169 .long xen_entry /* Xen hypervisor */
170 .long default_entry /* Moorestown MID */
171num_subarch_entries = (. - subarch_entries) / 4
172.previous
173#endif /* CONFIG_PARAVIRT */
174
175/* 142/*
176 * Initialize page tables. This creates a PDE and a set of page 143 * Initialize page tables. This creates a PDE and a set of page
177 * tables, which are located immediately beyond __brk_base. The variable 144 * tables, which are located immediately beyond __brk_base. The variable
@@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
181 * 148 *
182 * Note that the stack is not yet set up! 149 * Note that the stack is not yet set up!
183 */ 150 */
184default_entry:
185#ifdef CONFIG_X86_PAE 151#ifdef CONFIG_X86_PAE
186 152
187 /* 153 /*
@@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
261 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax 227 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
262 movl %eax,pa(initial_page_table+0xffc) 228 movl %eax,pa(initial_page_table+0xffc)
263#endif 229#endif
264 jmp 3f 230
231#ifdef CONFIG_PARAVIRT
232 /* This is can only trip for a broken bootloader... */
233 cmpw $0x207, pa(boot_params + BP_version)
234 jb default_entry
235
236 /* Paravirt-compatible boot parameters. Look to see what architecture
237 we're booting under. */
238 movl pa(boot_params + BP_hardware_subarch), %eax
239 cmpl $num_subarch_entries, %eax
240 jae bad_subarch
241
242 movl pa(subarch_entries)(,%eax,4), %eax
243 subl $__PAGE_OFFSET, %eax
244 jmp *%eax
245
246bad_subarch:
247WEAK(lguest_entry)
248WEAK(xen_entry)
249 /* Unknown implementation; there's really
250 nothing we can do at this point. */
251 ud2a
252
253 __INITDATA
254
255subarch_entries:
256 .long default_entry /* normal x86/PC */
257 .long lguest_entry /* lguest hypervisor */
258 .long xen_entry /* Xen hypervisor */
259 .long default_entry /* Moorestown MID */
260num_subarch_entries = (. - subarch_entries) / 4
261.previous
262#else
263 jmp default_entry
264#endif /* CONFIG_PARAVIRT */
265
265/* 266/*
266 * Non-boot CPU entry point; entered from trampoline.S 267 * Non-boot CPU entry point; entered from trampoline.S
267 * We can't lgdt here, because lgdt itself uses a data segment, but 268 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -282,7 +283,7 @@ ENTRY(startup_32_smp)
282 movl %eax,%fs 283 movl %eax,%fs
283 movl %eax,%gs 284 movl %eax,%gs
284#endif /* CONFIG_SMP */ 285#endif /* CONFIG_SMP */
2853: 286default_entry:
286 287
287/* 288/*
288 * New page tables may be in 4Mbyte page mode and may 289 * New page tables may be in 4Mbyte page mode and may
@@ -316,6 +317,10 @@ ENTRY(startup_32_smp)
316 subl $0x80000001, %eax 317 subl $0x80000001, %eax
317 cmpl $(0x8000ffff-0x80000001), %eax 318 cmpl $(0x8000ffff-0x80000001), %eax
318 ja 6f 319 ja 6f
320
321 /* Clear bogus XD_DISABLE bits */
322 call verify_cpu
323
319 mov $0x80000001, %eax 324 mov $0x80000001, %eax
320 cpuid 325 cpuid
321 /* Execute Disable bit supported? */ 326 /* Execute Disable bit supported? */
@@ -611,6 +616,8 @@ ignore_int:
611#endif 616#endif
612 iret 617 iret
613 618
619#include "verify_cpu.S"
620
614 __REFDATA 621 __REFDATA
615.align 4 622.align 4
616ENTRY(initial_code) 623ENTRY(initial_code)
@@ -622,13 +629,13 @@ ENTRY(initial_code)
622__PAGE_ALIGNED_BSS 629__PAGE_ALIGNED_BSS
623 .align PAGE_SIZE_asm 630 .align PAGE_SIZE_asm
624#ifdef CONFIG_X86_PAE 631#ifdef CONFIG_X86_PAE
625ENTRY(initial_pg_pmd) 632initial_pg_pmd:
626 .fill 1024*KPMDS,4,0 633 .fill 1024*KPMDS,4,0
627#else 634#else
628ENTRY(initial_page_table) 635ENTRY(initial_page_table)
629 .fill 1024,4,0 636 .fill 1024,4,0
630#endif 637#endif
631ENTRY(initial_pg_fixmap) 638initial_pg_fixmap:
632 .fill 1024,4,0 639 .fill 1024,4,0
633ENTRY(empty_zero_page) 640ENTRY(empty_zero_page)
634 .fill 4096,1,0 641 .fill 4096,1,0
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ce0cb4721c9a..0fe6d1a66c38 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
155 return 0; 155 return 0;
156} 156}
157 157
158static int get_ucode_data(void *to, const u8 *from, size_t n)
159{
160 memcpy(to, from, n);
161 return 0;
162}
163
164static void * 158static void *
165get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) 159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
166{ 160{
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
168 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
169 void *mc; 163 void *mc;
170 164
171 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) 165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
172 return NULL;
173 166
174 if (section_hdr[0] != UCODE_UCODE_TYPE) { 167 if (section_hdr[0] != UCODE_UCODE_TYPE) {
175 pr_err("error: invalid type field in container file section header\n"); 168 pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 mc = vmalloc(UCODE_MAX_SIZE); 179 mc = vzalloc(UCODE_MAX_SIZE);
187 if (mc) { 180 if (!mc)
188 memset(mc, 0, UCODE_MAX_SIZE); 181 return NULL;
189 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, 182
190 total_size)) { 183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
191 vfree(mc); 184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
192 mc = NULL; 185
193 } else
194 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
195 }
196 return mc; 186 return mc;
197} 187}
198 188
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
202 unsigned int *buf_pos = (unsigned int *)container_hdr; 192 unsigned int *buf_pos = (unsigned int *)container_hdr;
203 unsigned long size; 193 unsigned long size;
204 194
205 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) 195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
206 return 0;
207 196
208 size = buf_pos[2]; 197 size = buf_pos[2];
209 198
@@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf)
219 } 208 }
220 209
221 buf += UCODE_CONTAINER_HEADER_SIZE; 210 buf += UCODE_CONTAINER_HEADER_SIZE;
222 if (get_ucode_data(equiv_cpu_table, buf, size)) { 211 get_ucode_data(equiv_cpu_table, buf, size);
223 vfree(equiv_cpu_table);
224 return 0;
225 }
226 212
227 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
228} 214}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba0f0ca9f280..c01ffa5b9b87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -143,7 +143,7 @@ static void flush_gart(void)
143 143
144 spin_lock_irqsave(&iommu_bitmap_lock, flags); 144 spin_lock_irqsave(&iommu_bitmap_lock, flags);
145 if (need_flush) { 145 if (need_flush) {
146 k8_flush_garts(); 146 amd_flush_garts();
147 need_flush = false; 147 need_flush = false;
148 } 148 }
149 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 149 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
561{ 561{
562 int i; 562 int i;
563 563
564 if (!k8_northbridges.gart_supported) 564 if (!amd_nb_has_feature(AMD_NB_GART))
565 return; 565 return;
566 566
567 for (i = 0; i < k8_northbridges.num; i++) { 567 for (i = 0; i < amd_nb_num(); i++) {
568 struct pci_dev *dev = k8_northbridges.nb_misc[i]; 568 struct pci_dev *dev = node_to_amd_nb(i)->misc;
569 569
570 enable_gart_translation(dev, __pa(agp_gatt_table)); 570 enable_gart_translation(dev, __pa(agp_gatt_table));
571 } 571 }
572 572
573 /* Flush the GART-TLB to remove stale entries */ 573 /* Flush the GART-TLB to remove stale entries */
574 k8_flush_garts(); 574 amd_flush_garts();
575} 575}
576 576
577/* 577/*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
596 if (!fix_up_north_bridges) 596 if (!fix_up_north_bridges)
597 return; 597 return;
598 598
599 if (!k8_northbridges.gart_supported) 599 if (!amd_nb_has_feature(AMD_NB_GART))
600 return; 600 return;
601 601
602 pr_info("PCI-DMA: Restoring GART aperture settings\n"); 602 pr_info("PCI-DMA: Restoring GART aperture settings\n");
603 603
604 for (i = 0; i < k8_northbridges.num; i++) { 604 for (i = 0; i < amd_nb_num(); i++) {
605 struct pci_dev *dev = k8_northbridges.nb_misc[i]; 605 struct pci_dev *dev = node_to_amd_nb(i)->misc;
606 606
607 /* 607 /*
608 * Don't enable translations just yet. That is the next 608 * Don't enable translations just yet. That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
644 * Private Northbridge GATT initialization in case we cannot use the 644 * Private Northbridge GATT initialization in case we cannot use the
645 * AGP driver for some reason. 645 * AGP driver for some reason.
646 */ 646 */
647static __init int init_k8_gatt(struct agp_kern_info *info) 647static __init int init_amd_gatt(struct agp_kern_info *info)
648{ 648{
649 unsigned aper_size, gatt_size, new_aper_size; 649 unsigned aper_size, gatt_size, new_aper_size;
650 unsigned aper_base, new_aper_base; 650 unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
656 656
657 aper_size = aper_base = info->aper_size = 0; 657 aper_size = aper_base = info->aper_size = 0;
658 dev = NULL; 658 dev = NULL;
659 for (i = 0; i < k8_northbridges.num; i++) { 659 for (i = 0; i < amd_nb_num(); i++) {
660 dev = k8_northbridges.nb_misc[i]; 660 dev = node_to_amd_nb(i)->misc;
661 new_aper_base = read_aperture(dev, &new_aper_size); 661 new_aper_base = read_aperture(dev, &new_aper_size);
662 if (!new_aper_base) 662 if (!new_aper_base)
663 goto nommu; 663 goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
725 if (!no_agp) 725 if (!no_agp)
726 return; 726 return;
727 727
728 if (!k8_northbridges.gart_supported) 728 if (!amd_nb_has_feature(AMD_NB_GART))
729 return; 729 return;
730 730
731 for (i = 0; i < k8_northbridges.num; i++) { 731 for (i = 0; i < amd_nb_num(); i++) {
732 u32 ctl; 732 u32 ctl;
733 733
734 dev = k8_northbridges.nb_misc[i]; 734 dev = node_to_amd_nb(i)->misc;
735 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 735 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
736 736
737 ctl &= ~GARTEN; 737 ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
749 unsigned long scratch; 749 unsigned long scratch;
750 long i; 750 long i;
751 751
752 if (!k8_northbridges.gart_supported) 752 if (!amd_nb_has_feature(AMD_NB_GART))
753 return 0; 753 return 0;
754 754
755#ifndef CONFIG_AGP_AMD64 755#ifndef CONFIG_AGP_AMD64
756 no_agp = 1; 756 no_agp = 1;
757#else 757#else
758 /* Makefile puts PCI initialization via subsys_initcall first. */ 758 /* Makefile puts PCI initialization via subsys_initcall first. */
759 /* Add other K8 AGP bridge drivers here */ 759 /* Add other AMD AGP bridge drivers here */
760 no_agp = no_agp || 760 no_agp = no_agp ||
761 (agp_amd64_init() < 0) || 761 (agp_amd64_init() < 0) ||
762 (agp_copy_info(agp_bridge, &info) < 0); 762 (agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
765 if (no_iommu || 765 if (no_iommu ||
766 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 766 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
767 !gart_iommu_aperture || 767 !gart_iommu_aperture ||
768 (no_agp && init_k8_gatt(&info) < 0)) { 768 (no_agp && init_amd_gatt(&info) < 0)) {
769 if (max_pfn > MAX_DMA32_PFN) { 769 if (max_pfn > MAX_DMA32_PFN) {
770 pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); 770 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
771 pr_warning("falling back to iommu=soft.\n"); 771 pr_warning("falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313ebbb03..c8e41e90f59c 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
43 outb(1, 0x92); 43 outb(1, 0x92);
44} 44}
45 45
46static void ce4100_reset(struct pci_dev *dev)
47{
48 int i;
49
50 for (i = 0; i < 10; i++) {
51 outb(0x2, 0xcf9);
52 udelay(50);
53 }
54}
55
46struct device_fixup { 56struct device_fixup {
47 unsigned int vendor; 57 unsigned int vendor;
48 unsigned int device; 58 unsigned int device;
49 void (*reboot_fixup)(struct pci_dev *); 59 void (*reboot_fixup)(struct pci_dev *);
50}; 60};
51 61
62/*
63 * PCI ids solely used for fixups_table go here
64 */
65#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
66
52static const struct device_fixup fixups_table[] = { 67static const struct device_fixup fixups_table[] = {
53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 68{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 69{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 70{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
56{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, 71{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
72{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
57}; 73};
58 74
59/* 75/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a0f52af256a0..d3cfe26c0252 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void)
705void __init setup_arch(char **cmdline_p) 705void __init setup_arch(char **cmdline_p)
706{ 706{
707 int acpi = 0; 707 int acpi = 0;
708 int k8 = 0; 708 int amd = 0;
709 unsigned long flags; 709 unsigned long flags;
710 710
711#ifdef CONFIG_X86_32 711#ifdef CONFIG_X86_32
@@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p)
991 acpi = acpi_numa_init(); 991 acpi = acpi_numa_init();
992#endif 992#endif
993 993
994#ifdef CONFIG_K8_NUMA 994#ifdef CONFIG_AMD_NUMA
995 if (!acpi) 995 if (!acpi)
996 k8 = !k8_numa_init(0, max_pfn); 996 amd = !amd_numa_init(0, max_pfn);
997#endif 997#endif
998 998
999 initmem_init(0, max_pfn, acpi, k8); 999 initmem_init(0, max_pfn, acpi, amd);
1000 memblock_find_dma_reserve(); 1000 memblock_find_dma_reserve();
1001 dma32_reserve_bootmem(); 1001 dma32_reserve_bootmem();
1002 1002
@@ -1045,10 +1045,7 @@ void __init setup_arch(char **cmdline_p)
1045#endif 1045#endif
1046 1046
1047 init_apic_mappings(); 1047 init_apic_mappings();
1048 ioapic_init_mappings(); 1048 ioapic_and_gsi_init();
1049
1050 /* need to wait for io_apic is mapped */
1051 probe_nr_irqs_gsi();
1052 1049
1053 kvm_guest_init(); 1050 kvm_guest_init();
1054 1051
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 68f61ac632e1..ee886fe10ef4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1161,6 +1161,20 @@ out:
1161 preempt_enable(); 1161 preempt_enable();
1162} 1162}
1163 1163
1164void arch_disable_nonboot_cpus_begin(void)
1165{
1166 /*
1167 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1168 * In the suspend path, we will be back in the SMP mode shortly anyways.
1169 */
1170 skip_smp_alternatives = true;
1171}
1172
1173void arch_disable_nonboot_cpus_end(void)
1174{
1175 skip_smp_alternatives = false;
1176}
1177
1164void arch_enable_nonboot_cpus_begin(void) 1178void arch_enable_nonboot_cpus_begin(void)
1165{ 1179{
1166 set_mtrr_aps_delayed_init(); 1180 set_mtrr_aps_delayed_init();
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..075d130efcf9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
127no_longmode: 127no_longmode:
128 hlt 128 hlt
129 jmp no_longmode 129 jmp no_longmode
130#include "verify_cpu_64.S" 130#include "verify_cpu.S"
131 131
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0c40d8b72416..356a0d455cf9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
872 872
873 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 873 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
874 return 0; 874 return 0;
875
876 if (tsc_clocksource_reliable)
877 return 0;
875 /* 878 /*
876 * Intel systems are normally all synchronized. 879 * Intel systems are normally all synchronized.
877 * Exceptions must mark TSC as unstable: 880 * Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
879 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 882 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
880 /* assume multi socket systems are not synchronized: */ 883 /* assume multi socket systems are not synchronized: */
881 if (num_possible_cpus() > 1) 884 if (num_possible_cpus() > 1)
882 tsc_unstable = 1; 885 return 1;
883 } 886 }
884 887
885 return tsc_unstable; 888 return 0;
889}
890
891
892static void tsc_refine_calibration_work(struct work_struct *work);
893static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
894/**
895 * tsc_refine_calibration_work - Further refine tsc freq calibration
896 * @work - ignored.
897 *
898 * This functions uses delayed work over a period of a
899 * second to further refine the TSC freq value. Since this is
900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done.
902 *
903 * If there are any calibration anomolies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the
906 * early calibration.
907 */
908static void tsc_refine_calibration_work(struct work_struct *work)
909{
910 static u64 tsc_start = -1, ref_start;
911 static int hpet;
912 u64 tsc_stop, ref_stop, delta;
913 unsigned long freq;
914
915 /* Don't bother refining TSC on unstable systems */
916 if (check_tsc_unstable())
917 goto out;
918
919 /*
920 * Since the work is started early in boot, we may be
921 * delayed the first time we expire. So set the workqueue
922 * again once we know timers are working.
923 */
924 if (tsc_start == -1) {
925 /*
926 * Only set hpet once, to avoid mixing hardware
927 * if the hpet becomes enabled later.
928 */
929 hpet = is_hpet_enabled();
930 schedule_delayed_work(&tsc_irqwork, HZ);
931 tsc_start = tsc_read_refs(&ref_start, hpet);
932 return;
933 }
934
935 tsc_stop = tsc_read_refs(&ref_stop, hpet);
936
937 /* hpet or pmtimer available ? */
938 if (!hpet && !ref_start && !ref_stop)
939 goto out;
940
941 /* Check, whether the sampling was disturbed by an SMI */
942 if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
943 goto out;
944
945 delta = tsc_stop - tsc_start;
946 delta *= 1000000LL;
947 if (hpet)
948 freq = calc_hpet_ref(delta, ref_start, ref_stop);
949 else
950 freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
951
952 /* Make sure we're within 1% */
953 if (abs(tsc_khz - freq) > tsc_khz/100)
954 goto out;
955
956 tsc_khz = freq;
957 printk(KERN_INFO "Refined TSC clocksource calibration: "
958 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
959 (unsigned long)tsc_khz % 1000);
960
961out:
962 clocksource_register_khz(&clocksource_tsc, tsc_khz);
886} 963}
887 964
888static void __init init_tsc_clocksource(void) 965
966static int __init init_tsc_clocksource(void)
889{ 967{
968 if (!cpu_has_tsc || tsc_disabled > 0)
969 return 0;
970
890 if (tsc_clocksource_reliable) 971 if (tsc_clocksource_reliable)
891 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 972 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
892 /* lower the rating if we already know its unstable: */ 973 /* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
894 clocksource_tsc.rating = 0; 975 clocksource_tsc.rating = 0;
895 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 976 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
896 } 977 }
897 clocksource_register_khz(&clocksource_tsc, tsc_khz); 978 schedule_delayed_work(&tsc_irqwork, 0);
979 return 0;
898} 980}
981/*
982 * We use device_initcall here, to ensure we run after the hpet
983 * is fully initialized, which may occur at fs_initcall time.
984 */
985device_initcall(init_tsc_clocksource);
899 986
900void __init tsc_init(void) 987void __init tsc_init(void)
901{ 988{
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
949 mark_tsc_unstable("TSCs unsynchronized"); 1036 mark_tsc_unstable("TSCs unsynchronized");
950 1037
951 check_system_tsc_reliable(); 1038 check_system_tsc_reliable();
952 init_tsc_clocksource();
953} 1039}
954 1040
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..0edefc19a113 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de) 7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) 8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) 9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
10 * 11 *
11 * This source code is licensed under the GNU General Public License, 12 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details. 13 * Version 2. See the file COPYING for more details.
@@ -14,18 +15,17 @@
14 * This is a common code for verification whether CPU supports 15 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this 16 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context. 17 * file is included at various places and compiled in that context.
17 * Following are the current usage. 18 * This file is expected to run in 32bit code. Currently:
18 * 19 *
19 * This file is included by both 16bit and 32bit code. 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication
22 * arch/x86/kernel/head_32.S: processor startup
20 * 23 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure 25 * 0: Success 1: Failure
28 * 26 *
27 * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
28 *
29 * The caller needs to check for the error code and take the action 29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt. 30 * appropriately. Either display a message or halt.
31 */ 31 */
@@ -62,8 +62,41 @@ verify_cpu:
62 cmpl $0x444d4163,%ecx 62 cmpl $0x444d4163,%ecx
63 jnz verify_cpu_noamd 63 jnz verify_cpu_noamd
64 mov $1,%di # cpu is from AMD 64 mov $1,%di # cpu is from AMD
65 jmp verify_cpu_check
65 66
66verify_cpu_noamd: 67verify_cpu_noamd:
68 cmpl $0x756e6547,%ebx # GenuineIntel?
69 jnz verify_cpu_check
70 cmpl $0x49656e69,%edx
71 jnz verify_cpu_check
72 cmpl $0x6c65746e,%ecx
73 jnz verify_cpu_check
74
75 # only call IA32_MISC_ENABLE when:
76 # family > 6 || (family == 6 && model >= 0xd)
77 movl $0x1, %eax # check CPU family and model
78 cpuid
79 movl %eax, %ecx
80
81 andl $0x0ff00f00, %eax # mask family and extended family
82 shrl $8, %eax
83 cmpl $6, %eax
84 ja verify_cpu_clear_xd # family > 6, ok
85 jb verify_cpu_check # family < 6, skip
86
87 andl $0x000f00f0, %ecx # mask model and extended model
88 shrl $4, %ecx
89 cmpl $0xd, %ecx
90 jb verify_cpu_check # family == 6, model < 0xd, skip
91
92verify_cpu_clear_xd:
93 movl $MSR_IA32_MISC_ENABLE, %ecx
94 rdmsr
95 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
96 jnc verify_cpu_check # only write MSR if bit was changed
97 wrmsr
98
99verify_cpu_check:
67 movl $0x1,%eax # Does the cpu have what it takes 100 movl $0x1,%eax # Does the cpu have what it takes
68 cpuid 101 cpuid
69 andl $REQUIRED_MASK0,%edx 102 andl $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530aebfd0..bf4700755184 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
69 69
70PHDRS { 70PHDRS {
71 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
72 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(6); /* RW_ */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 user PT_LOAD FLAGS(5); /* R_E */ 74 user PT_LOAD FLAGS(5); /* R_E */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
116 116
117 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
118 118
119#if defined(CONFIG_DEBUG_RODATA)
120 /* .text should occupy whole number of pages */
121 . = ALIGN(PAGE_SIZE);
122#endif
119 X64_ALIGN_DEBUG_RODATA_BEGIN 123 X64_ALIGN_DEBUG_RODATA_BEGIN
120 RO_DATA(PAGE_SIZE) 124 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END 125 X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
335 __bss_start = .; 339 __bss_start = .;
336 *(.bss..page_aligned) 340 *(.bss..page_aligned)
337 *(.bss) 341 *(.bss)
338 . = ALIGN(4); 342 . = ALIGN(PAGE_SIZE);
339 __bss_stop = .; 343 __bss_stop = .;
340 } 344 }
341 345
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index e7d5382ef263..4f420c2f2d55 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -4,7 +4,6 @@
4#include <asm/asm-offsets.h> 4#include <asm/asm-offsets.h>
5#include <asm/thread_info.h> 5#include <asm/thread_info.h>
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7#include <asm/pgtable.h>
8 7
9/*G:020 8/*G:020
10 * Our story starts with the kernel booting into startup_32 in 9 * Our story starts with the kernel booting into startup_32 in
@@ -38,113 +37,9 @@ ENTRY(lguest_entry)
38 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
39 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
40 39
41 call init_pagetables
42
43 /* Jumps are relative: we're running __PAGE_OFFSET too low. */ 40 /* Jumps are relative: we're running __PAGE_OFFSET too low. */
44 jmp lguest_init+__PAGE_OFFSET 41 jmp lguest_init+__PAGE_OFFSET
45 42
46/*
47 * Initialize page tables. This creates a PDE and a set of page
48 * tables, which are located immediately beyond __brk_base. The variable
49 * _brk_end is set up to point to the first "safe" location.
50 * Mappings are created both at virtual address 0 (identity mapping)
51 * and PAGE_OFFSET for up to _end.
52 *
53 * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they
54 * don't have a stack at this point, so we can't just use call and ret.
55 */
56init_pagetables:
57#if PTRS_PER_PMD > 1
58#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
59#else
60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
61#endif
62#define pa(X) ((X) - __PAGE_OFFSET)
63
64/* Enough space to fit pagetables for the low memory linear map */
65MAPPING_BEYOND_END = \
66 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
67#ifdef CONFIG_X86_PAE
68
69 /*
70 * In PAE mode initial_page_table is statically defined to contain
71 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
72 * entries). The identity mapping is handled by pointing two PGD entries
73 * to the first kernel PMD.
74 *
75 * Note the upper half of each PMD or PTE are always zero at this stage.
76 */
77
78#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
79
80 xorl %ebx,%ebx /* %ebx is kept at zero */
81
82 movl $pa(__brk_base), %edi
83 movl $pa(initial_pg_pmd), %edx
84 movl $PTE_IDENT_ATTR, %eax
8510:
86 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
87 movl %ecx,(%edx) /* Store PMD entry */
88 /* Upper half already zero */
89 addl $8,%edx
90 movl $512,%ecx
9111:
92 stosl
93 xchgl %eax,%ebx
94 stosl
95 xchgl %eax,%ebx
96 addl $0x1000,%eax
97 loop 11b
98
99 /*
100 * End condition: we must map up to the end + MAPPING_BEYOND_END.
101 */
102 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
103 cmpl %ebp,%eax
104 jb 10b
1051:
106 addl $__PAGE_OFFSET, %edi
107 movl %edi, pa(_brk_end)
108 shrl $12, %eax
109 movl %eax, pa(max_pfn_mapped)
110
111 /* Do early initialization of the fixmap area */
112 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
113 movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
114#else /* Not PAE */
115
116page_pde_offset = (__PAGE_OFFSET >> 20);
117
118 movl $pa(__brk_base), %edi
119 movl $pa(initial_page_table), %edx
120 movl $PTE_IDENT_ATTR, %eax
12110:
122 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
123 movl %ecx,(%edx) /* Store identity PDE entry */
124 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
125 addl $4,%edx
126 movl $1024, %ecx
12711:
128 stosl
129 addl $0x1000,%eax
130 loop 11b
131 /*
132 * End condition: we must map up to the end + MAPPING_BEYOND_END.
133 */
134 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
135 cmpl %ebp,%eax
136 jb 10b
137 addl $__PAGE_OFFSET, %edi
138 movl %edi, pa(_brk_end)
139 shrl $12, %eax
140 movl %eax, pa(max_pfn_mapped)
141
142 /* Do early initialization of the fixmap area */
143 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
144 movl %eax,pa(initial_page_table+0xffc)
145#endif
146 ret
147
148/*G:055 43/*G:055
149 * We create a macro which puts the assembler code between lgstart_ and lgend_ 44 * We create a macro which puts the assembler code between lgstart_ and lgend_
150 * markers. These templates are put in the .text section: they can't be 45 * markers. These templates are put in the .text section: they can't be
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 55543397a8a7..09df2f9a3d69 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_K8_NUMA) += k8topology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28 28
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c
index 804a3b6c6e14..51fae9cfdecb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * AMD K8 NUMA support. 2 * AMD NUMA support.
3 * Discover the memory map and associated nodes. 3 * Discover the memory map and associated nodes.
4 * 4 *
5 * This version reads it directly from the K8 northbridge. 5 * This version reads it directly from the AMD northbridge.
6 * 6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */ 8 */
@@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void)
57{ 57{
58 /* 58 /*
59 * need to get the APIC ID of the BSP so can use that to 59 * need to get the APIC ID of the BSP so can use that to
60 * create apicid_to_node in k8_scan_nodes() 60 * create apicid_to_node in amd_scan_nodes()
61 */ 61 */
62#ifdef CONFIG_X86_MPPARSE 62#ifdef CONFIG_X86_MPPARSE
63 /* 63 /*
@@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void)
69 early_init_lapic_mapping(); 69 early_init_lapic_mapping();
70} 70}
71 71
72int __init k8_get_nodes(struct bootnode *physnodes) 72int __init amd_get_nodes(struct bootnode *physnodes)
73{ 73{
74 int i; 74 int i;
75 int ret = 0; 75 int ret = 0;
@@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes)
82 return ret; 82 return ret;
83} 83}
84 84
85int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) 85int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
86{ 86{
87 unsigned long start = PFN_PHYS(start_pfn); 87 unsigned long start = PFN_PHYS(start_pfn);
88 unsigned long end = PFN_PHYS(end_pfn); 88 unsigned long end = PFN_PHYS(end_pfn);
@@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
194 return 0; 194 return 0;
195} 195}
196 196
197int __init k8_scan_nodes(void) 197int __init amd_scan_nodes(void)
198{ 198{
199 unsigned int bits; 199 unsigned int bits;
200 unsigned int cores; 200 unsigned int cores;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
364 /* 364 /*
365 * We just marked the kernel text read only above, now that 365 * We just marked the kernel text read only above, now that
366 * we are going to free part of that, we need to make that 366 * we are going to free part of that, we need to make that
367 * writeable first. 367 * writeable and non-executable first.
368 */ 368 */
369 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
369 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 370 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
370 371
371 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 372 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..f89b5bb4e93f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
226 226
227static inline int is_kernel_text(unsigned long addr) 227static inline int is_kernel_text(unsigned long addr)
228{ 228{
229 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 229 if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
230 return 1; 230 return 1;
231 return 0; 231 return 0;
232} 232}
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
912 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 912 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
913} 913}
914 914
915static void mark_nxdata_nx(void)
916{
917 /*
918 * When this called, init has already been executed and released,
919 * so everything past _etext sould be NX.
920 */
921 unsigned long start = PFN_ALIGN(_etext);
922 /*
923 * This comes from is_kernel_text upper limit. Also HPAGE where used:
924 */
925 unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
926
927 if (__supported_pte_mask & _PAGE_NX)
928 printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
929 set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
930}
931
915void mark_rodata_ro(void) 932void mark_rodata_ro(void)
916{ 933{
917 unsigned long start = PFN_ALIGN(_text); 934 unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
946 printk(KERN_INFO "Testing CPA: write protecting again\n"); 963 printk(KERN_INFO "Testing CPA: write protecting again\n");
947 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 964 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
948#endif 965#endif
966 mark_nxdata_nx();
949} 967}
950#endif 968#endif
951 969
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b727efd..7762a517d69d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata;
264static char *cmdline __initdata; 264static char *cmdline __initdata;
265 265
266static int __init setup_physnodes(unsigned long start, unsigned long end, 266static int __init setup_physnodes(unsigned long start, unsigned long end,
267 int acpi, int k8) 267 int acpi, int amd)
268{ 268{
269 int nr_nodes = 0; 269 int nr_nodes = 0;
270 int ret = 0; 270 int ret = 0;
@@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
274 if (acpi) 274 if (acpi)
275 nr_nodes = acpi_get_nodes(physnodes); 275 nr_nodes = acpi_get_nodes(physnodes);
276#endif 276#endif
277#ifdef CONFIG_K8_NUMA 277#ifdef CONFIG_AMD_NUMA
278 if (k8) 278 if (amd)
279 nr_nodes = k8_get_nodes(physnodes); 279 nr_nodes = amd_get_nodes(physnodes);
280#endif 280#endif
281 /* 281 /*
282 * Basic sanity checking on the physical node map: there may be errors 282 * Basic sanity checking on the physical node map: there may be errors
283 * if the SRAT or K8 incorrectly reported the topology or the mem= 283 * if the SRAT or AMD code incorrectly reported the topology or the mem=
284 * kernel parameter is used. 284 * kernel parameter is used.
285 */ 285 */
286 for (i = 0; i < nr_nodes; i++) { 286 for (i = 0; i < nr_nodes; i++) {
@@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
549 * numa=fake command-line option. 549 * numa=fake command-line option.
550 */ 550 */
551static int __init numa_emulation(unsigned long start_pfn, 551static int __init numa_emulation(unsigned long start_pfn,
552 unsigned long last_pfn, int acpi, int k8) 552 unsigned long last_pfn, int acpi, int amd)
553{ 553{
554 u64 addr = start_pfn << PAGE_SHIFT; 554 u64 addr = start_pfn << PAGE_SHIFT;
555 u64 max_addr = last_pfn << PAGE_SHIFT; 555 u64 max_addr = last_pfn << PAGE_SHIFT;
@@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn,
557 int num_nodes; 557 int num_nodes;
558 int i; 558 int i;
559 559
560 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 560 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
561 /* 561 /*
562 * If the numa=fake command-line contains a 'M' or 'G', it represents 562 * If the numa=fake command-line contains a 'M' or 'G', it represents
563 * the fixed node size. Otherwise, if it is just a single number N, 563 * the fixed node size. Otherwise, if it is just a single number N,
@@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn,
602#endif /* CONFIG_NUMA_EMU */ 602#endif /* CONFIG_NUMA_EMU */
603 603
604void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 604void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
605 int acpi, int k8) 605 int acpi, int amd)
606{ 606{
607 int i; 607 int i;
608 608
@@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
610 nodes_clear(node_online_map); 610 nodes_clear(node_online_map);
611 611
612#ifdef CONFIG_NUMA_EMU 612#ifdef CONFIG_NUMA_EMU
613 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) 613 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
614 return; 614 return;
615 nodes_clear(node_possible_map); 615 nodes_clear(node_possible_map);
616 nodes_clear(node_online_map); 616 nodes_clear(node_online_map);
@@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
624 nodes_clear(node_online_map); 624 nodes_clear(node_online_map);
625#endif 625#endif
626 626
627#ifdef CONFIG_K8_NUMA 627#ifdef CONFIG_AMD_NUMA
628 if (!numa_off && k8 && !k8_scan_nodes()) 628 if (!numa_off && amd && !amd_scan_nodes())
629 return; 629 return;
630 nodes_clear(node_possible_map); 630 nodes_clear(node_possible_map);
631 nodes_clear(node_online_map); 631 nodes_clear(node_online_map);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
13#include <linux/pfn.h> 13#include <linux/pfn.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/pci.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
255 unsigned long pfn) 256 unsigned long pfn)
256{ 257{
257 pgprot_t forbidden = __pgprot(0); 258 pgprot_t forbidden = __pgprot(0);
259 pgprot_t required = __pgprot(0);
258 260
259 /* 261 /*
260 * The BIOS area between 640k and 1Mb needs to be executable for 262 * The BIOS area between 640k and 1Mb needs to be executable for
261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 263 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 */ 264 */
263 if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 265#ifdef CONFIG_PCI_BIOS
266 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
264 pgprot_val(forbidden) |= _PAGE_NX; 267 pgprot_val(forbidden) |= _PAGE_NX;
268#endif
265 269
266 /* 270 /*
267 * The kernel text needs to be executable for obvious reasons 271 * The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
278 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 282 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 283 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 284 pgprot_val(forbidden) |= _PAGE_RW;
285 /*
286 * .data and .bss should always be writable.
287 */
288 if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
289 within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
290 pgprot_val(required) |= _PAGE_RW;
281 291
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 292#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /* 293 /*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
317#endif 327#endif
318 328
319 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 329 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
330 prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
320 331
321 return prot; 332 return prot;
322} 333}
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
393{ 404{
394 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 405 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
395 pte_t new_pte, old_pte, *tmp; 406 pte_t new_pte, old_pte, *tmp;
396 pgprot_t old_prot, new_prot; 407 pgprot_t old_prot, new_prot, req_prot;
397 int i, do_split = 1; 408 int i, do_split = 1;
398 unsigned int level; 409 unsigned int level;
399 410
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
438 * We are safe now. Check whether the new pgprot is the same: 449 * We are safe now. Check whether the new pgprot is the same:
439 */ 450 */
440 old_pte = *kpte; 451 old_pte = *kpte;
441 old_prot = new_prot = pte_pgprot(old_pte); 452 old_prot = new_prot = req_prot = pte_pgprot(old_pte);
442 453
443 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 454 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
444 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 455 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
445 456
446 /* 457 /*
447 * old_pte points to the large page base address. So we need 458 * old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
450 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 461 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
451 cpa->pfn = pfn; 462 cpa->pfn = pfn;
452 463
453 new_prot = static_protections(new_prot, address, pfn); 464 new_prot = static_protections(req_prot, address, pfn);
454 465
455 /* 466 /*
456 * We need to check the full range, whether 467 * We need to check the full range, whether
457 * static_protection() requires a different pgprot for one of 468 * static_protection() requires a different pgprot for one of
458 * the pages in the range we try to preserve: 469 * the pages in the range we try to preserve:
459 */ 470 */
460 addr = address + PAGE_SIZE; 471 addr = address & pmask;
461 pfn++; 472 pfn = pte_pfn(old_pte);
462 for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { 473 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
463 pgprot_t chk_prot = static_protections(new_prot, addr, pfn); 474 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
464 475
465 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 476 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
466 goto out_unlock; 477 goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
483 * that we limited the number of possible pages already to 494 * that we limited the number of possible pages already to
484 * the number of pages in the large page. 495 * the number of pages in the large page.
485 */ 496 */
486 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { 497 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
487 /* 498 /*
488 * The address is aligned and the number of pages 499 * The address is aligned and the number of pages
489 * covers the full page. 500 * covers the full page.
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
41{ 41{
42 if (!cpu_has_nx) { 42 if (!cpu_has_nx) {
43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
44 "missing in CPU or disabled in BIOS!\n"); 44 "missing in CPU!\n");
45 } else { 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) { 47 if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..f16434568a51 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
92 /* mark this node as "seen" in node bitmap */ 92 /* mark this node as "seen" in node bitmap */
93 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); 93 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
94 94
95 /* don't need to check apic_id here, because it is always 8 bits */
95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; 96 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
96 97
97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", 98 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..171a0aacb99a 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
134 } 134 }
135 135
136 apic_id = pa->apic_id; 136 apic_id = pa->apic_id;
137 if (apic_id >= MAX_LOCAL_APIC) {
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return;
140 }
137 apicid_to_node[apic_id] = node; 141 apicid_to_node[apic_id] = node;
138 node_set(node, cpu_nodes_parsed); 142 node_set(node, cpu_nodes_parsed);
139 acpi_numa = 1; 143 acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
168 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 172 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
169 else 173 else
170 apic_id = pa->apic_id; 174 apic_id = pa->apic_id;
175
176 if (apic_id >= MAX_LOCAL_APIC) {
177 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
178 return;
179 }
180
171 apicid_to_node[apic_id] = node; 181 apicid_to_node[apic_id] = node;
172 node_set(node, cpu_nodes_parsed); 182 node_set(node, cpu_nodes_parsed);
173 acpi_numa = 1; 183 acpi_numa = 1;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 51104b33fd51..c3b8e24f2b16 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -610,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
610 ret = setup_ibs_ctl(i); 610 ret = setup_ibs_ctl(i);
611 if (ret) 611 if (ret)
612 return ret; 612 return ret;
613 pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
613 return 0; 614 return 0;
614 } 615 }
615 616
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index effd96e33f16..6b8759f7634e 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC) += olpc.o
7obj-$(CONFIG_PCI_XEN) += xen.o 7obj-$(CONFIG_PCI_XEN) += xen.o
8 8
9obj-y += fixup.o 9obj-y += fixup.o
10obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
10obj-$(CONFIG_ACPI) += acpi.o 11obj-$(CONFIG_ACPI) += acpi.o
11obj-y += legacy.o irq.o 12obj-y += legacy.o irq.o
12 13
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
new file mode 100644
index 000000000000..85b68ef5e809
--- /dev/null
+++ b/arch/x86/pci/ce4100.c
@@ -0,0 +1,315 @@
1/*
2 * GPL LICENSE SUMMARY
3 *
4 * Copyright(c) 2010 Intel Corporation. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 * The full GNU General Public License is included in this distribution
19 * in the file called LICENSE.GPL.
20 *
21 * Contact Information:
22 * Intel Corporation
23 * 2200 Mission College Blvd.
24 * Santa Clara, CA 97052
25 *
26 * This provides access methods for PCI registers that mis-behave on
27 * the CE4100. Each register can be assigned a private init, read and
28 * write routine. The exception to this is the bridge device. The
29 * bridge device is the only device on bus zero (0) that requires any
30 * fixup so it is a special case ATM
31 */
32
33#include <linux/kernel.h>
34#include <linux/pci.h>
35#include <linux/init.h>
36
37#include <asm/pci_x86.h>
38
39struct sim_reg {
40 u32 value;
41 u32 mask;
42};
43
44struct sim_dev_reg {
45 int dev_func;
46 int reg;
47 void (*init)(struct sim_dev_reg *reg);
48 void (*read)(struct sim_dev_reg *reg, u32 *value);
49 void (*write)(struct sim_dev_reg *reg, u32 value);
50 struct sim_reg sim_reg;
51};
52
53struct sim_reg_op {
54 void (*init)(struct sim_dev_reg *reg);
55 void (*read)(struct sim_dev_reg *reg, u32 value);
56 void (*write)(struct sim_dev_reg *reg, u32 value);
57};
58
59#define MB (1024 * 1024)
60#define KB (1024)
61#define SIZE_TO_MASK(size) (~(size - 1))
62
63#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
64{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
65 {0, SIZE_TO_MASK(size)} },
66
67static void reg_init(struct sim_dev_reg *reg)
68{
69 pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
70 &reg->sim_reg.value);
71}
72
73static void reg_read(struct sim_dev_reg *reg, u32 *value)
74{
75 unsigned long flags;
76
77 raw_spin_lock_irqsave(&pci_config_lock, flags);
78 *value = reg->sim_reg.value;
79 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
80}
81
82static void reg_write(struct sim_dev_reg *reg, u32 value)
83{
84 unsigned long flags;
85
86 raw_spin_lock_irqsave(&pci_config_lock, flags);
87 reg->sim_reg.value = (value & reg->sim_reg.mask) |
88 (reg->sim_reg.value & ~reg->sim_reg.mask);
89 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
90}
91
92static void sata_reg_init(struct sim_dev_reg *reg)
93{
94 pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
95 &reg->sim_reg.value);
96 reg->sim_reg.value += 0x400;
97}
98
99static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
100{
101 reg_read(reg, value);
102 if (*value != reg->sim_reg.mask)
103 *value |= 0x100;
104}
105
106void sata_revid_init(struct sim_dev_reg *reg)
107{
108 reg->sim_reg.value = 0x01060100;
109 reg->sim_reg.mask = 0;
110}
111
112static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
113{
114 reg_read(reg, value);
115}
116
117static struct sim_dev_reg bus1_fixups[] = {
118 DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
119 DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
120 DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
121 DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
122 DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
123 DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
124 DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
125 DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
126 DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
127 DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
128 DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
129 DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
130 DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
131 DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
132 DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
133 DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
134 DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
135 DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
136 DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
137 DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
138 DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
139 DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
140 DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
141 DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
142 DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
143 DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
144 DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
145 DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
146 DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
147 DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
148 DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
149 DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
150 DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
151 DEFINE_REG(14, 0, 0x8, 0, sata_revid_init, sata_revid_read, 0)
152 DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
153 DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
154 DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
155 DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
156 DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
157 DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
158 DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
159 DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
160 DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
161 DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
162 DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
163 DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
164 DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
165};
166
167static void __init init_sim_regs(void)
168{
169 int i;
170
171 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
172 if (bus1_fixups[i].init)
173 bus1_fixups[i].init(&bus1_fixups[i]);
174 }
175}
176
177static inline void extract_bytes(u32 *value, int reg, int len)
178{
179 uint32_t mask;
180
181 *value >>= ((reg & 3) * 8);
182 mask = 0xFFFFFFFF >> ((4 - len) * 8);
183 *value &= mask;
184}
185
186int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
187{
188 u32 av_bridge_base, av_bridge_limit;
189 int retval = 0;
190
191 switch (reg) {
192 /* Make BARs appear to not request any memory. */
193 case PCI_BASE_ADDRESS_0:
194 case PCI_BASE_ADDRESS_0 + 1:
195 case PCI_BASE_ADDRESS_0 + 2:
196 case PCI_BASE_ADDRESS_0 + 3:
197 *value = 0;
198 break;
199
200 /* Since subordinate bus number register is hardwired
201 * to zero and read only, so do the simulation.
202 */
203 case PCI_PRIMARY_BUS:
204 if (len == 4)
205 *value = 0x00010100;
206 break;
207
208 case PCI_SUBORDINATE_BUS:
209 *value = 1;
210 break;
211
212 case PCI_MEMORY_BASE:
213 case PCI_MEMORY_LIMIT:
214 /* Get the A/V bridge base address. */
215 pci_direct_conf1.read(0, 0, devfn,
216 PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
217
218 av_bridge_limit = av_bridge_base + (512*MB - 1);
219 av_bridge_limit >>= 16;
220 av_bridge_limit &= 0xFFF0;
221
222 av_bridge_base >>= 16;
223 av_bridge_base &= 0xFFF0;
224
225 if (reg == PCI_MEMORY_LIMIT)
226 *value = av_bridge_limit;
227 else if (len == 2)
228 *value = av_bridge_base;
229 else
230 *value = (av_bridge_limit << 16) | av_bridge_base;
231 break;
232 /* Make prefetchable memory limit smaller than prefetchable
233 * memory base, so not claim prefetchable memory space.
234 */
235 case PCI_PREF_MEMORY_BASE:
236 *value = 0xFFF0;
237 break;
238 case PCI_PREF_MEMORY_LIMIT:
239 *value = 0x0;
240 break;
241 /* Make IO limit smaller than IO base, so not claim IO space. */
242 case PCI_IO_BASE:
243 *value = 0xF0;
244 break;
245 case PCI_IO_LIMIT:
246 *value = 0;
247 break;
248 default:
249 retval = 1;
250 }
251 return retval;
252}
253
254static int ce4100_conf_read(unsigned int seg, unsigned int bus,
255 unsigned int devfn, int reg, int len, u32 *value)
256{
257 int i, retval = 1;
258
259 if (bus == 1) {
260 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
261 if (bus1_fixups[i].dev_func == devfn &&
262 bus1_fixups[i].reg == (reg & ~3) &&
263 bus1_fixups[i].read) {
264 bus1_fixups[i].read(&(bus1_fixups[i]),
265 value);
266 extract_bytes(value, reg, len);
267 return 0;
268 }
269 }
270 }
271
272 if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
273 !bridge_read(devfn, reg, len, value))
274 return 0;
275
276 return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
277}
278
279static int ce4100_conf_write(unsigned int seg, unsigned int bus,
280 unsigned int devfn, int reg, int len, u32 value)
281{
282 int i;
283
284 if (bus == 1) {
285 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
286 if (bus1_fixups[i].dev_func == devfn &&
287 bus1_fixups[i].reg == (reg & ~3) &&
288 bus1_fixups[i].write) {
289 bus1_fixups[i].write(&(bus1_fixups[i]),
290 value);
291 return 0;
292 }
293 }
294 }
295
296 /* Discard writes to A/V bridge BAR. */
297 if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
298 ((reg & ~3) == PCI_BASE_ADDRESS_0))
299 return 0;
300
301 return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
302}
303
304struct pci_raw_ops ce4100_pci_conf = {
305 .read = ce4100_conf_read,
306 .write = ce4100_conf_write,
307};
308
309static int __init ce4100_pci_init(void)
310{
311 init_sim_regs();
312 raw_pci_ops = &ce4100_pci_conf;
313 return 0;
314}
315subsys_initcall(ce4100_pci_init);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d165096a..a5f7d0d63de0 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11#include <asm/pci-functions.h> 11#include <asm/pci-functions.h>
12#include <asm/cacheflush.h>
12 13
13/* BIOS32 signature: "_32_" */ 14/* BIOS32 signature: "_32_" */
14#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) 15#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
25#define PCIBIOS_HW_TYPE1_SPEC 0x10 26#define PCIBIOS_HW_TYPE1_SPEC 0x10
26#define PCIBIOS_HW_TYPE2_SPEC 0x20 27#define PCIBIOS_HW_TYPE2_SPEC 0x20
27 28
29int pcibios_enabled;
30
31/* According to the BIOS specification at:
32 * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
33 * restrict the x zone to some pages and make it ro. But this may be
34 * broken on some bios, complex to handle with static_protections.
35 * We could make the 0xe0000-0x100000 range rox, but this can break
36 * some ISA mapping.
37 *
38 * So we let's an rw and x hole when pcibios is used. This shouldn't
39 * happen for modern system with mmconfig, and if you don't want it
40 * you could disable pcibios...
41 */
42static inline void set_bios_x(void)
43{
44 pcibios_enabled = 1;
45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
46 if (__supported_pte_mask & _PAGE_NX)
47 printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
48}
49
28/* 50/*
29 * This is the standard structure used to identify the entry point 51 * This is the standard structure used to identify the entry point
30 * to the BIOS32 Service Directory, as documented in 52 * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
332 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", 354 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
333 bios32_entry); 355 bios32_entry);
334 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 356 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
357 set_bios_x();
335 if (check_pcibios()) 358 if (check_pcibios())
336 return &pci_bios_access; 359 return &pci_bios_access;
337 } 360 }
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 7bf70b812fa2..021eee91c056 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,5 +1,7 @@
1# Platform specific code goes here 1# Platform specific code goes here
2obj-y += ce4100/
2obj-y += efi/ 3obj-y += efi/
4obj-y += iris/
3obj-y += mrst/ 5obj-y += mrst/
4obj-y += olpc/ 6obj-y += olpc/
5obj-y += scx200/ 7obj-y += scx200/
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile
new file mode 100644
index 000000000000..91fc92971d94
--- /dev/null
+++ b/arch/x86/platform/ce4100/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
new file mode 100644
index 000000000000..d2c0d51a7178
--- /dev/null
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -0,0 +1,132 @@
1/*
2 * Intel CE4100 platform specific setup code
3 *
4 * (C) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/serial_reg.h>
16#include <linux/serial_8250.h>
17
18#include <asm/setup.h>
19#include <asm/io.h>
20
21static int ce4100_i8042_detect(void)
22{
23 return 0;
24}
25
26static void __init sdv_find_smp_config(void)
27{
28}
29
30#ifdef CONFIG_SERIAL_8250
31
32
33static unsigned int mem_serial_in(struct uart_port *p, int offset)
34{
35 offset = offset << p->regshift;
36 return readl(p->membase + offset);
37}
38
39/*
40 * The UART Tx interrupts are not set under some conditions and therefore serial
41 * transmission hangs. This is a silicon issue and has not been root caused. The
42 * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
43 * bit of LSR register in interrupt handler to see whether at least one of these
44 * two bits is set, if so then process the transmit request. If this workaround
45 * is not applied, then the serial transmission may hang. This workaround is for
46 * errata number 9 in Errata - B step.
47*/
48
49static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
50{
51 unsigned int ret, ier, lsr;
52
53 if (offset == UART_IIR) {
54 offset = offset << p->regshift;
55 ret = readl(p->membase + offset);
56 if (ret & UART_IIR_NO_INT) {
57 /* see if the TX interrupt should have really set */
58 ier = mem_serial_in(p, UART_IER);
59 /* see if the UART's XMIT interrupt is enabled */
60 if (ier & UART_IER_THRI) {
61 lsr = mem_serial_in(p, UART_LSR);
62 /* now check to see if the UART should be
63 generating an interrupt (but isn't) */
64 if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
65 ret &= ~UART_IIR_NO_INT;
66 }
67 }
68 } else
69 ret = mem_serial_in(p, offset);
70 return ret;
71}
72
73static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
74{
75 offset = offset << p->regshift;
76 writel(value, p->membase + offset);
77}
78
79static void ce4100_serial_fixup(int port, struct uart_port *up,
80 unsigned short *capabilites)
81{
82#ifdef CONFIG_EARLY_PRINTK
83 /*
84 * Over ride the legacy port configuration that comes from
85 * asm/serial.h. Using the ioport driver then switching to the
86 * PCI memmaped driver hangs the IOAPIC
87 */
88 if (up->iotype != UPIO_MEM32) {
89 up->uartclk = 14745600;
90 up->mapbase = 0xdffe0200;
91 set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
92 up->mapbase & PAGE_MASK);
93 up->membase =
94 (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
95 up->membase += up->mapbase & ~PAGE_MASK;
96 up->iotype = UPIO_MEM32;
97 up->regshift = 2;
98 }
99#endif
100 up->iobase = 0;
101 up->serial_in = ce4100_mem_serial_in;
102 up->serial_out = ce4100_mem_serial_out;
103
104 *capabilites |= (1 << 12);
105}
106
107static __init void sdv_serial_fixup(void)
108{
109 serial8250_set_isa_configurator(ce4100_serial_fixup);
110}
111
112#else
113static inline void sdv_serial_fixup(void);
114#endif
115
116static void __init sdv_arch_setup(void)
117{
118 sdv_serial_fixup();
119}
120
121/*
122 * CE4100 specific x86_init function overrides and early setup
123 * calls.
124 */
125void __init x86_ce4100_early_setup(void)
126{
127 x86_init.oem.arch_setup = sdv_arch_setup;
128 x86_platform.i8042_detect = ce4100_i8042_detect;
129 x86_init.resources.probe_roms = x86_init_noop;
130 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
131 x86_init.mpparse.find_smp_config = sdv_find_smp_config;
132}
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile
new file mode 100644
index 000000000000..db921983a102
--- /dev/null
+++ b/arch/x86/platform/iris/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_32_IRIS) += iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
new file mode 100644
index 000000000000..1ba7f5ed8c9b
--- /dev/null
+++ b/arch/x86/platform/iris/iris.c
@@ -0,0 +1,91 @@
1/*
2 * Eurobraille/Iris power off support.
3 *
4 * Eurobraille's Iris machine is a PC with no APM or ACPI support.
5 * It is shutdown by a special I/O sequence which this module provides.
6 *
7 * Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
8 *
9 * This program is free software ; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation ; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with the program ; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 */
23
24#include <linux/moduleparam.h>
25#include <linux/module.h>
26#include <linux/kernel.h>
27#include <linux/errno.h>
28#include <linux/delay.h>
29#include <linux/init.h>
30#include <linux/pm.h>
31#include <asm/io.h>
32
33#define IRIS_GIO_BASE 0x340
34#define IRIS_GIO_INPUT IRIS_GIO_BASE
35#define IRIS_GIO_OUTPUT (IRIS_GIO_BASE + 1)
36#define IRIS_GIO_PULSE 0x80 /* First byte to send */
37#define IRIS_GIO_REST 0x00 /* Second byte to send */
38#define IRIS_GIO_NODEV 0xff /* Likely not an Iris */
39
40MODULE_LICENSE("GPL");
41MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
42MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
43MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
44
45static int force;
46
47module_param(force, bool, 0);
48MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
49
50static void (*old_pm_power_off)(void);
51
52static void iris_power_off(void)
53{
54 outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
55 msleep(850);
56 outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
57}
58
59/*
60 * Before installing the power_off handler, try to make sure the OS is
61 * running on an Iris. Since Iris does not support DMI, this is done
62 * by reading its input port and seeing whether the read value is
63 * meaningful.
64 */
65static int iris_init(void)
66{
67 unsigned char status;
68 if (force != 1) {
69 printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
70 return -ENODEV;
71 }
72 status = inb(IRIS_GIO_INPUT);
73 if (status == IRIS_GIO_NODEV) {
74 printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
75 return -ENODEV;
76 }
77 old_pm_power_off = pm_power_off;
78 pm_power_off = &iris_power_off;
79 printk(KERN_INFO "Iris power_off handler installed.\n");
80
81 return 0;
82}
83
84static void iris_exit(void)
85{
86 pm_power_off = old_pm_power_off;
87 printk(KERN_INFO "Iris power_off handler uninstalled.\n");
88}
89
90module_init(iris_init);
91module_exit(iris_exit);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index efbbc552fa95..f61ccdd49341 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1 +1,3 @@
1obj-$(CONFIG_X86_MRST) += mrst.o 1obj-$(CONFIG_X86_MRST) += mrst.o
2obj-$(CONFIG_X86_MRST) += vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..65df603622b2 100644
--- a/arch/x86/kernel/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 79ae68154e87..fee0b4914e07 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -9,9 +9,19 @@
9 * as published by the Free Software Foundation; version 2 9 * as published by the Free Software Foundation; version 2
10 * of the License. 10 * of the License.
11 */ 11 */
12
13#define pr_fmt(fmt) "mrst: " fmt
14
12#include <linux/init.h> 15#include <linux/init.h>
13#include <linux/kernel.h> 16#include <linux/kernel.h>
14#include <linux/sfi.h> 17#include <linux/sfi.h>
18#include <linux/intel_pmic_gpio.h>
19#include <linux/spi/spi.h>
20#include <linux/i2c.h>
21#include <linux/i2c/pca953x.h>
22#include <linux/gpio_keys.h>
23#include <linux/input.h>
24#include <linux/platform_device.h>
15#include <linux/irq.h> 25#include <linux/irq.h>
16#include <linux/module.h> 26#include <linux/module.h>
17 27
@@ -23,7 +33,9 @@
23#include <asm/mrst.h> 33#include <asm/mrst.h>
24#include <asm/io.h> 34#include <asm/io.h>
25#include <asm/i8259.h> 35#include <asm/i8259.h>
36#include <asm/intel_scu_ipc.h>
26#include <asm/apb_timer.h> 37#include <asm/apb_timer.h>
38#include <asm/reboot.h>
27 39
28/* 40/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, 41 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
@@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
102 memcpy(sfi_mtimer_array, pentry, totallen); 114 memcpy(sfi_mtimer_array, pentry, totallen);
103 } 115 }
104 116
105 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); 117 pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
106 pentry = sfi_mtimer_array; 118 pentry = sfi_mtimer_array;
107 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { 119 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
108 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," 120 pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
109 " irq = %d\n", totallen, (u32)pentry->phys_addr, 121 " irq = %d\n", totallen, (u32)pentry->phys_addr,
110 pentry->freq_hz, pentry->irq); 122 pentry->freq_hz, pentry->irq);
111 if (!pentry->irq) 123 if (!pentry->irq)
@@ -176,14 +188,14 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
176 memcpy(sfi_mrtc_array, pentry, totallen); 188 memcpy(sfi_mrtc_array, pentry, totallen);
177 } 189 }
178 190
179 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); 191 pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
180 pentry = sfi_mrtc_array; 192 pentry = sfi_mrtc_array;
181 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { 193 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
182 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", 194 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
183 totallen, (u32)pentry->phys_addr, pentry->irq); 195 totallen, (u32)pentry->phys_addr, pentry->irq);
184 mp_irq.type = MP_IOAPIC; 196 mp_irq.type = MP_IOAPIC;
185 mp_irq.irqtype = mp_INT; 197 mp_irq.irqtype = mp_INT;
186 mp_irq.irqflag = 0; 198 mp_irq.irqflag = 0xf; /* level trigger and active low */
187 mp_irq.srcbus = 0; 199 mp_irq.srcbus = 0;
188 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 200 mp_irq.srcbusirq = pentry->irq; /* IRQ */
189 mp_irq.dstapic = MP_APIC_ALL; 201 mp_irq.dstapic = MP_APIC_ALL;
@@ -209,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
209 221
210void __init mrst_time_init(void) 222void __init mrst_time_init(void)
211{ 223{
224 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
212 switch (mrst_timer_options) { 225 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY: 226 case MRST_TIMER_APBT_ONLY:
214 break; 227 break;
@@ -224,16 +237,10 @@ void __init mrst_time_init(void)
224 return; 237 return;
225 } 238 }
226 /* we need at least one APB timer */ 239 /* we need at least one APB timer */
227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
228 pre_init_apic_IRQ0(); 240 pre_init_apic_IRQ0();
229 apbt_time_init(); 241 apbt_time_init();
230} 242}
231 243
232void __init mrst_rtc_init(void)
233{
234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
235}
236
237void __cpuinit mrst_arch_setup(void) 244void __cpuinit mrst_arch_setup(void)
238{ 245{
239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) 246 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
@@ -256,6 +263,17 @@ static int mrst_i8042_detect(void)
256 return 0; 263 return 0;
257} 264}
258 265
266/* Reboot and power off are handled by the SCU on a MID device */
267static void mrst_power_off(void)
268{
269 intel_scu_ipc_simple_command(0xf1, 1);
270}
271
272static void mrst_reboot(void)
273{
274 intel_scu_ipc_simple_command(0xf1, 0);
275}
276
259/* 277/*
260 * Moorestown specific x86_init function overrides and early setup 278 * Moorestown specific x86_init function overrides and early setup
261 * calls. 279 * calls.
@@ -281,6 +299,10 @@ void __init x86_mrst_early_setup(void)
281 299
282 legacy_pic = &null_legacy_pic; 300 legacy_pic = &null_legacy_pic;
283 301
302 /* Moorestown specific power_off/restart method */
303 pm_power_off = mrst_power_off;
304 machine_ops.emergency_restart = mrst_reboot;
305
284 /* Avoid searching for BIOS MP tables */ 306 /* Avoid searching for BIOS MP tables */
285 x86_init.mpparse.find_smp_config = x86_init_noop; 307 x86_init.mpparse.find_smp_config = x86_init_noop;
286 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 308 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
@@ -309,3 +331,505 @@ static inline int __init setup_x86_mrst_timer(char *arg)
309 return 0; 331 return 0;
310} 332}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer); 333__setup("x86_mrst_timer=", setup_x86_mrst_timer);
334
335/*
336 * Parsing GPIO table first, since the DEVS table will need this table
337 * to map the pin name to the actual pin.
338 */
339static struct sfi_gpio_table_entry *gpio_table;
340static int gpio_num_entry;
341
342static int __init sfi_parse_gpio(struct sfi_table_header *table)
343{
344 struct sfi_table_simple *sb;
345 struct sfi_gpio_table_entry *pentry;
346 int num, i;
347
348 if (gpio_table)
349 return 0;
350 sb = (struct sfi_table_simple *)table;
351 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
352 pentry = (struct sfi_gpio_table_entry *)sb->pentry;
353
354 gpio_table = (struct sfi_gpio_table_entry *)
355 kmalloc(num * sizeof(*pentry), GFP_KERNEL);
356 if (!gpio_table)
357 return -1;
358 memcpy(gpio_table, pentry, num * sizeof(*pentry));
359 gpio_num_entry = num;
360
361 pr_debug("GPIO pin info:\n");
362 for (i = 0; i < num; i++, pentry++)
363 pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
364 " pin = %d\n", i,
365 pentry->controller_name,
366 pentry->pin_name,
367 pentry->pin_no);
368 return 0;
369}
370
371static int get_gpio_by_name(const char *name)
372{
373 struct sfi_gpio_table_entry *pentry = gpio_table;
374 int i;
375
376 if (!pentry)
377 return -1;
378 for (i = 0; i < gpio_num_entry; i++, pentry++) {
379 if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
380 return pentry->pin_no;
381 }
382 return -1;
383}
384
385/*
386 * Here defines the array of devices platform data that IAFW would export
387 * through SFI "DEVS" table, we use name and type to match the device and
388 * its platform data.
389 */
390struct devs_id {
391 char name[SFI_NAME_LEN + 1];
392 u8 type;
393 u8 delay;
394 void *(*get_platform_data)(void *info);
395};
396
397/* the offset for the mapping of global gpio pin to irq */
398#define MRST_IRQ_OFFSET 0x100
399
400static void __init *pmic_gpio_platform_data(void *info)
401{
402 static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
403 int gpio_base = get_gpio_by_name("pmic_gpio_base");
404
405 if (gpio_base == -1)
406 gpio_base = 64;
407 pmic_gpio_pdata.gpio_base = gpio_base;
408 pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
409 pmic_gpio_pdata.gpiointr = 0xffffeff8;
410
411 return &pmic_gpio_pdata;
412}
413
414static void __init *max3111_platform_data(void *info)
415{
416 struct spi_board_info *spi_info = info;
417 int intr = get_gpio_by_name("max3111_int");
418
419 if (intr == -1)
420 return NULL;
421 spi_info->irq = intr + MRST_IRQ_OFFSET;
422 return NULL;
423}
424
425/* we have multiple max7315 on the board ... */
426#define MAX7315_NUM 2
427static void __init *max7315_platform_data(void *info)
428{
429 static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
430 static int nr;
431 struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
432 struct i2c_board_info *i2c_info = info;
433 int gpio_base, intr;
434 char base_pin_name[SFI_NAME_LEN + 1];
435 char intr_pin_name[SFI_NAME_LEN + 1];
436
437 if (nr == MAX7315_NUM) {
438 pr_err("too many max7315s, we only support %d\n",
439 MAX7315_NUM);
440 return NULL;
441 }
442 /* we have several max7315 on the board, we only need load several
443 * instances of the same pca953x driver to cover them
444 */
445 strcpy(i2c_info->type, "max7315");
446 if (nr++) {
447 sprintf(base_pin_name, "max7315_%d_base", nr);
448 sprintf(intr_pin_name, "max7315_%d_int", nr);
449 } else {
450 strcpy(base_pin_name, "max7315_base");
451 strcpy(intr_pin_name, "max7315_int");
452 }
453
454 gpio_base = get_gpio_by_name(base_pin_name);
455 intr = get_gpio_by_name(intr_pin_name);
456
457 if (gpio_base == -1)
458 return NULL;
459 max7315->gpio_base = gpio_base;
460 if (intr != -1) {
461 i2c_info->irq = intr + MRST_IRQ_OFFSET;
462 max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
463 } else {
464 i2c_info->irq = -1;
465 max7315->irq_base = -1;
466 }
467 return max7315;
468}
469
470static void __init *emc1403_platform_data(void *info)
471{
472 static short intr2nd_pdata;
473 struct i2c_board_info *i2c_info = info;
474 int intr = get_gpio_by_name("thermal_int");
475 int intr2nd = get_gpio_by_name("thermal_alert");
476
477 if (intr == -1 || intr2nd == -1)
478 return NULL;
479
480 i2c_info->irq = intr + MRST_IRQ_OFFSET;
481 intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
482
483 return &intr2nd_pdata;
484}
485
486static void __init *lis331dl_platform_data(void *info)
487{
488 static short intr2nd_pdata;
489 struct i2c_board_info *i2c_info = info;
490 int intr = get_gpio_by_name("accel_int");
491 int intr2nd = get_gpio_by_name("accel_2");
492
493 if (intr == -1 || intr2nd == -1)
494 return NULL;
495
496 i2c_info->irq = intr + MRST_IRQ_OFFSET;
497 intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
498
499 return &intr2nd_pdata;
500}
501
502static void __init *no_platform_data(void *info)
503{
504 return NULL;
505}
506
507static const struct devs_id __initconst device_ids[] = {
508 {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
509 {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
510 {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
511 {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
512 {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
513 {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
514 {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
515 {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
516 {},
517};
518
519#define MAX_IPCDEVS 24
520static struct platform_device *ipc_devs[MAX_IPCDEVS];
521static int ipc_next_dev;
522
523#define MAX_SCU_SPI 24
524static struct spi_board_info *spi_devs[MAX_SCU_SPI];
525static int spi_next_dev;
526
527#define MAX_SCU_I2C 24
528static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
529static int i2c_bus[MAX_SCU_I2C];
530static int i2c_next_dev;
531
532static void __init intel_scu_device_register(struct platform_device *pdev)
533{
534 if(ipc_next_dev == MAX_IPCDEVS)
535 pr_err("too many SCU IPC devices");
536 else
537 ipc_devs[ipc_next_dev++] = pdev;
538}
539
540static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
541{
542 struct spi_board_info *new_dev;
543
544 if (spi_next_dev == MAX_SCU_SPI) {
545 pr_err("too many SCU SPI devices");
546 return;
547 }
548
549 new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
550 if (!new_dev) {
551 pr_err("failed to alloc mem for delayed spi dev %s\n",
552 sdev->modalias);
553 return;
554 }
555 memcpy(new_dev, sdev, sizeof(*sdev));
556
557 spi_devs[spi_next_dev++] = new_dev;
558}
559
560static void __init intel_scu_i2c_device_register(int bus,
561 struct i2c_board_info *idev)
562{
563 struct i2c_board_info *new_dev;
564
565 if (i2c_next_dev == MAX_SCU_I2C) {
566 pr_err("too many SCU I2C devices");
567 return;
568 }
569
570 new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
571 if (!new_dev) {
572 pr_err("failed to alloc mem for delayed i2c dev %s\n",
573 idev->type);
574 return;
575 }
576 memcpy(new_dev, idev, sizeof(*idev));
577
578 i2c_bus[i2c_next_dev] = bus;
579 i2c_devs[i2c_next_dev++] = new_dev;
580}
581
582/* Called by IPC driver */
583void intel_scu_devices_create(void)
584{
585 int i;
586
587 for (i = 0; i < ipc_next_dev; i++)
588 platform_device_add(ipc_devs[i]);
589
590 for (i = 0; i < spi_next_dev; i++)
591 spi_register_board_info(spi_devs[i], 1);
592
593 for (i = 0; i < i2c_next_dev; i++) {
594 struct i2c_adapter *adapter;
595 struct i2c_client *client;
596
597 adapter = i2c_get_adapter(i2c_bus[i]);
598 if (adapter) {
599 client = i2c_new_device(adapter, i2c_devs[i]);
600 if (!client)
601 pr_err("can't create i2c device %s\n",
602 i2c_devs[i]->type);
603 } else
604 i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
605 }
606}
607EXPORT_SYMBOL_GPL(intel_scu_devices_create);
608
609/* Called by IPC driver */
610void intel_scu_devices_destroy(void)
611{
612 int i;
613
614 for (i = 0; i < ipc_next_dev; i++)
615 platform_device_del(ipc_devs[i]);
616}
617EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
618
619static void __init install_irq_resource(struct platform_device *pdev, int irq)
620{
621 /* Single threaded */
622 static struct resource __initdata res = {
623 .name = "IRQ",
624 .flags = IORESOURCE_IRQ,
625 };
626 res.start = irq;
627 platform_device_add_resources(pdev, &res, 1);
628}
629
630static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
631{
632 const struct devs_id *dev = device_ids;
633 void *pdata = NULL;
634
635 while (dev->name[0]) {
636 if (dev->type == SFI_DEV_TYPE_IPC &&
637 !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
638 pdata = dev->get_platform_data(pdev);
639 break;
640 }
641 dev++;
642 }
643 pdev->dev.platform_data = pdata;
644 intel_scu_device_register(pdev);
645}
646
647static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
648{
649 const struct devs_id *dev = device_ids;
650 void *pdata = NULL;
651
652 while (dev->name[0]) {
653 if (dev->type == SFI_DEV_TYPE_SPI &&
654 !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
655 pdata = dev->get_platform_data(spi_info);
656 break;
657 }
658 dev++;
659 }
660 spi_info->platform_data = pdata;
661 if (dev->delay)
662 intel_scu_spi_device_register(spi_info);
663 else
664 spi_register_board_info(spi_info, 1);
665}
666
667static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
668{
669 const struct devs_id *dev = device_ids;
670 void *pdata = NULL;
671
672 while (dev->name[0]) {
673 if (dev->type == SFI_DEV_TYPE_I2C &&
674 !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
675 pdata = dev->get_platform_data(i2c_info);
676 break;
677 }
678 dev++;
679 }
680 i2c_info->platform_data = pdata;
681
682 if (dev->delay)
683 intel_scu_i2c_device_register(bus, i2c_info);
684 else
685 i2c_register_board_info(bus, i2c_info, 1);
686 }
687
688
689static int __init sfi_parse_devs(struct sfi_table_header *table)
690{
691 struct sfi_table_simple *sb;
692 struct sfi_device_table_entry *pentry;
693 struct spi_board_info spi_info;
694 struct i2c_board_info i2c_info;
695 struct platform_device *pdev;
696 int num, i, bus;
697 int ioapic;
698 struct io_apic_irq_attr irq_attr;
699
700 sb = (struct sfi_table_simple *)table;
701 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
702 pentry = (struct sfi_device_table_entry *)sb->pentry;
703
704 for (i = 0; i < num; i++, pentry++) {
705 if (pentry->irq != (u8)0xff) { /* native RTE case */
706 /* these SPI2 devices are not exposed to system as PCI
707 * devices, but they have separate RTE entry in IOAPIC
708 * so we have to enable them one by one here
709 */
710 ioapic = mp_find_ioapic(pentry->irq);
711 irq_attr.ioapic = ioapic;
712 irq_attr.ioapic_pin = pentry->irq;
713 irq_attr.trigger = 1;
714 irq_attr.polarity = 1;
715 io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
716 }
717 switch (pentry->type) {
718 case SFI_DEV_TYPE_IPC:
719 /* ID as IRQ is a hack that will go away */
720 pdev = platform_device_alloc(pentry->name, pentry->irq);
721 if (pdev == NULL) {
722 pr_err("out of memory for SFI platform device '%s'.\n",
723 pentry->name);
724 continue;
725 }
726 install_irq_resource(pdev, pentry->irq);
727 pr_debug("info[%2d]: IPC bus, name = %16.16s, "
728 "irq = 0x%2x\n", i, pentry->name, pentry->irq);
729 sfi_handle_ipc_dev(pdev);
730 break;
731 case SFI_DEV_TYPE_SPI:
732 memset(&spi_info, 0, sizeof(spi_info));
733 strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
734 spi_info.irq = pentry->irq;
735 spi_info.bus_num = pentry->host_num;
736 spi_info.chip_select = pentry->addr;
737 spi_info.max_speed_hz = pentry->max_freq;
738 pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
739 "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
740 spi_info.bus_num,
741 spi_info.modalias,
742 spi_info.irq,
743 spi_info.max_speed_hz,
744 spi_info.chip_select);
745 sfi_handle_spi_dev(&spi_info);
746 break;
747 case SFI_DEV_TYPE_I2C:
748 memset(&i2c_info, 0, sizeof(i2c_info));
749 bus = pentry->host_num;
750 strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
751 i2c_info.irq = pentry->irq;
752 i2c_info.addr = pentry->addr;
753 pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
754 "irq = 0x%2x, addr = 0x%x\n", i, bus,
755 i2c_info.type,
756 i2c_info.irq,
757 i2c_info.addr);
758 sfi_handle_i2c_dev(bus, &i2c_info);
759 break;
760 case SFI_DEV_TYPE_UART:
761 case SFI_DEV_TYPE_HSI:
762 default:
763 ;
764 }
765 }
766 return 0;
767}
768
769static int __init mrst_platform_init(void)
770{
771 sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
772 sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
773 return 0;
774}
775arch_initcall(mrst_platform_init);
776
777/*
778 * we will search these buttons in SFI GPIO table (by name)
779 * and register them dynamically. Please add all possible
780 * buttons here, we will shrink them if no GPIO found.
781 */
782static struct gpio_keys_button gpio_button[] = {
783 {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000},
784 {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20},
785 {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20},
786 {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20},
787 {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20},
788 {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20},
789 {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20},
790 {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20},
791 {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20},
792 {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20},
793};
794
795static struct gpio_keys_platform_data mrst_gpio_keys = {
796 .buttons = gpio_button,
797 .rep = 1,
798 .nbuttons = -1, /* will fill it after search */
799};
800
801static struct platform_device pb_device = {
802 .name = "gpio-keys",
803 .id = -1,
804 .dev = {
805 .platform_data = &mrst_gpio_keys,
806 },
807};
808
809/*
810 * Shrink the non-existent buttons, register the gpio button
811 * device if there is some
812 */
813static int __init pb_keys_init(void)
814{
815 struct gpio_keys_button *gb = gpio_button;
816 int i, num, good = 0;
817
818 num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
819 for (i = 0; i < num; i++) {
820 gb[i].gpio = get_gpio_by_name(gb[i].desc);
821 if (gb[i].gpio == -1)
822 continue;
823
824 if (i != good)
825 gb[good] = gb[i];
826 good++;
827 }
828
829 if (good) {
830 mrst_gpio_keys.nbuttons = good;
831 return platform_device_register(&pb_device);
832 }
833 return 0;
834}
835late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
new file mode 100644
index 000000000000..32cd7edd71a0
--- /dev/null
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -0,0 +1,165 @@
1/*
2 * vrtc.c: Driver for virtual RTC device on Intel MID platform
3 *
4 * (C) Copyright 2009 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 *
11 * Note:
12 * VRTC is emulated by system controller firmware, the real HW
13 * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
14 * in a memory mapped IO space that is visible to the host IA
15 * processor.
16 *
17 * This driver is based on RTC CMOS driver.
18 */
19
20#include <linux/kernel.h>
21#include <linux/init.h>
22#include <linux/sfi.h>
23#include <linux/platform_device.h>
24
25#include <asm/mrst.h>
26#include <asm/mrst-vrtc.h>
27#include <asm/time.h>
28#include <asm/fixmap.h>
29
30static unsigned char __iomem *vrtc_virt_base;
31
32unsigned char vrtc_cmos_read(unsigned char reg)
33{
34 unsigned char retval;
35
36 /* vRTC's registers range from 0x0 to 0xD */
37 if (reg > 0xd || !vrtc_virt_base)
38 return 0xff;
39
40 lock_cmos_prefix(reg);
41 retval = __raw_readb(vrtc_virt_base + (reg << 2));
42 lock_cmos_suffix(reg);
43 return retval;
44}
45EXPORT_SYMBOL_GPL(vrtc_cmos_read);
46
47void vrtc_cmos_write(unsigned char val, unsigned char reg)
48{
49 if (reg > 0xd || !vrtc_virt_base)
50 return;
51
52 lock_cmos_prefix(reg);
53 __raw_writeb(val, vrtc_virt_base + (reg << 2));
54 lock_cmos_suffix(reg);
55}
56EXPORT_SYMBOL_GPL(vrtc_cmos_write);
57
58unsigned long vrtc_get_time(void)
59{
60 u8 sec, min, hour, mday, mon;
61 u32 year;
62
63 while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
64 cpu_relax();
65
66 sec = vrtc_cmos_read(RTC_SECONDS);
67 min = vrtc_cmos_read(RTC_MINUTES);
68 hour = vrtc_cmos_read(RTC_HOURS);
69 mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
70 mon = vrtc_cmos_read(RTC_MONTH);
71 year = vrtc_cmos_read(RTC_YEAR);
72
73 /* vRTC YEAR reg contains the offset to 1960 */
74 year += 1960;
75
76 printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
77 "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
78
79 return mktime(year, mon, mday, hour, min, sec);
80}
81
82/* Only care about the minutes and seconds */
83int vrtc_set_mmss(unsigned long nowtime)
84{
85 int real_sec, real_min;
86 int vrtc_min;
87
88 vrtc_min = vrtc_cmos_read(RTC_MINUTES);
89
90 real_sec = nowtime % 60;
91 real_min = nowtime / 60;
92 if (((abs(real_min - vrtc_min) + 15)/30) & 1)
93 real_min += 30;
94 real_min %= 60;
95
96 vrtc_cmos_write(real_sec, RTC_SECONDS);
97 vrtc_cmos_write(real_min, RTC_MINUTES);
98 return 0;
99}
100
101void __init mrst_rtc_init(void)
102{
103 unsigned long rtc_paddr;
104 void __iomem *virt_base;
105
106 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
107 if (!sfi_mrtc_num)
108 return;
109
110 rtc_paddr = sfi_mrtc_array[0].phys_addr;
111
112 /* vRTC's register address may not be page aligned */
113 set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
114
115 virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
116 virt_base += rtc_paddr & ~PAGE_MASK;
117 vrtc_virt_base = virt_base;
118
119 x86_platform.get_wallclock = vrtc_get_time;
120 x86_platform.set_wallclock = vrtc_set_mmss;
121}
122
123/*
124 * The Moorestown platform has a memory mapped virtual RTC device that emulates
125 * the programming interface of the RTC.
126 */
127
128static struct resource vrtc_resources[] = {
129 [0] = {
130 .flags = IORESOURCE_MEM,
131 },
132 [1] = {
133 .flags = IORESOURCE_IRQ,
134 }
135};
136
137static struct platform_device vrtc_device = {
138 .name = "rtc_mrst",
139 .id = -1,
140 .resource = vrtc_resources,
141 .num_resources = ARRAY_SIZE(vrtc_resources),
142};
143
144/* Register the RTC device if appropriate */
145static int __init mrst_device_create(void)
146{
147 /* No Moorestown, no device */
148 if (!mrst_identify_cpu())
149 return -ENODEV;
150 /* No timer, no device */
151 if (!sfi_mrtc_num)
152 return -ENODEV;
153
154 /* iomem resource */
155 vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
156 vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
157 MRST_VRTC_MAP_SZ;
158 /* irq resource */
159 vrtc_resources[1].start = sfi_mrtc_array[0].irq;
160 vrtc_resources[1].end = sfi_mrtc_array[0].irq;
161
162 return platform_device_register(&vrtc_device);
163}
164
165module_init(mrst_device_create);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index dd4c281ffe57..ca54875ac795 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address)
48/* All CPUs enumerated by SFI must be present and enabled */ 48/* All CPUs enumerated by SFI must be present and enabled */
49static void __cpuinit mp_sfi_register_lapic(u8 id) 49static void __cpuinit mp_sfi_register_lapic(u8 id)
50{ 50{
51 if (MAX_APICS - id <= 0) { 51 if (MAX_LOCAL_APIC - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n", 52 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS); 53 id, MAX_LOCAL_APIC);
54 return; 54 return;
55 } 55 }
56 56
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ba9caa808a9c..df58e9cad96a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode)
1341 1341
1342 /* 1342 /*
1343 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1343 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1344 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub 1344 * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
1345 */ 1345 */
1346 bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE 1346 bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
1347 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1347 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
1490/* 1490/*
1491 * initialize the bau_control structure for each cpu 1491 * initialize the bau_control structure for each cpu
1492 */ 1492 */
1493static void __init uv_init_per_cpu(int nuvhubs) 1493static int __init uv_init_per_cpu(int nuvhubs)
1494{ 1494{
1495 int i; 1495 int i;
1496 int cpu; 1496 int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
1507 struct bau_control *smaster = NULL; 1507 struct bau_control *smaster = NULL;
1508 struct socket_desc { 1508 struct socket_desc {
1509 short num_cpus; 1509 short num_cpus;
1510 short cpu_number[16]; 1510 short cpu_number[MAX_CPUS_PER_SOCKET];
1511 }; 1511 };
1512 struct uvhub_desc { 1512 struct uvhub_desc {
1513 unsigned short socket_mask; 1513 unsigned short socket_mask;
@@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
1540 sdp = &bdp->socket[socket]; 1540 sdp = &bdp->socket[socket];
1541 sdp->cpu_number[sdp->num_cpus] = cpu; 1541 sdp->cpu_number[sdp->num_cpus] = cpu;
1542 sdp->num_cpus++; 1542 sdp->num_cpus++;
1543 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1544 printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
1545 return 1;
1546 }
1543 } 1547 }
1544 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1548 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1545 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) 1549 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
1570 bcp->uvhub_master = hmaster; 1574 bcp->uvhub_master = hmaster;
1571 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> 1575 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1572 blade_processor_id; 1576 blade_processor_id;
1577 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1578 printk(KERN_EMERG
1579 "%d cpus per uvhub invalid\n",
1580 bcp->uvhub_cpu);
1581 return 1;
1582 }
1573 } 1583 }
1574nextsocket: 1584nextsocket:
1575 socket++; 1585 socket++;
@@ -1595,6 +1605,7 @@ nextsocket:
1595 bcp->congested_reps = congested_reps; 1605 bcp->congested_reps = congested_reps;
1596 bcp->congested_period = congested_period; 1606 bcp->congested_period = congested_period;
1597 } 1607 }
1608 return 0;
1598} 1609}
1599 1610
1600/* 1611/*
@@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void)
1625 spin_lock_init(&disable_lock); 1636 spin_lock_init(&disable_lock);
1626 congested_cycles = microsec_2_cycles(congested_response_us); 1637 congested_cycles = microsec_2_cycles(congested_response_us);
1627 1638
1628 uv_init_per_cpu(nuvhubs); 1639 if (uv_init_per_cpu(nuvhubs)) {
1640 nobau = 1;
1641 return 0;
1642 }
1629 1643
1630 uv_partition_base_pnode = 0x7fffffff; 1644 uv_partition_base_pnode = 0x7fffffff;
1631 for (uvhub = 0; uvhub < nuvhubs; uvhub++) 1645 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd053b89..632037671746 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
171 ver = m->apicver; 171 ver = m->apicver;
172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { 172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
174 m->apicid, MAX_APICS); 174 m->apicid, MAX_LOCAL_APIC);
175 return; 175 return;
176 } 176 }
177 177
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 5718566e00f9..d9926afec110 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id,
275int __init acpi_numa_init(void) 275int __init acpi_numa_init(void)
276{ 276{
277 int ret = 0; 277 int ret = 0;
278 int nr_cpu_entries = nr_cpu_ids;
279
280#ifdef CONFIG_X86
281 /*
282 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
283 * SRAT cpu entries could have different order with that in MADT.
284 * So go over all cpu entries in SRAT to get apicid to node mapping.
285 */
286 nr_cpu_entries = MAX_LOCAL_APIC;
287#endif
278 288
279 /* SRAT: Static Resource Affinity Table */ 289 /* SRAT: Static Resource Affinity Table */
280 if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { 290 if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
281 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, 291 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
282 acpi_parse_x2apic_affinity, nr_cpu_ids); 292 acpi_parse_x2apic_affinity, nr_cpu_entries);
283 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, 293 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
284 acpi_parse_processor_affinity, nr_cpu_ids); 294 acpi_parse_processor_affinity, nr_cpu_entries);
285 ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, 295 ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
286 acpi_parse_memory_affinity, 296 acpi_parse_memory_affinity,
287 NR_NODE_MEMBLKS); 297 NR_NODE_MEMBLKS);
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 42396df55556..9252e85706ef 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -38,7 +38,7 @@ static int agp_bridges_found;
38 38
39static void amd64_tlbflush(struct agp_memory *temp) 39static void amd64_tlbflush(struct agp_memory *temp)
40{ 40{
41 k8_flush_garts(); 41 amd_flush_garts();
42} 42}
43 43
44static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) 44static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
124 u32 temp; 124 u32 temp;
125 struct aper_size_info_32 *values; 125 struct aper_size_info_32 *values;
126 126
127 dev = k8_northbridges.nb_misc[0]; 127 dev = node_to_amd_nb(0)->misc;
128 if (dev==NULL) 128 if (dev==NULL)
129 return 0; 129 return 0;
130 130
@@ -181,16 +181,15 @@ static int amd_8151_configure(void)
181 unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); 181 unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
182 int i; 182 int i;
183 183
184 if (!k8_northbridges.gart_supported) 184 if (!amd_nb_has_feature(AMD_NB_GART))
185 return 0; 185 return 0;
186 186
187 /* Configure AGP regs in each x86-64 host bridge. */ 187 /* Configure AGP regs in each x86-64 host bridge. */
188 for (i = 0; i < k8_northbridges.num; i++) { 188 for (i = 0; i < amd_nb_num(); i++) {
189 agp_bridge->gart_bus_addr = 189 agp_bridge->gart_bus_addr =
190 amd64_configure(k8_northbridges.nb_misc[i], 190 amd64_configure(node_to_amd_nb(i)->misc, gatt_bus);
191 gatt_bus);
192 } 191 }
193 k8_flush_garts(); 192 amd_flush_garts();
194 return 0; 193 return 0;
195} 194}
196 195
@@ -200,11 +199,11 @@ static void amd64_cleanup(void)
200 u32 tmp; 199 u32 tmp;
201 int i; 200 int i;
202 201
203 if (!k8_northbridges.gart_supported) 202 if (!amd_nb_has_feature(AMD_NB_GART))
204 return; 203 return;
205 204
206 for (i = 0; i < k8_northbridges.num; i++) { 205 for (i = 0; i < amd_nb_num(); i++) {
207 struct pci_dev *dev = k8_northbridges.nb_misc[i]; 206 struct pci_dev *dev = node_to_amd_nb(i)->misc;
208 /* disable gart translation */ 207 /* disable gart translation */
209 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); 208 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
210 tmp &= ~GARTEN; 209 tmp &= ~GARTEN;
@@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
331{ 330{
332 int i; 331 int i;
333 332
334 if (cache_k8_northbridges() < 0) 333 if (amd_cache_northbridges() < 0)
335 return -ENODEV; 334 return -ENODEV;
336 335
337 if (!k8_northbridges.gart_supported) 336 if (!amd_nb_has_feature(AMD_NB_GART))
338 return -ENODEV; 337 return -ENODEV;
339 338
340 i = 0; 339 i = 0;
341 for (i = 0; i < k8_northbridges.num; i++) { 340 for (i = 0; i < amd_nb_num(); i++) {
342 struct pci_dev *dev = k8_northbridges.nb_misc[i]; 341 struct pci_dev *dev = node_to_amd_nb(i)->misc;
343 if (fix_northbridge(dev, pdev, cap_ptr) < 0) { 342 if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
344 dev_err(&dev->dev, "no usable aperture found\n"); 343 dev_err(&dev->dev, "no usable aperture found\n");
345#ifdef __x86_64__ 344#ifdef __x86_64__
@@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
416 } 415 }
417 416
418 /* shadow x86-64 registers into ULi registers */ 417 /* shadow x86-64 registers into ULi registers */
419 pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE, 418 pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
420 &httfea); 419 &httfea);
421 420
422 /* if x86-64 aperture base is beyond 4G, exit here */ 421 /* if x86-64 aperture base is beyond 4G, exit here */
@@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev)
484 pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp); 483 pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
485 484
486 /* shadow x86-64 registers into NVIDIA registers */ 485 /* shadow x86-64 registers into NVIDIA registers */
487 pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE, 486 pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
488 &apbase); 487 &apbase);
489 488
490 /* if x86-64 aperture base is beyond 4G, exit here */ 489 /* if x86-64 aperture base is beyond 4G, exit here */
@@ -778,7 +777,7 @@ int __init agp_amd64_init(void)
778 } 777 }
779 778
780 /* First check that we have at least one AMD64 NB */ 779 /* First check that we have at least one AMD64 NB */
781 if (!pci_dev_present(k8_nb_ids)) 780 if (!pci_dev_present(amd_nb_misc_ids))
782 return -ENODEV; 781 return -ENODEV;
783 782
784 /* Look for any AGP bridge */ 783 /* Look for any AGP bridge */
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index eca9ba193e94..df211181fca4 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void)
2917 2917
2918 opstate_init(); 2918 opstate_init();
2919 2919
2920 if (cache_k8_northbridges() < 0) 2920 if (amd_cache_northbridges() < 0)
2921 goto err_ret; 2921 goto err_ret;
2922 2922
2923 msrs = msrs_alloc(); 2923 msrs = msrs_alloc();
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
2934 * to finish initialization of the MC instances. 2934 * to finish initialization of the MC instances.
2935 */ 2935 */
2936 err = -ENODEV; 2936 err = -ENODEV;
2937 for (nb = 0; nb < k8_northbridges.num; nb++) { 2937 for (nb = 0; nb < amd_nb_num(); nb++) {
2938 if (!pvt_lookup[nb]) 2938 if (!pvt_lookup[nb])
2939 continue; 2939 continue;
2940 2940
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
index 41a9e34899ac..ca35b0ce944a 100644
--- a/drivers/platform/x86/intel_scu_ipc.c
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -26,6 +26,7 @@
26#include <linux/sfi.h> 26#include <linux/sfi.h>
27#include <asm/mrst.h> 27#include <asm/mrst.h>
28#include <asm/intel_scu_ipc.h> 28#include <asm/intel_scu_ipc.h>
29#include <asm/mrst.h>
29 30
30/* IPC defines the following message types */ 31/* IPC defines the following message types */
31#define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */ 32#define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
@@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
699 iounmap(ipcdev.ipc_base); 700 iounmap(ipcdev.ipc_base);
700 return -ENOMEM; 701 return -ENOMEM;
701 } 702 }
703
704 intel_scu_devices_create();
705
702 return 0; 706 return 0;
703} 707}
704 708
@@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev)
720 iounmap(ipcdev.ipc_base); 724 iounmap(ipcdev.ipc_base);
721 iounmap(ipcdev.i2c_base); 725 iounmap(ipcdev.i2c_base);
722 ipcdev.pdev = NULL; 726 ipcdev.pdev = NULL;
727 intel_scu_devices_destroy();
723} 728}
724 729
725static const struct pci_device_id pci_ids[] = { 730static const struct pci_device_id pci_ids[] = {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 2883428d5ac8..4941cade319f 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -463,6 +463,18 @@ config RTC_DRV_CMOS
463 This driver can also be built as a module. If so, the module 463 This driver can also be built as a module. If so, the module
464 will be called rtc-cmos. 464 will be called rtc-cmos.
465 465
466config RTC_DRV_VRTC
467 tristate "Virtual RTC for Moorestown platforms"
468 depends on X86_MRST
469 default y if X86_MRST
470
471 help
472 Say "yes" here to get direct support for the real time clock
473 found on Moorestown platforms. The VRTC is a emulated RTC that
474 derives its clock source from a real RTC in the PMIC. The MC146818
475 style programming interface is mostly conserved, but any
476 updates are done via IPC calls to the system controller FW.
477
466config RTC_DRV_DS1216 478config RTC_DRV_DS1216
467 tristate "Dallas DS1216" 479 tristate "Dallas DS1216"
468 depends on SNI_RM 480 depends on SNI_RM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 4c2832df4697..2afdaf3ff986 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS) += rtc-cmos.o
30obj-$(CONFIG_RTC_DRV_COH901331) += rtc-coh901331.o 30obj-$(CONFIG_RTC_DRV_COH901331) += rtc-coh901331.o
31obj-$(CONFIG_RTC_DRV_DAVINCI) += rtc-davinci.o 31obj-$(CONFIG_RTC_DRV_DAVINCI) += rtc-davinci.o
32obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o 32obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o
33obj-$(CONFIG_RTC_DRV_VRTC) += rtc-mrst.o
33obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o 34obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o
34obj-$(CONFIG_RTC_DRV_DS1286) += rtc-ds1286.o 35obj-$(CONFIG_RTC_DRV_DS1286) += rtc-ds1286.o
35obj-$(CONFIG_RTC_DRV_DS1302) += rtc-ds1302.o 36obj-$(CONFIG_RTC_DRV_DS1302) += rtc-ds1302.o
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
new file mode 100644
index 000000000000..bcd0cf63eb16
--- /dev/null
+++ b/drivers/rtc/rtc-mrst.c
@@ -0,0 +1,582 @@
1/*
2 * rtc-mrst.c: Driver for Moorestown virtual RTC
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 * Feng Tang (feng.tang@intel.com)
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2
11 * of the License.
12 *
13 * Note:
14 * VRTC is emulated by system controller firmware, the real HW
15 * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
16 * in a memory mapped IO space that is visible to the host IA
17 * processor.
18 *
19 * This driver is based upon drivers/rtc/rtc-cmos.c
20 */
21
22/*
23 * Note:
24 * * vRTC only supports binary mode and 24H mode
25 * * vRTC only support PIE and AIE, no UIE, and its PIE only happens
26 * at 23:59:59pm everyday, no support for adjustable frequency
27 * * Alarm function is also limited to hr/min/sec.
28 */
29
30#include <linux/mod_devicetable.h>
31#include <linux/platform_device.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/kernel.h>
35#include <linux/module.h>
36#include <linux/init.h>
37#include <linux/sfi.h>
38
39#include <asm-generic/rtc.h>
40#include <asm/intel_scu_ipc.h>
41#include <asm/mrst.h>
42#include <asm/mrst-vrtc.h>
43
44struct mrst_rtc {
45 struct rtc_device *rtc;
46 struct device *dev;
47 int irq;
48 struct resource *iomem;
49
50 u8 enabled_wake;
51 u8 suspend_ctrl;
52};
53
54static const char driver_name[] = "rtc_mrst";
55
56#define RTC_IRQMASK (RTC_PF | RTC_AF)
57
58static inline int is_intr(u8 rtc_intr)
59{
60 if (!(rtc_intr & RTC_IRQF))
61 return 0;
62 return rtc_intr & RTC_IRQMASK;
63}
64
65/*
66 * rtc_time's year contains the increment over 1900, but vRTC's YEAR
67 * register can't be programmed to value larger than 0x64, so vRTC
68 * driver chose to use 1960 (1970 is UNIX time start point) as the base,
69 * and does the translation at read/write time.
70 *
71 * Why not just use 1970 as the offset? it's because using 1960 will
72 * make it consistent in leap year setting for both vrtc and low-level
73 * physical rtc devices.
74 */
75static int mrst_read_time(struct device *dev, struct rtc_time *time)
76{
77 unsigned long flags;
78
79 if (rtc_is_updating())
80 mdelay(20);
81
82 spin_lock_irqsave(&rtc_lock, flags);
83 time->tm_sec = vrtc_cmos_read(RTC_SECONDS);
84 time->tm_min = vrtc_cmos_read(RTC_MINUTES);
85 time->tm_hour = vrtc_cmos_read(RTC_HOURS);
86 time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
87 time->tm_mon = vrtc_cmos_read(RTC_MONTH);
88 time->tm_year = vrtc_cmos_read(RTC_YEAR);
89 spin_unlock_irqrestore(&rtc_lock, flags);
90
91 /* Adjust for the 1960/1900 */
92 time->tm_year += 60;
93 time->tm_mon--;
94 return RTC_24H;
95}
96
97static int mrst_set_time(struct device *dev, struct rtc_time *time)
98{
99 int ret;
100 unsigned long flags;
101 unsigned char mon, day, hrs, min, sec;
102 unsigned int yrs;
103
104 yrs = time->tm_year;
105 mon = time->tm_mon + 1; /* tm_mon starts at zero */
106 day = time->tm_mday;
107 hrs = time->tm_hour;
108 min = time->tm_min;
109 sec = time->tm_sec;
110
111 if (yrs < 70 || yrs > 138)
112 return -EINVAL;
113 yrs -= 60;
114
115 spin_lock_irqsave(&rtc_lock, flags);
116
117 vrtc_cmos_write(yrs, RTC_YEAR);
118 vrtc_cmos_write(mon, RTC_MONTH);
119 vrtc_cmos_write(day, RTC_DAY_OF_MONTH);
120 vrtc_cmos_write(hrs, RTC_HOURS);
121 vrtc_cmos_write(min, RTC_MINUTES);
122 vrtc_cmos_write(sec, RTC_SECONDS);
123
124 spin_unlock_irqrestore(&rtc_lock, flags);
125
126 ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME);
127 return ret;
128}
129
130static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
131{
132 struct mrst_rtc *mrst = dev_get_drvdata(dev);
133 unsigned char rtc_control;
134
135 if (mrst->irq <= 0)
136 return -EIO;
137
138 /* Basic alarms only support hour, minute, and seconds fields.
139 * Some also support day and month, for alarms up to a year in
140 * the future.
141 */
142 t->time.tm_mday = -1;
143 t->time.tm_mon = -1;
144 t->time.tm_year = -1;
145
146 /* vRTC only supports binary mode */
147 spin_lock_irq(&rtc_lock);
148 t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
149 t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM);
150 t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM);
151
152 rtc_control = vrtc_cmos_read(RTC_CONTROL);
153 spin_unlock_irq(&rtc_lock);
154
155 t->enabled = !!(rtc_control & RTC_AIE);
156 t->pending = 0;
157
158 return 0;
159}
160
161static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control)
162{
163 unsigned char rtc_intr;
164
165 /*
166 * NOTE after changing RTC_xIE bits we always read INTR_FLAGS;
167 * allegedly some older rtcs need that to handle irqs properly
168 */
169 rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS);
170 rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF;
171 if (is_intr(rtc_intr))
172 rtc_update_irq(mrst->rtc, 1, rtc_intr);
173}
174
175static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask)
176{
177 unsigned char rtc_control;
178
179 /*
180 * Flush any pending IRQ status, notably for update irqs,
181 * before we enable new IRQs
182 */
183 rtc_control = vrtc_cmos_read(RTC_CONTROL);
184 mrst_checkintr(mrst, rtc_control);
185
186 rtc_control |= mask;
187 vrtc_cmos_write(rtc_control, RTC_CONTROL);
188
189 mrst_checkintr(mrst, rtc_control);
190}
191
192static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask)
193{
194 unsigned char rtc_control;
195
196 rtc_control = vrtc_cmos_read(RTC_CONTROL);
197 rtc_control &= ~mask;
198 vrtc_cmos_write(rtc_control, RTC_CONTROL);
199 mrst_checkintr(mrst, rtc_control);
200}
201
202static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
203{
204 struct mrst_rtc *mrst = dev_get_drvdata(dev);
205 unsigned char hrs, min, sec;
206 int ret = 0;
207
208 if (!mrst->irq)
209 return -EIO;
210
211 hrs = t->time.tm_hour;
212 min = t->time.tm_min;
213 sec = t->time.tm_sec;
214
215 spin_lock_irq(&rtc_lock);
216 /* Next rtc irq must not be from previous alarm setting */
217 mrst_irq_disable(mrst, RTC_AIE);
218
219 /* Update alarm */
220 vrtc_cmos_write(hrs, RTC_HOURS_ALARM);
221 vrtc_cmos_write(min, RTC_MINUTES_ALARM);
222 vrtc_cmos_write(sec, RTC_SECONDS_ALARM);
223
224 spin_unlock_irq(&rtc_lock);
225
226 ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM);
227 if (ret)
228 return ret;
229
230 spin_lock_irq(&rtc_lock);
231 if (t->enabled)
232 mrst_irq_enable(mrst, RTC_AIE);
233
234 spin_unlock_irq(&rtc_lock);
235
236 return 0;
237}
238
239static int mrst_irq_set_state(struct device *dev, int enabled)
240{
241 struct mrst_rtc *mrst = dev_get_drvdata(dev);
242 unsigned long flags;
243
244 if (!mrst->irq)
245 return -ENXIO;
246
247 spin_lock_irqsave(&rtc_lock, flags);
248
249 if (enabled)
250 mrst_irq_enable(mrst, RTC_PIE);
251 else
252 mrst_irq_disable(mrst, RTC_PIE);
253
254 spin_unlock_irqrestore(&rtc_lock, flags);
255 return 0;
256}
257
258#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
259
260/* Currently, the vRTC doesn't support UIE ON/OFF */
261static int
262mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
263{
264 struct mrst_rtc *mrst = dev_get_drvdata(dev);
265 unsigned long flags;
266
267 switch (cmd) {
268 case RTC_AIE_OFF:
269 case RTC_AIE_ON:
270 if (!mrst->irq)
271 return -EINVAL;
272 break;
273 default:
274 /* PIE ON/OFF is handled by mrst_irq_set_state() */
275 return -ENOIOCTLCMD;
276 }
277
278 spin_lock_irqsave(&rtc_lock, flags);
279 switch (cmd) {
280 case RTC_AIE_OFF: /* alarm off */
281 mrst_irq_disable(mrst, RTC_AIE);
282 break;
283 case RTC_AIE_ON: /* alarm on */
284 mrst_irq_enable(mrst, RTC_AIE);
285 break;
286 }
287 spin_unlock_irqrestore(&rtc_lock, flags);
288 return 0;
289}
290
291#else
292#define mrst_rtc_ioctl NULL
293#endif
294
295#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
296
297static int mrst_procfs(struct device *dev, struct seq_file *seq)
298{
299 unsigned char rtc_control, valid;
300
301 spin_lock_irq(&rtc_lock);
302 rtc_control = vrtc_cmos_read(RTC_CONTROL);
303 valid = vrtc_cmos_read(RTC_VALID);
304 spin_unlock_irq(&rtc_lock);
305
306 return seq_printf(seq,
307 "periodic_IRQ\t: %s\n"
308 "alarm\t\t: %s\n"
309 "BCD\t\t: no\n"
310 "periodic_freq\t: daily (not adjustable)\n",
311 (rtc_control & RTC_PIE) ? "on" : "off",
312 (rtc_control & RTC_AIE) ? "on" : "off");
313}
314
315#else
316#define mrst_procfs NULL
317#endif
318
319static const struct rtc_class_ops mrst_rtc_ops = {
320 .ioctl = mrst_rtc_ioctl,
321 .read_time = mrst_read_time,
322 .set_time = mrst_set_time,
323 .read_alarm = mrst_read_alarm,
324 .set_alarm = mrst_set_alarm,
325 .proc = mrst_procfs,
326 .irq_set_state = mrst_irq_set_state,
327};
328
329static struct mrst_rtc mrst_rtc;
330
331/*
332 * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in
333 * Reg B, so no need for this driver to clear it
334 */
335static irqreturn_t mrst_rtc_irq(int irq, void *p)
336{
337 u8 irqstat;
338
339 spin_lock(&rtc_lock);
340 /* This read will clear all IRQ flags inside Reg C */
341 irqstat = vrtc_cmos_read(RTC_INTR_FLAGS);
342 spin_unlock(&rtc_lock);
343
344 irqstat &= RTC_IRQMASK | RTC_IRQF;
345 if (is_intr(irqstat)) {
346 rtc_update_irq(p, 1, irqstat);
347 return IRQ_HANDLED;
348 }
349 return IRQ_NONE;
350}
351
352static int __init
353vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
354{
355 int retval = 0;
356 unsigned char rtc_control;
357
358 /* There can be only one ... */
359 if (mrst_rtc.dev)
360 return -EBUSY;
361
362 if (!iomem)
363 return -ENODEV;
364
365 iomem = request_mem_region(iomem->start,
366 iomem->end + 1 - iomem->start,
367 driver_name);
368 if (!iomem) {
369 dev_dbg(dev, "i/o mem already in use.\n");
370 return -EBUSY;
371 }
372
373 mrst_rtc.irq = rtc_irq;
374 mrst_rtc.iomem = iomem;
375
376 mrst_rtc.rtc = rtc_device_register(driver_name, dev,
377 &mrst_rtc_ops, THIS_MODULE);
378 if (IS_ERR(mrst_rtc.rtc)) {
379 retval = PTR_ERR(mrst_rtc.rtc);
380 goto cleanup0;
381 }
382
383 mrst_rtc.dev = dev;
384 dev_set_drvdata(dev, &mrst_rtc);
385 rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
386
387 spin_lock_irq(&rtc_lock);
388 mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE);
389 rtc_control = vrtc_cmos_read(RTC_CONTROL);
390 spin_unlock_irq(&rtc_lock);
391
392 if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY)))
393 dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n");
394
395 if (rtc_irq) {
396 retval = request_irq(rtc_irq, mrst_rtc_irq,
397 IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev),
398 mrst_rtc.rtc);
399 if (retval < 0) {
400 dev_dbg(dev, "IRQ %d is already in use, err %d\n",
401 rtc_irq, retval);
402 goto cleanup1;
403 }
404 }
405 dev_dbg(dev, "initialised\n");
406 return 0;
407
408cleanup1:
409 mrst_rtc.dev = NULL;
410 rtc_device_unregister(mrst_rtc.rtc);
411cleanup0:
412 release_region(iomem->start, iomem->end + 1 - iomem->start);
413 dev_err(dev, "rtc-mrst: unable to initialise\n");
414 return retval;
415}
416
417static void rtc_mrst_do_shutdown(void)
418{
419 spin_lock_irq(&rtc_lock);
420 mrst_irq_disable(&mrst_rtc, RTC_IRQMASK);
421 spin_unlock_irq(&rtc_lock);
422}
423
424static void __exit rtc_mrst_do_remove(struct device *dev)
425{
426 struct mrst_rtc *mrst = dev_get_drvdata(dev);
427 struct resource *iomem;
428
429 rtc_mrst_do_shutdown();
430
431 if (mrst->irq)
432 free_irq(mrst->irq, mrst->rtc);
433
434 rtc_device_unregister(mrst->rtc);
435 mrst->rtc = NULL;
436
437 iomem = mrst->iomem;
438 release_region(iomem->start, iomem->end + 1 - iomem->start);
439 mrst->iomem = NULL;
440
441 mrst->dev = NULL;
442 dev_set_drvdata(dev, NULL);
443}
444
445#ifdef CONFIG_PM
446static int mrst_suspend(struct device *dev, pm_message_t mesg)
447{
448 struct mrst_rtc *mrst = dev_get_drvdata(dev);
449 unsigned char tmp;
450
451 /* Only the alarm might be a wakeup event source */
452 spin_lock_irq(&rtc_lock);
453 mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL);
454 if (tmp & (RTC_PIE | RTC_AIE)) {
455 unsigned char mask;
456
457 if (device_may_wakeup(dev))
458 mask = RTC_IRQMASK & ~RTC_AIE;
459 else
460 mask = RTC_IRQMASK;
461 tmp &= ~mask;
462 vrtc_cmos_write(tmp, RTC_CONTROL);
463
464 mrst_checkintr(mrst, tmp);
465 }
466 spin_unlock_irq(&rtc_lock);
467
468 if (tmp & RTC_AIE) {
469 mrst->enabled_wake = 1;
470 enable_irq_wake(mrst->irq);
471 }
472
473 dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n",
474 (tmp & RTC_AIE) ? ", alarm may wake" : "",
475 tmp);
476
477 return 0;
478}
479
480/*
481 * We want RTC alarms to wake us from the deep power saving state
482 */
483static inline int mrst_poweroff(struct device *dev)
484{
485 return mrst_suspend(dev, PMSG_HIBERNATE);
486}
487
488static int mrst_resume(struct device *dev)
489{
490 struct mrst_rtc *mrst = dev_get_drvdata(dev);
491 unsigned char tmp = mrst->suspend_ctrl;
492
493 /* Re-enable any irqs previously active */
494 if (tmp & RTC_IRQMASK) {
495 unsigned char mask;
496
497 if (mrst->enabled_wake) {
498 disable_irq_wake(mrst->irq);
499 mrst->enabled_wake = 0;
500 }
501
502 spin_lock_irq(&rtc_lock);
503 do {
504 vrtc_cmos_write(tmp, RTC_CONTROL);
505
506 mask = vrtc_cmos_read(RTC_INTR_FLAGS);
507 mask &= (tmp & RTC_IRQMASK) | RTC_IRQF;
508 if (!is_intr(mask))
509 break;
510
511 rtc_update_irq(mrst->rtc, 1, mask);
512 tmp &= ~RTC_AIE;
513 } while (mask & RTC_AIE);
514 spin_unlock_irq(&rtc_lock);
515 }
516
517 dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp);
518
519 return 0;
520}
521
522#else
523#define mrst_suspend NULL
524#define mrst_resume NULL
525
526static inline int mrst_poweroff(struct device *dev)
527{
528 return -ENOSYS;
529}
530
531#endif
532
533static int __init vrtc_mrst_platform_probe(struct platform_device *pdev)
534{
535 return vrtc_mrst_do_probe(&pdev->dev,
536 platform_get_resource(pdev, IORESOURCE_MEM, 0),
537 platform_get_irq(pdev, 0));
538}
539
540static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev)
541{
542 rtc_mrst_do_remove(&pdev->dev);
543 return 0;
544}
545
546static void vrtc_mrst_platform_shutdown(struct platform_device *pdev)
547{
548 if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev))
549 return;
550
551 rtc_mrst_do_shutdown();
552}
553
554MODULE_ALIAS("platform:vrtc_mrst");
555
556static struct platform_driver vrtc_mrst_platform_driver = {
557 .probe = vrtc_mrst_platform_probe,
558 .remove = __exit_p(vrtc_mrst_platform_remove),
559 .shutdown = vrtc_mrst_platform_shutdown,
560 .driver = {
561 .name = (char *) driver_name,
562 .suspend = mrst_suspend,
563 .resume = mrst_resume,
564 }
565};
566
567static int __init vrtc_mrst_init(void)
568{
569 return platform_driver_register(&vrtc_mrst_platform_driver);
570}
571
572static void __exit vrtc_mrst_exit(void)
573{
574 platform_driver_unregister(&vrtc_mrst_platform_driver);
575}
576
577module_init(vrtc_mrst_init);
578module_exit(vrtc_mrst_exit);
579
580MODULE_AUTHOR("Jacob Pan; Feng Tang");
581MODULE_DESCRIPTION("Driver for Moorestown virtual RTC");
582MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4ee..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
763 int metadata; 763 int metadata;
764 unsigned int revokes = 0; 764 unsigned int revokes = 0;
765 int x; 765 int x;
766 int error; 766 int error = 0;
767 767
768 if (!*top) 768 if (!*top)
769 sm->sm_first = 0; 769 sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
780 if (metadata) 780 if (metadata)
781 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 781 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
782 782
783 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); 783 if (ip != GFS2_I(sdp->sd_rindex))
784 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
785 else if (!sdp->sd_rgrps)
786 error = gfs2_ri_update(ip);
787
784 if (error) 788 if (error)
785 return error; 789 return error;
786 790
@@ -879,7 +883,8 @@ out_rg_gunlock:
879out_rlist: 883out_rlist:
880 gfs2_rlist_free(&rlist); 884 gfs2_rlist_free(&rlist);
881out: 885out:
882 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); 886 if (ip != GFS2_I(sdp->sd_rindex))
887 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
883 return error; 888 return error;
884} 889}
885 890
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f92c17704169..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
541 spin_unlock(&gl->gl_spin); 541 spin_unlock(&gl->gl_spin);
542} 542}
543 543
544static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
545 unsigned int req_state,
546 unsigned int flags)
547{
548 int ret = LM_OUT_ERROR;
549
550 if (!sdp->sd_lockstruct.ls_ops->lm_lock)
551 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
552
553 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
554 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
555 req_state, flags);
556 return ret;
557}
558
559/** 544/**
560 * do_xmote - Calls the DLM to change the state of a lock 545 * do_xmote - Calls the DLM to change the state of a lock
561 * @gl: The lock state 546 * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
575 560
576 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | 561 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
577 LM_FLAG_PRIORITY); 562 LM_FLAG_PRIORITY);
578 BUG_ON(gl->gl_state == target); 563 GLOCK_BUG_ON(gl, gl->gl_state == target);
579 BUG_ON(gl->gl_state == gl->gl_target); 564 GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
580 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && 565 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
581 glops->go_inval) { 566 glops->go_inval) {
582 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); 567 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
583 do_error(gl, 0); /* Fail queued try locks */ 568 do_error(gl, 0); /* Fail queued try locks */
584 } 569 }
570 gl->gl_req = target;
585 spin_unlock(&gl->gl_spin); 571 spin_unlock(&gl->gl_spin);
586 if (glops->go_xmote_th) 572 if (glops->go_xmote_th)
587 glops->go_xmote_th(gl); 573 glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
594 gl->gl_state == LM_ST_DEFERRED) && 580 gl->gl_state == LM_ST_DEFERRED) &&
595 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 581 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
596 lck_flags |= LM_FLAG_TRY_1CB; 582 lck_flags |= LM_FLAG_TRY_1CB;
597 ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
598 583
599 if (!(ret & LM_OUT_ASYNC)) { 584 if (sdp->sd_lockstruct.ls_ops->lm_lock) {
600 finish_xmote(gl, ret); 585 /* lock_dlm */
586 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
587 GLOCK_BUG_ON(gl, ret);
588 } else { /* lock_nolock */
589 finish_xmote(gl, target);
601 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 590 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
602 gfs2_glock_put(gl); 591 gfs2_glock_put(gl);
603 } else {
604 GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
605 } 592 }
593
606 spin_lock(&gl->gl_spin); 594 spin_lock(&gl->gl_spin);
607} 595}
608 596
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
951 939
952void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) 940void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
953{ 941{
942 struct va_format vaf;
954 va_list args; 943 va_list args;
955 944
956 va_start(args, fmt); 945 va_start(args, fmt);
946
957 if (seq) { 947 if (seq) {
958 struct gfs2_glock_iter *gi = seq->private; 948 struct gfs2_glock_iter *gi = seq->private;
959 vsprintf(gi->string, fmt, args); 949 vsprintf(gi->string, fmt, args);
960 seq_printf(seq, gi->string); 950 seq_printf(seq, gi->string);
961 } else { 951 } else {
962 printk(KERN_ERR " "); 952 vaf.fmt = fmt;
963 vprintk(fmt, args); 953 vaf.va = &args;
954
955 printk(KERN_ERR " %pV", &vaf);
964 } 956 }
957
965 va_end(args); 958 va_end(args);
966} 959}
967 960
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
1361 * @gl: Pointer to the glock 1354 * @gl: Pointer to the glock
1362 * @ret: The return value from the dlm 1355 * @ret: The return value from the dlm
1363 * 1356 *
1357 * The gl_reply field is under the gl_spin lock so that it is ok
1358 * to use a bitfield shared with other glock state fields.
1364 */ 1359 */
1365 1360
1366void gfs2_glock_complete(struct gfs2_glock *gl, int ret) 1361void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1367{ 1362{
1368 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 1363 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1369 1364
1365 spin_lock(&gl->gl_spin);
1370 gl->gl_reply = ret; 1366 gl->gl_reply = ret;
1371 1367
1372 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1368 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1373 spin_lock(&gl->gl_spin);
1374 if (gfs2_should_freeze(gl)) { 1369 if (gfs2_should_freeze(gl)) {
1375 set_bit(GLF_FROZEN, &gl->gl_flags); 1370 set_bit(GLF_FROZEN, &gl->gl_flags);
1376 spin_unlock(&gl->gl_spin); 1371 spin_unlock(&gl->gl_spin);
1377 return; 1372 return;
1378 } 1373 }
1379 spin_unlock(&gl->gl_spin);
1380 } 1374 }
1375
1376 spin_unlock(&gl->gl_spin);
1381 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1377 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1378 smp_wmb();
1382 gfs2_glock_hold(gl); 1379 gfs2_glock_hold(gl);
1383 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1380 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1384 gfs2_glock_put(gl); 1381 gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1626static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) 1623static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1627{ 1624{
1628 struct task_struct *gh_owner = NULL; 1625 struct task_struct *gh_owner = NULL;
1629 char buffer[KSYM_SYMBOL_LEN];
1630 char flags_buf[32]; 1626 char flags_buf[32];
1631 1627
1632 sprint_symbol(buffer, gh->gh_ip);
1633 if (gh->gh_owner_pid) 1628 if (gh->gh_owner_pid)
1634 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); 1629 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
1635 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", 1630 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
1636 state2str(gh->gh_state), 1631 state2str(gh->gh_state),
1637 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), 1632 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
1638 gh->gh_error, 1633 gh->gh_error,
1639 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, 1634 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
1640 gh_owner ? gh_owner->comm : "(ended)", buffer); 1635 gh_owner ? gh_owner->comm : "(ended)",
1636 (void *)gh->gh_ip);
1641 return 0; 1637 return 0;
1642} 1638}
1643 1639
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
1782 } 1778 }
1783#endif 1779#endif
1784 1780
1785 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | 1781 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
1786 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1782 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1787 if (IS_ERR(glock_workqueue)) 1783 if (IS_ERR(glock_workqueue))
1788 return PTR_ERR(glock_workqueue); 1784 return PTR_ERR(glock_workqueue);
1789 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | 1785 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
1790 WQ_FREEZEABLE, 0); 1786 WQ_MEM_RECLAIM | WQ_FREEZEABLE,
1787 0);
1791 if (IS_ERR(gfs2_delete_workqueue)) { 1788 if (IS_ERR(gfs2_delete_workqueue)) {
1792 destroy_workqueue(glock_workqueue); 1789 destroy_workqueue(glock_workqueue);
1793 return PTR_ERR(gfs2_delete_workqueue); 1790 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d220..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
87#define GL_ASYNC 0x00000040 87#define GL_ASYNC 0x00000040
88#define GL_EXACT 0x00000080 88#define GL_EXACT 0x00000080
89#define GL_SKIP 0x00000100 89#define GL_SKIP 0x00000100
90#define GL_ATIME 0x00000200
91#define GL_NOCACHE 0x00000400 90#define GL_NOCACHE 0x00000400
92 91
93/* 92/*
94 * lm_lock() and lm_async_cb return flags 93 * lm_async_cb return flags
95 * 94 *
96 * LM_OUT_ST_MASK 95 * LM_OUT_ST_MASK
97 * Masks the lower two bits of lock state in the returned value. 96 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
99 * LM_OUT_CANCELED 98 * LM_OUT_CANCELED
100 * The lock request was canceled. 99 * The lock request was canceled.
101 * 100 *
102 * LM_OUT_ASYNC
103 * The result of the request will be returned in an LM_CB_ASYNC callback.
104 *
105 */ 101 */
106 102
107#define LM_OUT_ST_MASK 0x00000003 103#define LM_OUT_ST_MASK 0x00000003
108#define LM_OUT_CANCELED 0x00000008 104#define LM_OUT_CANCELED 0x00000008
109#define LM_OUT_ASYNC 0x00000080 105#define LM_OUT_ERROR 0x00000004
110#define LM_OUT_ERROR 0x00000100
111 106
112/* 107/*
113 * lm_recovery_done() messages 108 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
124 void (*lm_unmount) (struct gfs2_sbd *sdp); 119 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp); 120 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); 121 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl, 122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
128 unsigned int req_state, unsigned int flags); 123 unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl); 124 void (*lm_cancel) (struct gfs2_glock *gl);
130 const match_table_t *lm_tokens; 125 const match_table_t *lm_tokens;
131}; 126};
132 127
133#define LM_FLAG_TRY 0x00000001
134#define LM_FLAG_TRY_1CB 0x00000002
135#define LM_FLAG_NOEXP 0x00000004
136#define LM_FLAG_ANY 0x00000008
137#define LM_FLAG_PRIORITY 0x00000010
138
139#define GL_ASYNC 0x00000040
140#define GL_EXACT 0x00000080
141#define GL_SKIP 0x00000100
142#define GL_NOCACHE 0x00000400
143
144#define GLR_TRYFAILED 13
145
146extern struct workqueue_struct *gfs2_delete_workqueue; 128extern struct workqueue_struct *gfs2_delete_workqueue;
147static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 129static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
148{ 130{
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
212int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 194int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
213void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 195void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
214void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 196void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
197
198__attribute__ ((format(printf, 2, 3)))
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 199void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 200
217/** 201/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e5..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
325 325
326 if (gl->gl_state != LM_ST_UNLOCKED && 326 if (gl->gl_state != LM_ST_UNLOCKED &&
327 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 327 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
328 flush_workqueue(gfs2_delete_workqueue);
329 gfs2_meta_syncfs(sdp); 328 gfs2_meta_syncfs(sdp);
330 gfs2_log_shutdown(sdp); 329 gfs2_log_shutdown(sdp);
331 } 330 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc8..8d3d2b4a0a7d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
207 207
208 spinlock_t gl_spin; 208 spinlock_t gl_spin;
209 209
210 unsigned int gl_state; 210 /* State fields protected by gl_spin */
211 unsigned int gl_target; 211 unsigned int gl_state:2, /* Current state */
212 unsigned int gl_reply; 212 gl_target:2, /* Target state */
213 gl_demote_state:2, /* State requested by remote node */
214 gl_req:2, /* State in last dlm request */
215 gl_reply:8; /* Last reply from the dlm */
216
213 unsigned int gl_hash; 217 unsigned int gl_hash;
214 unsigned int gl_req;
215 unsigned int gl_demote_state; /* state requested by remote node */
216 unsigned long gl_demote_time; /* time of first demote request */ 218 unsigned long gl_demote_time; /* time of first demote request */
217 struct list_head gl_holders; 219 struct list_head gl_holders;
218 220
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f9217..14e682dbe8bf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
916 if (error) 916 if (error)
917 return error; 917 return error;
918 918
919 if ((attr->ia_valid & ATTR_SIZE) &&
920 attr->ia_size != i_size_read(inode)) {
921 error = vmtruncate(inode, attr->ia_size);
922 if (error)
923 return error;
924 }
925
926 setattr_copy(inode, attr); 919 setattr_copy(inode, attr);
927 mark_inode_dirty(inode); 920 mark_inode_dirty(inode);
928
929 gfs2_assert_warn(GFS2_SB(inode), !error);
930 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 921 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
931 gfs2_dinode_out(ip, dibh->b_data); 922 gfs2_dinode_out(ip, dibh->b_data);
932 brelse(dibh); 923 brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45fd..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
146 return lkf; 146 return lkf;
147} 147}
148 148
149static unsigned int gdlm_lock(struct gfs2_glock *gl, 149static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
150 unsigned int req_state, unsigned int flags) 150 unsigned int flags)
151{ 151{
152 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 152 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
153 int error;
154 int req; 153 int req;
155 u32 lkf; 154 u32 lkf;
156 155
157 gl->gl_req = req_state;
158 req = make_mode(req_state); 156 req = make_mode(req_state);
159 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); 157 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
160 158
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
162 * Submit the actual lock request. 160 * Submit the actual lock request.
163 */ 161 */
164 162
165 error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, 163 return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
166 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); 164 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
167 if (error == -EAGAIN)
168 return 0;
169 if (error)
170 return LM_OUT_ERROR;
171 return LM_OUT_ASYNC;
172} 165}
173 166
174static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) 167static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c2..1db6b7343229 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1069{ 1069{
1070 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
1071 struct gfs2_sbd *sdp = GFS2_SB(inode); 1071 struct gfs2_sbd *sdp = GFS2_SB(inode);
1072 struct buffer_head *dibh;
1073 u32 ouid, ogid, nuid, ngid; 1072 u32 ouid, ogid, nuid, ngid;
1074 int error; 1073 int error;
1075 1074
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1100 if (error) 1099 if (error)
1101 goto out_gunlock_q; 1100 goto out_gunlock_q;
1102 1101
1103 error = gfs2_meta_inode_buffer(ip, &dibh); 1102 error = gfs2_setattr_simple(ip, attr);
1104 if (error) 1103 if (error)
1105 goto out_end_trans; 1104 goto out_end_trans;
1106 1105
1107 if ((attr->ia_valid & ATTR_SIZE) &&
1108 attr->ia_size != i_size_read(inode)) {
1109 int error;
1110
1111 error = vmtruncate(inode, attr->ia_size);
1112 gfs2_assert_warn(sdp, !error);
1113 }
1114
1115 setattr_copy(inode, attr);
1116 mark_inode_dirty(inode);
1117
1118 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1119 gfs2_dinode_out(ip, dibh->b_data);
1120 brelse(dibh);
1121
1122 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { 1106 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
1123 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); 1107 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
1124 gfs2_quota_change(ip, -blocks, ouid, ogid); 1108 gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f606baf9ba72..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
666 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); 666 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
667 qd->qd_qb.qb_limit = qp->qu_limit; 667 qd->qd_qb.qb_limit = qp->qu_limit;
668 } 668 }
669 if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
670 qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
671 qd->qd_qb.qb_value = qp->qu_value;
672 }
669 } 673 }
670 674
671 /* Write the quota into the quota file on disk */ 675 /* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
1509} 1513}
1510 1514
1511/* GFS2 only supports a subset of the XFS fields */ 1515/* GFS2 only supports a subset of the XFS fields */
1512#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1516#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
1513 1517
1514static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, 1518static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1515 struct fs_disk_quota *fdq) 1519 struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1569 if ((fdq->d_fieldmask & FS_DQ_BSOFT) && 1573 if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
1570 ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) 1574 ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
1571 fdq->d_fieldmask ^= FS_DQ_BSOFT; 1575 fdq->d_fieldmask ^= FS_DQ_BSOFT;
1576
1572 if ((fdq->d_fieldmask & FS_DQ_BHARD) && 1577 if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
1573 ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) 1578 ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
1574 fdq->d_fieldmask ^= FS_DQ_BHARD; 1579 fdq->d_fieldmask ^= FS_DQ_BHARD;
1580
1581 if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
1582 ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
1583 fdq->d_fieldmask ^= FS_DQ_BCOUNT;
1584
1575 if (fdq->d_fieldmask == 0) 1585 if (fdq->d_fieldmask == 0)
1576 goto out_i; 1586 goto out_i;
1577 1587
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
1620 .get_dqblk = gfs2_get_dqblk, 1630 .get_dqblk = gfs2_get_dqblk,
1621 .set_dqblk = gfs2_set_dqblk, 1631 .set_dqblk = gfs2_set_dqblk,
1622}; 1632};
1623
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 33c8407b876f..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) 503 if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
583 * Returns: 0 on successful update, error code otherwise 583 * Returns: 0 on successful update, error code otherwise
584 */ 584 */
585 585
586static int gfs2_ri_update(struct gfs2_inode *ip) 586int gfs2_ri_update(struct gfs2_inode *ip)
587{ 587{
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
614} 614}
615 615
616/** 616/**
617 * gfs2_ri_update_special - Pull in a new resource index from the disk
618 *
619 * This is a special version that's safe to call from gfs2_inplace_reserve_i.
620 * In this case we know that we don't have any resource groups in memory yet.
621 *
622 * @ip: pointer to the rindex inode
623 *
624 * Returns: 0 on successful update, error code otherwise
625 */
626static int gfs2_ri_update_special(struct gfs2_inode *ip)
627{
628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
629 struct inode *inode = &ip->i_inode;
630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
633 int error;
634
635 file_ra_state_init(&ra_state, inode->i_mapping);
636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
637 /* Ignore partials */
638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
639 i_size_read(inode))
640 break;
641 error = read_rindex_entry(ip, &ra_state);
642 if (error) {
643 clear_rgrpdi(sdp);
644 return error;
645 }
646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
651
652 sdp->sd_rindex_uptodate = 1;
653 return 0;
654}
655
656/**
657 * gfs2_rindex_hold - Grab a lock on the rindex 617 * gfs2_rindex_hold - Grab a lock on the rindex
658 * @sdp: The GFS2 superblock 618 * @sdp: The GFS2 superblock
659 * @ri_gh: the glock holder 619 * @ri_gh: the glock holder
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1226 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1186 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1227 else if (!sdp->sd_rgrps) /* We may not have the rindex read 1187 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1228 in, so: */ 1188 in, so: */
1229 error = gfs2_ri_update_special(ip); 1189 error = gfs2_ri_update(ip);
1230 if (error) 1190 if (error)
1231 return error; 1191 return error;
1232 } 1192 }
1233 1193
1194try_again:
1234 do { 1195 do {
1235 error = get_local_rgrp(ip, &last_unlinked); 1196 error = get_local_rgrp(ip, &last_unlinked);
1236 /* If there is no space, flushing the log may release some */ 1197 /* If there is no space, flushing the log may release some */
1237 if (error) 1198 if (error) {
1199 if (ip == GFS2_I(sdp->sd_rindex) &&
1200 !sdp->sd_rindex_uptodate) {
1201 error = gfs2_ri_update(ip);
1202 if (error)
1203 return error;
1204 goto try_again;
1205 }
1238 gfs2_log_flush(sdp, NULL); 1206 gfs2_log_flush(sdp, NULL);
1207 }
1239 } while (error && tries++ < 3); 1208 } while (error && tries++ < 3);
1240 1209
1241 if (error) { 1210 if (error) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9a..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
48 48
49extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
50 50
51extern int gfs2_ri_update(struct gfs2_inode *ip);
51extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
52extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
53 54
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a6..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
1296 1296
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1298{
1299 struct inode *inode = &ip->i_inode;
1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1301 struct gfs2_ea_location el; 1300 struct gfs2_ea_location el;
1302 struct buffer_head *dibh;
1303 int error; 1301 int error;
1304 1302
1305 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); 1303 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1321 if (error) 1319 if (error)
1322 return error; 1320 return error;
1323 1321
1324 error = gfs2_meta_inode_buffer(ip, &dibh); 1322 error = gfs2_setattr_simple(ip, attr);
1325 if (error)
1326 goto out_trans_end;
1327
1328 if ((attr->ia_valid & ATTR_SIZE) &&
1329 attr->ia_size != i_size_read(inode)) {
1330 int error;
1331
1332 error = vmtruncate(inode, attr->ia_size);
1333 gfs2_assert_warn(GFS2_SB(inode), !error);
1334 }
1335
1336 setattr_copy(inode, attr);
1337 mark_inode_dirty(inode);
1338
1339 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1340 gfs2_dinode_out(ip, dibh->b_data);
1341 brelse(dibh);
1342
1343out_trans_end:
1344 gfs2_trans_end(sdp); 1323 gfs2_trans_end(sdp);
1345 return error; 1324 return error;
1346} 1325}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe4..08cba2c3b612 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
1407 1407
1408#endif 1408#endif
1409 1409
1410#ifdef CONFIG_SCHED_AUTOGROUP
1411/*
1412 * Print out autogroup related information:
1413 */
1414static int sched_autogroup_show(struct seq_file *m, void *v)
1415{
1416 struct inode *inode = m->private;
1417 struct task_struct *p;
1418
1419 p = get_proc_task(inode);
1420 if (!p)
1421 return -ESRCH;
1422 proc_sched_autogroup_show_task(p, m);
1423
1424 put_task_struct(p);
1425
1426 return 0;
1427}
1428
1429static ssize_t
1430sched_autogroup_write(struct file *file, const char __user *buf,
1431 size_t count, loff_t *offset)
1432{
1433 struct inode *inode = file->f_path.dentry->d_inode;
1434 struct task_struct *p;
1435 char buffer[PROC_NUMBUF];
1436 long nice;
1437 int err;
1438
1439 memset(buffer, 0, sizeof(buffer));
1440 if (count > sizeof(buffer) - 1)
1441 count = sizeof(buffer) - 1;
1442 if (copy_from_user(buffer, buf, count))
1443 return -EFAULT;
1444
1445 err = strict_strtol(strstrip(buffer), 0, &nice);
1446 if (err)
1447 return -EINVAL;
1448
1449 p = get_proc_task(inode);
1450 if (!p)
1451 return -ESRCH;
1452
1453 err = nice;
1454 err = proc_sched_autogroup_set_nice(p, &err);
1455 if (err)
1456 count = err;
1457
1458 put_task_struct(p);
1459
1460 return count;
1461}
1462
1463static int sched_autogroup_open(struct inode *inode, struct file *filp)
1464{
1465 int ret;
1466
1467 ret = single_open(filp, sched_autogroup_show, NULL);
1468 if (!ret) {
1469 struct seq_file *m = filp->private_data;
1470
1471 m->private = inode;
1472 }
1473 return ret;
1474}
1475
1476static const struct file_operations proc_pid_sched_autogroup_operations = {
1477 .open = sched_autogroup_open,
1478 .read = seq_read,
1479 .write = sched_autogroup_write,
1480 .llseek = seq_lseek,
1481 .release = single_release,
1482};
1483
1484#endif /* CONFIG_SCHED_AUTOGROUP */
1485
1410static ssize_t comm_write(struct file *file, const char __user *buf, 1486static ssize_t comm_write(struct file *file, const char __user *buf,
1411 size_t count, loff_t *offset) 1487 size_t count, loff_t *offset)
1412{ 1488{
@@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2733#ifdef CONFIG_SCHED_DEBUG 2809#ifdef CONFIG_SCHED_DEBUG
2734 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2810 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2735#endif 2811#endif
2812#ifdef CONFIG_SCHED_AUTOGROUP
2813 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2814#endif
2736 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2815 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2737#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2816#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2738 INF("syscall", S_IRUSR, proc_pid_syscall), 2817 INF("syscall", S_IRUSR, proc_pid_syscall),
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 36d57f74cd01..51494e6b5548 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
81extern int wait_for_completion_killable(struct completion *x); 81extern int wait_for_completion_killable(struct completion *x);
82extern unsigned long wait_for_completion_timeout(struct completion *x, 82extern unsigned long wait_for_completion_timeout(struct completion *x,
83 unsigned long timeout); 83 unsigned long timeout);
84extern unsigned long wait_for_completion_interruptible_timeout( 84extern long wait_for_completion_interruptible_timeout(
85 struct completion *x, unsigned long timeout); 85 struct completion *x, unsigned long timeout);
86extern unsigned long wait_for_completion_killable_timeout( 86extern long wait_for_completion_killable_timeout(
87 struct completion *x, unsigned long timeout); 87 struct completion *x, unsigned long timeout);
88extern bool try_wait_for_completion(struct completion *x); 88extern bool try_wait_for_completion(struct completion *x);
89extern bool completion_done(struct completion *x); 89extern bool completion_done(struct completion *x);
90 90
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index a90b3892074a..1c70028f81f9 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
44extern int ddebug_remove_module(const char *mod_name); 44extern int ddebug_remove_module(const char *mod_name);
45 45
46#define dynamic_pr_debug(fmt, ...) do { \ 46#define dynamic_pr_debug(fmt, ...) do { \
47 __label__ do_printk; \
48 __label__ out; \
49 static struct _ddebug descriptor \ 47 static struct _ddebug descriptor \
50 __used \ 48 __used \
51 __attribute__((section("__verbose"), aligned(8))) = \ 49 __attribute__((section("__verbose"), aligned(8))) = \
52 { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \ 50 { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
53 _DPRINTK_FLAGS_DEFAULT }; \ 51 _DPRINTK_FLAGS_DEFAULT }; \
54 JUMP_LABEL(&descriptor.enabled, do_printk); \ 52 if (unlikely(descriptor.enabled)) \
55 goto out; \ 53 printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
56do_printk: \
57 printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
58out: ; \
59 } while (0) 54 } while (0)
60 55
61 56
62#define dynamic_dev_dbg(dev, fmt, ...) do { \ 57#define dynamic_dev_dbg(dev, fmt, ...) do { \
63 __label__ do_printk; \
64 __label__ out; \
65 static struct _ddebug descriptor \ 58 static struct _ddebug descriptor \
66 __used \ 59 __used \
67 __attribute__((section("__verbose"), aligned(8))) = \ 60 __attribute__((section("__verbose"), aligned(8))) = \
68 { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \ 61 { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
69 _DPRINTK_FLAGS_DEFAULT }; \ 62 _DPRINTK_FLAGS_DEFAULT }; \
70 JUMP_LABEL(&descriptor.enabled, do_printk); \ 63 if (unlikely(descriptor.enabled)) \
71 goto out; \ 64 dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
72do_printk: \
73 dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
74out: ; \
75 } while (0) 65 } while (0)
76 66
77#else 67#else
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0c1b857d3d..330586ffffbb 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -22,7 +22,7 @@
22#include <linux/wait.h> 22#include <linux/wait.h>
23#include <linux/percpu.h> 23#include <linux/percpu.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25 25#include <linux/timerqueue.h>
26 26
27struct hrtimer_clock_base; 27struct hrtimer_clock_base;
28struct hrtimer_cpu_base; 28struct hrtimer_cpu_base;
@@ -79,8 +79,8 @@ enum hrtimer_restart {
79 79
80/** 80/**
81 * struct hrtimer - the basic hrtimer structure 81 * struct hrtimer - the basic hrtimer structure
82 * @node: red black tree node for time ordered insertion 82 * @node: timerqueue node, which also manages node.expires,
83 * @_expires: the absolute expiry time in the hrtimers internal 83 * the absolute expiry time in the hrtimers internal
84 * representation. The time is related to the clock on 84 * representation. The time is related to the clock on
85 * which the timer is based. Is setup by adding 85 * which the timer is based. Is setup by adding
86 * slack to the _softexpires value. For non range timers 86 * slack to the _softexpires value. For non range timers
@@ -101,8 +101,7 @@ enum hrtimer_restart {
101 * The hrtimer structure must be initialized by hrtimer_init() 101 * The hrtimer structure must be initialized by hrtimer_init()
102 */ 102 */
103struct hrtimer { 103struct hrtimer {
104 struct rb_node node; 104 struct timerqueue_node node;
105 ktime_t _expires;
106 ktime_t _softexpires; 105 ktime_t _softexpires;
107 enum hrtimer_restart (*function)(struct hrtimer *); 106 enum hrtimer_restart (*function)(struct hrtimer *);
108 struct hrtimer_clock_base *base; 107 struct hrtimer_clock_base *base;
@@ -141,8 +140,7 @@ struct hrtimer_sleeper {
141struct hrtimer_clock_base { 140struct hrtimer_clock_base {
142 struct hrtimer_cpu_base *cpu_base; 141 struct hrtimer_cpu_base *cpu_base;
143 clockid_t index; 142 clockid_t index;
144 struct rb_root active; 143 struct timerqueue_head active;
145 struct rb_node *first;
146 ktime_t resolution; 144 ktime_t resolution;
147 ktime_t (*get_time)(void); 145 ktime_t (*get_time)(void);
148 ktime_t softirq_time; 146 ktime_t softirq_time;
@@ -158,7 +156,6 @@ struct hrtimer_clock_base {
158 * @lock: lock protecting the base and associated clock bases 156 * @lock: lock protecting the base and associated clock bases
159 * and timers 157 * and timers
160 * @clock_base: array of clock bases for this cpu 158 * @clock_base: array of clock bases for this cpu
161 * @curr_timer: the timer which is executing a callback right now
162 * @expires_next: absolute time of the next event which was scheduled 159 * @expires_next: absolute time of the next event which was scheduled
163 * via clock_set_next_event() 160 * via clock_set_next_event()
164 * @hres_active: State of high resolution mode 161 * @hres_active: State of high resolution mode
@@ -184,43 +181,43 @@ struct hrtimer_cpu_base {
184 181
185static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) 182static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
186{ 183{
187 timer->_expires = time; 184 timer->node.expires = time;
188 timer->_softexpires = time; 185 timer->_softexpires = time;
189} 186}
190 187
191static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) 188static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
192{ 189{
193 timer->_softexpires = time; 190 timer->_softexpires = time;
194 timer->_expires = ktime_add_safe(time, delta); 191 timer->node.expires = ktime_add_safe(time, delta);
195} 192}
196 193
197static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) 194static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
198{ 195{
199 timer->_softexpires = time; 196 timer->_softexpires = time;
200 timer->_expires = ktime_add_safe(time, ns_to_ktime(delta)); 197 timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
201} 198}
202 199
203static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) 200static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
204{ 201{
205 timer->_expires.tv64 = tv64; 202 timer->node.expires.tv64 = tv64;
206 timer->_softexpires.tv64 = tv64; 203 timer->_softexpires.tv64 = tv64;
207} 204}
208 205
209static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) 206static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
210{ 207{
211 timer->_expires = ktime_add_safe(timer->_expires, time); 208 timer->node.expires = ktime_add_safe(timer->node.expires, time);
212 timer->_softexpires = ktime_add_safe(timer->_softexpires, time); 209 timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
213} 210}
214 211
215static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) 212static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
216{ 213{
217 timer->_expires = ktime_add_ns(timer->_expires, ns); 214 timer->node.expires = ktime_add_ns(timer->node.expires, ns);
218 timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); 215 timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
219} 216}
220 217
221static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) 218static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
222{ 219{
223 return timer->_expires; 220 return timer->node.expires;
224} 221}
225 222
226static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) 223static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
@@ -230,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
230 227
231static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) 228static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
232{ 229{
233 return timer->_expires.tv64; 230 return timer->node.expires.tv64;
234} 231}
235static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) 232static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
236{ 233{
@@ -239,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
239 236
240static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) 237static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
241{ 238{
242 return ktime_to_ns(timer->_expires); 239 return ktime_to_ns(timer->node.expires);
243} 240}
244 241
245static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) 242static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
246{ 243{
247 return ktime_sub(timer->_expires, timer->base->get_time()); 244 return ktime_sub(timer->node.expires, timer->base->get_time());
248} 245}
249 246
250#ifdef CONFIG_HIGH_RES_TIMERS 247#ifdef CONFIG_HIGH_RES_TIMERS
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f8c06ce0fa6..caa151fbebb7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
12#include <linux/securebits.h> 12#include <linux/securebits.h>
13#include <net/net_namespace.h> 13#include <net/net_namespace.h>
14 14
15#ifdef CONFIG_SMP
16# define INIT_PUSHABLE_TASKS(tsk) \
17 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
18#else
19# define INIT_PUSHABLE_TASKS(tsk)
20#endif
21
15extern struct files_struct init_files; 22extern struct files_struct init_files;
16extern struct fs_struct init_fs; 23extern struct fs_struct init_fs;
17 24
@@ -83,6 +90,12 @@ extern struct group_info init_groups;
83 */ 90 */
84# define CAP_INIT_BSET CAP_FULL_SET 91# define CAP_INIT_BSET CAP_FULL_SET
85 92
93#ifdef CONFIG_RCU_BOOST
94#define INIT_TASK_RCU_BOOST() \
95 .rcu_boost_mutex = NULL,
96#else
97#define INIT_TASK_RCU_BOOST()
98#endif
86#ifdef CONFIG_TREE_PREEMPT_RCU 99#ifdef CONFIG_TREE_PREEMPT_RCU
87#define INIT_TASK_RCU_TREE_PREEMPT() \ 100#define INIT_TASK_RCU_TREE_PREEMPT() \
88 .rcu_blocked_node = NULL, 101 .rcu_blocked_node = NULL,
@@ -94,7 +107,8 @@ extern struct group_info init_groups;
94 .rcu_read_lock_nesting = 0, \ 107 .rcu_read_lock_nesting = 0, \
95 .rcu_read_unlock_special = 0, \ 108 .rcu_read_unlock_special = 0, \
96 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ 109 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
97 INIT_TASK_RCU_TREE_PREEMPT() 110 INIT_TASK_RCU_TREE_PREEMPT() \
111 INIT_TASK_RCU_BOOST()
98#else 112#else
99#define INIT_TASK_RCU_PREEMPT(tsk) 113#define INIT_TASK_RCU_PREEMPT(tsk)
100#endif 114#endif
@@ -137,7 +151,7 @@ extern struct cred init_cred;
137 .nr_cpus_allowed = NR_CPUS, \ 151 .nr_cpus_allowed = NR_CPUS, \
138 }, \ 152 }, \
139 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 153 .tasks = LIST_HEAD_INIT(tsk.tasks), \
140 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ 154 INIT_PUSHABLE_TASKS(tsk) \
141 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ 155 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
142 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ 156 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
143 .real_parent = &tsk, \ 157 .real_parent = &tsk, \
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79d0c4f6d071..55e0d4253e49 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
114struct irqaction { 114struct irqaction {
115 irq_handler_t handler; 115 irq_handler_t handler;
116 unsigned long flags; 116 unsigned long flags;
117 const char *name;
118 void *dev_id; 117 void *dev_id;
119 struct irqaction *next; 118 struct irqaction *next;
120 int irq; 119 int irq;
121 struct proc_dir_entry *dir;
122 irq_handler_t thread_fn; 120 irq_handler_t thread_fn;
123 struct task_struct *thread; 121 struct task_struct *thread;
124 unsigned long thread_flags; 122 unsigned long thread_flags;
125}; 123 const char *name;
124 struct proc_dir_entry *dir;
125} ____cacheline_internodealigned_in_smp;
126 126
127extern irqreturn_t no_action(int cpl, void *dev_id); 127extern irqreturn_t no_action(int cpl, void *dev_id);
128 128
diff --git a/include/linux/module.h b/include/linux/module.h
index 7575bbbdf2a2..8b17fd8c790d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -308,6 +308,9 @@ struct module
308 /* The size of the executable code in each section. */ 308 /* The size of the executable code in each section. */
309 unsigned int init_text_size, core_text_size; 309 unsigned int init_text_size, core_text_size;
310 310
311 /* Size of RO sections of the module (text+rodata) */
312 unsigned int init_ro_size, core_ro_size;
313
311 /* Arch-specific module values */ 314 /* Arch-specific module values */
312 struct mod_arch_specific arch; 315 struct mod_arch_specific arch;
313 316
@@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
672{ 675{
673 return 0; 676 return 0;
674} 677}
675
676#endif /* CONFIG_MODULES */ 678#endif /* CONFIG_MODULES */
677 679
678#ifdef CONFIG_SYSFS 680#ifdef CONFIG_SYSFS
@@ -687,6 +689,13 @@ extern int module_sysfs_initialized;
687 689
688#define __MODULE_STRING(x) __stringify(x) 690#define __MODULE_STRING(x) __stringify(x)
689 691
692#ifdef CONFIG_DEBUG_SET_MODULE_RONX
693extern void set_all_modules_text_rw(void);
694extern void set_all_modules_text_ro(void);
695#else
696static inline void set_all_modules_text_rw(void) { }
697static inline void set_all_modules_text_ro(void) { }
698#endif
690 699
691#ifdef CONFIG_GENERIC_BUG 700#ifdef CONFIG_GENERIC_BUG
692void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, 701void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f363bc8fdc74..94b48bd40dd7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
160extern void mutex_unlock(struct mutex *lock); 160extern void mutex_unlock(struct mutex *lock);
161extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); 161extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
162 162
163#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
164#define arch_mutex_cpu_relax() cpu_relax()
165#endif
166
163#endif 167#endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f31ef61f1c65..2dea94fc4402 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
241#define list_first_entry_rcu(ptr, type, member) \ 241#define list_first_entry_rcu(ptr, type, member) \
242 list_entry_rcu((ptr)->next, type, member) 242 list_entry_rcu((ptr)->next, type, member)
243 243
244#define __list_for_each_rcu(pos, head) \
245 for (pos = rcu_dereference_raw(list_next_rcu(head)); \
246 pos != (head); \
247 pos = rcu_dereference_raw(list_next_rcu((pos)))
248
249/** 244/**
250 * list_for_each_entry_rcu - iterate over rcu list of given type 245 * list_for_each_entry_rcu - iterate over rcu list of given type
251 * @pos: the type * to use as a loop cursor. 246 * @pos: the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03cda7bed985..af5614856285 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
47extern int rcutorture_runnable; /* for sysctl */ 47extern int rcutorture_runnable; /* for sysctl */
48#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 48#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
49 49
50#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
51#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
50#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 52#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) 53#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52 54
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
66extern void synchronize_sched(void); 68extern void synchronize_sched(void);
67extern void rcu_barrier_bh(void); 69extern void rcu_barrier_bh(void);
68extern void rcu_barrier_sched(void); 70extern void rcu_barrier_sched(void);
69extern void synchronize_sched_expedited(void);
70extern int sched_expedited_torture_stats(char *page); 71extern int sched_expedited_torture_stats(char *page);
71 72
72static inline void __rcu_read_lock_bh(void) 73static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
118#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 119#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
119 120
120/* Internal to kernel */ 121/* Internal to kernel */
121extern void rcu_init(void);
122extern void rcu_sched_qs(int cpu); 122extern void rcu_sched_qs(int cpu);
123extern void rcu_bh_qs(int cpu); 123extern void rcu_bh_qs(int cpu);
124extern void rcu_check_callbacks(int cpu, int user); 124extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 13877cb93a60..30ebd7c8d874 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
27 27
28#include <linux/cache.h> 28#include <linux/cache.h>
29 29
30#define rcu_init_sched() do { } while (0) 30static inline void rcu_init(void)
31{
32}
31 33
32#ifdef CONFIG_TINY_RCU 34#ifdef CONFIG_TINY_RCU
33 35
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
58 synchronize_sched(); 60 synchronize_sched();
59} 61}
60 62
63static inline void synchronize_sched_expedited(void)
64{
65 synchronize_sched();
66}
67
61#ifdef CONFIG_TINY_RCU 68#ifdef CONFIG_TINY_RCU
62 69
63static inline void rcu_preempt_note_context_switch(void) 70static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
125} 132}
126 133
127#ifdef CONFIG_DEBUG_LOCK_ALLOC 134#ifdef CONFIG_DEBUG_LOCK_ALLOC
128
129extern int rcu_scheduler_active __read_mostly; 135extern int rcu_scheduler_active __read_mostly;
130extern void rcu_scheduler_starting(void); 136extern void rcu_scheduler_starting(void);
131
132#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 137#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
133
134static inline void rcu_scheduler_starting(void) 138static inline void rcu_scheduler_starting(void)
135{ 139{
136} 140}
137
138#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 141#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
139 142
140#endif /* __LINUX_RCUTINY_H */ 143#endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 95518e628794..3a933482734a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33extern void rcu_init(void);
33extern void rcu_note_context_switch(int cpu); 34extern void rcu_note_context_switch(int cpu);
34extern int rcu_needs_cpu(int cpu); 35extern int rcu_needs_cpu(int cpu);
35extern void rcu_cpu_stall_reset(void); 36extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
47#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 48#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
48 49
49extern void synchronize_rcu_bh(void); 50extern void synchronize_rcu_bh(void);
51extern void synchronize_sched_expedited(void);
50extern void synchronize_rcu_expedited(void); 52extern void synchronize_rcu_expedited(void);
51 53
52static inline void synchronize_rcu_bh_expedited(void) 54static inline void synchronize_rcu_bh_expedited(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a99d735db3df..777cd01e240e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -513,6 +513,8 @@ struct thread_group_cputimer {
513 spinlock_t lock; 513 spinlock_t lock;
514}; 514};
515 515
516struct autogroup;
517
516/* 518/*
517 * NOTE! "signal_struct" does not have it's own 519 * NOTE! "signal_struct" does not have it's own
518 * locking, because a shared signal_struct always 520 * locking, because a shared signal_struct always
@@ -580,6 +582,9 @@ struct signal_struct {
580 582
581 struct tty_struct *tty; /* NULL if no tty */ 583 struct tty_struct *tty; /* NULL if no tty */
582 584
585#ifdef CONFIG_SCHED_AUTOGROUP
586 struct autogroup *autogroup;
587#endif
583 /* 588 /*
584 * Cumulative resource counters for dead threads in the group, 589 * Cumulative resource counters for dead threads in the group,
585 * and for reaped dead child processes forked by this group. 590 * and for reaped dead child processes forked by this group.
@@ -1233,13 +1238,18 @@ struct task_struct {
1233#ifdef CONFIG_TREE_PREEMPT_RCU 1238#ifdef CONFIG_TREE_PREEMPT_RCU
1234 struct rcu_node *rcu_blocked_node; 1239 struct rcu_node *rcu_blocked_node;
1235#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1240#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1241#ifdef CONFIG_RCU_BOOST
1242 struct rt_mutex *rcu_boost_mutex;
1243#endif /* #ifdef CONFIG_RCU_BOOST */
1236 1244
1237#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1245#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1238 struct sched_info sched_info; 1246 struct sched_info sched_info;
1239#endif 1247#endif
1240 1248
1241 struct list_head tasks; 1249 struct list_head tasks;
1250#ifdef CONFIG_SMP
1242 struct plist_node pushable_tasks; 1251 struct plist_node pushable_tasks;
1252#endif
1243 1253
1244 struct mm_struct *mm, *active_mm; 1254 struct mm_struct *mm, *active_mm;
1245#if defined(SPLIT_RSS_COUNTING) 1255#if defined(SPLIT_RSS_COUNTING)
@@ -1763,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1763#ifdef CONFIG_PREEMPT_RCU 1773#ifdef CONFIG_PREEMPT_RCU
1764 1774
1765#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ 1775#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1766#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ 1776#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
1777#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
1767 1778
1768static inline void rcu_copy_process(struct task_struct *p) 1779static inline void rcu_copy_process(struct task_struct *p)
1769{ 1780{
@@ -1771,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p)
1771 p->rcu_read_unlock_special = 0; 1782 p->rcu_read_unlock_special = 0;
1772#ifdef CONFIG_TREE_PREEMPT_RCU 1783#ifdef CONFIG_TREE_PREEMPT_RCU
1773 p->rcu_blocked_node = NULL; 1784 p->rcu_blocked_node = NULL;
1774#endif 1785#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1786#ifdef CONFIG_RCU_BOOST
1787 p->rcu_boost_mutex = NULL;
1788#endif /* #ifdef CONFIG_RCU_BOOST */
1775 INIT_LIST_HEAD(&p->rcu_node_entry); 1789 INIT_LIST_HEAD(&p->rcu_node_entry);
1776} 1790}
1777 1791
@@ -1876,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
1876extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1890extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1877 1891
1878#ifdef CONFIG_HOTPLUG_CPU 1892#ifdef CONFIG_HOTPLUG_CPU
1879extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
1880extern void idle_task_exit(void); 1893extern void idle_task_exit(void);
1881#else 1894#else
1882static inline void idle_task_exit(void) {} 1895static inline void idle_task_exit(void) {}
1883#endif 1896#endif
1884 1897
1885extern void sched_idle_next(void);
1886
1887#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 1898#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
1888extern void wake_up_idle_cpu(int cpu); 1899extern void wake_up_idle_cpu(int cpu);
1889#else 1900#else
@@ -1893,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
1893extern unsigned int sysctl_sched_latency; 1904extern unsigned int sysctl_sched_latency;
1894extern unsigned int sysctl_sched_min_granularity; 1905extern unsigned int sysctl_sched_min_granularity;
1895extern unsigned int sysctl_sched_wakeup_granularity; 1906extern unsigned int sysctl_sched_wakeup_granularity;
1896extern unsigned int sysctl_sched_shares_ratelimit;
1897extern unsigned int sysctl_sched_shares_thresh;
1898extern unsigned int sysctl_sched_child_runs_first; 1907extern unsigned int sysctl_sched_child_runs_first;
1899 1908
1900enum sched_tunable_scaling { 1909enum sched_tunable_scaling {
@@ -1910,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
1910extern unsigned int sysctl_sched_nr_migrate; 1919extern unsigned int sysctl_sched_nr_migrate;
1911extern unsigned int sysctl_sched_time_avg; 1920extern unsigned int sysctl_sched_time_avg;
1912extern unsigned int sysctl_timer_migration; 1921extern unsigned int sysctl_timer_migration;
1922extern unsigned int sysctl_sched_shares_window;
1913 1923
1914int sched_proc_update_handler(struct ctl_table *table, int write, 1924int sched_proc_update_handler(struct ctl_table *table, int write,
1915 void __user *buffer, size_t *length, 1925 void __user *buffer, size_t *length,
@@ -1935,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
1935 1945
1936extern unsigned int sysctl_sched_compat_yield; 1946extern unsigned int sysctl_sched_compat_yield;
1937 1947
1948#ifdef CONFIG_SCHED_AUTOGROUP
1949extern unsigned int sysctl_sched_autogroup_enabled;
1950
1951extern void sched_autogroup_create_attach(struct task_struct *p);
1952extern void sched_autogroup_detach(struct task_struct *p);
1953extern void sched_autogroup_fork(struct signal_struct *sig);
1954extern void sched_autogroup_exit(struct signal_struct *sig);
1955#ifdef CONFIG_PROC_FS
1956extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
1957extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
1958#endif
1959#else
1960static inline void sched_autogroup_create_attach(struct task_struct *p) { }
1961static inline void sched_autogroup_detach(struct task_struct *p) { }
1962static inline void sched_autogroup_fork(struct signal_struct *sig) { }
1963static inline void sched_autogroup_exit(struct signal_struct *sig) { }
1964#endif
1965
1938#ifdef CONFIG_RT_MUTEXES 1966#ifdef CONFIG_RT_MUTEXES
1939extern int rt_mutex_getprio(struct task_struct *p); 1967extern int rt_mutex_getprio(struct task_struct *p);
1940extern void rt_mutex_setprio(struct task_struct *p, int prio); 1968extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1953,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
1953extern int can_nice(const struct task_struct *p, const int nice); 1981extern int can_nice(const struct task_struct *p, const int nice);
1954extern int task_curr(const struct task_struct *p); 1982extern int task_curr(const struct task_struct *p);
1955extern int idle_cpu(int cpu); 1983extern int idle_cpu(int cpu);
1956extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); 1984extern int sched_setscheduler(struct task_struct *, int,
1985 const struct sched_param *);
1957extern int sched_setscheduler_nocheck(struct task_struct *, int, 1986extern int sched_setscheduler_nocheck(struct task_struct *, int,
1958 struct sched_param *); 1987 const struct sched_param *);
1959extern struct task_struct *idle_task(int cpu); 1988extern struct task_struct *idle_task(int cpu);
1960extern struct task_struct *curr_task(int cpu); 1989extern struct task_struct *curr_task(int cpu);
1961extern void set_curr_task(int cpu, struct task_struct *p); 1990extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/include/linux/sfi.h b/include/linux/sfi.h
index 7f770c638e99..fe817918b30e 100644
--- a/include/linux/sfi.h
+++ b/include/linux/sfi.h
@@ -77,6 +77,8 @@
77#define SFI_OEM_ID_SIZE 6 77#define SFI_OEM_ID_SIZE 6
78#define SFI_OEM_TABLE_ID_SIZE 8 78#define SFI_OEM_TABLE_ID_SIZE 8
79 79
80#define SFI_NAME_LEN 16
81
80#define SFI_SYST_SEARCH_BEGIN 0x000E0000 82#define SFI_SYST_SEARCH_BEGIN 0x000E0000
81#define SFI_SYST_SEARCH_END 0x000FFFFF 83#define SFI_SYST_SEARCH_END 0x000FFFFF
82 84
@@ -156,13 +158,13 @@ struct sfi_device_table_entry {
156 u16 addr; 158 u16 addr;
157 u8 irq; 159 u8 irq;
158 u32 max_freq; 160 u32 max_freq;
159 char name[16]; 161 char name[SFI_NAME_LEN];
160} __packed; 162} __packed;
161 163
162struct sfi_gpio_table_entry { 164struct sfi_gpio_table_entry {
163 char controller_name[16]; 165 char controller_name[SFI_NAME_LEN];
164 u16 pin_no; 166 u16 pin_no;
165 char pin_name[16]; 167 char pin_name[SFI_NAME_LEN];
166} __packed; 168} __packed;
167 169
168typedef int (*sfi_table_handler) (struct sfi_table_header *table); 170typedef int (*sfi_table_handler) (struct sfi_table_header *table);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 38cf093ef62c..6abd9138beda 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -24,9 +24,9 @@ struct timer_list {
24 int slack; 24 int slack;
25 25
26#ifdef CONFIG_TIMER_STATS 26#ifdef CONFIG_TIMER_STATS
27 int start_pid;
27 void *start_site; 28 void *start_site;
28 char start_comm[16]; 29 char start_comm[16];
29 int start_pid;
30#endif 30#endif
31#ifdef CONFIG_LOCKDEP 31#ifdef CONFIG_LOCKDEP
32 struct lockdep_map lockdep_map; 32 struct lockdep_map lockdep_map;
@@ -48,12 +48,38 @@ extern struct tvec_base boot_tvec_bases;
48#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) 48#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
49#endif 49#endif
50 50
51/*
52 * Note that all tvec_bases are 2 byte aligned and lower bit of
53 * base in timer_list is guaranteed to be zero. Use the LSB to
54 * indicate whether the timer is deferrable.
55 *
56 * A deferrable timer will work normally when the system is busy, but
57 * will not cause a CPU to come out of idle just to service it; instead,
58 * the timer will be serviced when the CPU eventually wakes up with a
59 * subsequent non-deferrable timer.
60 */
61#define TBASE_DEFERRABLE_FLAG (0x1)
62
51#define TIMER_INITIALIZER(_function, _expires, _data) { \ 63#define TIMER_INITIALIZER(_function, _expires, _data) { \
52 .entry = { .prev = TIMER_ENTRY_STATIC }, \ 64 .entry = { .prev = TIMER_ENTRY_STATIC }, \
53 .function = (_function), \ 65 .function = (_function), \
54 .expires = (_expires), \ 66 .expires = (_expires), \
55 .data = (_data), \ 67 .data = (_data), \
56 .base = &boot_tvec_bases, \ 68 .base = &boot_tvec_bases, \
69 .slack = -1, \
70 __TIMER_LOCKDEP_MAP_INITIALIZER( \
71 __FILE__ ":" __stringify(__LINE__)) \
72 }
73
74#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *) \
75 ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
76
77#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
78 .entry = { .prev = TIMER_ENTRY_STATIC }, \
79 .function = (_function), \
80 .expires = (_expires), \
81 .data = (_data), \
82 .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases), \
57 __TIMER_LOCKDEP_MAP_INITIALIZER( \ 83 __TIMER_LOCKDEP_MAP_INITIALIZER( \
58 __FILE__ ":" __stringify(__LINE__)) \ 84 __FILE__ ":" __stringify(__LINE__)) \
59 } 85 }
@@ -248,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
248 274
249extern void add_timer(struct timer_list *timer); 275extern void add_timer(struct timer_list *timer);
250 276
277extern int try_to_del_timer_sync(struct timer_list *timer);
278
251#ifdef CONFIG_SMP 279#ifdef CONFIG_SMP
252 extern int try_to_del_timer_sync(struct timer_list *timer);
253 extern int del_timer_sync(struct timer_list *timer); 280 extern int del_timer_sync(struct timer_list *timer);
254#else 281#else
255# define try_to_del_timer_sync(t) del_timer(t)
256# define del_timer_sync(t) del_timer(t) 282# define del_timer_sync(t) del_timer(t)
257#endif 283#endif
258 284
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
new file mode 100644
index 000000000000..d24aabaca474
--- /dev/null
+++ b/include/linux/timerqueue.h
@@ -0,0 +1,50 @@
1#ifndef _LINUX_TIMERQUEUE_H
2#define _LINUX_TIMERQUEUE_H
3
4#include <linux/rbtree.h>
5#include <linux/ktime.h>
6
7
8struct timerqueue_node {
9 struct rb_node node;
10 ktime_t expires;
11};
12
13struct timerqueue_head {
14 struct rb_root head;
15 struct timerqueue_node *next;
16};
17
18
19extern void timerqueue_add(struct timerqueue_head *head,
20 struct timerqueue_node *node);
21extern void timerqueue_del(struct timerqueue_head *head,
22 struct timerqueue_node *node);
23extern struct timerqueue_node *timerqueue_iterate_next(
24 struct timerqueue_node *node);
25
26/**
27 * timerqueue_getnext - Returns the timer with the earlies expiration time
28 *
29 * @head: head of timerqueue
30 *
31 * Returns a pointer to the timer node that has the
32 * earliest expiration time.
33 */
34static inline
35struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
36{
37 return head->next;
38}
39
40static inline void timerqueue_init(struct timerqueue_node *node)
41{
42 RB_CLEAR_NODE(&node->node);
43}
44
45static inline void timerqueue_init_head(struct timerqueue_head *head)
46{
47 head->head = RB_ROOT;
48 head->next = NULL;
49}
50#endif /* _LINUX_TIMERQUEUE_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d3e4f87e95c0..c6814616653b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -32,7 +32,7 @@ struct tracepoint {
32 int state; /* State. */ 32 int state; /* State. */
33 void (*regfunc)(void); 33 void (*regfunc)(void);
34 void (*unregfunc)(void); 34 void (*unregfunc)(void);
35 struct tracepoint_func *funcs; 35 struct tracepoint_func __rcu *funcs;
36} __attribute__((aligned(32))); /* 36} __attribute__((aligned(32))); /*
37 * Aligned on 32 bytes because it is 37 * Aligned on 32 bytes because it is
38 * globally visible and gcc happily 38 * globally visible and gcc happily
@@ -326,7 +326,7 @@ do_trace: \
326 * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); 326 * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
327 * __entry->next_pid = next->pid; 327 * __entry->next_pid = next->pid;
328 * __entry->next_prio = next->prio; 328 * __entry->next_prio = next->prio;
329 * ) 329 * ),
330 * 330 *
331 * * 331 * *
332 * * Formatted output of a trace record via TP_printk(). 332 * * Formatted output of a trace record via TP_printk().
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0c0771f06bfa..bd257fee6031 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -127,12 +127,20 @@ struct execute_work {
127 .timer = TIMER_INITIALIZER(NULL, 0, 0), \ 127 .timer = TIMER_INITIALIZER(NULL, 0, 0), \
128 } 128 }
129 129
130#define __DEFERRED_WORK_INITIALIZER(n, f) { \
131 .work = __WORK_INITIALIZER((n).work, (f)), \
132 .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0), \
133 }
134
130#define DECLARE_WORK(n, f) \ 135#define DECLARE_WORK(n, f) \
131 struct work_struct n = __WORK_INITIALIZER(n, f) 136 struct work_struct n = __WORK_INITIALIZER(n, f)
132 137
133#define DECLARE_DELAYED_WORK(n, f) \ 138#define DECLARE_DELAYED_WORK(n, f) \
134 struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f) 139 struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
135 140
141#define DECLARE_DEFERRED_WORK(n, f) \
142 struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f)
143
136/* 144/*
137 * initialize a work item's function pointer 145 * initialize a work item's function pointer
138 */ 146 */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index b0b4eb24d592..da39b22636f7 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -21,6 +21,16 @@
21#undef CREATE_TRACE_POINTS 21#undef CREATE_TRACE_POINTS
22 22
23#include <linux/stringify.h> 23#include <linux/stringify.h>
24/*
25 * module.h includes tracepoints, and because ftrace.h
26 * pulls in module.h:
27 * trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h ->
28 * linux/ftrace.h -> linux/module.h
29 * we must include module.h here before we play with any of
30 * the TRACE_EVENT() macros, otherwise the tracepoints included
31 * by module.h may break the build.
32 */
33#include <linux/module.h>
24 34
25#undef TRACE_EVENT 35#undef TRACE_EVENT
26#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ 36#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 75ce9d500d8e..f10293c41b1e 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -25,9 +25,7 @@ TRACE_EVENT(kfree_skb,
25 25
26 TP_fast_assign( 26 TP_fast_assign(
27 __entry->skbaddr = skb; 27 __entry->skbaddr = skb;
28 if (skb) { 28 __entry->protocol = ntohs(skb->protocol);
29 __entry->protocol = ntohs(skb->protocol);
30 }
31 __entry->location = location; 29 __entry->location = location;
32 ), 30 ),
33 31
diff --git a/init/Kconfig b/init/Kconfig
index c9728992a776..8dfd094e6875 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -393,7 +393,6 @@ config PREEMPT_RCU
393 393
394config RCU_TRACE 394config RCU_TRACE
395 bool "Enable tracing for RCU" 395 bool "Enable tracing for RCU"
396 depends on TREE_RCU || TREE_PREEMPT_RCU
397 help 396 help
398 This option provides tracing in RCU which presents stats 397 This option provides tracing in RCU which presents stats
399 in debugfs for debugging RCU implementation. 398 in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
459 TREE_PREEMPT_RCU implementations, permitting Makefile to 458 TREE_PREEMPT_RCU implementations, permitting Makefile to
460 trivially select kernel/rcutree_trace.c. 459 trivially select kernel/rcutree_trace.c.
461 460
461config RCU_BOOST
462 bool "Enable RCU priority boosting"
463 depends on RT_MUTEXES && TINY_PREEMPT_RCU
464 default n
465 help
466 This option boosts the priority of preempted RCU readers that
467 block the current preemptible RCU grace period for too long.
468 This option also prevents heavy loads from blocking RCU
469 callback invocation for all flavors of RCU.
470
471 Say Y here if you are working with real-time apps or heavy loads
472 Say N here if you are unsure.
473
474config RCU_BOOST_PRIO
475 int "Real-time priority to boost RCU readers to"
476 range 1 99
477 depends on RCU_BOOST
478 default 1
479 help
480 This option specifies the real-time priority to which preempted
481 RCU readers are to be boosted. If you are working with CPU-bound
482 real-time applications, you should specify a priority higher then
483 the highest-priority CPU-bound application.
484
485 Specify the real-time priority, or take the default if unsure.
486
487config RCU_BOOST_DELAY
488 int "Milliseconds to delay boosting after RCU grace-period start"
489 range 0 3000
490 depends on RCU_BOOST
491 default 500
492 help
493 This option specifies the time to wait after the beginning of
494 a given grace period before priority-boosting preempted RCU
495 readers blocking that grace period. Note that any RCU reader
496 blocking an expedited RCU grace period is boosted immediately.
497
498 Accept the default if unsure.
499
500config SRCU_SYNCHRONIZE_DELAY
501 int "Microseconds to delay before waiting for readers"
502 range 0 20
503 default 10
504 help
505 This option controls how long SRCU delays before entering its
506 loop waiting on SRCU readers. The purpose of this loop is
507 to avoid the unconditional context-switch penalty that would
508 otherwise be incurred if there was an active SRCU reader,
509 in a manner similar to adaptive locking schemes. This should
510 be set to be a bit longer than the common-case SRCU read-side
511 critical-section overhead.
512
513 Accept the default if unsure.
514
462endmenu # "RCU Subsystem" 515endmenu # "RCU Subsystem"
463 516
464config IKCONFIG 517config IKCONFIG
@@ -741,6 +794,19 @@ config NET_NS
741 794
742endif # NAMESPACES 795endif # NAMESPACES
743 796
797config SCHED_AUTOGROUP
798 bool "Automatic process group scheduling"
799 select EVENTFD
800 select CGROUPS
801 select CGROUP_SCHED
802 select FAIR_GROUP_SCHED
803 help
804 This option optimizes the scheduler for common desktop workloads by
805 automatically creating and populating task groups. This separation
806 of workloads isolates aggressive CPU burners (like build jobs) from
807 desktop applications. Task group autogeneration is currently based
808 upon task session.
809
744config MM_OWNER 810config MM_OWNER
745 bool 811 bool
746 812
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..e0f2831634b4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 105obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
386#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
387static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
388 386
387void __weak arch_disable_nonboot_cpus_begin(void)
388{
389}
390
391void __weak arch_disable_nonboot_cpus_end(void)
392{
393}
394
389int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
390{ 396{
391 int cpu, first_cpu, error = 0; 397 int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
397 * with the userspace trying to use the CPU hotplug at the same time 403 * with the userspace trying to use the CPU hotplug at the same time
398 */ 404 */
399 cpumask_clear(frozen_cpus); 405 cpumask_clear(frozen_cpus);
406 arch_disable_nonboot_cpus_begin();
400 407
401 printk("Disabling non-boot CPUs ...\n"); 408 printk("Disabling non-boot CPUs ...\n");
402 for_each_online_cpu(cpu) { 409 for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
412 } 419 }
413 } 420 }
414 421
422 arch_disable_nonboot_cpus_end();
423
415 if (!error) { 424 if (!error) {
416 BUG_ON(num_online_cpus() > 1); 425 BUG_ON(num_online_cpus() > 1);
417 /* Make sure the CPUs won't be enabled by someone else */ 426 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
174 174
175static inline void put_signal_struct(struct signal_struct *sig) 175static inline void put_signal_struct(struct signal_struct *sig)
176{ 176{
177 if (atomic_dec_and_test(&sig->sigcnt)) 177 if (atomic_dec_and_test(&sig->sigcnt)) {
178 sched_autogroup_exit(sig);
178 free_signal_struct(sig); 179 free_signal_struct(sig);
180 }
179} 181}
180 182
181void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
905 posix_cpu_timers_init_group(sig); 907 posix_cpu_timers_init_group(sig);
906 908
907 tty_audit_fork(sig); 909 tty_audit_fork(sig);
910 sched_autogroup_fork(sig);
908 911
909 sig->oom_adj = current->signal->oom_adj; 912 sig->oom_adj = current->signal->oom_adj;
910 sig->oom_score_adj = current->signal->oom_score_adj; 913 sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
1315 } 1318 }
1316bad_fork_cleanup_signal: 1319bad_fork_cleanup_signal:
1317 if (!(clone_flags & CLONE_THREAD)) 1320 if (!(clone_flags & CLONE_THREAD))
1318 free_signal_struct(p->signal); 1321 put_signal_struct(p->signal);
1319bad_fork_cleanup_sighand: 1322bad_fork_cleanup_sighand:
1320 __cleanup_sighand(p->sighand); 1323 __cleanup_sighand(p->sighand);
1321bad_fork_cleanup_fs: 1324bad_fork_cleanup_fs:
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d0..3019b92e6917 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
70 70
71/* 71/*
72 * Futex flags used to encode options to functions and preserve them across
73 * restarts.
74 */
75#define FLAGS_SHARED 0x01
76#define FLAGS_CLOCKRT 0x02
77#define FLAGS_HAS_TIMEOUT 0x04
78
79/*
72 * Priority Inheritance state: 80 * Priority Inheritance state:
73 */ 81 */
74struct futex_pi_state { 82struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
123 u32 bitset; 131 u32 bitset;
124}; 132};
125 133
134static const struct futex_q futex_q_init = {
135 /* list gets initialized in queue_me()*/
136 .key = FUTEX_KEY_INIT,
137 .bitset = FUTEX_BITSET_MATCH_ANY
138};
139
126/* 140/*
127 * Hash buckets are shared by all the futex_keys that hash to the same 141 * Hash buckets are shared by all the futex_keys that hash to the same
128 * location. Each key may have multiple futex_q structures, one for each task 142 * location. Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
283 return 0; 297 return 0;
284} 298}
285 299
286static inline 300static inline void put_futex_key(union futex_key *key)
287void put_futex_key(int fshared, union futex_key *key)
288{ 301{
289 drop_futex_key_refs(key); 302 drop_futex_key_refs(key);
290} 303}
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
870/* 883/*
871 * Wake up waiters matching bitset queued on this futex (uaddr). 884 * Wake up waiters matching bitset queued on this futex (uaddr).
872 */ 885 */
873static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 886static int
887futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
874{ 888{
875 struct futex_hash_bucket *hb; 889 struct futex_hash_bucket *hb;
876 struct futex_q *this, *next; 890 struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
881 if (!bitset) 895 if (!bitset)
882 return -EINVAL; 896 return -EINVAL;
883 897
884 ret = get_futex_key(uaddr, fshared, &key); 898 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
885 if (unlikely(ret != 0)) 899 if (unlikely(ret != 0))
886 goto out; 900 goto out;
887 901
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
907 } 921 }
908 922
909 spin_unlock(&hb->lock); 923 spin_unlock(&hb->lock);
910 put_futex_key(fshared, &key); 924 put_futex_key(&key);
911out: 925out:
912 return ret; 926 return ret;
913} 927}
@@ -917,7 +931,7 @@ out:
917 * to this virtual address: 931 * to this virtual address:
918 */ 932 */
919static int 933static int
920futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 934futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
921 int nr_wake, int nr_wake2, int op) 935 int nr_wake, int nr_wake2, int op)
922{ 936{
923 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 937 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
927 int ret, op_ret; 941 int ret, op_ret;
928 942
929retry: 943retry:
930 ret = get_futex_key(uaddr1, fshared, &key1); 944 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
931 if (unlikely(ret != 0)) 945 if (unlikely(ret != 0))
932 goto out; 946 goto out;
933 ret = get_futex_key(uaddr2, fshared, &key2); 947 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
934 if (unlikely(ret != 0)) 948 if (unlikely(ret != 0))
935 goto out_put_key1; 949 goto out_put_key1;
936 950
@@ -962,11 +976,11 @@ retry_private:
962 if (ret) 976 if (ret)
963 goto out_put_keys; 977 goto out_put_keys;
964 978
965 if (!fshared) 979 if (!(flags & FLAGS_SHARED))
966 goto retry_private; 980 goto retry_private;
967 981
968 put_futex_key(fshared, &key2); 982 put_futex_key(&key2);
969 put_futex_key(fshared, &key1); 983 put_futex_key(&key1);
970 goto retry; 984 goto retry;
971 } 985 }
972 986
@@ -996,9 +1010,9 @@ retry_private:
996 1010
997 double_unlock_hb(hb1, hb2); 1011 double_unlock_hb(hb1, hb2);
998out_put_keys: 1012out_put_keys:
999 put_futex_key(fshared, &key2); 1013 put_futex_key(&key2);
1000out_put_key1: 1014out_put_key1:
1001 put_futex_key(fshared, &key1); 1015 put_futex_key(&key1);
1002out: 1016out:
1003 return ret; 1017 return ret;
1004} 1018}
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1133/** 1147/**
1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1148 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1135 * @uaddr1: source futex user address 1149 * @uaddr1: source futex user address
1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 1150 * @flags: futex flags (FLAGS_SHARED, etc.)
1137 * @uaddr2: target futex user address 1151 * @uaddr2: target futex user address
1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1152 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1153 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL) 1154 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1155 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1142 * pi futex (pi to pi requeue is not supported) 1156 * pi futex (pi to pi requeue is not supported)
1143 * 1157 *
1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1158 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1145 * uaddr2 atomically on behalf of the top waiter. 1159 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1148 * >=0 - on success, the number of tasks requeued or woken 1162 * >=0 - on success, the number of tasks requeued or woken
1149 * <0 - on error 1163 * <0 - on error
1150 */ 1164 */
1151static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1165static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1152 int nr_wake, int nr_requeue, u32 *cmpval, 1166 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1153 int requeue_pi) 1167 u32 *cmpval, int requeue_pi)
1154{ 1168{
1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1169 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1156 int drop_count = 0, task_count = 0, ret; 1170 int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
1191 pi_state = NULL; 1205 pi_state = NULL;
1192 } 1206 }
1193 1207
1194 ret = get_futex_key(uaddr1, fshared, &key1); 1208 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
1195 if (unlikely(ret != 0)) 1209 if (unlikely(ret != 0))
1196 goto out; 1210 goto out;
1197 ret = get_futex_key(uaddr2, fshared, &key2); 1211 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
1198 if (unlikely(ret != 0)) 1212 if (unlikely(ret != 0))
1199 goto out_put_key1; 1213 goto out_put_key1;
1200 1214
@@ -1216,11 +1230,11 @@ retry_private:
1216 if (ret) 1230 if (ret)
1217 goto out_put_keys; 1231 goto out_put_keys;
1218 1232
1219 if (!fshared) 1233 if (!(flags & FLAGS_SHARED))
1220 goto retry_private; 1234 goto retry_private;
1221 1235
1222 put_futex_key(fshared, &key2); 1236 put_futex_key(&key2);
1223 put_futex_key(fshared, &key1); 1237 put_futex_key(&key1);
1224 goto retry; 1238 goto retry;
1225 } 1239 }
1226 if (curval != *cmpval) { 1240 if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
1260 break; 1274 break;
1261 case -EFAULT: 1275 case -EFAULT:
1262 double_unlock_hb(hb1, hb2); 1276 double_unlock_hb(hb1, hb2);
1263 put_futex_key(fshared, &key2); 1277 put_futex_key(&key2);
1264 put_futex_key(fshared, &key1); 1278 put_futex_key(&key1);
1265 ret = fault_in_user_writeable(uaddr2); 1279 ret = fault_in_user_writeable(uaddr2);
1266 if (!ret) 1280 if (!ret)
1267 goto retry; 1281 goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
1269 case -EAGAIN: 1283 case -EAGAIN:
1270 /* The owner was exiting, try again. */ 1284 /* The owner was exiting, try again. */
1271 double_unlock_hb(hb1, hb2); 1285 double_unlock_hb(hb1, hb2);
1272 put_futex_key(fshared, &key2); 1286 put_futex_key(&key2);
1273 put_futex_key(fshared, &key1); 1287 put_futex_key(&key1);
1274 cond_resched(); 1288 cond_resched();
1275 goto retry; 1289 goto retry;
1276 default: 1290 default:
@@ -1352,9 +1366,9 @@ out_unlock:
1352 drop_futex_key_refs(&key1); 1366 drop_futex_key_refs(&key1);
1353 1367
1354out_put_keys: 1368out_put_keys:
1355 put_futex_key(fshared, &key2); 1369 put_futex_key(&key2);
1356out_put_key1: 1370out_put_key1:
1357 put_futex_key(fshared, &key1); 1371 put_futex_key(&key1);
1358out: 1372out:
1359 if (pi_state != NULL) 1373 if (pi_state != NULL)
1360 free_pi_state(pi_state); 1374 free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
1494 * private futexes. 1508 * private futexes.
1495 */ 1509 */
1496static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1510static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1497 struct task_struct *newowner, int fshared) 1511 struct task_struct *newowner)
1498{ 1512{
1499 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1513 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1500 struct futex_pi_state *pi_state = q->pi_state; 1514 struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
1587 goto retry; 1601 goto retry;
1588} 1602}
1589 1603
1590/*
1591 * In case we must use restart_block to restart a futex_wait,
1592 * we encode in the 'flags' shared capability
1593 */
1594#define FLAGS_SHARED 0x01
1595#define FLAGS_CLOCKRT 0x02
1596#define FLAGS_HAS_TIMEOUT 0x04
1597
1598static long futex_wait_restart(struct restart_block *restart); 1604static long futex_wait_restart(struct restart_block *restart);
1599 1605
1600/** 1606/**
1601 * fixup_owner() - Post lock pi_state and corner case management 1607 * fixup_owner() - Post lock pi_state and corner case management
1602 * @uaddr: user address of the futex 1608 * @uaddr: user address of the futex
1603 * @fshared: whether the futex is shared (1) or not (0)
1604 * @q: futex_q (contains pi_state and access to the rt_mutex) 1609 * @q: futex_q (contains pi_state and access to the rt_mutex)
1605 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1610 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1606 * 1611 *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
1613 * 0 - success, lock not taken 1618 * 0 - success, lock not taken
1614 * <0 - on error (-EFAULT) 1619 * <0 - on error (-EFAULT)
1615 */ 1620 */
1616static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, 1621static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1617 int locked)
1618{ 1622{
1619 struct task_struct *owner; 1623 struct task_struct *owner;
1620 int ret = 0; 1624 int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1625 * did a lock-steal - fix up the PI-state in that case: 1629 * did a lock-steal - fix up the PI-state in that case:
1626 */ 1630 */
1627 if (q->pi_state->owner != current) 1631 if (q->pi_state->owner != current)
1628 ret = fixup_pi_state_owner(uaddr, q, current, fshared); 1632 ret = fixup_pi_state_owner(uaddr, q, current);
1629 goto out; 1633 goto out;
1630 } 1634 }
1631 1635
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1652 * lock. Fix the state up. 1656 * lock. Fix the state up.
1653 */ 1657 */
1654 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1658 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1655 ret = fixup_pi_state_owner(uaddr, q, owner, fshared); 1659 ret = fixup_pi_state_owner(uaddr, q, owner);
1656 goto out; 1660 goto out;
1657 } 1661 }
1658 1662
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1715 * futex_wait_setup() - Prepare to wait on a futex 1719 * futex_wait_setup() - Prepare to wait on a futex
1716 * @uaddr: the futex userspace address 1720 * @uaddr: the futex userspace address
1717 * @val: the expected value 1721 * @val: the expected value
1718 * @fshared: whether the futex is shared (1) or not (0) 1722 * @flags: futex flags (FLAGS_SHARED, etc.)
1719 * @q: the associated futex_q 1723 * @q: the associated futex_q
1720 * @hb: storage for hash_bucket pointer to be returned to caller 1724 * @hb: storage for hash_bucket pointer to be returned to caller
1721 * 1725 *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1728 * 0 - uaddr contains val and hb has been locked 1732 * 0 - uaddr contains val and hb has been locked
1729 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1733 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1730 */ 1734 */
1731static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, 1735static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1732 struct futex_q *q, struct futex_hash_bucket **hb) 1736 struct futex_q *q, struct futex_hash_bucket **hb)
1733{ 1737{
1734 u32 uval; 1738 u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1752 * rare, but normal. 1756 * rare, but normal.
1753 */ 1757 */
1754retry: 1758retry:
1755 q->key = FUTEX_KEY_INIT; 1759 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
1756 ret = get_futex_key(uaddr, fshared, &q->key);
1757 if (unlikely(ret != 0)) 1760 if (unlikely(ret != 0))
1758 return ret; 1761 return ret;
1759 1762
@@ -1769,10 +1772,10 @@ retry_private:
1769 if (ret) 1772 if (ret)
1770 goto out; 1773 goto out;
1771 1774
1772 if (!fshared) 1775 if (!(flags & FLAGS_SHARED))
1773 goto retry_private; 1776 goto retry_private;
1774 1777
1775 put_futex_key(fshared, &q->key); 1778 put_futex_key(&q->key);
1776 goto retry; 1779 goto retry;
1777 } 1780 }
1778 1781
@@ -1783,32 +1786,29 @@ retry_private:
1783 1786
1784out: 1787out:
1785 if (ret) 1788 if (ret)
1786 put_futex_key(fshared, &q->key); 1789 put_futex_key(&q->key);
1787 return ret; 1790 return ret;
1788} 1791}
1789 1792
1790static int futex_wait(u32 __user *uaddr, int fshared, 1793static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1791 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1794 ktime_t *abs_time, u32 bitset)
1792{ 1795{
1793 struct hrtimer_sleeper timeout, *to = NULL; 1796 struct hrtimer_sleeper timeout, *to = NULL;
1794 struct restart_block *restart; 1797 struct restart_block *restart;
1795 struct futex_hash_bucket *hb; 1798 struct futex_hash_bucket *hb;
1796 struct futex_q q; 1799 struct futex_q q = futex_q_init;
1797 int ret; 1800 int ret;
1798 1801
1799 if (!bitset) 1802 if (!bitset)
1800 return -EINVAL; 1803 return -EINVAL;
1801
1802 q.pi_state = NULL;
1803 q.bitset = bitset; 1804 q.bitset = bitset;
1804 q.rt_waiter = NULL;
1805 q.requeue_pi_key = NULL;
1806 1805
1807 if (abs_time) { 1806 if (abs_time) {
1808 to = &timeout; 1807 to = &timeout;
1809 1808
1810 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 1809 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1811 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1810 CLOCK_REALTIME : CLOCK_MONOTONIC,
1811 HRTIMER_MODE_ABS);
1812 hrtimer_init_sleeper(to, current); 1812 hrtimer_init_sleeper(to, current);
1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1814 current->timer_slack_ns); 1814 current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments 1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs. 1820 * q.key refs.
1821 */ 1821 */
1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1823 if (ret) 1823 if (ret)
1824 goto out; 1824 goto out;
1825 1825
@@ -1852,12 +1852,7 @@ retry:
1852 restart->futex.val = val; 1852 restart->futex.val = val;
1853 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1854 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
1855 restart->futex.flags = FLAGS_HAS_TIMEOUT; 1855 restart->futex.flags = flags;
1856
1857 if (fshared)
1858 restart->futex.flags |= FLAGS_SHARED;
1859 if (clockrt)
1860 restart->futex.flags |= FLAGS_CLOCKRT;
1861 1856
1862 ret = -ERESTART_RESTARTBLOCK; 1857 ret = -ERESTART_RESTARTBLOCK;
1863 1858
@@ -1873,7 +1868,6 @@ out:
1873static long futex_wait_restart(struct restart_block *restart) 1868static long futex_wait_restart(struct restart_block *restart)
1874{ 1869{
1875 u32 __user *uaddr = restart->futex.uaddr; 1870 u32 __user *uaddr = restart->futex.uaddr;
1876 int fshared = 0;
1877 ktime_t t, *tp = NULL; 1871 ktime_t t, *tp = NULL;
1878 1872
1879 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 1873 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
1881 tp = &t; 1875 tp = &t;
1882 } 1876 }
1883 restart->fn = do_no_restart_syscall; 1877 restart->fn = do_no_restart_syscall;
1884 if (restart->futex.flags & FLAGS_SHARED) 1878
1885 fshared = 1; 1879 return (long)futex_wait(uaddr, restart->futex.flags,
1886 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, 1880 restart->futex.val, tp, restart->futex.bitset);
1887 restart->futex.bitset,
1888 restart->futex.flags & FLAGS_CLOCKRT);
1889} 1881}
1890 1882
1891 1883
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
1895 * if there are waiters then it will block, it does PI, etc. (Due to 1887 * if there are waiters then it will block, it does PI, etc. (Due to
1896 * races the kernel might see a 0 value of the futex too.) 1888 * races the kernel might see a 0 value of the futex too.)
1897 */ 1889 */
1898static int futex_lock_pi(u32 __user *uaddr, int fshared, 1890static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1899 int detect, ktime_t *time, int trylock) 1891 ktime_t *time, int trylock)
1900{ 1892{
1901 struct hrtimer_sleeper timeout, *to = NULL; 1893 struct hrtimer_sleeper timeout, *to = NULL;
1902 struct futex_hash_bucket *hb; 1894 struct futex_hash_bucket *hb;
1903 struct futex_q q; 1895 struct futex_q q = futex_q_init;
1904 int res, ret; 1896 int res, ret;
1905 1897
1906 if (refill_pi_state_cache()) 1898 if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1914 hrtimer_set_expires(&to->timer, *time); 1906 hrtimer_set_expires(&to->timer, *time);
1915 } 1907 }
1916 1908
1917 q.pi_state = NULL;
1918 q.rt_waiter = NULL;
1919 q.requeue_pi_key = NULL;
1920retry: 1909retry:
1921 q.key = FUTEX_KEY_INIT; 1910 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
1922 ret = get_futex_key(uaddr, fshared, &q.key);
1923 if (unlikely(ret != 0)) 1911 if (unlikely(ret != 0))
1924 goto out; 1912 goto out;
1925 1913
@@ -1941,7 +1929,7 @@ retry_private:
1941 * exit to complete. 1929 * exit to complete.
1942 */ 1930 */
1943 queue_unlock(&q, hb); 1931 queue_unlock(&q, hb);
1944 put_futex_key(fshared, &q.key); 1932 put_futex_key(&q.key);
1945 cond_resched(); 1933 cond_resched();
1946 goto retry; 1934 goto retry;
1947 default: 1935 default:
@@ -1971,7 +1959,7 @@ retry_private:
1971 * Fixup the pi_state owner and possibly acquire the lock if we 1959 * Fixup the pi_state owner and possibly acquire the lock if we
1972 * haven't already. 1960 * haven't already.
1973 */ 1961 */
1974 res = fixup_owner(uaddr, fshared, &q, !ret); 1962 res = fixup_owner(uaddr, &q, !ret);
1975 /* 1963 /*
1976 * If fixup_owner() returned an error, proprogate that. If it acquired 1964 * If fixup_owner() returned an error, proprogate that. If it acquired
1977 * the lock, clear our -ETIMEDOUT or -EINTR. 1965 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
1995 queue_unlock(&q, hb); 1983 queue_unlock(&q, hb);
1996 1984
1997out_put_key: 1985out_put_key:
1998 put_futex_key(fshared, &q.key); 1986 put_futex_key(&q.key);
1999out: 1987out:
2000 if (to) 1988 if (to)
2001 destroy_hrtimer_on_stack(&to->timer); 1989 destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
2008 if (ret) 1996 if (ret)
2009 goto out_put_key; 1997 goto out_put_key;
2010 1998
2011 if (!fshared) 1999 if (!(flags & FLAGS_SHARED))
2012 goto retry_private; 2000 goto retry_private;
2013 2001
2014 put_futex_key(fshared, &q.key); 2002 put_futex_key(&q.key);
2015 goto retry; 2003 goto retry;
2016} 2004}
2017 2005
@@ -2020,7 +2008,7 @@ uaddr_faulted:
2020 * This is the in-kernel slowpath: we look up the PI state (if any), 2008 * This is the in-kernel slowpath: we look up the PI state (if any),
2021 * and do the rt-mutex unlock. 2009 * and do the rt-mutex unlock.
2022 */ 2010 */
2023static int futex_unlock_pi(u32 __user *uaddr, int fshared) 2011static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2024{ 2012{
2025 struct futex_hash_bucket *hb; 2013 struct futex_hash_bucket *hb;
2026 struct futex_q *this, *next; 2014 struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
2038 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2026 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2039 return -EPERM; 2027 return -EPERM;
2040 2028
2041 ret = get_futex_key(uaddr, fshared, &key); 2029 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
2042 if (unlikely(ret != 0)) 2030 if (unlikely(ret != 0))
2043 goto out; 2031 goto out;
2044 2032
@@ -2093,14 +2081,14 @@ retry:
2093 2081
2094out_unlock: 2082out_unlock:
2095 spin_unlock(&hb->lock); 2083 spin_unlock(&hb->lock);
2096 put_futex_key(fshared, &key); 2084 put_futex_key(&key);
2097 2085
2098out: 2086out:
2099 return ret; 2087 return ret;
2100 2088
2101pi_faulted: 2089pi_faulted:
2102 spin_unlock(&hb->lock); 2090 spin_unlock(&hb->lock);
2103 put_futex_key(fshared, &key); 2091 put_futex_key(&key);
2104 2092
2105 ret = fault_in_user_writeable(uaddr); 2093 ret = fault_in_user_writeable(uaddr);
2106 if (!ret) 2094 if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2160/** 2148/**
2161 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2149 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2162 * @uaddr: the futex we initially wait on (non-pi) 2150 * @uaddr: the futex we initially wait on (non-pi)
2163 * @fshared: whether the futexes are shared (1) or not (0). They must be 2151 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2164 * the same type, no requeueing from private to shared, etc. 2152 * the same type, no requeueing from private to shared, etc.
2165 * @val: the expected value of uaddr 2153 * @val: the expected value of uaddr
2166 * @abs_time: absolute timeout 2154 * @abs_time: absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2198 * 0 - On success 2186 * 0 - On success
2199 * <0 - On error 2187 * <0 - On error
2200 */ 2188 */
2201static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, 2189static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2202 u32 val, ktime_t *abs_time, u32 bitset, 2190 u32 val, ktime_t *abs_time, u32 bitset,
2203 int clockrt, u32 __user *uaddr2) 2191 u32 __user *uaddr2)
2204{ 2192{
2205 struct hrtimer_sleeper timeout, *to = NULL; 2193 struct hrtimer_sleeper timeout, *to = NULL;
2206 struct rt_mutex_waiter rt_waiter; 2194 struct rt_mutex_waiter rt_waiter;
2207 struct rt_mutex *pi_mutex = NULL; 2195 struct rt_mutex *pi_mutex = NULL;
2208 struct futex_hash_bucket *hb; 2196 struct futex_hash_bucket *hb;
2209 union futex_key key2; 2197 union futex_key key2 = FUTEX_KEY_INIT;
2210 struct futex_q q; 2198 struct futex_q q = futex_q_init;
2211 int res, ret; 2199 int res, ret;
2212 2200
2213 if (!bitset) 2201 if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 2203
2216 if (abs_time) { 2204 if (abs_time) {
2217 to = &timeout; 2205 to = &timeout;
2218 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 2206 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2219 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2207 CLOCK_REALTIME : CLOCK_MONOTONIC,
2208 HRTIMER_MODE_ABS);
2220 hrtimer_init_sleeper(to, current); 2209 hrtimer_init_sleeper(to, current);
2221 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2210 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2222 current->timer_slack_ns); 2211 current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2229 debug_rt_mutex_init_waiter(&rt_waiter); 2218 debug_rt_mutex_init_waiter(&rt_waiter);
2230 rt_waiter.task = NULL; 2219 rt_waiter.task = NULL;
2231 2220
2232 key2 = FUTEX_KEY_INIT; 2221 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
2233 ret = get_futex_key(uaddr2, fshared, &key2);
2234 if (unlikely(ret != 0)) 2222 if (unlikely(ret != 0))
2235 goto out; 2223 goto out;
2236 2224
2237 q.pi_state = NULL;
2238 q.bitset = bitset; 2225 q.bitset = bitset;
2239 q.rt_waiter = &rt_waiter; 2226 q.rt_waiter = &rt_waiter;
2240 q.requeue_pi_key = &key2; 2227 q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2230 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count. 2231 * count.
2245 */ 2232 */
2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2233 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2247 if (ret) 2234 if (ret)
2248 goto out_key2; 2235 goto out_key2;
2249 2236
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2273 */ 2260 */
2274 if (q.pi_state && (q.pi_state->owner != current)) { 2261 if (q.pi_state && (q.pi_state->owner != current)) {
2275 spin_lock(q.lock_ptr); 2262 spin_lock(q.lock_ptr);
2276 ret = fixup_pi_state_owner(uaddr2, &q, current, 2263 ret = fixup_pi_state_owner(uaddr2, &q, current);
2277 fshared);
2278 spin_unlock(q.lock_ptr); 2264 spin_unlock(q.lock_ptr);
2279 } 2265 }
2280 } else { 2266 } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2293 * Fixup the pi_state owner and possibly acquire the lock if we 2279 * Fixup the pi_state owner and possibly acquire the lock if we
2294 * haven't already. 2280 * haven't already.
2295 */ 2281 */
2296 res = fixup_owner(uaddr2, fshared, &q, !ret); 2282 res = fixup_owner(uaddr2, &q, !ret);
2297 /* 2283 /*
2298 * If fixup_owner() returned an error, proprogate that. If it 2284 * If fixup_owner() returned an error, proprogate that. If it
2299 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2285 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2324 } 2310 }
2325 2311
2326out_put_keys: 2312out_put_keys:
2327 put_futex_key(fshared, &q.key); 2313 put_futex_key(&q.key);
2328out_key2: 2314out_key2:
2329 put_futex_key(fshared, &key2); 2315 put_futex_key(&key2);
2330 2316
2331out: 2317out:
2332 if (to) { 2318 if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
2551long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2537long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2552 u32 __user *uaddr2, u32 val2, u32 val3) 2538 u32 __user *uaddr2, u32 val2, u32 val3)
2553{ 2539{
2554 int clockrt, ret = -ENOSYS; 2540 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2555 int cmd = op & FUTEX_CMD_MASK; 2541 unsigned int flags = 0;
2556 int fshared = 0;
2557 2542
2558 if (!(op & FUTEX_PRIVATE_FLAG)) 2543 if (!(op & FUTEX_PRIVATE_FLAG))
2559 fshared = 1; 2544 flags |= FLAGS_SHARED;
2560 2545
2561 clockrt = op & FUTEX_CLOCK_REALTIME; 2546 if (op & FUTEX_CLOCK_REALTIME) {
2562 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2547 flags |= FLAGS_CLOCKRT;
2563 return -ENOSYS; 2548 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2549 return -ENOSYS;
2550 }
2564 2551
2565 switch (cmd) { 2552 switch (cmd) {
2566 case FUTEX_WAIT: 2553 case FUTEX_WAIT:
2567 val3 = FUTEX_BITSET_MATCH_ANY; 2554 val3 = FUTEX_BITSET_MATCH_ANY;
2568 case FUTEX_WAIT_BITSET: 2555 case FUTEX_WAIT_BITSET:
2569 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); 2556 ret = futex_wait(uaddr, flags, val, timeout, val3);
2570 break; 2557 break;
2571 case FUTEX_WAKE: 2558 case FUTEX_WAKE:
2572 val3 = FUTEX_BITSET_MATCH_ANY; 2559 val3 = FUTEX_BITSET_MATCH_ANY;
2573 case FUTEX_WAKE_BITSET: 2560 case FUTEX_WAKE_BITSET:
2574 ret = futex_wake(uaddr, fshared, val, val3); 2561 ret = futex_wake(uaddr, flags, val, val3);
2575 break; 2562 break;
2576 case FUTEX_REQUEUE: 2563 case FUTEX_REQUEUE:
2577 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); 2564 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2578 break; 2565 break;
2579 case FUTEX_CMP_REQUEUE: 2566 case FUTEX_CMP_REQUEUE:
2580 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2567 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2581 0);
2582 break; 2568 break;
2583 case FUTEX_WAKE_OP: 2569 case FUTEX_WAKE_OP:
2584 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2570 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2585 break; 2571 break;
2586 case FUTEX_LOCK_PI: 2572 case FUTEX_LOCK_PI:
2587 if (futex_cmpxchg_enabled) 2573 if (futex_cmpxchg_enabled)
2588 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2574 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2589 break; 2575 break;
2590 case FUTEX_UNLOCK_PI: 2576 case FUTEX_UNLOCK_PI:
2591 if (futex_cmpxchg_enabled) 2577 if (futex_cmpxchg_enabled)
2592 ret = futex_unlock_pi(uaddr, fshared); 2578 ret = futex_unlock_pi(uaddr, flags);
2593 break; 2579 break;
2594 case FUTEX_TRYLOCK_PI: 2580 case FUTEX_TRYLOCK_PI:
2595 if (futex_cmpxchg_enabled) 2581 if (futex_cmpxchg_enabled)
2596 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2582 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2597 break; 2583 break;
2598 case FUTEX_WAIT_REQUEUE_PI: 2584 case FUTEX_WAIT_REQUEUE_PI:
2599 val3 = FUTEX_BITSET_MATCH_ANY; 2585 val3 = FUTEX_BITSET_MATCH_ANY;
2600 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, 2586 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2601 clockrt, uaddr2); 2587 uaddr2);
2602 break; 2588 break;
2603 case FUTEX_CMP_REQUEUE_PI: 2589 case FUTEX_CMP_REQUEUE_PI:
2604 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2590 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2605 1);
2606 break; 2591 break;
2607 default: 2592 default:
2608 ret = -ENOSYS; 2593 ret = -ENOSYS;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..f2429fc3438c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
516 516
517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
518 struct hrtimer *timer; 518 struct hrtimer *timer;
519 struct timerqueue_node *next;
519 520
520 if (!base->first) 521 next = timerqueue_getnext(&base->active);
522 if (!next)
521 continue; 523 continue;
522 timer = rb_entry(base->first, struct hrtimer, node); 524 timer = container_of(next, struct hrtimer, node);
525
523 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 526 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
524 /* 527 /*
525 * clock_was_set() has changed base->offset so the 528 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
840static int enqueue_hrtimer(struct hrtimer *timer, 843static int enqueue_hrtimer(struct hrtimer *timer,
841 struct hrtimer_clock_base *base) 844 struct hrtimer_clock_base *base)
842{ 845{
843 struct rb_node **link = &base->active.rb_node;
844 struct rb_node *parent = NULL;
845 struct hrtimer *entry;
846 int leftmost = 1;
847
848 debug_activate(timer); 846 debug_activate(timer);
849 847
850 /* 848 timerqueue_add(&base->active, &timer->node);
851 * Find the right place in the rbtree:
852 */
853 while (*link) {
854 parent = *link;
855 entry = rb_entry(parent, struct hrtimer, node);
856 /*
857 * We dont care about collisions. Nodes with
858 * the same expiry time stay together.
859 */
860 if (hrtimer_get_expires_tv64(timer) <
861 hrtimer_get_expires_tv64(entry)) {
862 link = &(*link)->rb_left;
863 } else {
864 link = &(*link)->rb_right;
865 leftmost = 0;
866 }
867 }
868
869 /*
870 * Insert the timer to the rbtree and check whether it
871 * replaces the first pending timer
872 */
873 if (leftmost)
874 base->first = &timer->node;
875 849
876 rb_link_node(&timer->node, parent, link);
877 rb_insert_color(&timer->node, &base->active);
878 /* 850 /*
879 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 851 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
880 * state of a possibly running callback. 852 * state of a possibly running callback.
881 */ 853 */
882 timer->state |= HRTIMER_STATE_ENQUEUED; 854 timer->state |= HRTIMER_STATE_ENQUEUED;
883 855
884 return leftmost; 856 return (&timer->node == base->active.next);
885} 857}
886 858
887/* 859/*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 873 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
902 goto out; 874 goto out;
903 875
904 /* 876 if (&timer->node == timerqueue_getnext(&base->active)) {
905 * Remove the timer from the rbtree and replace the first
906 * entry pointer if necessary.
907 */
908 if (base->first == &timer->node) {
909 base->first = rb_next(&timer->node);
910#ifdef CONFIG_HIGH_RES_TIMERS 877#ifdef CONFIG_HIGH_RES_TIMERS
911 /* Reprogram the clock event device. if enabled */ 878 /* Reprogram the clock event device. if enabled */
912 if (reprogram && hrtimer_hres_active()) { 879 if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
919 } 886 }
920#endif 887#endif
921 } 888 }
922 rb_erase(&timer->node, &base->active); 889 timerqueue_del(&base->active, &timer->node);
923out: 890out:
924 timer->state = newstate; 891 timer->state = newstate;
925} 892}
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
1128 if (!hrtimer_hres_active()) { 1095 if (!hrtimer_hres_active()) {
1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1096 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1130 struct hrtimer *timer; 1097 struct hrtimer *timer;
1098 struct timerqueue_node *next;
1131 1099
1132 if (!base->first) 1100 next = timerqueue_getnext(&base->active);
1101 if (!next)
1133 continue; 1102 continue;
1134 1103
1135 timer = rb_entry(base->first, struct hrtimer, node); 1104 timer = container_of(next, struct hrtimer, node);
1136 delta.tv64 = hrtimer_get_expires_tv64(timer); 1105 delta.tv64 = hrtimer_get_expires_tv64(timer);
1137 delta = ktime_sub(delta, base->get_time()); 1106 delta = ktime_sub(delta, base->get_time());
1138 if (delta.tv64 < mindelta.tv64) 1107 if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1162 1131
1163 timer->base = &cpu_base->clock_base[clock_id]; 1132 timer->base = &cpu_base->clock_base[clock_id];
1164 hrtimer_init_timer_hres(timer); 1133 hrtimer_init_timer_hres(timer);
1134 timerqueue_init(&timer->node);
1165 1135
1166#ifdef CONFIG_TIMER_STATS 1136#ifdef CONFIG_TIMER_STATS
1167 timer->start_site = NULL; 1137 timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
1278 1248
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1249 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1250 ktime_t basenow;
1281 struct rb_node *node; 1251 struct timerqueue_node *node;
1282 1252
1283 basenow = ktime_add(now, base->offset); 1253 basenow = ktime_add(now, base->offset);
1284 1254
1285 while ((node = base->first)) { 1255 while ((node = timerqueue_getnext(&base->active))) {
1286 struct hrtimer *timer; 1256 struct hrtimer *timer;
1287 1257
1288 timer = rb_entry(node, struct hrtimer, node); 1258 timer = container_of(node, struct hrtimer, node);
1289 1259
1290 /* 1260 /*
1291 * The immediate goal for using the softexpires is 1261 * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
1441 */ 1411 */
1442void hrtimer_run_queues(void) 1412void hrtimer_run_queues(void)
1443{ 1413{
1444 struct rb_node *node; 1414 struct timerqueue_node *node;
1445 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1415 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1446 struct hrtimer_clock_base *base; 1416 struct hrtimer_clock_base *base;
1447 int index, gettime = 1; 1417 int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
1451 1421
1452 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { 1422 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1453 base = &cpu_base->clock_base[index]; 1423 base = &cpu_base->clock_base[index];
1454 1424 if (!timerqueue_getnext(&base->active))
1455 if (!base->first)
1456 continue; 1425 continue;
1457 1426
1458 if (gettime) { 1427 if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
1462 1431
1463 raw_spin_lock(&cpu_base->lock); 1432 raw_spin_lock(&cpu_base->lock);
1464 1433
1465 while ((node = base->first)) { 1434 while ((node = timerqueue_getnext(&base->active))) {
1466 struct hrtimer *timer; 1435 struct hrtimer *timer;
1467 1436
1468 timer = rb_entry(node, struct hrtimer, node); 1437 timer = container_of(node, struct hrtimer, node);
1469 if (base->softirq_time.tv64 <= 1438 if (base->softirq_time.tv64 <=
1470 hrtimer_get_expires_tv64(timer)) 1439 hrtimer_get_expires_tv64(timer))
1471 break; 1440 break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1630 1599
1631 raw_spin_lock_init(&cpu_base->lock); 1600 raw_spin_lock_init(&cpu_base->lock);
1632 1601
1633 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1602 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1634 cpu_base->clock_base[i].cpu_base = cpu_base; 1603 cpu_base->clock_base[i].cpu_base = cpu_base;
1604 timerqueue_init_head(&cpu_base->clock_base[i].active);
1605 }
1635 1606
1636 hrtimer_init_hres(cpu_base); 1607 hrtimer_init_hres(cpu_base);
1637} 1608}
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1642 struct hrtimer_clock_base *new_base) 1613 struct hrtimer_clock_base *new_base)
1643{ 1614{
1644 struct hrtimer *timer; 1615 struct hrtimer *timer;
1645 struct rb_node *node; 1616 struct timerqueue_node *node;
1646 1617
1647 while ((node = rb_first(&old_base->active))) { 1618 while ((node = timerqueue_getnext(&old_base->active))) {
1648 timer = rb_entry(node, struct hrtimer, node); 1619 timer = container_of(node, struct hrtimer, node);
1649 BUG_ON(hrtimer_callback_running(timer)); 1620 BUG_ON(hrtimer_callback_running(timer));
1650 debug_deactivate(timer); 1621 debug_deactivate(timer);
1651 1622
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..1969d2fc4b36 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
494 namelen += 2; 494 namelen += 2;
495 495
496 for (i = 0; i < LOCKSTAT_POINTS; i++) { 496 for (i = 0; i < LOCKSTAT_POINTS; i++) {
497 char sym[KSYM_SYMBOL_LEN];
498 char ip[32]; 497 char ip[32];
499 498
500 if (class->contention_point[i] == 0) 499 if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
503 if (!i) 502 if (!i)
504 seq_line(m, '-', 40-namelen, namelen); 503 seq_line(m, '-', 40-namelen, namelen);
505 504
506 sprint_symbol(sym, class->contention_point[i]);
507 snprintf(ip, sizeof(ip), "[<%p>]", 505 snprintf(ip, sizeof(ip), "[<%p>]",
508 (void *)class->contention_point[i]); 506 (void *)class->contention_point[i]);
509 seq_printf(m, "%40s %14lu %29s %s\n", name, 507 seq_printf(m, "%40s %14lu %29s %pS\n",
510 stats->contention_point[i], 508 name, stats->contention_point[i],
511 ip, sym); 509 ip, (void *)class->contention_point[i]);
512 } 510 }
513 for (i = 0; i < LOCKSTAT_POINTS; i++) { 511 for (i = 0; i < LOCKSTAT_POINTS; i++) {
514 char sym[KSYM_SYMBOL_LEN];
515 char ip[32]; 512 char ip[32];
516 513
517 if (class->contending_point[i] == 0) 514 if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
520 if (!i) 517 if (!i)
521 seq_line(m, '-', 40-namelen, namelen); 518 seq_line(m, '-', 40-namelen, namelen);
522 519
523 sprint_symbol(sym, class->contending_point[i]);
524 snprintf(ip, sizeof(ip), "[<%p>]", 520 snprintf(ip, sizeof(ip), "[<%p>]",
525 (void *)class->contending_point[i]); 521 (void *)class->contending_point[i]);
526 seq_printf(m, "%40s %14lu %29s %s\n", name, 522 seq_printf(m, "%40s %14lu %29s %pS\n",
527 stats->contending_point[i], 523 name, stats->contending_point[i],
528 ip, sym); 524 ip, (void *)class->contending_point[i]);
529 } 525 }
530 if (i) { 526 if (i) {
531 seq_puts(m, "\n"); 527 seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index d190664f25ff..34e00b708fad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
61#include <trace/events/module.h> 62#include <trace/events/module.h>
@@ -70,6 +71,26 @@
70#define ARCH_SHF_SMALL 0 71#define ARCH_SHF_SMALL 0
71#endif 72#endif
72 73
74/*
75 * Modules' sections will be aligned on page boundaries
76 * to ensure complete separation of code and data, but
77 * only when CONFIG_DEBUG_SET_MODULE_RONX=y
78 */
79#ifdef CONFIG_DEBUG_SET_MODULE_RONX
80# define debug_align(X) ALIGN(X, PAGE_SIZE)
81#else
82# define debug_align(X) (X)
83#endif
84
85/*
86 * Given BASE and SIZE this macro calculates the number of pages the
87 * memory regions occupies
88 */
89#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
90 (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
91 PFN_DOWN((unsigned long)BASE) + 1) \
92 : (0UL))
93
73/* If this is set, the section belongs in the init part of the module */ 94/* If this is set, the section belongs in the init part of the module */
74#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 95#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
75 96
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
1542 return 0; 1563 return 0;
1543} 1564}
1544 1565
1566#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1567/*
1568 * LKM RO/NX protection: protect module's text/ro-data
1569 * from modification and any data from execution.
1570 */
1571void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
1572{
1573 unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
1574 unsigned long end_pfn = PFN_DOWN((unsigned long)end);
1575
1576 if (end_pfn > begin_pfn)
1577 set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1578}
1579
1580static void set_section_ro_nx(void *base,
1581 unsigned long text_size,
1582 unsigned long ro_size,
1583 unsigned long total_size)
1584{
1585 /* begin and end PFNs of the current subsection */
1586 unsigned long begin_pfn;
1587 unsigned long end_pfn;
1588
1589 /*
1590 * Set RO for module text and RO-data:
1591 * - Always protect first page.
1592 * - Do not protect last partial page.
1593 */
1594 if (ro_size > 0)
1595 set_page_attributes(base, base + ro_size, set_memory_ro);
1596
1597 /*
1598 * Set NX permissions for module data:
1599 * - Do not protect first partial page.
1600 * - Always protect last page.
1601 */
1602 if (total_size > text_size) {
1603 begin_pfn = PFN_UP((unsigned long)base + text_size);
1604 end_pfn = PFN_UP((unsigned long)base + total_size);
1605 if (end_pfn > begin_pfn)
1606 set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1607 }
1608}
1609
1610/* Setting memory back to RW+NX before releasing it */
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{
1613 unsigned long total_pages;
1614
1615 if (mod->module_core == module_region) {
1616 /* Set core as NX+RW */
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
1618 set_memory_nx((unsigned long)mod->module_core, total_pages);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages);
1620
1621 } else if (mod->module_init == module_region) {
1622 /* Set init as NX+RW */
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
1624 set_memory_nx((unsigned long)mod->module_init, total_pages);
1625 set_memory_rw((unsigned long)mod->module_init, total_pages);
1626 }
1627}
1628
1629/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw()
1631{
1632 struct module *mod;
1633
1634 mutex_lock(&module_mutex);
1635 list_for_each_entry_rcu(mod, &modules, list) {
1636 if ((mod->module_core) && (mod->core_text_size)) {
1637 set_page_attributes(mod->module_core,
1638 mod->module_core + mod->core_text_size,
1639 set_memory_rw);
1640 }
1641 if ((mod->module_init) && (mod->init_text_size)) {
1642 set_page_attributes(mod->module_init,
1643 mod->module_init + mod->init_text_size,
1644 set_memory_rw);
1645 }
1646 }
1647 mutex_unlock(&module_mutex);
1648}
1649
1650/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro()
1652{
1653 struct module *mod;
1654
1655 mutex_lock(&module_mutex);
1656 list_for_each_entry_rcu(mod, &modules, list) {
1657 if ((mod->module_core) && (mod->core_text_size)) {
1658 set_page_attributes(mod->module_core,
1659 mod->module_core + mod->core_text_size,
1660 set_memory_ro);
1661 }
1662 if ((mod->module_init) && (mod->init_text_size)) {
1663 set_page_attributes(mod->module_init,
1664 mod->module_init + mod->init_text_size,
1665 set_memory_ro);
1666 }
1667 }
1668 mutex_unlock(&module_mutex);
1669}
1670#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
1673#endif
1674
1545/* Free a module, remove from lists, etc. */ 1675/* Free a module, remove from lists, etc. */
1546static void free_module(struct module *mod) 1676static void free_module(struct module *mod)
1547{ 1677{
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
1566 destroy_params(mod->kp, mod->num_kp); 1696 destroy_params(mod->kp, mod->num_kp);
1567 1697
1568 /* This may be NULL, but that's OK */ 1698 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init);
1569 module_free(mod, mod->module_init); 1700 module_free(mod, mod->module_init);
1570 kfree(mod->args); 1701 kfree(mod->args);
1571 percpu_modfree(mod); 1702 percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
1574 lockdep_free_key_range(mod->module_core, mod->core_size); 1705 lockdep_free_key_range(mod->module_core, mod->core_size);
1575 1706
1576 /* Finally, free the core (containing the module structure) */ 1707 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core);
1577 module_free(mod, mod->module_core); 1709 module_free(mod, mod->module_core);
1578 1710
1579#ifdef CONFIG_MPU 1711#ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1777 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1909 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1778 DEBUGP("\t%s\n", name); 1910 DEBUGP("\t%s\n", name);
1779 } 1911 }
1780 if (m == 0) 1912 switch (m) {
1913 case 0: /* executable */
1914 mod->core_size = debug_align(mod->core_size);
1781 mod->core_text_size = mod->core_size; 1915 mod->core_text_size = mod->core_size;
1916 break;
1917 case 1: /* RO: text and ro-data */
1918 mod->core_size = debug_align(mod->core_size);
1919 mod->core_ro_size = mod->core_size;
1920 break;
1921 case 3: /* whole core */
1922 mod->core_size = debug_align(mod->core_size);
1923 break;
1924 }
1782 } 1925 }
1783 1926
1784 DEBUGP("Init section allocation order:\n"); 1927 DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1796 | INIT_OFFSET_MASK); 1939 | INIT_OFFSET_MASK);
1797 DEBUGP("\t%s\n", sname); 1940 DEBUGP("\t%s\n", sname);
1798 } 1941 }
1799 if (m == 0) 1942 switch (m) {
1943 case 0: /* executable */
1944 mod->init_size = debug_align(mod->init_size);
1800 mod->init_text_size = mod->init_size; 1945 mod->init_text_size = mod->init_size;
1946 break;
1947 case 1: /* RO: text and ro-data */
1948 mod->init_size = debug_align(mod->init_size);
1949 mod->init_ro_size = mod->init_size;
1950 break;
1951 case 3: /* whole init */
1952 mod->init_size = debug_align(mod->init_size);
1953 break;
1954 }
1801 } 1955 }
1802} 1956}
1803 1957
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2722 blocking_notifier_call_chain(&module_notify_list, 2876 blocking_notifier_call_chain(&module_notify_list,
2723 MODULE_STATE_COMING, mod); 2877 MODULE_STATE_COMING, mod);
2724 2878
2879 /* Set RO and NX regions for core */
2880 set_section_ro_nx(mod->module_core,
2881 mod->core_text_size,
2882 mod->core_ro_size,
2883 mod->core_size);
2884
2885 /* Set RO and NX regions for init */
2886 set_section_ro_nx(mod->module_init,
2887 mod->init_text_size,
2888 mod->init_ro_size,
2889 mod->init_size);
2890
2725 do_mod_ctors(mod); 2891 do_mod_ctors(mod);
2726 /* Start the module */ 2892 /* Start the module */
2727 if (mod->init != NULL) 2893 if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2765 mod->symtab = mod->core_symtab; 2931 mod->symtab = mod->core_symtab;
2766 mod->strtab = mod->core_strtab; 2932 mod->strtab = mod->core_strtab;
2767#endif 2933#endif
2934 unset_section_ro_nx(mod, mod->module_init);
2768 module_free(mod, mod->module_init); 2935 module_free(mod, mod->module_init);
2769 mod->module_init = NULL; 2936 mod->module_init = NULL;
2770 mod->init_size = 0; 2937 mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
145 145
146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
147 147
148static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 148static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
149
150#define lock_timer(tid, flags) \
151({ struct k_itimer *__timr; \
152 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
153 __timr; \
154})
149 155
150static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 156static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
151{ 157{
@@ -619,7 +625,7 @@ out:
619 * the find to the timer lock. To avoid a dead lock, the timer id MUST 625 * the find to the timer lock. To avoid a dead lock, the timer id MUST
620 * be release with out holding the timer lock. 626 * be release with out holding the timer lock.
621 */ 627 */
622static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) 628static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
623{ 629{
624 struct k_itimer *timr; 630 struct k_itimer *timr;
625 /* 631 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1074
1075void printk_tick(void) 1075void printk_tick(void)
1076{ 1076{
1077 if (__get_cpu_var(printk_pending)) { 1077 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1078 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1079 wake_up_interruptible(&log_wait);
1080 } 1080 }
1081} 1081}
1082 1082
1083int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1084{ 1084{
1085 if (unlikely(cpu_is_offline(cpu))) 1085 if (cpu_is_offline(cpu))
1086 printk_tick(); 1086 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1087 return __this_cpu_read(printk_pending);
1088} 1088}
1089 1089
1090void wake_up_klogd(void) 1090void wake_up_klogd(void)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..034493724749 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38 38
39/* Global control variables for rcupdate callback mechanism. */ 39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40struct rcu_ctrlblk { 40static struct task_struct *rcu_kthread_task;
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 42static unsigned long have_rcu_kthread_work;
43 struct rcu_head **curtail; /* ->next pointer of last CB. */ 43static void invoke_rcu_kthread(void);
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 44
62/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 46struct rcu_ctrlblk;
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg);
64static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu), 50 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp); 51 struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
123{ 108{
124 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125 rcu_qsctr_help(&rcu_bh_ctrlblk)) 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
126 raise_softirq(RCU_SOFTIRQ); 111 invoke_rcu_kthread();
127} 112}
128 113
129/* 114/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
132void rcu_bh_qs(int cpu) 117void rcu_bh_qs(int cpu)
133{ 118{
134 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135 raise_softirq(RCU_SOFTIRQ); 120 invoke_rcu_kthread();
136} 121}
137 122
138/* 123/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
152} 137}
153 138
154/* 139/*
155 * Helper function for rcu_process_callbacks() that operates on the 140 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
156 * specified rcu_ctrlkblk structure. 141 * whose grace period has elapsed.
157 */ 142 */
158static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 143static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159{ 144{
160 struct rcu_head *next, *list; 145 struct rcu_head *next, *list;
161 unsigned long flags; 146 unsigned long flags;
147 RCU_TRACE(int cb_count = 0);
162 148
163 /* If no RCU callbacks ready to invoke, just return. */ 149 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 150 if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
180 next = list->next; 166 next = list->next;
181 prefetch(next); 167 prefetch(next);
182 debug_rcu_head_unqueue(list); 168 debug_rcu_head_unqueue(list);
169 local_bh_disable();
183 list->func(list); 170 list->func(list);
171 local_bh_enable();
184 list = next; 172 list = next;
173 RCU_TRACE(cb_count++);
185 } 174 }
175 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186} 176}
187 177
188/* 178/*
189 * Invoke any callbacks whose grace period has completed. 179 * This kthread invokes RCU callbacks whose grace periods have
180 * elapsed. It is awakened as needed, and takes the place of the
181 * RCU_SOFTIRQ that was used previously for this purpose.
182 * This is a kthread, but it is never stopped, at least not until
183 * the system goes down.
190 */ 184 */
191static void rcu_process_callbacks(struct softirq_action *unused) 185static int rcu_kthread(void *arg)
192{ 186{
193 __rcu_process_callbacks(&rcu_sched_ctrlblk); 187 unsigned long work;
194 __rcu_process_callbacks(&rcu_bh_ctrlblk); 188 unsigned long morework;
195 rcu_preempt_process_callbacks(); 189 unsigned long flags;
190
191 for (;;) {
192 wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
193 morework = rcu_boost();
194 local_irq_save(flags);
195 work = have_rcu_kthread_work;
196 have_rcu_kthread_work = morework;
197 local_irq_restore(flags);
198 if (work) {
199 rcu_process_callbacks(&rcu_sched_ctrlblk);
200 rcu_process_callbacks(&rcu_bh_ctrlblk);
201 rcu_preempt_process_callbacks();
202 }
203 schedule_timeout_interruptible(1); /* Leave CPU for others. */
204 }
205
206 return 0; /* Not reached, but needed to shut gcc up. */
207}
208
209/*
210 * Wake up rcu_kthread() to process callbacks now eligible for invocation
211 * or to boost readers.
212 */
213static void invoke_rcu_kthread(void)
214{
215 unsigned long flags;
216
217 local_irq_save(flags);
218 have_rcu_kthread_work = 1;
219 wake_up(&rcu_kthread_wq);
220 local_irq_restore(flags);
196} 221}
197 222
198/* 223/*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
230 local_irq_save(flags); 255 local_irq_save(flags);
231 *rcp->curtail = head; 256 *rcp->curtail = head;
232 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
258 RCU_TRACE(rcp->qlen++);
233 local_irq_restore(flags); 259 local_irq_restore(flags);
234} 260}
235 261
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
282} 308}
283EXPORT_SYMBOL_GPL(rcu_barrier_sched); 309EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284 310
285void __init rcu_init(void) 311/*
312 * Spawn the kthread that invokes RCU callbacks.
313 */
314static int __init rcu_spawn_kthreads(void)
286{ 315{
287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 316 struct sched_param sp;
317
318 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
319 sp.sched_priority = RCU_BOOST_PRIO;
320 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
321 return 0;
288} 322}
323early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/debugfs.h>
27#include <linux/seq_file.h>
28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */
41};
42
43/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist,
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52};
53
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55int rcu_scheduler_active __read_mostly;
56EXPORT_SYMBOL_GPL(rcu_scheduler_active);
57#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
25#ifdef CONFIG_TINY_PREEMPT_RCU 59#ifdef CONFIG_TINY_PREEMPT_RCU
26 60
27#include <linux/delay.h> 61#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
46 struct list_head *gp_tasks; 80 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */ 81 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */ 82 /* current grace period, or NULL if there */
49 /* is not such task. */ 83 /* is no such task. */
50 struct list_head *exp_tasks; 84 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */ 85 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */ 86 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */ 87 /* if there is no such task. If there */
54 /* is no current expedited grace period, */ 88 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */ 89 /* then there cannot be any such task. */
90#ifdef CONFIG_RCU_BOOST
91 struct list_head *boost_tasks;
92 /* Pointer to first task that needs to be */
93 /* priority-boosted, or NULL if no priority */
94 /* boosting is needed. If there is no */
95 /* current or expedited grace period, there */
96 /* can be no such task. */
97#endif /* #ifdef CONFIG_RCU_BOOST */
56 u8 gpnum; /* Current grace period. */ 98 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */ 99 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted;
110 unsigned long n_exp_boosts;
111 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks;
113 unsigned long n_normal_balk_gp_tasks;
114 unsigned long n_normal_balk_boost_tasks;
115 unsigned long n_normal_balk_boosted;
116 unsigned long n_normal_balk_notyet;
117 unsigned long n_normal_balk_nos;
118 unsigned long n_exp_balk_blkd_tasks;
119 unsigned long n_exp_balk_nos;
120#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */
60}; 122};
61 123
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { 124static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
122} 184}
123 185
124/* 186/*
187 * Advance a ->blkd_tasks-list pointer to the next entry, instead
188 * returning NULL if at the end of the list.
189 */
190static struct list_head *rcu_next_node_entry(struct task_struct *t)
191{
192 struct list_head *np;
193
194 np = t->rcu_node_entry.next;
195 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
196 np = NULL;
197 return np;
198}
199
200#ifdef CONFIG_RCU_TRACE
201
202#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */
206
207/*
208 * Dump additional statistice for TINY_PREEMPT_RCU.
209 */
210static void show_tiny_preempt_stats(struct seq_file *m)
211{
212 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
213 rcu_preempt_ctrlblk.rcb.qlen,
214 rcu_preempt_ctrlblk.n_grace_periods,
215 rcu_preempt_ctrlblk.gpnum,
216 rcu_preempt_ctrlblk.gpcpu,
217 rcu_preempt_ctrlblk.completed,
218 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
219 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]);
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) {
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
247 "normal balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet,
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */
258}
259
260#endif /* #ifdef CONFIG_RCU_TRACE */
261
262#ifdef CONFIG_RCU_BOOST
263
264#include "rtmutex_common.h"
265
266/*
267 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
268 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
269 */
270static int rcu_boost(void)
271{
272 unsigned long flags;
273 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t;
276
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL)
278 return 0; /* Nothing to boost. */
279 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++;
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
282 rcu_node_entry);
283 np = rcu_next_node_entry(t);
284 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
290 rcu_preempt_ctrlblk.boosted_this_gp++;
291 rt_mutex_unlock(&mtx);
292 return rcu_preempt_ctrlblk.boost_tasks != NULL;
293}
294
295/*
296 * Check to see if it is now time to start boosting RCU readers blocking
297 * the current grace period, and, if so, tell the rcu_kthread_task to
298 * start boosting them. If there is an expedited boost in progress,
299 * we wait for it to complete.
300 *
301 * If there are no blocked readers blocking the current grace period,
302 * return 0 to let the caller know, otherwise return 1. Note that this
303 * return value is independent of whether or not boosting was done.
304 */
305static int rcu_initiate_boost(void)
306{
307 if (!rcu_preempt_blocked_readers_cgp()) {
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
309 return 0;
310 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
312 rcu_preempt_ctrlblk.boost_tasks == NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else
319 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1;
321}
322
323/*
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343
344/*
345 * Do priority-boost accounting for the start of a new grace period.
346 */
347static void rcu_preempt_boost_start_gp(void)
348{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352}
353
354#else /* #ifdef CONFIG_RCU_BOOST */
355
356/*
357 * If there is no RCU priority boosting, we don't boost.
358 */
359static int rcu_boost(void)
360{
361 return 0;
362}
363
364/*
365 * If there is no RCU priority boosting, we don't initiate boosting,
366 * but we do indicate whether there are blocked readers blocking the
367 * current grace period.
368 */
369static int rcu_initiate_boost(void)
370{
371 return rcu_preempt_blocked_readers_cgp();
372}
373
374/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */
384static void rcu_preempt_boost_start_gp(void)
385{
386}
387
388#endif /* else #ifdef CONFIG_RCU_BOOST */
389
390/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note 391 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is 392 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked 393 * in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; 414 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 415 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150 416
417 /* If there is no GP then there is nothing more to do. */
418 if (!rcu_preempt_gp_in_progress())
419 return;
151 /* 420 /*
152 * If there is no GP, or if blocked readers are still blocking GP, 421 * Check up on boosting. If there are no readers blocking the
153 * then there is nothing more to do. 422 * current grace period, leave.
154 */ 423 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) 424 if (rcu_initiate_boost())
156 return; 425 return;
157 426
158 /* Advance callbacks. */ 427 /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
164 if (!rcu_preempt_blocked_readers_any()) 433 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; 434 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166 435
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */ 436 /* If there are done callbacks, cause them to be invoked. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ); 438 invoke_rcu_kthread();
170} 439}
171 440
172/* 441/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
178 447
179 /* Official start of GP. */ 448 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++; 449 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181 451
182 /* Any blocked RCU readers block new GP. */ 452 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any()) 453 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks = 454 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next; 455 rcu_preempt_ctrlblk.blkd_tasks.next;
186 456
457 /* Set up for RCU priority boosting. */
458 rcu_preempt_boost_start_gp();
459
187 /* If there is no running reader, CPU is done with GP. */ 460 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader()) 461 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs(); 462 rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
304 */ 577 */
305 empty = !rcu_preempt_blocked_readers_cgp(); 578 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next; 580 np = rcu_next_node_entry(t);
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry); 581 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np; 583 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np; 585 rcu_preempt_ctrlblk.exp_tasks = np;
586#ifdef CONFIG_RCU_BOOST
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */
315 INIT_LIST_HEAD(&t->rcu_node_entry); 590 INIT_LIST_HEAD(&t->rcu_node_entry);
316 591
317 /* 592 /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) 606 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done(); 607 rcu_report_exp_done();
333 } 608 }
609#ifdef CONFIG_RCU_BOOST
610 /* Unboost self if was boosted. */
611 if (special & RCU_READ_UNLOCK_BOOSTED) {
612 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
613 rt_mutex_unlock(t->rcu_boost_mutex);
614 t->rcu_boost_mutex = NULL;
615 }
616#endif /* #ifdef CONFIG_RCU_BOOST */
334 local_irq_restore(flags); 617 local_irq_restore(flags);
335} 618}
336 619
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
374 rcu_preempt_cpu_qs(); 657 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 658 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail) 659 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ); 660 invoke_rcu_kthread();
378 if (rcu_preempt_gp_in_progress() && 661 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() && 662 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader()) 663 rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
383 666
384/* 667/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to 668 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to 669 * update, so this is invoked from rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of 670 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and 671 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there 672 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
400 */ 683 */
401static void rcu_preempt_process_callbacks(void) 684static void rcu_preempt_process_callbacks(void)
402{ 685{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 686 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404} 687}
405 688
406/* 689/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
417 local_irq_save(flags); 700 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head; 701 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next; 702 rcu_preempt_ctrlblk.nexttail = &head->next;
703 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */ 704 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags); 705 local_irq_restore(flags);
422} 706}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
532 816
533 /* Wait for tail of ->blkd_tasks list to drain. */ 817 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp()) 818 if (rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost();
535 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp()); 821 !rcu_preempted_readers_exp());
537 822
@@ -572,6 +857,27 @@ void exit_rcu(void)
572 857
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574 859
860#ifdef CONFIG_RCU_TRACE
861
862/*
863 * Because preemptible RCU does not exist, it is not necessary to
864 * dump out its statistics.
865 */
866static void show_tiny_preempt_stats(struct seq_file *m)
867{
868}
869
870#endif /* #ifdef CONFIG_RCU_TRACE */
871
872/*
873 * Because preemptible RCU does not exist, it is never necessary to
874 * boost preempted RCU readers.
875 */
876static int rcu_boost(void)
877{
878 return 0;
879}
880
575/* 881/*
576 * Because preemptible RCU does not exist, it never has any callbacks 882 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check. 883 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 905#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600 906
601#ifdef CONFIG_DEBUG_LOCK_ALLOC 907#ifdef CONFIG_DEBUG_LOCK_ALLOC
602
603#include <linux/kernel_stat.h> 908#include <linux/kernel_stat.h>
604 909
605/* 910/*
606 * During boot, we forgive RCU lockdep issues. After this function is 911 * During boot, we forgive RCU lockdep issues. After this function is
607 * invoked, we start taking RCU lockdep issues seriously. 912 * invoked, we start taking RCU lockdep issues seriously.
608 */ 913 */
609void rcu_scheduler_starting(void) 914void __init rcu_scheduler_starting(void)
610{ 915{
611 WARN_ON(nr_context_switches() > 0); 916 WARN_ON(nr_context_switches() > 0);
612 rcu_scheduler_active = 1; 917 rcu_scheduler_active = 1;
613} 918}
614 919
615#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 920#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
921
922#ifdef CONFIG_RCU_BOOST
923#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
924#else /* #ifdef CONFIG_RCU_BOOST */
925#define RCU_BOOST_PRIO 1
926#endif /* #else #ifdef CONFIG_RCU_BOOST */
927
928#ifdef CONFIG_RCU_TRACE
929
930#ifdef CONFIG_RCU_BOOST
931
932static void rcu_initiate_boost_trace(void)
933{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL)
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++;
952}
953
954#endif /* #ifdef CONFIG_RCU_BOOST */
955
956static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
957{
958 unsigned long flags;
959
960 raw_local_irq_save(flags);
961 rcp->qlen -= n;
962 raw_local_irq_restore(flags);
963}
964
965/*
966 * Dump statistics for TINY_RCU, such as they are.
967 */
968static int show_tiny_stats(struct seq_file *m, void *unused)
969{
970 show_tiny_preempt_stats(m);
971 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
972 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
973 return 0;
974}
975
976static int show_tiny_stats_open(struct inode *inode, struct file *file)
977{
978 return single_open(file, show_tiny_stats, NULL);
979}
980
981static const struct file_operations show_tiny_stats_fops = {
982 .owner = THIS_MODULE,
983 .open = show_tiny_stats_open,
984 .read = seq_read,
985 .llseek = seq_lseek,
986 .release = single_release,
987};
988
989static struct dentry *rcudir;
990
991static int __init rcutiny_trace_init(void)
992{
993 struct dentry *retval;
994
995 rcudir = debugfs_create_dir("rcu", NULL);
996 if (!rcudir)
997 goto free_out;
998 retval = debugfs_create_file("rcudata", 0444, rcudir,
999 NULL, &show_tiny_stats_fops);
1000 if (!retval)
1001 goto free_out;
1002 return 0;
1003free_out:
1004 debugfs_remove_recursive(rcudir);
1005 return 1;
1006}
1007
1008static void __exit rcutiny_trace_cleanup(void)
1009{
1010 debugfs_remove_recursive(rcudir);
1011}
1012
1013module_init(rcutiny_trace_init);
1014module_exit(rcutiny_trace_cleanup);
1015
1016MODULE_AUTHOR("Paul E. McKenney");
1017MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1018MODULE_LICENSE("GPL");
1019
1020#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 65static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 66static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
69static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
70static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 71static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 72
69module_param(nreaders, int, 0444); 73module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 92MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444); 93module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 94MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
95module_param(test_boost, int, 0444);
96MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
97module_param(test_boost_interval, int, 0444);
98MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
99module_param(test_boost_duration, int, 0444);
100MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91module_param(torture_type, charp, 0444); 101module_param(torture_type, charp, 0444);
92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 102MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 103
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
109static struct task_struct *shuffler_task; 119static struct task_struct *shuffler_task;
110static struct task_struct *stutter_task; 120static struct task_struct *stutter_task;
111static struct task_struct *fqs_task; 121static struct task_struct *fqs_task;
122static struct task_struct *boost_tasks[NR_CPUS];
112 123
113#define RCU_TORTURE_PIPE_LEN 10 124#define RCU_TORTURE_PIPE_LEN 10
114 125
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
134static atomic_t n_rcu_torture_free; 145static atomic_t n_rcu_torture_free;
135static atomic_t n_rcu_torture_mberror; 146static atomic_t n_rcu_torture_mberror;
136static atomic_t n_rcu_torture_error; 147static atomic_t n_rcu_torture_error;
148static long n_rcu_torture_boost_ktrerror;
149static long n_rcu_torture_boost_rterror;
150static long n_rcu_torture_boost_allocerror;
151static long n_rcu_torture_boost_afferror;
152static long n_rcu_torture_boost_failure;
153static long n_rcu_torture_boosts;
137static long n_rcu_torture_timers; 154static long n_rcu_torture_timers;
138static struct list_head rcu_torture_removed; 155static struct list_head rcu_torture_removed;
139static cpumask_var_t shuffle_tmp_mask; 156static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
147#endif 164#endif
148int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 165int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 166
167#ifdef CONFIG_RCU_BOOST
168#define rcu_can_boost() 1
169#else /* #ifdef CONFIG_RCU_BOOST */
170#define rcu_can_boost() 0
171#endif /* #else #ifdef CONFIG_RCU_BOOST */
172
173static unsigned long boost_starttime; /* jiffies of next boost test start. */
174DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
175 /* and boost task create/destroy. */
176
150/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 177/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 178
152#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 179#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
277 void (*fqs)(void); 304 void (*fqs)(void);
278 int (*stats)(char *page); 305 int (*stats)(char *page);
279 int irq_capable; 306 int irq_capable;
307 int can_boost;
280 char *name; 308 char *name;
281}; 309};
282 310
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
366 .fqs = rcu_force_quiescent_state, 394 .fqs = rcu_force_quiescent_state,
367 .stats = NULL, 395 .stats = NULL,
368 .irq_capable = 1, 396 .irq_capable = 1,
397 .can_boost = rcu_can_boost(),
369 .name = "rcu" 398 .name = "rcu"
370}; 399};
371 400
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
408 .fqs = rcu_force_quiescent_state, 437 .fqs = rcu_force_quiescent_state,
409 .stats = NULL, 438 .stats = NULL,
410 .irq_capable = 1, 439 .irq_capable = 1,
440 .can_boost = rcu_can_boost(),
411 .name = "rcu_sync" 441 .name = "rcu_sync"
412}; 442};
413 443
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
424 .fqs = rcu_force_quiescent_state, 454 .fqs = rcu_force_quiescent_state,
425 .stats = NULL, 455 .stats = NULL,
426 .irq_capable = 1, 456 .irq_capable = 1,
457 .can_boost = rcu_can_boost(),
427 .name = "rcu_expedited" 458 .name = "rcu_expedited"
428}; 459};
429 460
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
684}; 715};
685 716
686/* 717/*
718 * RCU torture priority-boost testing. Runs one real-time thread per
719 * CPU for moderate bursts, repeatedly registering RCU callbacks and
720 * spinning waiting for them to be invoked. If a given callback takes
721 * too long to be invoked, we assume that priority inversion has occurred.
722 */
723
724struct rcu_boost_inflight {
725 struct rcu_head rcu;
726 int inflight;
727};
728
729static void rcu_torture_boost_cb(struct rcu_head *head)
730{
731 struct rcu_boost_inflight *rbip =
732 container_of(head, struct rcu_boost_inflight, rcu);
733
734 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
735 rbip->inflight = 0;
736}
737
738static int rcu_torture_boost(void *arg)
739{
740 unsigned long call_rcu_time;
741 unsigned long endtime;
742 unsigned long oldstarttime;
743 struct rcu_boost_inflight rbi = { .inflight = 0 };
744 struct sched_param sp;
745
746 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
747
748 /* Set real-time priority. */
749 sp.sched_priority = 1;
750 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
751 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
752 n_rcu_torture_boost_rterror++;
753 }
754
755 /* Each pass through the following loop does one boost-test cycle. */
756 do {
757 /* Wait for the next test interval. */
758 oldstarttime = boost_starttime;
759 while (jiffies - oldstarttime > ULONG_MAX / 2) {
760 schedule_timeout_uninterruptible(1);
761 rcu_stutter_wait("rcu_torture_boost");
762 if (kthread_should_stop() ||
763 fullstop != FULLSTOP_DONTSTOP)
764 goto checkwait;
765 }
766
767 /* Do one boost-test interval. */
768 endtime = oldstarttime + test_boost_duration * HZ;
769 call_rcu_time = jiffies;
770 while (jiffies - endtime > ULONG_MAX / 2) {
771 /* If we don't have a callback in flight, post one. */
772 if (!rbi.inflight) {
773 smp_mb(); /* RCU core before ->inflight = 1. */
774 rbi.inflight = 1;
775 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
776 if (jiffies - call_rcu_time >
777 test_boost_duration * HZ - HZ / 2) {
778 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
779 n_rcu_torture_boost_failure++;
780 }
781 call_rcu_time = jiffies;
782 }
783 cond_resched();
784 rcu_stutter_wait("rcu_torture_boost");
785 if (kthread_should_stop() ||
786 fullstop != FULLSTOP_DONTSTOP)
787 goto checkwait;
788 }
789
790 /*
791 * Set the start time of the next test interval.
792 * Yes, this is vulnerable to long delays, but such
793 * delays simply cause a false negative for the next
794 * interval. Besides, we are running at RT priority,
795 * so delays should be relatively rare.
796 */
797 while (oldstarttime == boost_starttime) {
798 if (mutex_trylock(&boost_mutex)) {
799 boost_starttime = jiffies +
800 test_boost_interval * HZ;
801 n_rcu_torture_boosts++;
802 mutex_unlock(&boost_mutex);
803 break;
804 }
805 schedule_timeout_uninterruptible(1);
806 }
807
808 /* Go do the stutter. */
809checkwait: rcu_stutter_wait("rcu_torture_boost");
810 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
811
812 /* Clean up and exit. */
813 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
814 rcutorture_shutdown_absorb("rcu_torture_boost");
815 while (!kthread_should_stop() || rbi.inflight)
816 schedule_timeout_uninterruptible(1);
817 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
818 return 0;
819}
820
821/*
687 * RCU torture force-quiescent-state kthread. Repeatedly induces 822 * RCU torture force-quiescent-state kthread. Repeatedly induces
688 * bursts of calls to force_quiescent_state(), increasing the probability 823 * bursts of calls to force_quiescent_state(), increasing the probability
689 * of occurrence of some important types of race conditions. 824 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
933 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
934 cnt += sprintf(&page[cnt], 1069 cnt += sprintf(&page[cnt],
935 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
936 "rtmbe: %d nt: %ld", 1071 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
1072 "rtbf: %ld rtb: %ld nt: %ld",
937 rcu_torture_current, 1073 rcu_torture_current,
938 rcu_torture_current_version, 1074 rcu_torture_current_version,
939 list_empty(&rcu_torture_freelist), 1075 list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
941 atomic_read(&n_rcu_torture_alloc_fail), 1077 atomic_read(&n_rcu_torture_alloc_fail),
942 atomic_read(&n_rcu_torture_free), 1078 atomic_read(&n_rcu_torture_free),
943 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1080 n_rcu_torture_boost_ktrerror,
1081 n_rcu_torture_boost_rterror,
1082 n_rcu_torture_boost_allocerror,
1083 n_rcu_torture_boost_afferror,
1084 n_rcu_torture_boost_failure,
1085 n_rcu_torture_boosts,
944 n_rcu_torture_timers); 1086 n_rcu_torture_timers);
945 if (atomic_read(&n_rcu_torture_mberror) != 0) 1087 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1088 n_rcu_torture_boost_ktrerror != 0 ||
1089 n_rcu_torture_boost_rterror != 0 ||
1090 n_rcu_torture_boost_allocerror != 0 ||
1091 n_rcu_torture_boost_afferror != 0 ||
1092 n_rcu_torture_boost_failure != 0)
946 cnt += sprintf(&page[cnt], " !!!"); 1093 cnt += sprintf(&page[cnt], " !!!");
947 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
948 if (i > 1) { 1095 if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
1094} 1241}
1095 1242
1096static inline void 1243static inline void
1097rcu_torture_print_module_parms(char *tag) 1244rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1098{ 1245{
1099 printk(KERN_ALERT "%s" TORTURE_FLAG 1246 printk(KERN_ALERT "%s" TORTURE_FLAG
1100 "--- %s: nreaders=%d nfakewriters=%d " 1247 "--- %s: nreaders=%d nfakewriters=%d "
1101 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1248 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1102 "shuffle_interval=%d stutter=%d irqreader=%d " 1249 "shuffle_interval=%d stutter=%d irqreader=%d "
1103 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", 1250 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1251 "test_boost=%d/%d test_boost_interval=%d "
1252 "test_boost_duration=%d\n",
1104 torture_type, tag, nrealreaders, nfakewriters, 1253 torture_type, tag, nrealreaders, nfakewriters,
1105 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1254 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1106 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); 1255 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1256 test_boost, cur_ops->can_boost,
1257 test_boost_interval, test_boost_duration);
1107} 1258}
1108 1259
1109static struct notifier_block rcutorture_nb = { 1260static struct notifier_block rcutorture_shutdown_nb = {
1110 .notifier_call = rcutorture_shutdown_notify, 1261 .notifier_call = rcutorture_shutdown_notify,
1111}; 1262};
1112 1263
1264static void rcutorture_booster_cleanup(int cpu)
1265{
1266 struct task_struct *t;
1267
1268 if (boost_tasks[cpu] == NULL)
1269 return;
1270 mutex_lock(&boost_mutex);
1271 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1272 t = boost_tasks[cpu];
1273 boost_tasks[cpu] = NULL;
1274 mutex_unlock(&boost_mutex);
1275
1276 /* This must be outside of the mutex, otherwise deadlock! */
1277 kthread_stop(t);
1278}
1279
1280static int rcutorture_booster_init(int cpu)
1281{
1282 int retval;
1283
1284 if (boost_tasks[cpu] != NULL)
1285 return 0; /* Already created, nothing more to do. */
1286
1287 /* Don't allow time recalculation while creating a new task. */
1288 mutex_lock(&boost_mutex);
1289 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1290 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1291 "rcu_torture_boost");
1292 if (IS_ERR(boost_tasks[cpu])) {
1293 retval = PTR_ERR(boost_tasks[cpu]);
1294 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1295 n_rcu_torture_boost_ktrerror++;
1296 boost_tasks[cpu] = NULL;
1297 mutex_unlock(&boost_mutex);
1298 return retval;
1299 }
1300 kthread_bind(boost_tasks[cpu], cpu);
1301 wake_up_process(boost_tasks[cpu]);
1302 mutex_unlock(&boost_mutex);
1303 return 0;
1304}
1305
1306static int rcutorture_cpu_notify(struct notifier_block *self,
1307 unsigned long action, void *hcpu)
1308{
1309 long cpu = (long)hcpu;
1310
1311 switch (action) {
1312 case CPU_ONLINE:
1313 case CPU_DOWN_FAILED:
1314 (void)rcutorture_booster_init(cpu);
1315 break;
1316 case CPU_DOWN_PREPARE:
1317 rcutorture_booster_cleanup(cpu);
1318 break;
1319 default:
1320 break;
1321 }
1322 return NOTIFY_OK;
1323}
1324
1325static struct notifier_block rcutorture_cpu_nb = {
1326 .notifier_call = rcutorture_cpu_notify,
1327};
1328
1113static void 1329static void
1114rcu_torture_cleanup(void) 1330rcu_torture_cleanup(void)
1115{ 1331{
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
1127 } 1343 }
1128 fullstop = FULLSTOP_RMMOD; 1344 fullstop = FULLSTOP_RMMOD;
1129 mutex_unlock(&fullstop_mutex); 1345 mutex_unlock(&fullstop_mutex);
1130 unregister_reboot_notifier(&rcutorture_nb); 1346 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1131 if (stutter_task) { 1347 if (stutter_task) {
1132 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1348 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1133 kthread_stop(stutter_task); 1349 kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
1184 kthread_stop(fqs_task); 1400 kthread_stop(fqs_task);
1185 } 1401 }
1186 fqs_task = NULL; 1402 fqs_task = NULL;
1403 if ((test_boost == 1 && cur_ops->can_boost) ||
1404 test_boost == 2) {
1405 unregister_cpu_notifier(&rcutorture_cpu_nb);
1406 for_each_possible_cpu(i)
1407 rcutorture_booster_cleanup(i);
1408 }
1187 1409
1188 /* Wait for all RCU callbacks to fire. */ 1410 /* Wait for all RCU callbacks to fire. */
1189 1411
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
1195 if (cur_ops->cleanup) 1417 if (cur_ops->cleanup)
1196 cur_ops->cleanup(); 1418 cur_ops->cleanup();
1197 if (atomic_read(&n_rcu_torture_error)) 1419 if (atomic_read(&n_rcu_torture_error))
1198 rcu_torture_print_module_parms("End of test: FAILURE"); 1420 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1199 else 1421 else
1200 rcu_torture_print_module_parms("End of test: SUCCESS"); 1422 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1201} 1423}
1202 1424
1203static int __init 1425static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
1242 nrealreaders = nreaders; 1464 nrealreaders = nreaders;
1243 else 1465 else
1244 nrealreaders = 2 * num_online_cpus(); 1466 nrealreaders = 2 * num_online_cpus();
1245 rcu_torture_print_module_parms("Start of test"); 1467 rcu_torture_print_module_parms(cur_ops, "Start of test");
1246 fullstop = FULLSTOP_DONTSTOP; 1468 fullstop = FULLSTOP_DONTSTOP;
1247 1469
1248 /* Set up the freelist. */ 1470 /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
1263 atomic_set(&n_rcu_torture_free, 0); 1485 atomic_set(&n_rcu_torture_free, 0);
1264 atomic_set(&n_rcu_torture_mberror, 0); 1486 atomic_set(&n_rcu_torture_mberror, 0);
1265 atomic_set(&n_rcu_torture_error, 0); 1487 atomic_set(&n_rcu_torture_error, 0);
1488 n_rcu_torture_boost_ktrerror = 0;
1489 n_rcu_torture_boost_rterror = 0;
1490 n_rcu_torture_boost_allocerror = 0;
1491 n_rcu_torture_boost_afferror = 0;
1492 n_rcu_torture_boost_failure = 0;
1493 n_rcu_torture_boosts = 0;
1266 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1267 atomic_set(&rcu_torture_wcount[i], 0); 1495 atomic_set(&rcu_torture_wcount[i], 0);
1268 for_each_possible_cpu(cpu) { 1496 for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
1376 goto unwind; 1604 goto unwind;
1377 } 1605 }
1378 } 1606 }
1379 register_reboot_notifier(&rcutorture_nb); 1607 if (test_boost_interval < 1)
1608 test_boost_interval = 1;
1609 if (test_boost_duration < 2)
1610 test_boost_duration = 2;
1611 if ((test_boost == 1 && cur_ops->can_boost) ||
1612 test_boost == 2) {
1613 int retval;
1614
1615 boost_starttime = jiffies + test_boost_interval * HZ;
1616 register_cpu_notifier(&rcutorture_cpu_nb);
1617 for_each_possible_cpu(i) {
1618 if (cpu_is_offline(i))
1619 continue; /* Heuristic: CPU can go offline. */
1620 retval = rcutorture_booster_init(i);
1621 if (retval < 0) {
1622 firsterr = retval;
1623 goto unwind;
1624 }
1625 }
1626 }
1627 register_reboot_notifier(&rcutorture_shutdown_nb);
1380 mutex_unlock(&fullstop_mutex); 1628 mutex_unlock(&fullstop_mutex);
1381 return 0; 1629 return 0;
1382 1630
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..d0ddfea6579d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
67 .gpnum = -300, \ 67 .gpnum = -300, \
68 .completed = -300, \ 68 .completed = -300, \
69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 70 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 71 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 72 .n_force_qs_ngp = 0, \
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
620static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 617static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
621{ 618{
622 if (rdp->gpnum != rnp->gpnum) { 619 if (rdp->gpnum != rnp->gpnum) {
623 rdp->qs_pending = 1; 620 /*
624 rdp->passed_quiesc = 0; 621 * If the current grace period is waiting for this CPU,
622 * set up to detect a quiescent state, otherwise don't
623 * go looking for one.
624 */
625 rdp->gpnum = rnp->gpnum; 625 rdp->gpnum = rnp->gpnum;
626 if (rnp->qsmask & rdp->grpmask) {
627 rdp->qs_pending = 1;
628 rdp->passed_quiesc = 0;
629 } else
630 rdp->qs_pending = 0;
626 } 631 }
627} 632}
628 633
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
681 686
682 /* Remember that we saw this grace-period completion. */ 687 /* Remember that we saw this grace-period completion. */
683 rdp->completed = rnp->completed; 688 rdp->completed = rnp->completed;
689
690 /*
691 * If we were in an extended quiescent state, we may have
692 * missed some grace periods that others CPUs handled on
693 * our behalf. Catch up with this state to avoid noting
694 * spurious new grace periods. If another grace period
695 * has started, then rnp->gpnum will have advanced, so
696 * we will detect this later on.
697 */
698 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
699 rdp->gpnum = rdp->completed;
700
701 /*
702 * If RCU does not need a quiescent state from this CPU,
703 * then make sure that this CPU doesn't go looking for one.
704 */
705 if ((rnp->qsmask & rdp->grpmask) == 0)
706 rdp->qs_pending = 0;
684 } 707 }
685} 708}
686 709
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
984#ifdef CONFIG_HOTPLUG_CPU 1007#ifdef CONFIG_HOTPLUG_CPU
985 1008
986/* 1009/*
987 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the 1010 * Move a dying CPU's RCU callbacks to online CPU's callback list.
988 * specified flavor of RCU. The callbacks will be adopted by the next 1011 * Synchronization is not required because this function executes
989 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever 1012 * in stop_machine() context.
990 * comes first. Because this is invoked from the CPU_DYING notifier,
991 * irqs are already disabled.
992 */ 1013 */
993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1014static void rcu_send_cbs_to_online(struct rcu_state *rsp)
994{ 1015{
995 int i; 1016 int i;
1017 /* current DYING CPU is cleared in the cpu_online_mask */
1018 int receive_cpu = cpumask_any(cpu_online_mask);
996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1019 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1020 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
997 1021
998 if (rdp->nxtlist == NULL) 1022 if (rdp->nxtlist == NULL)
999 return; /* irqs disabled, so comparison is stable. */ 1023 return; /* irqs disabled, so comparison is stable. */
1000 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1024
1001 *rsp->orphan_cbs_tail = rdp->nxtlist; 1025 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1002 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 1026 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1027 receive_rdp->qlen += rdp->qlen;
1028 receive_rdp->n_cbs_adopted += rdp->qlen;
1029 rdp->n_cbs_orphaned += rdp->qlen;
1030
1003 rdp->nxtlist = NULL; 1031 rdp->nxtlist = NULL;
1004 for (i = 0; i < RCU_NEXT_SIZE; i++) 1032 for (i = 0; i < RCU_NEXT_SIZE; i++)
1005 rdp->nxttail[i] = &rdp->nxtlist; 1033 rdp->nxttail[i] = &rdp->nxtlist;
1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
1008 rdp->qlen = 0; 1034 rdp->qlen = 0;
1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1010}
1011
1012/*
1013 * Adopt previously orphaned RCU callbacks.
1014 */
1015static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1016{
1017 unsigned long flags;
1018 struct rcu_data *rdp;
1019
1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1021 rdp = this_cpu_ptr(rsp->rda);
1022 if (rsp->orphan_cbs_list == NULL) {
1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1024 return;
1025 }
1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
1030 rsp->orphan_cbs_list = NULL;
1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
1032 rsp->orphan_qlen = 0;
1033 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1082 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1082 if (need_report & RCU_OFL_TASKS_EXP_GP) 1083 if (need_report & RCU_OFL_TASKS_EXP_GP)
1083 rcu_report_exp_rnp(rsp, rnp); 1084 rcu_report_exp_rnp(rsp, rnp);
1084
1085 rcu_adopt_orphan_cbs(rsp);
1086} 1085}
1087 1086
1088/* 1087/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
1100 1099
1101#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1100#else /* #ifdef CONFIG_HOTPLUG_CPU */
1102 1101
1103static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1102static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1104{
1105}
1106
1107static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1108{ 1103{
1109} 1104}
1110 1105
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1440 */ 1435 */
1441 local_irq_save(flags); 1436 local_irq_save(flags);
1442 rdp = this_cpu_ptr(rsp->rda); 1437 rdp = this_cpu_ptr(rsp->rda);
1443 rcu_process_gp_end(rsp, rdp);
1444 check_for_new_grace_period(rsp, rdp);
1445 1438
1446 /* Add the callback to our list. */ 1439 /* Add the callback to our list. */
1447 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1440 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1448 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1449 1442
1450 /* Start a new grace period if one not already started. */
1451 if (!rcu_gp_in_progress(rsp)) {
1452 unsigned long nestflag;
1453 struct rcu_node *rnp_root = rcu_get_root(rsp);
1454
1455 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1456 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1457 }
1458
1459 /* 1443 /*
1460 * Force the grace period if too many callbacks or too long waiting. 1444 * Force the grace period if too many callbacks or too long waiting.
1461 * Enforce hysteresis, and don't invoke force_quiescent_state() 1445 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1464 * is the only one waiting for a grace period to complete. 1448 * is the only one waiting for a grace period to complete.
1465 */ 1449 */
1466 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1467 rdp->blimit = LONG_MAX; 1451
1468 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1452 /* Are we ignoring a completed grace period? */
1469 *rdp->nxttail[RCU_DONE_TAIL] != head) 1453 rcu_process_gp_end(rsp, rdp);
1470 force_quiescent_state(rsp, 0); 1454 check_for_new_grace_period(rsp, rdp);
1471 rdp->n_force_qs_snap = rsp->n_force_qs; 1455
1472 rdp->qlen_last_fqs_check = rdp->qlen; 1456 /* Start a new grace period if one not already started. */
1457 if (!rcu_gp_in_progress(rsp)) {
1458 unsigned long nestflag;
1459 struct rcu_node *rnp_root = rcu_get_root(rsp);
1460
1461 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1462 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1463 } else {
1464 /* Give the grace period a kick. */
1465 rdp->blimit = LONG_MAX;
1466 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1467 *rdp->nxttail[RCU_DONE_TAIL] != head)
1468 force_quiescent_state(rsp, 0);
1469 rdp->n_force_qs_snap = rsp->n_force_qs;
1470 rdp->qlen_last_fqs_check = rdp->qlen;
1471 }
1473 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1472 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1474 force_quiescent_state(rsp, 1); 1473 force_quiescent_state(rsp, 1);
1475 local_irq_restore(flags); 1474 local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
1699 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 1698 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1700 * might complete its grace period before all of the other CPUs 1699 * might complete its grace period before all of the other CPUs
1701 * did their increment, causing this function to return too 1700 * did their increment, causing this function to return too
1702 * early. 1701 * early. Note that on_each_cpu() disables irqs, which prevents
1702 * any CPUs from coming online or going offline until each online
1703 * CPU has queued its RCU-barrier callback.
1703 */ 1704 */
1704 atomic_set(&rcu_barrier_cpu_count, 1); 1705 atomic_set(&rcu_barrier_cpu_count, 1);
1705 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1706 rcu_adopt_orphan_cbs(rsp);
1707 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 1706 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1708 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1709 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 1707 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1710 complete(&rcu_barrier_completion); 1708 complete(&rcu_barrier_completion);
1711 wait_for_completion(&rcu_barrier_completion); 1709 wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1831 case CPU_DYING: 1829 case CPU_DYING:
1832 case CPU_DYING_FROZEN: 1830 case CPU_DYING_FROZEN:
1833 /* 1831 /*
1834 * preempt_disable() in _rcu_barrier() prevents stop_machine(), 1832 * The whole machine is "stopped" except this CPU, so we can
1835 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" 1833 * touch any data without introducing corruption. We send the
1836 * returns, all online cpus have queued rcu_barrier_func(). 1834 * dying CPU's callbacks to an arbitrarily chosen online CPU.
1837 * The dying CPU clears its cpu_online_mask bit and
1838 * moves all of its RCU callbacks to ->orphan_cbs_list
1839 * in the context of stop_machine(), so subsequent calls
1840 * to _rcu_barrier() will adopt these callbacks and only
1841 * then queue rcu_barrier_func() on all remaining CPUs.
1842 */ 1835 */
1843 rcu_send_cbs_to_orphanage(&rcu_bh_state); 1836 rcu_send_cbs_to_online(&rcu_bh_state);
1844 rcu_send_cbs_to_orphanage(&rcu_sched_state); 1837 rcu_send_cbs_to_online(&rcu_sched_state);
1845 rcu_preempt_send_cbs_to_orphanage(); 1838 rcu_preempt_send_cbs_to_online();
1846 break; 1839 break;
1847 case CPU_DEAD: 1840 case CPU_DEAD:
1848 case CPU_DEAD_FROZEN: 1841 case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1880{ 1873{
1881 int i; 1874 int i;
1882 1875
1883 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) 1876 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1884 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1877 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1878 rsp->levelspread[0] = RCU_FANOUT_LEAF;
1885} 1879}
1886#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1880#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1887static void __init rcu_init_levelspread(struct rcu_state *rsp) 1881static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this did work well going from three levels to four.
35 * bug somewhere. 35 * Of course, your mileage may vary.
36 */ 36 */
37#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_LEAF 16
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) 41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42 42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#if NR_CPUS <= RCU_FANOUT 43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
47
48#if NR_CPUS <= RCU_FANOUT_1
44# define NUM_RCU_LVLS 1 49# define NUM_RCU_LVLS 1
45# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
46# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
47# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
48# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
50#elif NR_CPUS <= RCU_FANOUT_SQ 55#elif NR_CPUS <= RCU_FANOUT_2
51# define NUM_RCU_LVLS 2 56# define NUM_RCU_LVLS 2
52# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
55# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
57#elif NR_CPUS <= RCU_FANOUT_CUBE 62#elif NR_CPUS <= RCU_FANOUT_3
58# define NUM_RCU_LVLS 3 63# define NUM_RCU_LVLS 3
59# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
62# define NUM_RCU_LVL_3 NR_CPUS 67# define NUM_RCU_LVL_3 (NR_CPUS)
63# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH 69#elif NR_CPUS <= RCU_FANOUT_4
65# define NUM_RCU_LVLS 4 70# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 74# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
70# define NUM_RCU_LVL_4 NR_CPUS 75# define NUM_RCU_LVL_4 (NR_CPUS)
71#else 76#else
72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79
75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
203 long qlen_last_fqs_check; 208 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 209 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 210 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ 211 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ 212 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
208 unsigned long n_force_qs_snap; 213 unsigned long n_force_qs_snap;
209 /* did other CPU force QS recently? */ 214 /* did other CPU force QS recently? */
210 long blimit; /* Upper limit on a processed batch */ 215 long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
309 /* End of fields guarded by root rcu_node's lock. */ 314 /* End of fields guarded by root rcu_node's lock. */
310 315
311 raw_spinlock_t onofflock; /* exclude on/offline and */ 316 raw_spinlock_t onofflock; /* exclude on/offline and */
312 /* starting new GP. Also */ 317 /* starting new GP. */
313 /* protects the following */
314 /* orphan_cbs fields. */
315 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
316 /* orphaned by all CPUs in */
317 /* a given leaf rcu_node */
318 /* going offline. */
319 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
320 long orphan_qlen; /* Number of orphaned cbs. */
321 raw_spinlock_t fqslock; /* Only one task forcing */ 318 raw_spinlock_t fqslock; /* Only one task forcing */
322 /* quiescent states. */ 319 /* quiescent states. */
323 unsigned long jiffies_force_qs; /* Time at which to invoke */ 320 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
390static int rcu_preempt_pending(int cpu); 387static int rcu_preempt_pending(int cpu);
391static int rcu_preempt_needs_cpu(int cpu); 388static int rcu_preempt_needs_cpu(int cpu);
392static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 389static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
393static void rcu_preempt_send_cbs_to_orphanage(void); 390static void rcu_preempt_send_cbs_to_online(void);
394static void __init __rcu_init_preempt(void); 391static void __init __rcu_init_preempt(void);
395static void rcu_needs_cpu_flush(void); 392static void rcu_needs_cpu_flush(void);
396 393
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
28 29
29/* 30/*
30 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
773} 774}
774 775
775/* 776/*
776 * Move preemptable RCU's callbacks to ->orphan_cbs_list. 777 * Move preemptable RCU's callbacks from dying CPU to other online CPU.
777 */ 778 */
778static void rcu_preempt_send_cbs_to_orphanage(void) 779static void rcu_preempt_send_cbs_to_online(void)
779{ 780{
780 rcu_send_cbs_to_orphanage(&rcu_preempt_state); 781 rcu_send_cbs_to_online(&rcu_preempt_state);
781} 782}
782 783
783/* 784/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1001/* 1002/*
1002 * Because there is no preemptable RCU, there are no callbacks to move. 1003 * Because there is no preemptable RCU, there are no callbacks to move.
1003 */ 1004 */
1004static void rcu_preempt_send_cbs_to_orphanage(void) 1005static void rcu_preempt_send_cbs_to_online(void)
1005{ 1006{
1006} 1007}
1007 1008
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
1014 1015
1015#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1016 1017
1018#ifndef CONFIG_SMP
1019
1020void synchronize_sched_expedited(void)
1021{
1022 cond_resched();
1023}
1024EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025
1026#else /* #ifndef CONFIG_SMP */
1027
1028static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1029static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1030
1031static int synchronize_sched_expedited_cpu_stop(void *data)
1032{
1033 /*
1034 * There must be a full memory barrier on each affected CPU
1035 * between the time that try_stop_cpus() is called and the
1036 * time that it returns.
1037 *
1038 * In the current initial implementation of cpu_stop, the
1039 * above condition is already met when the control reaches
1040 * this point and the following smp_mb() is not strictly
1041 * necessary. Do smp_mb() anyway for documentation and
1042 * robustness against future implementation changes.
1043 */
1044 smp_mb(); /* See above comment block. */
1045 return 0;
1046}
1047
1048/*
1049 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1050 * approach to force grace period to end quickly. This consumes
1051 * significant time on all CPUs, and is thus not recommended for
1052 * any sort of common-case code.
1053 *
1054 * Note that it is illegal to call this function while holding any
1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1056 * observe this restriction will result in deadlock.
1057 *
1058 * This implementation can be thought of as an application of ticket
1059 * locking to RCU, with sync_sched_expedited_started and
1060 * sync_sched_expedited_done taking on the roles of the halves
1061 * of the ticket-lock word. Each task atomically increments
1062 * sync_sched_expedited_started upon entry, snapshotting the old value,
1063 * then attempts to stop all the CPUs. If this succeeds, then each
1064 * CPU will have executed a context switch, resulting in an RCU-sched
1065 * grace period. We are then done, so we use atomic_cmpxchg() to
1066 * update sync_sched_expedited_done to match our snapshot -- but
1067 * only if someone else has not already advanced past our snapshot.
1068 *
1069 * On the other hand, if try_stop_cpus() fails, we check the value
1070 * of sync_sched_expedited_done. If it has advanced past our
1071 * initial snapshot, then someone else must have forced a grace period
1072 * some time after we took our snapshot. In this case, our work is
1073 * done for us, and we can simply return. Otherwise, we try again,
1074 * but keep our initial snapshot for purposes of checking for someone
1075 * doing our work for us.
1076 *
1077 * If we fail too many times in a row, we fall back to synchronize_sched().
1078 */
1079void synchronize_sched_expedited(void)
1080{
1081 int firstsnap, s, snap, trycount = 0;
1082
1083 /* Note that atomic_inc_return() implies full memory barrier. */
1084 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1085 get_online_cpus();
1086
1087 /*
1088 * Each pass through the following loop attempts to force a
1089 * context switch on each CPU.
1090 */
1091 while (try_stop_cpus(cpu_online_mask,
1092 synchronize_sched_expedited_cpu_stop,
1093 NULL) == -EAGAIN) {
1094 put_online_cpus();
1095
1096 /* No joy, try again later. Or just synchronize_sched(). */
1097 if (trycount++ < 10)
1098 udelay(trycount * num_online_cpus());
1099 else {
1100 synchronize_sched();
1101 return;
1102 }
1103
1104 /* Check to see if someone else did our work for us. */
1105 s = atomic_read(&sync_sched_expedited_done);
1106 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1107 smp_mb(); /* ensure test happens before caller kfree */
1108 return;
1109 }
1110
1111 /*
1112 * Refetching sync_sched_expedited_started allows later
1113 * callers to piggyback on our grace period. We subtract
1114 * 1 to get the same token that the last incrementer got.
1115 * We retry after they started, so our grace period works
1116 * for them, and they started after our first try, so their
1117 * grace period works for us.
1118 */
1119 get_online_cpus();
1120 snap = atomic_read(&sync_sched_expedited_started) - 1;
1121 smp_mb(); /* ensure read is before try_stop_cpus(). */
1122 }
1123
1124 /*
1125 * Everyone up to our most recent fetch is covered by our grace
1126 * period. Update the counter, but only if our work is still
1127 * relevant -- which it won't be if someone who started later
1128 * than we did beat us to the punch.
1129 */
1130 do {
1131 s = atomic_read(&sync_sched_expedited_done);
1132 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1133 smp_mb(); /* ensure test happens before caller kfree */
1134 break;
1135 }
1136 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1137
1138 put_online_cpus();
1139}
1140EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1141
1142#endif /* #else #ifndef CONFIG_SMP */
1143
1017#if !defined(CONFIG_RCU_FAST_NO_HZ) 1144#if !defined(CONFIG_RCU_FAST_NO_HZ)
1018 1145
1019/* 1146/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
166 166
167 gpnum = rsp->gpnum; 167 gpnum = rsp->gpnum;
168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
170 rsp->completed, gpnum, rsp->signaled, 170 rsp->completed, gpnum, rsp->signaled,
171 (long)(rsp->jiffies_force_qs - jiffies), 171 (long)(rsp->jiffies_force_qs - jiffies),
172 (int)(jiffies & 0xffff), 172 (int)(jiffies & 0xffff),
173 rsp->n_force_qs, rsp->n_force_qs_ngp, 173 rsp->n_force_qs, rsp->n_force_qs_ngp,
174 rsp->n_force_qs - rsp->n_force_qs_ngp, 174 rsp->n_force_qs - rsp->n_force_qs_ngp,
175 rsp->n_force_qs_lh, rsp->orphan_qlen); 175 rsp->n_force_qs_lh);
176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
177 if (rnp->level != level) { 177 if (rnp->level != level) {
178 seq_puts(m, "\n"); 178 seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
300 300
301static struct dentry *rcudir; 301static struct dentry *rcudir;
302 302
303static int __init rcuclassic_trace_init(void) 303static int __init rcutree_trace_init(void)
304{ 304{
305 struct dentry *retval; 305 struct dentry *retval;
306 306
@@ -337,14 +337,14 @@ free_out:
337 return 1; 337 return 1;
338} 338}
339 339
340static void __exit rcuclassic_trace_cleanup(void) 340static void __exit rcutree_trace_cleanup(void)
341{ 341{
342 debugfs_remove_recursive(rcudir); 342 debugfs_remove_recursive(rcudir);
343} 343}
344 344
345 345
346module_init(rcuclassic_trace_init); 346module_init(rcutree_trace_init);
347module_exit(rcuclassic_trace_cleanup); 347module_exit(rcutree_trace_cleanup);
348 348
349MODULE_AUTHOR("Paul E. McKenney"); 349MODULE_AUTHOR("Paul E. McKenney");
350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); 350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index c68cead94dd7..04949089e760 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
271}; 279};
272 280
273#define root_task_group init_task_group 281#define root_task_group init_task_group
274 282
275/* task_group_lock serializes add/remove of task groups and also changes to 283/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 284static DEFINE_SPINLOCK(task_group_lock);
279 285
280#ifdef CONFIG_FAIR_GROUP_SCHED 286#ifdef CONFIG_FAIR_GROUP_SCHED
281 287
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 289
291/* 290/*
@@ -342,6 +341,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 342 * list is used during load balance.
344 */ 343 */
344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
@@ -360,14 +360,17 @@ struct cfs_rq {
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * this cpu's part of tg->shares 363 * Maintaining per-cpu shares distribution for group scheduling
364 *
365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time
364 */ 368 */
365 unsigned long shares; 369 u64 load_avg;
370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time;
366 372
367 /* 373 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 374#endif
372#endif 375#endif
373}; 376};
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
605 */ 608 */
606static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
607{ 610{
611 struct task_group *tg;
608 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
609 613
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
613} 619}
614 620
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -793,20 +799,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 799const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 800
795/* 801/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 802 * period over which we average the RT time consumption, measured
811 * in ms. 803 * in ms.
812 * 804 *
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1347 lw->inv_weight = 0;
1356} 1348}
1357 1349
1350static inline void update_load_set(struct load_weight *lw, unsigned long w)
1351{
1352 lw->weight = w;
1353 lw->inv_weight = 0;
1354}
1355
1358/* 1356/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1357 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1358 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1541
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1542#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1543
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1544/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1545 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1546 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1555 load = cpu_rq(cpu)->load.weight;
1653 } else { 1556 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1557 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1558 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1559 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1560 }
1658 1561
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1564 return 0;
1662} 1565}
1663 1566
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1567static void update_h_load(long cpu)
1682{ 1568{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1569 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1570}
1685 1571
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1572#endif
1693 1573
1694#ifdef CONFIG_PREEMPT 1574#ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1690
1811#endif 1691#endif
1812 1692
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1693static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1694static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1695static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1934#include "sched_idletask.c"
2064#include "sched_fair.c" 1935#include "sched_fair.c"
2065#include "sched_rt.c" 1936#include "sched_rt.c"
1937#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1938#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1939#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1940# include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2127 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2128 * Returns true if you have to wait for migration thread.
2257 */ 2129 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2130static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2131{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2132 /*
2263 * If the task is not on a runqueue (and not running), then 2133 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2134 * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2308 return dest_cpu;
2439 2309
2440 /* No more Mr. Nice Guy. */ 2310 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2311 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2312 /*
2443 /* 2313 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2314 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2315 * leave kernel.
2446 * leave kernel. 2316 */
2447 */ 2317 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2318 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2319 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2320 }
2454 2321
2455 return dest_cpu; 2322 return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2652 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2653 task_thread_info(p)->preempt_count = 1;
2787#endif 2654#endif
2655#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2656 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2657#endif
2789 2658
2790 put_cpu(); 2659 put_cpu();
2791} 2660}
@@ -3549,7 +3418,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3418 * select_task_rq() can race against ->cpus_allowed
3550 */ 3419 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3420 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3421 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3422 struct migration_arg arg = { p, dest_cpu };
3554 3423
3555 task_rq_unlock(rq, &flags); 3424 task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4083 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4084 return 0;
4216 4085
4217 cpu_relax(); 4086 arch_mutex_cpu_relax();
4218 } 4087 }
4219 4088
4220 return 1; 4089 return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4526 * This waits for either a completion of a specific task to be signaled or for a 4395 * This waits for either a completion of a specific task to be signaled or for a
4527 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4396 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4528 */ 4397 */
4529unsigned long __sched 4398long __sched
4530wait_for_completion_interruptible_timeout(struct completion *x, 4399wait_for_completion_interruptible_timeout(struct completion *x,
4531 unsigned long timeout) 4400 unsigned long timeout)
4532{ 4401{
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4559 * signaled or for a specified timeout to expire. It can be 4428 * signaled or for a specified timeout to expire. It can be
4560 * interrupted by a kill signal. The timeout is in jiffies. 4429 * interrupted by a kill signal. The timeout is in jiffies.
4561 */ 4430 */
4562unsigned long __sched 4431long __sched
4563wait_for_completion_killable_timeout(struct completion *x, 4432wait_for_completion_killable_timeout(struct completion *x,
4564 unsigned long timeout) 4433 unsigned long timeout)
4565{ 4434{
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4770}
4902 4771
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4772static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4773 const struct sched_param *param, bool user)
4905{ 4774{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4775 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4776 unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4925 * NOTE that the task may be already dead.
5057 */ 4926 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4927int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4928 const struct sched_param *param)
5060{ 4929{
5061 return __sched_setscheduler(p, policy, param, true); 4930 return __sched_setscheduler(p, policy, param, true);
5062} 4931}
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4943 * but our caller might not have that capability.
5075 */ 4944 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4945int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4946 const struct sched_param *param)
5078{ 4947{
5079 return __sched_setscheduler(p, policy, param, false); 4948 return __sched_setscheduler(p, policy, param, false);
5080} 4949}
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5459 unsigned state;
5591 5460
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5461 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5462 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5463 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5464#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5465 if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5623 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5624 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5625 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5626#undef SET_SYSCTL
5759} 5627}
5760 5628
@@ -5830,7 +5698,7 @@ again:
5830 goto out; 5698 goto out;
5831 5699
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5700 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5701 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5702 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5703 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5704 task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
5912} 5780}
5913 5781
5914#ifdef CONFIG_HOTPLUG_CPU 5782#ifdef CONFIG_HOTPLUG_CPU
5783
5915/* 5784/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5785 * Ensures that the idle task is using init_mm right before its cpu goes
5786 * offline.
5917 */ 5787 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5788void idle_task_exit(void)
5919{ 5789{
5920 struct rq *rq = cpu_rq(dead_cpu); 5790 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5791
5924 local_irq_save(flags); 5792 BUG_ON(cpu_online(smp_processor_id()));
5925 5793
5926 raw_spin_lock(&rq->lock); 5794 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5795 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5796 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5797}
5939 5798
5940/* 5799/*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5806static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5807{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5808 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5809
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5810 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5811 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5812}
5977 5813
5978/* 5814/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5815 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5816 */
5983void sched_idle_next(void) 5817static void calc_global_load_remove(struct rq *rq)
5984{ 5818{
5985 int this_cpu = smp_processor_id(); 5819 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5820 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5821}
6005 5822
6006/* 5823/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5824 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5825 * try_to_wake_up()->select_task_rq().
5826 *
5827 * Called with rq->lock held even though we'er in stop_machine() and
5828 * there's no concurrency possible, we hold the required locks anyway
5829 * because of lock validation efforts.
6009 */ 5830 */
6010void idle_task_exit(void) 5831static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5832{
6024 struct rq *rq = cpu_rq(dead_cpu); 5833 struct rq *rq = cpu_rq(dead_cpu);
6025 5834 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5835 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5836
6034 /* 5837 /*
6035 * Drop lock around migration; if someone else moves it, 5838 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5839 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5840 *
5841 * We're currently inside stop_machine() and the rq is either stuck
5842 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5843 * either way we should never end up calling schedule() until we're
5844 * done here.
6038 */ 5845 */
6039 raw_spin_unlock_irq(&rq->lock); 5846 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5847
6052 for ( ; ; ) { 5848 for ( ; ; ) {
6053 if (!rq->nr_running) 5849 /*
5850 * There's this thread running, bail when that's the only
5851 * remaining thread.
5852 */
5853 if (rq->nr_running == 1)
6054 break; 5854 break;
5855
6055 next = pick_next_task(rq); 5856 next = pick_next_task(rq);
6056 if (!next) 5857 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5858 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5859
5860 /* Find suitable destination for @next, with force if needed. */
5861 dest_cpu = select_fallback_rq(dead_cpu, next);
5862 raw_spin_unlock(&rq->lock);
5863
5864 __migrate_task(next, dead_cpu, dest_cpu);
5865
5866 raw_spin_lock(&rq->lock);
6061 } 5867 }
6062}
6063 5868
6064/* 5869 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5870}
5871
6072#endif /* CONFIG_HOTPLUG_CPU */ 5872#endif /* CONFIG_HOTPLUG_CPU */
6073 5873
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6078 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6079 struct rq *rq = cpu_rq(cpu);
6280 6080
6281 switch (action) { 6081 switch (action & ~CPU_TASKS_FROZEN) {
6282 6082
6283 case CPU_UP_PREPARE: 6083 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6084 rq->calc_load_update = calc_load_update;
6286 break; 6085 break;
6287 6086
6288 case CPU_ONLINE: 6087 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6088 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6089 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6090 if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6096 break;
6299 6097
6300#ifdef CONFIG_HOTPLUG_CPU 6098#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6099 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6100 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6101 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6102 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6104 set_rq_offline(rq);
6323 } 6105 }
6106 migrate_tasks(cpu);
6107 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6108 raw_spin_unlock_irqrestore(&rq->lock, flags);
6109
6110 migrate_nr_uninterruptible(rq);
6111 calc_global_load_remove(rq);
6325 break; 6112 break;
6326#endif 6113#endif
6327 } 6114 }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7839
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7840#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7841static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7842 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7843 struct sched_entity *parent)
8057{ 7844{
8058 struct rq *rq = cpu_rq(cpu); 7845 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7846 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7847 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7848 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7849
8065 tg->se[cpu] = se; 7850 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7851 /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7858 se->cfs_rq = parent->my_q;
8074 7859
8075 se->my_q = cfs_rq; 7860 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7861 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7862 se->parent = parent;
8079} 7863}
8080#endif 7864#endif
8081 7865
8082#ifdef CONFIG_RT_GROUP_SCHED 7866#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7867static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7868 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7869 struct sched_rt_entity *parent)
8086{ 7870{
8087 struct rq *rq = cpu_rq(cpu); 7871 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7874 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7875 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7876 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7877
8096 tg->rt_se[cpu] = rt_se; 7878 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7879 if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
8164#ifdef CONFIG_CGROUP_SCHED 7946#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7947 list_add(&init_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7948 INIT_LIST_HEAD(&init_task_group.children);
8167 7949 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7950#endif /* CONFIG_CGROUP_SCHED */
8169 7951
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7952 for_each_possible_cpu(i) {
8175 struct rq *rq; 7953 struct rq *rq;
8176 7954
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7962#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7963 init_task_group.shares = init_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7965 /*
8189 * How much cpu bandwidth does init_task_group get? 7966 * How much cpu bandwidth does init_task_group get?
8190 * 7967 *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
8204 * We achieve this by letting init_task_group's tasks sit 7981 * We achieve this by letting init_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8206 */ 7983 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7985#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7986
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7988#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7991#endif
8218 7992
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8486 if (!se) 8260 if (!se)
8487 goto err_free_rq; 8261 goto err_free_rq;
8488 8262
8489 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8263 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8490 } 8264 }
8491 8265
8492 return 1; 8266 return 1;
@@ -8497,15 +8271,21 @@ err:
8497 return 0; 8271 return 0;
8498} 8272}
8499 8273
8500static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8501{
8502 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8503 &cpu_rq(cpu)->leaf_cfs_rq_list);
8504}
8505
8506static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8274static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8507{ 8275{
8508 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8276 struct rq *rq = cpu_rq(cpu);
8277 unsigned long flags;
8278
8279 /*
8280 * Only empty task groups can be destroyed; so we can speculatively
8281 * check on_list without danger of it being re-added.
8282 */
8283 if (!tg->cfs_rq[cpu]->on_list)
8284 return;
8285
8286 raw_spin_lock_irqsave(&rq->lock, flags);
8287 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8288 raw_spin_unlock_irqrestore(&rq->lock, flags);
8509} 8289}
8510#else /* !CONFG_FAIR_GROUP_SCHED */ 8290#else /* !CONFG_FAIR_GROUP_SCHED */
8511static inline void free_fair_sched_group(struct task_group *tg) 8291static inline void free_fair_sched_group(struct task_group *tg)
@@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8518 return 1; 8298 return 1;
8519} 8299}
8520 8300
8521static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8522{
8523}
8524
8525static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8301static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8526{ 8302{
8527} 8303}
@@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8576 if (!rt_se) 8352 if (!rt_se)
8577 goto err_free_rq; 8353 goto err_free_rq;
8578 8354
8579 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8355 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8580 } 8356 }
8581 8357
8582 return 1; 8358 return 1;
@@ -8586,17 +8362,6 @@ err_free_rq:
8586err: 8362err:
8587 return 0; 8363 return 0;
8588} 8364}
8589
8590static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8591{
8592 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8593 &cpu_rq(cpu)->leaf_rt_rq_list);
8594}
8595
8596static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8597{
8598 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8599}
8600#else /* !CONFIG_RT_GROUP_SCHED */ 8365#else /* !CONFIG_RT_GROUP_SCHED */
8601static inline void free_rt_sched_group(struct task_group *tg) 8366static inline void free_rt_sched_group(struct task_group *tg)
8602{ 8367{
@@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8607{ 8372{
8608 return 1; 8373 return 1;
8609} 8374}
8610
8611static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8612{
8613}
8614
8615static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8616{
8617}
8618#endif /* CONFIG_RT_GROUP_SCHED */ 8375#endif /* CONFIG_RT_GROUP_SCHED */
8619 8376
8620#ifdef CONFIG_CGROUP_SCHED 8377#ifdef CONFIG_CGROUP_SCHED
@@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8630{ 8387{
8631 struct task_group *tg; 8388 struct task_group *tg;
8632 unsigned long flags; 8389 unsigned long flags;
8633 int i;
8634 8390
8635 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8636 if (!tg) 8392 if (!tg)
@@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8643 goto err; 8399 goto err;
8644 8400
8645 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8646 for_each_possible_cpu(i) {
8647 register_fair_sched_group(tg, i);
8648 register_rt_sched_group(tg, i);
8649 }
8650 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8651 8403
8652 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8676 unsigned long flags; 8428 unsigned long flags;
8677 int i; 8429 int i;
8678 8430
8679 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8680 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8681 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8682 unregister_rt_sched_group(tg, i); 8434
8683 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8684 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8685 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8686 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8727#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8728 8480
8729#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8730static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8731{
8732 struct cfs_rq *cfs_rq = se->cfs_rq;
8733 int on_rq;
8734
8735 on_rq = se->on_rq;
8736 if (on_rq)
8737 dequeue_entity(cfs_rq, se, 0);
8738
8739 se->load.weight = shares;
8740 se->load.inv_weight = 0;
8741
8742 if (on_rq)
8743 enqueue_entity(cfs_rq, se, 0);
8744}
8745
8746static void set_se_shares(struct sched_entity *se, unsigned long shares)
8747{
8748 struct cfs_rq *cfs_rq = se->cfs_rq;
8749 struct rq *rq = cfs_rq->rq;
8750 unsigned long flags;
8751
8752 raw_spin_lock_irqsave(&rq->lock, flags);
8753 __set_se_shares(se, shares);
8754 raw_spin_unlock_irqrestore(&rq->lock, flags);
8755}
8756
8757static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8758 8483
8759int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8776 if (tg->shares == shares) 8501 if (tg->shares == shares)
8777 goto done; 8502 goto done;
8778 8503
8779 spin_lock_irqsave(&task_group_lock, flags);
8780 for_each_possible_cpu(i)
8781 unregister_fair_sched_group(tg, i);
8782 list_del_rcu(&tg->siblings);
8783 spin_unlock_irqrestore(&task_group_lock, flags);
8784
8785 /* wait for any ongoing reference to this group to finish */
8786 synchronize_sched();
8787
8788 /*
8789 * Now we are free to modify the group's share on each cpu
8790 * w/o tripping rebalance_share or load_balance_fair.
8791 */
8792 tg->shares = shares; 8504 tg->shares = shares;
8793 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8794 /* 8506 struct rq *rq = cpu_rq(i);
8795 * force a rebalance 8507 struct sched_entity *se;
8796 */ 8508
8797 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8798 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8799 } 8515 }
8800 8516
8801 /*
8802 * Enable load balance activity on this group, by inserting it back on
8803 * each cpu's rq->leaf_cfs_rq_list.
8804 */
8805 spin_lock_irqsave(&task_group_lock, flags);
8806 for_each_possible_cpu(i)
8807 register_fair_sched_group(tg, i);
8808 list_add_rcu(&tg->siblings, &tg->parent->children);
8809 spin_unlock_irqrestore(&task_group_lock, flags);
8810done: 8517done:
8811 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8812 return 0; 8519 return 0;
@@ -9532,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = {
9532}; 9239};
9533#endif /* CONFIG_CGROUP_CPUACCT */ 9240#endif /* CONFIG_CGROUP_CPUACCT */
9534 9241
9535#ifndef CONFIG_SMP
9536
9537void synchronize_sched_expedited(void)
9538{
9539 barrier();
9540}
9541EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9542
9543#else /* #ifndef CONFIG_SMP */
9544
9545static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9546
9547static int synchronize_sched_expedited_cpu_stop(void *data)
9548{
9549 /*
9550 * There must be a full memory barrier on each affected CPU
9551 * between the time that try_stop_cpus() is called and the
9552 * time that it returns.
9553 *
9554 * In the current initial implementation of cpu_stop, the
9555 * above condition is already met when the control reaches
9556 * this point and the following smp_mb() is not strictly
9557 * necessary. Do smp_mb() anyway for documentation and
9558 * robustness against future implementation changes.
9559 */
9560 smp_mb(); /* See above comment block. */
9561 return 0;
9562}
9563
9564/*
9565 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9566 * approach to force grace period to end quickly. This consumes
9567 * significant time on all CPUs, and is thus not recommended for
9568 * any sort of common-case code.
9569 *
9570 * Note that it is illegal to call this function while holding any
9571 * lock that is acquired by a CPU-hotplug notifier. Failing to
9572 * observe this restriction will result in deadlock.
9573 */
9574void synchronize_sched_expedited(void)
9575{
9576 int snap, trycount = 0;
9577
9578 smp_mb(); /* ensure prior mod happens before capturing snap. */
9579 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9580 get_online_cpus();
9581 while (try_stop_cpus(cpu_online_mask,
9582 synchronize_sched_expedited_cpu_stop,
9583 NULL) == -EAGAIN) {
9584 put_online_cpus();
9585 if (trycount++ < 10)
9586 udelay(trycount * num_online_cpus());
9587 else {
9588 synchronize_sched();
9589 return;
9590 }
9591 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9592 smp_mb(); /* ensure test happens before caller kfree */
9593 return;
9594 }
9595 get_online_cpus();
9596 }
9597 atomic_inc(&synchronize_sched_expedited_count);
9598 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9599 put_online_cpus();
9600}
9601EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9602
9603#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..c80fedcd476b
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &init_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_task_get(struct task_struct *p)
45{
46 struct autogroup *ag;
47 unsigned long flags;
48
49 if (!lock_task_sighand(p, &flags))
50 return autogroup_kref_get(&autogroup_default);
51
52 ag = autogroup_kref_get(p->signal->autogroup);
53 unlock_task_sighand(p, &flags);
54
55 return ag;
56}
57
58static inline struct autogroup *autogroup_create(void)
59{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
61 struct task_group *tg;
62
63 if (!ag)
64 goto out_fail;
65
66 tg = sched_create_group(&init_task_group);
67
68 if (IS_ERR(tg))
69 goto out_free;
70
71 kref_init(&ag->kref);
72 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg;
75 tg->autogroup = ag;
76
77 return ag;
78
79out_free:
80 kfree(ag);
81out_fail:
82 if (printk_ratelimit()) {
83 printk(KERN_WARNING "autogroup_create: %s failure.\n",
84 ag ? "sched_create_group()" : "kmalloc()");
85 }
86
87 return autogroup_kref_get(&autogroup_default);
88}
89
90static inline bool
91task_wants_autogroup(struct task_struct *p, struct task_group *tg)
92{
93 if (tg != &root_task_group)
94 return false;
95
96 if (p->sched_class != &fair_sched_class)
97 return false;
98
99 /*
100 * We can only assume the task group can't go away on us if
101 * autogroup_move_group() can see us on ->thread_group list.
102 */
103 if (p->flags & PF_EXITING)
104 return false;
105
106 return true;
107}
108
109static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{
112 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
113
114 if (enabled && task_wants_autogroup(p, tg))
115 return p->signal->autogroup->tg;
116
117 return tg;
118}
119
120static void
121autogroup_move_group(struct task_struct *p, struct autogroup *ag)
122{
123 struct autogroup *prev;
124 struct task_struct *t;
125 unsigned long flags;
126
127 BUG_ON(!lock_task_sighand(p, &flags));
128
129 prev = p->signal->autogroup;
130 if (prev == ag) {
131 unlock_task_sighand(p, &flags);
132 return;
133 }
134
135 p->signal->autogroup = autogroup_kref_get(ag);
136
137 t = p;
138 do {
139 sched_move_task(t);
140 } while_each_thread(p, t);
141
142 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev);
144}
145
146/* Allocates GFP_KERNEL, cannot be called under any spinlock */
147void sched_autogroup_create_attach(struct task_struct *p)
148{
149 struct autogroup *ag = autogroup_create();
150
151 autogroup_move_group(p, ag);
152 /* drop extra refrence added by autogroup_create() */
153 autogroup_kref_put(ag);
154}
155EXPORT_SYMBOL(sched_autogroup_create_attach);
156
157/* Cannot be called under siglock. Currently has no users */
158void sched_autogroup_detach(struct task_struct *p)
159{
160 autogroup_move_group(p, &autogroup_default);
161}
162EXPORT_SYMBOL(sched_autogroup_detach);
163
164void sched_autogroup_fork(struct signal_struct *sig)
165{
166 sig->autogroup = autogroup_task_get(current);
167}
168
169void sched_autogroup_exit(struct signal_struct *sig)
170{
171 autogroup_kref_put(sig->autogroup);
172}
173
174static int __init setup_autogroup(char *str)
175{
176 sysctl_sched_autogroup_enabled = 0;
177
178 return 1;
179}
180
181__setup("noautogroup", setup_autogroup);
182
183#ifdef CONFIG_PROC_FS
184
185int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
186{
187 static unsigned long next = INITIAL_JIFFIES;
188 struct autogroup *ag;
189 int err;
190
191 if (*nice < -20 || *nice > 19)
192 return -EINVAL;
193
194 err = security_task_setnice(current, *nice);
195 if (err)
196 return err;
197
198 if (*nice < 0 && !can_nice(current, *nice))
199 return -EPERM;
200
201 /* this is a heavy operation taking global locks.. */
202 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
203 return -EAGAIN;
204
205 next = HZ / 10 + jiffies;
206 ag = autogroup_task_get(p);
207
208 down_write(&ag->lock);
209 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
210 if (!err)
211 ag->nice = *nice;
212 up_write(&ag->lock);
213
214 autogroup_kref_put(ag);
215
216 return err;
217}
218
219void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{
221 struct autogroup *ag = autogroup_task_get(p);
222
223 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock);
226
227 autogroup_kref_put(ag);
228}
229#endif /* CONFIG_PROC_FS */
230
231#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235}
236#endif /* CONFIG_SCHED_DEBUG */
237
238#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1145 */
956 update_curr(cfs_rq); 1146 update_curr(cfs_rq);
957 1147
1148 /*
1149 * Update share accounting for long-running entities.
1150 */
1151 update_entity_shares_tick(cfs_rq);
1152
958#ifdef CONFIG_SCHED_HRTICK 1153#ifdef CONFIG_SCHED_HRTICK
959 /* 1154 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1155 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1250 flags = ENQUEUE_WAKEUP;
1056 } 1251 }
1057 1252
1253 for_each_sched_entity(se) {
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255
1256 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0);
1258 }
1259
1058 hrtick_update(rq); 1260 hrtick_update(rq);
1059} 1261}
1060 1262
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1273 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1274 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1275 dequeue_entity(cfs_rq, se, flags);
1276
1074 /* Don't dequeue parent if it has other entities besides us */ 1277 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1278 if (cfs_rq->load.weight)
1076 break; 1279 break;
1077 flags |= DEQUEUE_SLEEP; 1280 flags |= DEQUEUE_SLEEP;
1078 } 1281 }
1079 1282
1283 for_each_sched_entity(se) {
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285
1286 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0);
1288 }
1289
1080 hrtick_update(rq); 1290 hrtick_update(rq);
1081} 1291}
1082 1292
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1353 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1354 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1355 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1356 */
1161static long effective_load(struct task_group *tg, int cpu, 1357static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1358{
1164 struct sched_entity *se = tg->se[cpu]; 1359 struct sched_entity *se = tg->se[cpu];
1165 1360
1166 if (!tg->parent) 1361 if (!tg->parent)
1167 return wl; 1362 return wl;
1168 1363
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1364 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1365 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1366
1188 S = se->my_q->tg->shares; 1367 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1368 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1369 rw = se->my_q->load.weight;
1191 1370
1192 a = S*(rw + wl); 1371 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1372 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1687 sd = tmp;
1509 } 1688 }
1510 1689
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1690 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1691 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1692 return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
1909} 2071}
1910 2072
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2073#ifdef CONFIG_FAIR_GROUP_SCHED
2074/*
2075 * update tg->load_weight by folding this cpu's load_avg
2076 */
2077static int update_shares_cpu(struct task_group *tg, int cpu)
2078{
2079 struct cfs_rq *cfs_rq;
2080 unsigned long flags;
2081 struct rq *rq;
2082
2083 if (!tg->se[cpu])
2084 return 0;
2085
2086 rq = cpu_rq(cpu);
2087 cfs_rq = tg->cfs_rq[cpu];
2088
2089 raw_spin_lock_irqsave(&rq->lock, flags);
2090
2091 update_rq_clock(rq);
2092 update_cfs_load(cfs_rq, 1);
2093
2094 /*
2095 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks.
2097 */
2098 update_cfs_shares(cfs_rq, 0);
2099
2100 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101
2102 return 0;
2103}
2104
2105static void update_shares(int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 struct rq *rq = cpu_rq(cpu);
2109
2110 rcu_read_lock();
2111 for_each_leaf_cfs_rq(rq, cfs_rq)
2112 update_shares_cpu(cfs_rq->tg, cpu);
2113 rcu_read_unlock();
2114}
2115
1912static unsigned long 2116static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2117load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2118 unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2160 return max_load_move - rem_load_move;
1957} 2161}
1958#else 2162#else
2163static inline void update_shares(int cpu)
2164{
2165}
2166
1959static unsigned long 2167static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2168load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2169 unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3240 schedstat_inc(sd, lb_count[idle]);
3033 3241
3034redo: 3242redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3244 cpus, balance);
3038 3245
@@ -3174,8 +3381,6 @@ out_one_pinned:
3174 else 3381 else
3175 ld_moved = 0; 3382 ld_moved = 0;
3176out: 3383out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3384 return ld_moved;
3180} 3385}
3181 3386
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3404 */
3200 raw_spin_unlock(&this_rq->lock); 3405 raw_spin_unlock(&this_rq->lock);
3201 3406
3407 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3408 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3409 unsigned long interval;
3204 int balance = 1; 3410 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3775 int update_next_balance = 0;
3570 int need_serialize; 3776 int need_serialize;
3571 3777
3778 update_shares(cpu);
3779
3572 for_each_domain(cpu, sd) { 3780 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3781 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3782 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..98d8c1e80edb 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/smp.h> 33#include <linux/smp.h>
34#include <linux/delay.h>
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35 36
36static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
203 * all srcu_read_lock() calls using the old counters have completed. 204 * all srcu_read_lock() calls using the old counters have completed.
204 * Their corresponding critical sections might well be still 205 * Their corresponding critical sections might well be still
205 * executing, but the srcu_read_lock() primitives themselves 206 * executing, but the srcu_read_lock() primitives themselves
206 * will have finished executing. 207 * will have finished executing. We initially give readers
208 * an arbitrarily chosen 10 microseconds to get out of their
209 * SRCU read-side critical sections, then loop waiting 1/HZ
210 * seconds per iteration.
207 */ 211 */
208 212
213 if (srcu_readers_active_idx(sp, idx))
214 udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
209 while (srcu_readers_active_idx(sp, idx)) 215 while (srcu_readers_active_idx(sp, idx))
210 schedule_timeout_interruptible(1); 216 schedule_timeout_interruptible(1);
211 217
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 46404414d8a7..ae5cbb1e3ced 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 333 .proc_handler = proc_dointvec,
353 }, 334 },
354 { 335 {
336 .procname = "sched_shares_window",
337 .data = &sysctl_sched_shares_window,
338 .maxlen = sizeof(unsigned int),
339 .mode = 0644,
340 .proc_handler = proc_dointvec,
341 },
342 {
355 .procname = "timer_migration", 343 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 344 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 345 .maxlen = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 370 .mode = 0644,
383 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
384 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
385#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
386 { 385 {
387 .procname = "prove_locking", 386 .procname = "prove_locking",
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/math64.h> 23#include <linux/math64.h>
24#include <linux/kernel.h>
24 25
25/* 26/*
26 * fixed point arithmetic scale factor for skew 27 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
57 int index; 58 int index;
58 int num_samples = sync->num_samples; 59 int num_samples = sync->num_samples;
59 60
60 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { 61 if (num_samples > ARRAY_SIZE(buffer)) {
61 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); 62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
62 if (!samples) { 63 if (!samples) {
63 samples = buffer; 64 samples = buffer;
64 num_samples = sizeof(buffer)/sizeof(buffer[0]); 65 num_samples = ARRAY_SIZE(buffer);
65 } 66 }
66 } else { 67 } else {
67 samples = buffer; 68 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..5bb86da82003 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
32 cycle_t cycle_interval; 32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */ 33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval; 34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
35 /* Raw nano seconds accumulated per NTP interval. */ 37 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval; 38 u32 raw_interval;
37 39
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
62static void timekeeper_setup_internals(struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
63{ 65{
64 cycle_t interval; 66 cycle_t interval;
65 u64 tmp; 67 u64 tmp, ntpinterval;
66 68
67 timekeeper.clock = clock; 69 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
70 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH; 73 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift; 74 tmp <<= clock->shift;
75 ntpinterval = tmp;
73 tmp += clock->mult/2; 76 tmp += clock->mult/2;
74 do_div(tmp, clock->mult); 77 do_div(tmp, clock->mult);
75 if (tmp == 0) 78 if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
80 83
81 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
83 timekeeper.raw_interval = 87 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
85 89
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
719 723
720 /* Accumulate error between NTP and clock interval */ 724 /* Accumulate error between NTP and clock interval */
721 timekeeper.ntp_error += tick_length << shift; 725 timekeeper.ntp_error += tick_length << shift;
722 timekeeper.ntp_error -= timekeeper.xtime_interval << 726 timekeeper.ntp_error -=
727 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
723 (timekeeper.ntp_error_shift + shift); 728 (timekeeper.ntp_error_shift + shift);
724 729
725 return offset; 730 return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..32a19f9397fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
79{ 79{
80 struct hrtimer *timer, tmp; 80 struct hrtimer *timer, tmp;
81 unsigned long next = 0, i; 81 unsigned long next = 0, i;
82 struct rb_node *curr; 82 struct timerqueue_node *curr;
83 unsigned long flags; 83 unsigned long flags;
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = timerqueue_getnext(&base->active);
90 /* 90 /*
91 * Crude but we have to do this O(N*N) thing, because 91 * Crude but we have to do this O(N*N) thing, because
92 * we have to unlock the base when printing: 92 * we have to unlock the base when printing:
93 */ 93 */
94 while (curr && i < next) { 94 while (curr && i < next) {
95 curr = rb_next(curr); 95 curr = timerqueue_iterate_next(curr);
96 i++; 96 i++;
97 } 97 }
98 98
99 if (curr) { 99 if (curr) {
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = container_of(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
diff --git a/kernel/timer.c b/kernel/timer.c
index 353b9227c2ec..43ca9936f2d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
88EXPORT_SYMBOL(boot_tvec_bases); 88EXPORT_SYMBOL(boot_tvec_bases);
89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
100 */
101#define TBASE_DEFERRABLE_FLAG (0x1)
102
103/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
104static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
105{ 93{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
113 101
114static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
115{ 103{
116 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
117 TBASE_DEFERRABLE_FLAG));
118} 105}
119 106
120static inline void 107static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
343} 330}
344EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
345 332
346
347static inline void set_running_timer(struct tvec_base *base,
348 struct timer_list *timer)
349{
350#ifdef CONFIG_SMP
351 base->running_timer = timer;
352#endif
353}
354
355static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
356{ 334{
357 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
936} 914}
937EXPORT_SYMBOL(del_timer); 915EXPORT_SYMBOL(del_timer);
938 916
939#ifdef CONFIG_SMP
940/** 917/**
941 * try_to_del_timer_sync - Try to deactivate a timer 918 * try_to_del_timer_sync - Try to deactivate a timer
942 * @timer: timer do del 919 * @timer: timer do del
943 * 920 *
944 * This function tries to deactivate a timer. Upon successful (ret >= 0) 921 * This function tries to deactivate a timer. Upon successful (ret >= 0)
945 * exit the timer is not queued and the handler is not running on any CPU. 922 * exit the timer is not queued and the handler is not running on any CPU.
946 *
947 * It must not be called from interrupt contexts.
948 */ 923 */
949int try_to_del_timer_sync(struct timer_list *timer) 924int try_to_del_timer_sync(struct timer_list *timer)
950{ 925{
@@ -973,6 +948,7 @@ out:
973} 948}
974EXPORT_SYMBOL(try_to_del_timer_sync); 949EXPORT_SYMBOL(try_to_del_timer_sync);
975 950
951#ifdef CONFIG_SMP
976/** 952/**
977 * del_timer_sync - deactivate a timer and wait for the handler to finish. 953 * del_timer_sync - deactivate a timer and wait for the handler to finish.
978 * @timer: the timer to be deactivated 954 * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
983 * 959 *
984 * Synchronization rules: Callers must prevent restarting of the timer, 960 * Synchronization rules: Callers must prevent restarting of the timer,
985 * otherwise this function is meaningless. It must not be called from 961 * otherwise this function is meaningless. It must not be called from
986 * interrupt contexts. The caller must not hold locks which would prevent 962 * hardirq contexts. The caller must not hold locks which would prevent
987 * completion of the timer's handler. The timer's handler must not call 963 * completion of the timer's handler. The timer's handler must not call
988 * add_timer_on(). Upon exit the timer is not queued and the handler is 964 * add_timer_on(). Upon exit the timer is not queued and the handler is
989 * not running on any CPU. 965 * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
993int del_timer_sync(struct timer_list *timer) 969int del_timer_sync(struct timer_list *timer)
994{ 970{
995#ifdef CONFIG_LOCKDEP 971#ifdef CONFIG_LOCKDEP
996 unsigned long flags; 972 local_bh_disable();
997
998 local_irq_save(flags);
999 lock_map_acquire(&timer->lockdep_map); 973 lock_map_acquire(&timer->lockdep_map);
1000 lock_map_release(&timer->lockdep_map); 974 lock_map_release(&timer->lockdep_map);
1001 local_irq_restore(flags); 975 local_bh_enable();
1002#endif 976#endif
1003 977 /*
978 * don't use it in hardirq context, because it
979 * could lead to deadlock.
980 */
981 WARN_ON(in_irq());
1004 for (;;) { 982 for (;;) {
1005 int ret = try_to_del_timer_sync(timer); 983 int ret = try_to_del_timer_sync(timer);
1006 if (ret >= 0) 984 if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
1111 1089
1112 timer_stats_account_timer(timer); 1090 timer_stats_account_timer(timer);
1113 1091
1114 set_running_timer(base, timer); 1092 base->running_timer = timer;
1115 detach_timer(timer, 1); 1093 detach_timer(timer, 1);
1116 1094
1117 spin_unlock_irq(&base->lock); 1095 spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
1119 spin_lock_irq(&base->lock); 1097 spin_lock_irq(&base->lock);
1120 } 1098 }
1121 } 1099 }
1122 set_running_timer(base, NULL); 1100 base->running_timer = NULL;
1123 spin_unlock_irq(&base->lock); 1101 spin_unlock_irq(&base->lock);
1124} 1102}
1125 1103
@@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1249 */ 1227 */
1250unsigned long get_next_timer_interrupt(unsigned long now) 1228unsigned long get_next_timer_interrupt(unsigned long now)
1251{ 1229{
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1230 struct tvec_base *base = __this_cpu_read(tvec_bases);
1253 unsigned long expires; 1231 unsigned long expires;
1254 1232
1255 /* 1233 /*
@@ -1298,7 +1276,7 @@ void update_process_times(int user_tick)
1298 */ 1276 */
1299static void run_timer_softirq(struct softirq_action *h) 1277static void run_timer_softirq(struct softirq_action *h)
1300{ 1278{
1301 struct tvec_base *base = __get_cpu_var(tvec_bases); 1279 struct tvec_base *base = __this_cpu_read(tvec_bases);
1302 1280
1303 hrtimer_run_pending(); 1281 hrtimer_run_pending();
1304 1282
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1313 1313
1314 __this_cpu_inc(user_stack_count); 1314 __this_cpu_inc(user_stack_count);
1315 1315
1316
1317
1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1316 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1319 sizeof(*entry), flags, pc); 1317 sizeof(*entry), flags, pc);
1320 if (!event) 1318 if (!event)
1321 return; 1319 goto out_drop_count;
1322 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1323 1321
1324 entry->tgid = current->tgid; 1322 entry->tgid = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1333 if (!filter_check_discard(call, entry, buffer, event)) 1331 if (!filter_check_discard(call, entry, buffer, event))
1334 ring_buffer_unlock_commit(buffer, event); 1332 ring_buffer_unlock_commit(buffer, event);
1335 1333
1334 out_drop_count:
1336 __this_cpu_dec(user_stack_count); 1335 __this_cpu_dec(user_stack_count);
1337
1338 out: 1336 out:
1339 preempt_enable(); 1337 preempt_enable();
1340} 1338}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..562c56e048fd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index aaa8dae08236..6e7b575ac33c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
309 */ 309 */
310static int watchdog(void *unused) 310static int watchdog(void *unused)
311{ 311{
312 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 312 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
314 314
315 sched_setscheduler(current, SCHED_FIFO, &param); 315 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/lib/Makefile b/lib/Makefile
index e6a3763b8212..9e2db72d128e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
8endif 8endif
9 9
10lib-y := ctype.o string.o vsprintf.o cmdline.o \ 10lib-y := ctype.o string.o vsprintf.o cmdline.o \
11 rbtree.o radix-tree.o dump_stack.o \ 11 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
12 idr.o int_sqrt.o extable.o prio_tree.o \ 12 idr.o int_sqrt.o extable.o prio_tree.o \
13 sha1.o irq_regs.o reciprocal_div.o argv_split.o \ 13 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
14 proportions.o prio_heap.o ratelimit.o show_mem.o \ 14 proportions.o prio_heap.o ratelimit.o show_mem.o \
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 3094318bfea7..b335acb43be2 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query,
141 else if (!dp->flags) 141 else if (!dp->flags)
142 dt->num_enabled++; 142 dt->num_enabled++;
143 dp->flags = newflags; 143 dp->flags = newflags;
144 if (newflags) { 144 if (newflags)
145 jump_label_enable(&dp->enabled); 145 dp->enabled = 1;
146 } else { 146 else
147 jump_label_disable(&dp->enabled); 147 dp->enabled = 0;
148 }
149 if (verbose) 148 if (verbose)
150 printk(KERN_INFO 149 printk(KERN_INFO
151 "ddebug: changed %s:%d [%s]%s %s\n", 150 "ddebug: changed %s:%d [%s]%s %s\n",
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
new file mode 100644
index 000000000000..e3a1050e6820
--- /dev/null
+++ b/lib/timerqueue.c
@@ -0,0 +1,107 @@
1/*
2 * Generic Timer-queue
3 *
4 * Manages a simple queue of timers, ordered by expiration time.
5 * Uses rbtrees for quick list adds and expiration.
6 *
7 * NOTE: All of the following functions need to be serialized
8 * to avoid races. No locking is done by this libary code.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25#include <linux/timerqueue.h>
26#include <linux/rbtree.h>
27#include <linux/module.h>
28
29/**
30 * timerqueue_add - Adds timer to timerqueue.
31 *
32 * @head: head of timerqueue
33 * @node: timer node to be added
34 *
35 * Adds the timer node to the timerqueue, sorted by the
36 * node's expires value.
37 */
38void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
39{
40 struct rb_node **p = &head->head.rb_node;
41 struct rb_node *parent = NULL;
42 struct timerqueue_node *ptr;
43
44 /* Make sure we don't add nodes that are already added */
45 WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
46
47 while (*p) {
48 parent = *p;
49 ptr = rb_entry(parent, struct timerqueue_node, node);
50 if (node->expires.tv64 < ptr->expires.tv64)
51 p = &(*p)->rb_left;
52 else
53 p = &(*p)->rb_right;
54 }
55 rb_link_node(&node->node, parent, p);
56 rb_insert_color(&node->node, &head->head);
57
58 if (!head->next || node->expires.tv64 < head->next->expires.tv64)
59 head->next = node;
60}
61EXPORT_SYMBOL_GPL(timerqueue_add);
62
63/**
64 * timerqueue_del - Removes a timer from the timerqueue.
65 *
66 * @head: head of timerqueue
67 * @node: timer node to be removed
68 *
69 * Removes the timer node from the timerqueue.
70 */
71void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
72{
73 WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
74
75 /* update next pointer */
76 if (head->next == node) {
77 struct rb_node *rbn = rb_next(&node->node);
78
79 head->next = rbn ?
80 rb_entry(rbn, struct timerqueue_node, node) : NULL;
81 }
82 rb_erase(&node->node, &head->head);
83 RB_CLEAR_NODE(&node->node);
84}
85EXPORT_SYMBOL_GPL(timerqueue_del);
86
87/**
88 * timerqueue_iterate_next - Returns the timer after the provided timer
89 *
90 * @node: Pointer to a timer.
91 *
92 * Provides the timer that is after the given node. This is used, when
93 * necessary, to iterate through the list of timers in a timer list
94 * without modifying the list.
95 */
96struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
97{
98 struct rb_node *next;
99
100 if (!node)
101 return NULL;
102 next = rb_next(&node->node);
103 if (!next)
104 return NULL;
105 return container_of(next, struct timerqueue_node, node);
106}
107EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 39580a5dc5df..9f85012acf0d 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -155,6 +155,8 @@ use strict;
155# '@parameter' - name of a parameter 155# '@parameter' - name of a parameter
156# '%CONST' - name of a constant. 156# '%CONST' - name of a constant.
157 157
158## init lots of data
159
158my $errors = 0; 160my $errors = 0;
159my $warnings = 0; 161my $warnings = 0;
160my $anon_struct_union = 0; 162my $anon_struct_union = 0;
@@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1",
218 $type_param, "\$1" ); 220 $type_param, "\$1" );
219my $blankline_list = ""; 221my $blankline_list = "";
220 222
221sub usage {
222 print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
223 print " [ -no-doc-sections ]\n";
224 print " [ -function funcname [ -function funcname ...] ]\n";
225 print " [ -nofunction funcname [ -nofunction funcname ...] ]\n";
226 print " c source file(s) > outputfile\n";
227 print " -v : verbose output, more warnings & other info listed\n";
228 exit 1;
229}
230
231# read arguments 223# read arguments
232if ($#ARGV == -1) { 224if ($#ARGV == -1) {
233 usage(); 225 usage();
234} 226}
235 227
228my $kernelversion;
229my $dohighlight = "";
230
236my $verbose = 0; 231my $verbose = 0;
237my $output_mode = "man"; 232my $output_mode = "man";
238my $no_doc_sections = 0; 233my $no_doc_sections = 0;
@@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June',
245 'November', 'December')[(localtime)[4]] . 240 'November', 'December')[(localtime)[4]] .
246 " " . ((localtime)[5]+1900); 241 " " . ((localtime)[5]+1900);
247 242
248# Essentially these are globals 243# Essentially these are globals.
249# They probably want to be tidied up, made more localised or something. 244# They probably want to be tidied up, made more localised or something.
250# CAVEAT EMPTOR! Some of the others I localised may not want to be, which 245# CAVEAT EMPTOR! Some of the others I localised may not want to be, which
251# could cause "use of undefined value" or other bugs. 246# could cause "use of undefined value" or other bugs.
@@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) {
353 } 348 }
354} 349}
355 350
351# continue execution near EOF;
352
353sub usage {
354 print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
355 print " [ -no-doc-sections ]\n";
356 print " [ -function funcname [ -function funcname ...] ]\n";
357 print " [ -nofunction funcname [ -nofunction funcname ...] ]\n";
358 print " c source file(s) > outputfile\n";
359 print " -v : verbose output, more warnings & other info listed\n";
360 exit 1;
361}
362
356# get kernel version from env 363# get kernel version from env
357sub get_kernel_version() { 364sub get_kernel_version() {
358 my $version = 'unknown kernel version'; 365 my $version = 'unknown kernel version';
@@ -362,15 +369,6 @@ sub get_kernel_version() {
362 } 369 }
363 return $version; 370 return $version;
364} 371}
365my $kernelversion = get_kernel_version();
366
367# generate a sequence of code that will splice in highlighting information
368# using the s// operator.
369my $dohighlight = "";
370foreach my $pattern (keys %highlights) {
371# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
372 $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
373}
374 372
375## 373##
376# dumps section contents to arrays/hashes intended for that purpose. 374# dumps section contents to arrays/hashes intended for that purpose.
@@ -1851,34 +1849,6 @@ sub dump_function($$) {
1851 }); 1849 });
1852} 1850}
1853 1851
1854sub process_file($);
1855
1856# Read the file that maps relative names to absolute names for
1857# separate source and object directories and for shadow trees.
1858if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
1859 my ($relname, $absname);
1860 while(<SOURCE_MAP>) {
1861 chop();
1862 ($relname, $absname) = (split())[0..1];
1863 $relname =~ s:^/+::;
1864 $source_map{$relname} = $absname;
1865 }
1866 close(SOURCE_MAP);
1867}
1868
1869foreach (@ARGV) {
1870 chomp;
1871 process_file($_);
1872}
1873if ($verbose && $errors) {
1874 print STDERR "$errors errors\n";
1875}
1876if ($verbose && $warnings) {
1877 print STDERR "$warnings warnings\n";
1878}
1879
1880exit($errors);
1881
1882sub reset_state { 1852sub reset_state {
1883 $function = ""; 1853 $function = "";
1884 %constants = (); 1854 %constants = ();
@@ -2285,3 +2255,39 @@ sub process_file($) {
2285 } 2255 }
2286 } 2256 }
2287} 2257}
2258
2259
2260$kernelversion = get_kernel_version();
2261
2262# generate a sequence of code that will splice in highlighting information
2263# using the s// operator.
2264foreach my $pattern (keys %highlights) {
2265# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
2266 $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
2267}
2268
2269# Read the file that maps relative names to absolute names for
2270# separate source and object directories and for shadow trees.
2271if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
2272 my ($relname, $absname);
2273 while(<SOURCE_MAP>) {
2274 chop();
2275 ($relname, $absname) = (split())[0..1];
2276 $relname =~ s:^/+::;
2277 $source_map{$relname} = $absname;
2278 }
2279 close(SOURCE_MAP);
2280}
2281
2282foreach (@ARGV) {
2283 chomp;
2284 process_file($_);
2285}
2286if ($verbose && $errors) {
2287 print STDERR "$errors errors\n";
2288}
2289if ($verbose && $warnings) {
2290 print STDERR "$warnings warnings\n";
2291}
2292
2293exit($errors);