Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/urgent

author: Ingo Molnar <mingo@elte.hu> 2011-01-09 04:42:21 -0500
committer: Ingo Molnar <mingo@elte.hu> 2011-01-09 04:42:21 -0500
commit: 4385428a477559b26736cc3c80d8b68f31126c71 (patch)
tree: 8eb0cbc78e79c368687fa13a1e0674ae537f830f
parent: 047a3772feaae8e43d81d790f3d3f80dae8ae676 (diff)
parent: 2d75af2f2a7a6103a6d539a492fe81deacabde44 (diff)
165 files changed, 5763 insertions, 2060 deletions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index a851118775d8..6a8c73f55b80 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
 CONFIG_RCU_TRACE debugfs Files and Formats
-The rcutree implementation of RCU provides debugfs trace output that
+The rcutree and rcutiny implementations of RCU provide debugfs trace
-summarizes counters and state.  This information is useful for debugging
+output that summarizes counters and state.  This information is useful for
-RCU itself, and can sometimes also help to debug abuses of RCU.
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
 top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
 The output of "cat rcu/rcudata" looks as follows:
@@ -130,7 +134,8 @@ o	"ci" is the number of RCU callbacks that have been invoked for
        been registered in absence of CPU-hotplug activity.
 o       "co" is the number of RCU callbacks that have been orphaned due to
-        this CPU going offline.
+        this CPU going offline.  These orphaned callbacks have been moved
+        to an arbitrarily chosen online CPU.
 o       "ca" is the number of RCU callbacks that have been adopted due to
        other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o	"gpnum" is the number of grace periods that have started.  It is
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o	"fqlh" is the number of calls to force_quiescent_state() that
        exited immediately (without even being counted in nfqs above)
        due to contention on ->fqslock.
-o       "oqlen" is the number of callbacks on the "orphan" callback
-        list.  RCU callbacks are placed on this list by CPUs going
-        offline, and are "adopted" either by the CPU helping the outgoing
-        CPU or by the next rcu_barrier*() call, whichever comes first.
 o       Each element of the form "1/1 0:127 ^0" represents one struct
        rcu_node.  Each line represents one level of the hierarchy, from
        root to leaves.  It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o	"nn" is the number of times that this CPU needed nothing.  Alert
        readers will note that the rcu "nn" number for a given CPU very
        closely matches the rcu_bh "np" number for that same CPU.  This
        is due to short-circuit evaluation in rcu_pending().
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+The output of "cat rcu/rcudata" is as follows:
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+o       "qlen" is the number of RCU callbacks currently waiting either
+        for an RCU grace period or waiting to be invoked.  This is the
+        only field present for rcu_sched and rcu_bh, due to the
+        short-circuiting of grace period in those two cases.
+o       "gp" is the number of grace periods that have completed.
+o       "g197/p197/c197" displays the grace-period state, with the
+        "g" number being the number of grace periods that have started
+        (mod 256), the "p" number being the number of grace periods
+        that the CPU has responded to (also mod 256), and the "c"
+        number being the number of grace periods that have completed
+        (once again mode 256).
+        Why have both "gp" and "g"?  Because the data flowing into
+        "gp" is only present in a CONFIG_RCU_TRACE kernel.
+o       "tasks" is a set of bits.  The first bit is "T" if there are
+        currently tasks that have recently blocked within an RCU
+        read-side critical section, the second bit is "N" if any of the
+        aforementioned tasks are blocking the current RCU grace period,
+        and the third bit is "E" if any of the aforementioned tasks are
+        blocking the current expedited grace period.  Each bit is "."
+        if the corresponding condition does not hold.
+o       "ttb" is a single bit.  It is "B" if any of the blocked tasks
+        need to be priority boosted and "." otherwise.
+o       "btg" indicates whether boosting has been carried out during
+        the current grace period, with "exp" indicating that boosting
+        is in progress for an expedited grace period, "no" indicating
+        that boosting has not yet started for a normal grace period,
+        "begun" indicating that boosting has bebug for a normal grace
+        period, and "done" indicating that boosting has completed for
+        a normal grace period.
+o       "ntb" is the total number of tasks subjected to RCU priority boosting
+        periods since boot.
+o       "neb" is the number of expedited grace periods that have had
+        to resort to RCU priority boosting since boot.
+o       "nnb" is the number of normal grace periods that have had
+        to resort to RCU priority boosting since boot.
+o       "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+o       "bt" is the low-order 12 bits of the value that the jiffies counter
+        will have at the next time that boosting is scheduled to begin.
+o       In the line beginning with "normal balk", the fields are as follows:
+        o       "nt" is the number of times that the system balked from
+                boosting because there were no blocked tasks to boost.
+                Note that the system will balk from boosting even if the
+                grace period is overdue when the currently running task
+                is looping within an RCU read-side critical section.
+                There is no point in boosting in this case, because
+                boosting a running task won't make it run any faster.
+        o       "gt" is the number of times that the system balked
+                from boosting because, although there were blocked tasks,
+                none of them were preventing the current grace period
+                from completing.
+        o       "bt" is the number of times that the system balked
+                from boosting because boosting was already in progress.
+        o       "b" is the number of times that the system balked from
+                boosting because boosting had already completed for
+                the grace period in question.
+        o       "ny" is the number of times that the system balked from
+                boosting because it was not yet time to start boosting
+                the grace period in question.
+        o       "nos" is the number of times that the system balked from
+                boosting for inexplicable ("not otherwise specified")
+                reasons.  This can actually happen due to races involving
+                increments of the jiffies counter.
+o       In the line beginning with "exp balk", the fields are as follows:
+        o       "bt" is the number of times that the system balked from
+                boosting because there were no blocked tasks to boost.
+        o       "nos" is the number of times that the system balked from
+                 boosting for inexplicable ("not otherwise specified")
+                 reasons.
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index d9bcffd59433..470d3dba1a69 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,6 +62,10 @@ aic7*reg_print.c*
 aic7*seq.h*
 aicasm
 aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
 asm-offsets.h
 asm_offsets.h
 autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
 build
 bvmlinux
 bzImage*
+capflags.c
 classlist.h*
 comp*.log
 compile.h*
@@ -94,6 +99,7 @@ devlist.h*
 docproc
 elf2ecoff
 elfconfig.h*
+evergreen_reg_safe.h
 fixdep
 flask.h
 fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
 *_gray256.c
 ihex2fw
 ikconfig.h*
+inat-tables.c
 initramfs_data.cpio
 initramfs_data.cpio.gz
 initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
 kallsyms
 kconfig
 keywords.c
@@ -140,6 +153,7 @@ mkprep
 mktables
 mktree
 modpost
+modules.builtin
 modules.order
 modversions.h*
 ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
 pca200e_ecd.bin2
 piggy.gz
 piggyback
+piggy.S
 pnmtologo
 ppc_defs.h*
 pss_boot.h
 qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
 raid6altivec*.c
 raid6int*.c
 raid6tables.c
 relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
 series
 setup
 setup.bin
@@ -169,6 +192,7 @@ sImage
 sm_tbl*
 split-include
 syscalltab.h
+tables.c
 tags
 tftpboot.img
 timeconst.h
@@ -190,6 +214,7 @@ vmlinux
 vmlinux-*
 vmlinux.aout
 vmlinux.lds
+voffset.h
 vsyscall.lds
 vsyscall_32.lds
 wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
 wakeup.lds
 zImage*
 zconf.hash.c
+zoffset.h
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 715eaaf1519d..9a8674629a07 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -537,7 +537,7 @@
       Notes: Further information in
       http://www.oreilly.com/catalog/linuxdrive2/
-     * Title: "Linux Device Drivers, 3nd Edition"
+     * Title: "Linux Device Drivers, 3rd Edition"
       Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
       Publisher: O'Reilly & Associates.
       Date: 2005.
@@ -592,14 +592,6 @@
       Pages: 600.
       ISBN: 0-13-101908-2
-     * Title:  "The  Design  and Implementation of the 4.4 BSD UNIX
-       Operating System"
-       Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
-       John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1996.
-       ISBN: 0-201-54979-4
     * Title: "Programming for the real world - POSIX.4"
       Author: Bill O. Gallmeister.
       Publisher: O'Reilly & Associates, Inc..
@@ -610,28 +602,13 @@
       POSIX. Good reference.
     * Title:  "UNIX  Systems  for  Modern Architectures: Symmetric
-       Multiprocesssing and Caching for Kernel Programmers"
+       Multiprocessing and Caching for Kernel Programmers"
       Author: Curt Schimmel.
       Publisher: Addison Wesley.
       Date: June, 1994.
       Pages: 432.
       ISBN: 0-201-63338-8
-     * Title:  "The  Design  and Implementation of the 4.3 BSD UNIX
-       Operating System"
-       Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
-       Karels, John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1989 (reprinted with corrections on October, 1990).
-       ISBN: 0-201-06196-1
-     * Title: "The Design of the UNIX Operating System"
-       Author: Maurice J. Bach.
-       Publisher: Prentice Hall.
-       Date: 1986.
-       Pages: 471.
-       ISBN: 0-13-201757-1
     MISCELLANEOUS:
     * Name: linux/Documentation
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 992cda68fa63..f3dc951e949f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
        noapic          [SMP,APIC] Tells the kernel to not make use of any
                        IOAPICs that may be present in the system.
+        noautogroup     Disable scheduler automatic task group creation.
        nobats          [PPC] Do not use BATs for mapping kernel lowmem
                        on "Classic" PPC cores.
@@ -2459,12 +2461,13 @@ and is between 256 and 4096 characters. It is defined in the file
                        to facilitate early boot debugging.
                        See also Documentation/trace/events.txt
-        tsc=            Disable clocksource-must-verify flag for TSC.
+        tsc=            Disable clocksource stability checks for TSC.
                        Format: <string>
                        [x86] reliable: mark tsc clocksource as reliable, this
-                        disables clocksource verification at runtime.
+                        disables clocksource verification at runtime, as well
-                        Used to enable high-resolution timer mode on older
+                        as the stability checks done at bootup. Used to enable
-                        hardware, and in virtualized environment.
+                        high-resolution timer mode on older hardware, and in
+                        virtualized environment.
                        [x86] noirqtime: Do not use TSC to do irq accounting.
                        Used to run time disable IRQ_TIME_ACCOUNTING on any
                        platforms where RDTSC is slow and this accounting
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 30b43e1b2697..bdeb81ccb5f6 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -600,6 +600,7 @@ Protocol:	2.07+
  0x00000001    lguest
  0x00000002    Xen
  0x00000003    Moorestown MID
+  0x00000004    CE4100 TV Platform
 Field name:     hardware_subarch_data
 Type:           write (subarch-dependent)
diff --git a/MAINTAINERS b/MAINTAINERS
index b1dda78a1e75..c5c7292daba0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2812,6 +2812,10 @@ M:	Thomas Gleixner <tglx@linutronix.de>
 S:      Maintained
 F:      Documentation/timers/
 F:      kernel/hrtimer.c
+F:      kernel/time/clockevents.c
+F:      kernel/time/tick*.*
+F:      kernel/time/timer_*.c
+F       include/linux/clockevents.h
 F:      include/linux/hrtimer.h
 HIGH-SPEED SCC DRIVER FOR AX.25
@@ -5142,6 +5146,18 @@ L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:      Supported
 F:      sound/soc/s3c24xx
+TIMEKEEPING, NTP
+M:      John Stultz <johnstul@us.ibm.com>
+M:      Thomas Gleixner <tglx@linutronix.de>
+S:      Supported
+F:      include/linux/clocksource.h
+F:      include/linux/time.h
+F:      include/linux/timex.h
+F:      include/linux/timekeeping.h
+F:      kernel/time/clocksource.c
+F:      kernel/time/time*.c
+F:      kernel/time/ntp.c
 TLG2300 VIDEO4LINUX-2 DRIVER
 M:      Huang Shijie <shijie8@gmail.com>
 M:      Kang Yong <kangyong@telegent.com>
diff --git a/arch/Kconfig b/arch/Kconfig
index 8bf0fa652eb6..f78c2be4242b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
 config HAVE_ARCH_JUMP_LABEL
        bool
+config HAVE_ARCH_MUTEX_CPU_RELAX
+        bool
 source "kernel/gcov/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e0b98e71ff47..6c6d7b339aae 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
        select HAVE_KERNEL_LZMA
        select HAVE_KERNEL_LZO
        select HAVE_GET_USER_PAGES_FAST
+        select HAVE_ARCH_MUTEX_CPU_RELAX
        select ARCH_INLINE_SPIN_TRYLOCK
        select ARCH_INLINE_SPIN_TRYLOCK_BH
        select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h
index 458c1f7fbc18..688271f5f2e4 100644
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
 */
 #include <asm-generic/mutex-dec.h>
+#define arch_mutex_cpu_relax()  barrier()
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e330da21b84f..b6fccb07123e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -377,6 +377,18 @@ config X86_ELAN
          If unsure, choose "PC-compatible" instead.
+config X86_INTEL_CE
+        bool "CE4100 TV platform"
+        depends on PCI
+        depends on PCI_GODIRECT
+        depends on X86_32
+        depends on X86_EXTENDED_PLATFORM
+        select X86_REBOOTFIXUPS
+        ---help---
+          Select for the Intel CE media processor (CE4100) SOC.
+          This option compiles in support for the CE4100 SOC for settop
+          boxes and media devices.
 config X86_MRST
       bool "Moorestown MID platform"
        depends on PCI
@@ -385,6 +397,10 @@ config X86_MRST
        depends on X86_EXTENDED_PLATFORM
        depends on X86_IO_APIC
        select APB_TIMER
+        select I2C
+        select SPI
+        select INTEL_SCU_IPC
+        select X86_PLATFORM_DEVICES
        ---help---
          Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
          Internet Device(MID) platform. Moorestown consists of two chips:
@@ -466,6 +482,19 @@ config X86_ES7000
          Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
          supposed to run on an IA32-based Unisys ES7000 system.
+config X86_32_IRIS
+        tristate "Eurobraille/Iris poweroff module"
+        depends on X86_32
+        ---help---
+          The Iris machines from EuroBraille do not have APM or ACPI support
+          to shut themselves down properly.  A special I/O sequence is
+          needed to do so, which is what this module does at
+          kernel shutdown.
+          This is only for Iris machines from EuroBraille.
+          If unused, say N.
 config SCHED_OMIT_FRAME_POINTER
        def_bool y
        prompt "Single-depth WCHAN output"
@@ -1141,16 +1170,16 @@ config NUMA
 comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
        depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
-config K8_NUMA
+config AMD_NUMA
        def_bool y
        prompt "Old style AMD Opteron NUMA detection"
        depends on X86_64 && NUMA && PCI
        ---help---
-          Enable K8 NUMA node topology detection.  You should say Y here if
+          Enable AMD NUMA node topology detection.  You should say Y here if
-          you have a multi processor AMD K8 system. This uses an old
+          you have a multi processor AMD system. This uses an old method to
-          method to read the NUMA configuration directly from the builtin
+          read the NUMA configuration directly from the builtin Northbridge
-          Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+          of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
-          instead, which also takes priority if both are compiled in.
+          which also takes priority if both are compiled in.
 config X86_64_ACPI_NUMA
        def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b59ee765414e..45143bbcfe5e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
          feature as well as for the change_page_attr() infrastructure.
          If in doubt, say "N"
+config DEBUG_SET_MODULE_RONX
+        bool "Set loadable kernel module data as NX and text as RO"
+        depends on MODULES
+        ---help---
+          This option helps catch unintended modifications to loadable
+          kernel module's text and read-only data. It also prevents execution
+          of module data. Such protection may interfere with run-time code
+          patching and dynamic kernel tracing - and they might also protect
+          against certain classes of kernel exploits.
+          If in doubt, say "N".
 config DEBUG_NX_TEST
        tristate "Testcase for the NX non-executable stack feature"
        depends on DEBUG_KERNEL && m
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a196fa0..35af09d13dc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
        hlt
        jmp     1b
-#include "../../kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu.S"
        /*
         * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 4a2adaa9aefc..13009d1af99a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
 extern void alternatives_smp_module_del(struct module *mod);
 extern void alternatives_smp_switch(int smp);
 extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
 #else
 static inline void alternatives_smp_module_add(struct module *mod, char *name,
                                               void *locks, void *locks_end,
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c8517f81b21e..6aee50d655d1 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,36 +3,53 @@
 #include <linux/pci.h>
-extern struct pci_device_id k8_nb_ids[];
+extern struct pci_device_id amd_nb_misc_ids[];
 struct bootnode;
-extern int early_is_k8_nb(u32 value);
+extern int early_is_amd_nb(u32 value);
-extern int cache_k8_northbridges(void);
+extern int amd_cache_northbridges(void);
-extern void k8_flush_garts(void);
+extern void amd_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
+extern int amd_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
+extern int amd_scan_nodes(void);
-struct k8_northbridge_info {
+struct amd_northbridge {
+        struct pci_dev *misc;
+};
+struct amd_northbridge_info {
        u16 num;
-        u8 gart_supported;
+        u64 flags;
-        struct pci_dev **nb_misc;
+        struct amd_northbridge *nb;
 };
-extern struct k8_northbridge_info k8_northbridges;
+extern struct amd_northbridge_info amd_northbridges;
+#define AMD_NB_GART                     0x1
+#define AMD_NB_L3_INDEX_DISABLE         0x2
 #ifdef CONFIG_AMD_NB
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline int amd_nb_num(void)
 {
-        return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+        return amd_northbridges.num;
 }
-#else
+static inline int amd_nb_has_feature(int feature)
+{
+        return ((amd_northbridges.flags & feature) == feature);
+}
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct amd_northbridge *node_to_amd_nb(int node)
 {
-        return NULL;
+        return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
 }
+#else
+#define amd_nb_num(x)           0
+#define amd_nb_has_feature(x)   false
+#define node_to_amd_nb(x)       NULL
 #endif
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f6ce0bda3b98..cf12007796db 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
 extern void enable_NMI_through_LVT0(void);
+extern int apic_force_enable(void);
 /*
 * On 32bit this is mach-xxx local
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index a859ca461fb0..47a30ff8e517 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
 #ifdef CONFIG_X86_32
 # define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
 #else
 # define MAX_IO_APICS 128
 # define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 8e6218550e77..c8bfe63a06de 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -124,6 +124,7 @@ enum {
        X86_SUBARCH_LGUEST,
        X86_SUBARCH_XEN,
        X86_SUBARCH_MRST,
+        X86_SUBARCH_CE4100,
        X86_NR_SUBARCHS,
 };
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 9479a037419f..0141b234406f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,6 +117,10 @@ enum fixed_addresses {
        FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
        FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
        __end_of_permanent_fixed_addresses,
+#ifdef  CONFIG_X86_MRST
+        FIX_LNW_VRTC,
+#endif
        /*
         * 256 temporary boot-time mappings, used by early_ioremap(),
         * before ioremap() is functional.
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 4aa2bb3b242a..ef328901c802 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
        int err;
        /* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+        asm volatile("1:  fxrstorq %[fx]\n\t"
+                     "2:\n"
+                     ".section .fixup,\"ax\"\n"
+                     "3:  movl $-1,%[err]\n"
+                     "    jmp  2b\n"
+                     ".previous\n"
+                     _ASM_EXTABLE(1b, 3b)
+                     : [err] "=r" (err)
+                     : [fx] "m" (*fx), "0" (0));
+#else
        asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
                     "2:\n"
                     ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
                     _ASM_EXTABLE(1b, 3b)
                     : [err] "=r" (err)
                     : [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
        return err;
 }
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                return -EFAULT;
        /* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+        asm volatile("1:  fxsaveq %[fx]\n\t"
+                     "2:\n"
+                     ".section .fixup,\"ax\"\n"
+                     "3:  movl $-1,%[err]\n"
+                     "    jmp  2b\n"
+                     ".previous\n"
+                     _ASM_EXTABLE(1b, 3b)
+                     : [err] "=r" (err), [fx] "=m" (*fx)
+                     : "0" (0));
+#else
        asm volatile("1:  rex64/fxsave (%[fx])\n\t"
                     "2:\n"
                     ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                     _ASM_EXTABLE(1b, 3b)
                     : [err] "=r" (err), "=m" (*fx)
                     : [fx] "R" (fx), "0" (0));
+#endif
        if (unlikely(err) &&
            __clear_user(fx, sizeof(struct i387_fxsave_struct)))
                err = -EFAULT;
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a6b28d017c2f..0c5ca4e30d7b 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
                 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_init_mappings(void);
+extern void ioapic_and_gsi_init(void);
 extern void ioapic_insert_resources(void);
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,10 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void probe_nr_irqs_gsi(void);
 extern int get_nr_irqs_gsi(void);
 extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
 struct mp_ioapic_gsi{
        u32 gsi_base;
@@ -189,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void);
 #define io_apic_assign_pci_irqs 0
 #define setup_ioapic_ids_from_mpc x86_init_noop
 static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void)   { }
+static inline void ioapic_and_gsi_init(void) { }
 static inline void ioapic_insert_resources(void) { }
-static inline void probe_nr_irqs_gsi(void)      { }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13cb9788..eb16e94ae04f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
 void mce_log_therm_throt_event(__u64 status);
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
 #ifdef CONFIG_X86_THERMAL_VECTOR
 extern void mcheck_intel_therm_init(void);
 #else
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..24215072d0e1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
 #ifdef CONFIG_MICROCODE_AMD
 extern struct microcode_ops * __init init_amd_microcode(void);
+static inline void get_ucode_data(void *to, const u8 *from, size_t n)
+{
+        memcpy(to, from, n);
+}
 #else
 static inline struct microcode_ops * __init init_amd_microcode(void)
 {
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e9f905..0c90dd9f0505 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
 #include <asm/mpspec_def.h>
 #include <asm/x86_init.h>
+#include <asm/apicdef.h>
-extern int apic_version[MAX_APICS];
+extern int apic_version[];
 extern int pic_mode;
 #ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
                                 int active_high_low);
 #endif /* CONFIG_ACPI */
-#define PHYSID_ARRAY_SIZE       BITS_TO_LONGS(MAX_APICS)
+#define PHYSID_ARRAY_SIZE       BITS_TO_LONGS(MAX_LOCAL_APIC)
 struct physid_mask {
        unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
        test_and_set_bit(physid, (map).mask)
 #define physids_and(dst, src1, src2)                                    \
-        bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+        bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
 #define physids_or(dst, src1, src2)                                     \
-        bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+        bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
 #define physids_clear(map)                                      \
-        bitmap_zero((map).mask, MAX_APICS)
+        bitmap_zero((map).mask, MAX_LOCAL_APIC)
 #define physids_complement(dst, src)                            \
-        bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+        bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
 #define physids_empty(map)                                      \
-        bitmap_empty((map).mask, MAX_APICS)
+        bitmap_empty((map).mask, MAX_LOCAL_APIC)
 #define physids_equal(map1, map2)                               \
-        bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+        bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
 #define physids_weight(map)                                     \
-        bitmap_weight((map).mask, MAX_APICS)
+        bitmap_weight((map).mask, MAX_LOCAL_APIC)
 #define physids_shift_right(d, s, n)                            \
-        bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+        bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
 #define physids_shift_left(d, s, n)                             \
-        bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+        bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
 static inline unsigned long physids_coerce(physid_mask_t *map)
 {
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
        map->mask[0] = physids;
 }
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid)                                   \
-        ({                                                              \
-                physid_mask_t __physid_mask = PHYSID_MASK_NONE;         \
-                physid_set(physid, __physid_mask);                      \
-                __physid_mask;                                          \
-        })
 static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
 {
        physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..c0a955a9a087 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
 #ifdef CONFIG_X86_32
 # define MAX_MPC_ENTRY 1024
-# define MAX_APICS      256
-#else
-# if NR_CPUS <= 255
-#  define MAX_APICS     255
-# else
-#  define MAX_APICS   32768
-# endif
 #endif
 /* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h
new file mode 100644
index 000000000000..73668abdbedf
--- /dev/null
+++ b/arch/x86/include/asm/mrst-vrtc.h
@@ -0,0 +1,9 @@
+#ifndef _MRST_VRTC_H
+#define _MRST_VRTC_H
+extern unsigned char vrtc_cmos_read(unsigned char reg);
+extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
+extern unsigned long vrtc_get_time(void);
+extern int vrtc_set_mmss(unsigned long nowtime);
+#endif
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 4a711a684b17..719f00b28ff5 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -14,7 +14,9 @@
 #include <linux/sfi.h>
 extern int pci_mrst_init(void);
-int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
 /*
 * Medfield is the follow-up of Moorestown, it combines two chip solution into
@@ -50,4 +52,14 @@ extern void mrst_early_console_init(void);
 extern struct console early_hsu_console;
 extern void hsu_early_console_init(void);
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ        (1024)
+/*#define MRST_VRTC_PGOFFSET    (0xc00) */
+extern void mrst_rtc_init(void);
 #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 86030f63ba02..4d0dfa0d998e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -257,6 +257,18 @@
 #define PACKAGE_THERM_INT_LOW_ENABLE            (1 << 1)
 #define PACKAGE_THERM_INT_PLN_ENABLE            (1 << 24)
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE    (1 << 15)
+#define THERM_SHIFT_THRESHOLD0        8
+#define THERM_MASK_THRESHOLD0          (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE    (1 << 23)
+#define THERM_SHIFT_THRESHOLD1        16
+#define THERM_MASK_THRESHOLD1          (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0        (1 << 6)
+#define THERM_LOG_THRESHOLD0           (1 << 7)
+#define THERM_STATUS_THRESHOLD1        (1 << 8)
+#define THERM_LOG_THRESHOLD1           (1 << 9)
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING        (1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC                (1ULL << 1)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ef9975812c77..7709c12431b8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
 static inline void halt(void)
 {
-        PVOP_VCALL0(pv_irq_ops.safe_halt);
+        PVOP_VCALL0(pv_irq_ops.halt);
 }
 static inline void wbinvd(void)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c714b2..676129229630 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
 #define PCIBIOS_MIN_CARDBUS_IO  0x4000
+extern int pcibios_enabled;
 void pcibios_config_init(void);
 struct pci_bus *pcibios_scan_root(int bus);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index d6763b139a84..db8aa19a08a2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void);
 static inline void x86_mrst_early_setup(void) { }
 #endif
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
+#endif
 #ifndef _SETUP
 /*
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412fd8b02..ce1d54c8a433 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
 *
- * We will use 31 sets, one for sending BAU messages from each of the 32
+ * We will use one set for sending BAU messages from each of the
 * cpu's on the uvhub.
 *
 * TLB shootdown will use the first of the 8 descriptors of each set.
 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
 */
+#define MAX_CPUS_PER_UVHUB              64
+#define MAX_CPUS_PER_SOCKET             32
+#define UV_ADP_SIZE                     64 /* hardware-provided max. */
+#define UV_CPUS_PER_ACT_STATUS          32 /* hardware-provided max. */
 #define UV_ITEMS_PER_DESCRIPTOR         8
 /* the 'throttle' to prevent the hardware stay-busy bug */
 #define MAX_BAU_CONCURRENT              3
-#define UV_CPUS_PER_ACT_STATUS          32
 #define UV_ACT_STATUS_MASK              0x3
 #define UV_ACT_STATUS_SIZE              2
-#define UV_ADP_SIZE                     32
 #define UV_DISTRIBUTION_SIZE            256
 #define UV_SW_ACK_NPENDING              8
 #define UV_NET_ENDPOINT_INTD            0x38
@@ -100,7 +102,6 @@
 * number of destination side software ack resources
 */
 #define DEST_NUM_RESOURCES              8
-#define MAX_CPUS_PER_NODE               32
 /*
 * completion statuses for sending a TLB flush message
 */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 1e994754d323..34244b2cd880 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -85,7 +85,6 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)              += kgdb.o
 obj-$(CONFIG_VM86)              += vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)      += early_printk.o
-obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
 obj-$(CONFIG_HPET_TIMER)        += hpet.o
 obj-$(CONFIG_APB_TIMER)         += apb_timer.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 71232b941b6c..17c8090fabd4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
        unsigned int ver = 0;
+        if (id >= (MAX_LOCAL_APIC-1)) {
+                printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+                return;
+        }
        if (!enabled) {
                ++disabled_cpus;
                return;
@@ -910,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
        acpi_register_lapic_address(acpi_lapic_addr);
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-                                      acpi_parse_sapic, MAX_APICS);
+                                      acpi_parse_sapic, MAX_LOCAL_APIC);
        if (!count) {
                x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-                                                acpi_parse_x2apic, MAX_APICS);
+                                        acpi_parse_x2apic, MAX_LOCAL_APIC);
                count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-                                              acpi_parse_lapic, MAX_APICS);
+                                        acpi_parse_lapic, MAX_LOCAL_APIC);
        }
        if (!count && !x2count) {
                printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 553d0b0d639b..123608531c8f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
        mutex_unlock(&smp_alt);
 }
+bool skip_smp_alternatives;
 void alternatives_smp_switch(int smp)
 {
        struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
        printk("lockdep: fixing up alternatives.\n");
 #endif
-        if (noreplace_smp || smp_alt_once)
+        if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
                return;
        BUG_ON(!smp && (num_online_cpus() > 1));
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 8f6463d8ed0d..affacb5e0065 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,95 +12,116 @@
 static u32 *flush_words;
-struct pci_device_id k8_nb_ids[] = {
+struct pci_device_id amd_nb_misc_ids[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
        {}
 };
-EXPORT_SYMBOL(k8_nb_ids);
+EXPORT_SYMBOL(amd_nb_misc_ids);
-struct k8_northbridge_info k8_northbridges;
+struct amd_northbridge_info amd_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
+EXPORT_SYMBOL(amd_northbridges);
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+                                        struct pci_device_id *ids)
 {
        do {
                dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
                if (!dev)
                        break;
-        } while (!pci_match_id(&k8_nb_ids[0], dev));
+        } while (!pci_match_id(ids, dev));
        return dev;
 }
-int cache_k8_northbridges(void)
+int amd_cache_northbridges(void)
 {
-        int i;
+        int i = 0;
-        struct pci_dev *dev;
+        struct amd_northbridge *nb;
+        struct pci_dev *misc;
-        if (k8_northbridges.num)
+        if (amd_nb_num())
                return 0;
-        dev = NULL;
+        misc = NULL;
-        while ((dev = next_k8_northbridge(dev)) != NULL)
+        while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
-                k8_northbridges.num++;
+                i++;
-        /* some CPU families (e.g. family 0x11) do not support GART */
+        if (i == 0)
-        if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+                return 0;
-            boot_cpu_data.x86 == 0x15)
-                k8_northbridges.gart_supported = 1;
-        k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+        nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
-                                          sizeof(void *), GFP_KERNEL);
+        if (!nb)
-        if (!k8_northbridges.nb_misc)
                return -ENOMEM;
-        if (!k8_northbridges.num) {
+        amd_northbridges.nb = nb;
-                k8_northbridges.nb_misc[0] = NULL;
+        amd_northbridges.num = i;
-                return 0;
-        }
-        if (k8_northbridges.gart_supported) {
+        misc = NULL;
-                flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+        for (i = 0; i != amd_nb_num(); i++) {
-                                      GFP_KERNEL);
+                node_to_amd_nb(i)->misc = misc =
-                if (!flush_words) {
+                        next_northbridge(misc, amd_nb_misc_ids);
-                        kfree(k8_northbridges.nb_misc);
+        }
-                        return -ENOMEM;
-                }
+        /* some CPU families (e.g. family 0x11) do not support GART */
-        }
+        if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+            boot_cpu_data.x86 == 0x15)
+                amd_northbridges.flags |= AMD_NB_GART;
+        /*
+         * Some CPU families support L3 Cache Index Disable. There are some
+         * limitations because of E382 and E388 on family 0x10.
+         */
+        if (boot_cpu_data.x86 == 0x10 &&
+            boot_cpu_data.x86_model >= 0x8 &&
+            (boot_cpu_data.x86_model > 0x9 ||
+             boot_cpu_data.x86_mask >= 0x1))
+                amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
-        dev = NULL;
-        i = 0;
-        while ((dev = next_k8_northbridge(dev)) != NULL) {
-                k8_northbridges.nb_misc[i] = dev;
-                if (k8_northbridges.gart_supported)
-                        pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-        }
-        k8_northbridges.nb_misc[i] = NULL;
        return 0;
 }
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
 /* Ignores subdevice/subvendor but as far as I can figure out
   they're useless anyways */
-int __init early_is_k8_nb(u32 device)
+int __init early_is_amd_nb(u32 device)
 {
        struct pci_device_id *id;
        u32 vendor = device & 0xffff;
        device >>= 16;
-        for (id = k8_nb_ids; id->vendor; id++)
+        for (id = amd_nb_misc_ids; id->vendor; id++)
                if (vendor == id->vendor && device == id->device)
                        return 1;
        return 0;
 }
-void k8_flush_garts(void)
+int amd_cache_gart(void)
+{
+       int i;
+       if (!amd_nb_has_feature(AMD_NB_GART))
+               return 0;
+       flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+       if (!flush_words) {
+               amd_northbridges.flags &= ~AMD_NB_GART;
+               return -ENOMEM;
+       }
+       for (i = 0; i != amd_nb_num(); i++)
+               pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                     &flush_words[i]);
+       return 0;
+}
+void amd_flush_garts(void)
 {
        int flushed, i;
        unsigned long flags;
        static DEFINE_SPINLOCK(gart_lock);
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return;
        /* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +130,16 @@ void k8_flush_garts(void)
           that it doesn't matter to serialize more. -AK */
        spin_lock_irqsave(&gart_lock, flags);
        flushed = 0;
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
+                pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
-                                       flush_words[i]|1);
+                                       flush_words[i] | 1);
                flushed++;
        }
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
                u32 w;
                /* Make sure the hardware actually executed the flush*/
                for (;;) {
-                        pci_read_config_dword(k8_northbridges.nb_misc[i],
+                        pci_read_config_dword(node_to_amd_nb(i)->misc,
                                              0x9c, &w);
                        if (!(w & 1))
                                break;
@@ -129,19 +150,23 @@ void k8_flush_garts(void)
        if (!flushed)
                printk("nothing to flush?\n");
 }
-EXPORT_SYMBOL_GPL(k8_flush_garts);
+EXPORT_SYMBOL_GPL(amd_flush_garts);
-static __init int init_k8_nbs(void)
+static __init int init_amd_nbs(void)
 {
        int err = 0;
-        err = cache_k8_northbridges();
+        err = amd_cache_northbridges();
        if (err < 0)
-                printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+                printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+        if (amd_cache_gart() < 0)
+                printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+                       "GART support disabled.\n");
        return err;
 }
 /* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 92543c73cf8e..7c9ab59653e8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
        if (system_state == SYSTEM_BOOTING) {
                irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+                irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
                /* APB timer irqs are set up as mp_irqs, timer is edge type */
                __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
                if (request_irq(adev->irq, apbt_interrupt_handler,
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b3a16e8f0703..dcd7c83e1659 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
 * Do an PCI bus scan by hand because we're running before the PCI
 * subsystem.
 *
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
 * generically. It's probably overkill to always scan all slots because
 * the AGP bridges should be always an own bus on the HT hierarchy,
 * but do it here for future safety.
@@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void)
                dev_limit = bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void)
                dev_limit = bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void)
                dev_limit = bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        iommu_detected = 1;
@@ -518,7 +518,7 @@ out:
                dev_base = bus_dev_ranges[i].dev_base;
                dev_limit = bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fb7657822aad..879999a5230f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -431,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
        reserved = reserve_eilvt_offset(offset, new);
        if (reserved != new) {
-                pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
+                pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
-                       "vector 0x%x was already reserved by another core, "
+                       "vector 0x%x, but the register is already in use for "
-                       "APIC%lX=0x%x\n",
+                       "vector 0x%x on another cpu\n",
-                       smp_processor_id(), new, reserved, reg, old);
+                       smp_processor_id(), reg, offset, new, reserved);
                return -EINVAL;
        }
        if (!eilvt_entry_is_changeable(old, new)) {
-                pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
+                pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
-                       "register already in use, APIC%lX=0x%x\n",
+                       "vector 0x%x, but the register is already in use for "
-                       smp_processor_id(), new, reg, old);
+                       "vector 0x%x on this cpu\n",
+                       smp_processor_id(), reg, offset, new, old);
                return -EBUSY;
        }
@@ -1532,13 +1533,60 @@ static int __init detect_init_APIC(void)
        return 0;
 }
 #else
+static int apic_verify(void)
+{
+        u32 features, h, l;
+        /*
+         * The APIC feature bit should now be enabled
+         * in `cpuid'
+         */
+        features = cpuid_edx(1);
+        if (!(features & (1 << X86_FEATURE_APIC))) {
+                pr_warning("Could not enable APIC!\n");
+                return -1;
+        }
+        set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        /* The BIOS may have set up the APIC at some other address */
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        if (l & MSR_IA32_APICBASE_ENABLE)
+                mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+        pr_info("Found and enabled local APIC!\n");
+        return 0;
+}
+int apic_force_enable(void)
+{
+        u32 h, l;
+        if (disable_apic)
+                return -1;
+        /*
+         * Some BIOSes disable the local APIC in the APIC_BASE
+         * MSR. This can only be done in software for Intel P6 or later
+         * and AMD K7 (Model > 1) or later.
+         */
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+                pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+                l &= ~MSR_IA32_APICBASE_BASE;
+                l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+                wrmsr(MSR_IA32_APICBASE, l, h);
+                enabled_via_apicbase = 1;
+        }
+        return apic_verify();
+}
 /*
 * Detect and initialize APIC
 */
 static int __init detect_init_APIC(void)
 {
-        u32 h, l, features;
        /* Disabled by kernel option? */
        if (disable_apic)
                return -1;
@@ -1568,38 +1616,12 @@ static int __init detect_init_APIC(void)
                                "you can enable it with \"lapic\"\n");
                        return -1;
                }
-                /*
+                if (apic_force_enable())
-                 * Some BIOSes disable the local APIC in the APIC_BASE
+                        return -1;
-                 * MSR. This can only be done in software for Intel P6 or later
+        } else {
-                 * and AMD K7 (Model > 1) or later.
+                if (apic_verify())
-                 */
+                        return -1;
-                rdmsr(MSR_IA32_APICBASE, l, h);
-                if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                        pr_info("Local APIC disabled by BIOS -- reenabling.\n");
-                        l &= ~MSR_IA32_APICBASE_BASE;
-                        l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-                        wrmsr(MSR_IA32_APICBASE, l, h);
-                        enabled_via_apicbase = 1;
-                }
-        }
-        /*
-         * The APIC feature bit should now be enabled
-         * in `cpuid'
-         */
-        features = cpuid_edx(1);
-        if (!(features & (1 << X86_FEATURE_APIC))) {
-                pr_warning("Could not enable APIC!\n");
-                return -1;
        }
-        set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-        /* The BIOS may have set up the APIC at some other address */
-        rdmsr(MSR_IA32_APICBASE, l, h);
-        if (l & MSR_IA32_APICBASE_ENABLE)
-                mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-        pr_info("Found and enabled local APIC!\n");
        apic_pm_activate();
@@ -1687,7 +1709,7 @@ void __init init_apic_mappings(void)
 * This initializes the IO-APIC and APIC hardware if this is
 * a UP kernel.
 */
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
 int __init APIC_init_uniprocessor(void)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 16c2db8750a2..f6cd5b410770 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1933,8 +1933,7 @@ void disable_IO_APIC(void)
 *
 * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
 */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
-void __init setup_ioapic_ids_from_mpc(void)
 {
        union IO_APIC_reg_00 reg_00;
        physid_mask_t phys_id_present_map;
@@ -1943,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void)
        unsigned char old_id;
        unsigned long flags;
-        if (acpi_ioapic)
-                return;
-        /*
-         * Don't check I/O APIC IDs for xAPIC systems.  They have
-         * no meaning without the serial APIC bus.
-         */
-        if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-                || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-                return;
        /*
         * This is broken; anything with a real cpu count has to
         * circumvent this idiocy regardless.
@@ -2005,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void)
                        physids_or(phys_id_present_map, phys_id_present_map, tmp);
                }
                /*
                 * We need to adjust the IRQ routing table
                 * if the ID changed.
@@ -2041,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void)
                        apic_printk(APIC_VERBOSE, " ok.\n");
        }
 }
+void __init setup_ioapic_ids_from_mpc(void)
+{
+        if (acpi_ioapic)
+                return;
+        /*
+         * Don't check I/O APIC IDs for xAPIC systems.  They have
+         * no meaning without the serial APIC bus.
+         */
+        if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+                || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+                return;
+        setup_ioapic_ids_from_mpc_nocheck();
+}
 #endif
 int no_timer_check __initdata;
@@ -3593,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic)
        return reg_01.bits.entries + 1;
 }
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
 {
        int nr;
@@ -3910,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
        return res;
 }
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
 {
        unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
        struct resource *ioapic_res;
@@ -3948,6 +3952,8 @@ fake_ioapic_page:
                ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
                ioapic_res++;
        }
+        probe_nr_irqs_gsi();
 }
 void __init ioapic_insert_resources(void)
@@ -4057,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void)
        printk(KERN_INFO "Early APIC setup for system timer0\n");
 #ifndef CONFIG_SMP
-        phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+        physid_set_mask_of_physid(boot_cpu_physical_apicid,
+                                         &phys_cpu_present_map);
 #endif
        /* Make sure the irq descriptor is set up */
        cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 927902d90fe6..936613e77113 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 static DEFINE_SPINLOCK(uv_nmi_lock);
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+        unsigned long val, *mmr;
+        mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+        val = *mmr;
+        early_iounmap(mmr, sizeof(*mmr));
+        return val;
+}
 static inline bool is_GRU_range(u64 start, u64 end)
 {
        return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -58,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
        return is_ISA_range(start, end) || is_GRU_range(start, end);
 }
-static int early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
 {
        union uvh_node_id_u node_id;
-        unsigned long *mmr;
+        union uvh_rh_gam_config_mmr_u  m_n_config;
+        int pnode;
-        mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
-        node_id.v = *mmr;
-        early_iounmap(mmr, sizeof(*mmr));
        /* Currently, all blades have same revision number */
+        node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+        m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
        uv_min_hub_revision_id = node_id.s.revision;
-        return node_id.s.node_id;
+        pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+        return pnode;
 }
 static void __init early_get_apic_pnode_shift(void)
 {
-        unsigned long *mmr;
+        uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
-        mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
-        uvh_apicid.v = *mmr;
-        early_iounmap(mmr, sizeof(*mmr));
        if (!uvh_apicid.v)
                /*
                 * Old bios, use default value
@@ -95,21 +101,17 @@ static void __init early_get_apic_pnode_shift(void)
 static void __init uv_set_apicid_hibit(void)
 {
        union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
-        unsigned long *mmr;
-        mmr = early_ioremap(UV_LOCAL_MMR_BASE |
+        apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
-                UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
-        apicid_mask.v = *mmr;
-        early_iounmap(mmr, sizeof(*mmr));
        uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
 }
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-        int nodeid;
+        int pnodeid;
        if (!strcmp(oem_id, "SGI")) {
-                nodeid = early_get_nodeid();
+                pnodeid = early_get_pnodeid();
                early_get_apic_pnode_shift();
                x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
                x86_platform.nmi_init = uv_nmi_init;
@@ -119,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                        uv_system_type = UV_X2APIC;
                else if (!strcmp(oem_table_id, "UVH")) {
                        __get_cpu_var(x2apic_extra_bits) =
-                                nodeid << (uvh_apicid.s.pnode_shift - 1);
+                                pnodeid << uvh_apicid.s.pnode_shift;
                        uv_system_type = UV_NON_UNIQUE_APIC;
                        uv_set_apicid_hibit();
                        return 1;
@@ -682,27 +684,32 @@ void uv_nmi_init(void)
 void __init uv_system_init(void)
 {
        union uvh_rh_gam_config_mmr_u  m_n_config;
+        union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
        union uvh_node_id_u node_id;
        unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-        int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+        int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
        int gnode_extra, max_pnode = 0;
        unsigned long mmr_base, present, paddr;
-        unsigned short pnode_mask;
+        unsigned short pnode_mask, pnode_io_mask;
        map_low_mmrs();
        m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
        m_val = m_n_config.s.m_skt;
        n_val = m_n_config.s.n_skt;
+        mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+        n_io = mmioh.s.n_io;
        mmr_base =
            uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
            ~UV_MMR_ENABLE;
        pnode_mask = (1 << n_val) - 1;
+        pnode_io_mask = (1 << n_io) - 1;
        node_id.v = uv_read_local_mmr(UVH_NODE_ID);
        gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
        gnode_upper = ((unsigned long)gnode_extra  << m_val);
-        printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+        printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
-                        n_val, m_val, gnode_upper, gnode_extra);
+                        n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
        printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
@@ -735,7 +742,7 @@ void __init uv_system_init(void)
                for (j = 0; j < 64; j++) {
                        if (!test_bit(j, &present))
                                continue;
-                        pnode = (i * 64 + j);
+                        pnode = (i * 64 + j) & pnode_mask;
                        uv_blade_info[blade].pnode = pnode;
                        uv_blade_info[blade].nr_possible_cpus = 0;
                        uv_blade_info[blade].nr_online_cpus = 0;
@@ -756,6 +763,7 @@ void __init uv_system_init(void)
                /*
                 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
                 */
+                uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
                pnode = uv_apicid_to_pnode(apicid);
                blade = boot_pnode_to_blade(pnode);
@@ -772,7 +780,6 @@ void __init uv_system_init(void)
                uv_cpu_hub_info(cpu)->numa_blade_id = blade;
                uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
                uv_cpu_hub_info(cpu)->pnode = pnode;
-                uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
                uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
                uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -796,7 +803,7 @@ void __init uv_system_init(void)
        map_gru_high(max_pnode);
        map_mmr_high(max_pnode);
-        map_mmioh_high(max_pnode);
+        map_mmioh_high(max_pnode & pnode_io_mask);
        uv_cpu_init();
        uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 17ad03366211..9ecf81f9b90f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx {
 };
 struct amd_l3_cache {
-        struct   pci_dev *dev;
+        struct   amd_northbridge *nb;
-        bool     can_disable;
        unsigned indices;
        u8       subcaches[4];
 };
@@ -311,14 +310,12 @@ struct _cache_attr {
 /*
 * L3 cache descriptors
 */
-static struct amd_l3_cache **__cpuinitdata l3_caches;
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
        unsigned int sc0, sc1, sc2, sc3;
        u32 val = 0;
-        pci_read_config_dword(l3->dev, 0x1C4, &val);
+        pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
        /* calculate subcache sizes */
        l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
        l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
-{
+                                        int index)
-        struct amd_l3_cache *l3;
-        struct pci_dev *dev = node_to_k8_nb_misc(node);
-        l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-        if (!l3) {
-                printk(KERN_WARNING "Error allocating L3 struct\n");
-                return NULL;
-        }
-        l3->dev = dev;
-        amd_calc_l3_indices(l3);
-        return l3;
-}
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
-                                           int index)
 {
+        static struct amd_l3_cache *__cpuinitdata l3_caches;
        int node;
-        if (boot_cpu_data.x86 != 0x10)
+        /* only for L3, and not in virtualized environments */
-                return;
+        if (index < 3 || amd_nb_num() == 0)
-        if (index < 3)
-                return;
-        /* see errata #382 and #388 */
-        if (boot_cpu_data.x86_model < 0x8)
-                return;
-        if ((boot_cpu_data.x86_model == 0x8 ||
-             boot_cpu_data.x86_model == 0x9)
-                &&
-             boot_cpu_data.x86_mask < 0x1)
-                        return;
-        /* not in virtualized environments */
-        if (k8_northbridges.num == 0)
                return;
        /*
@@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
         * never freed but this is done only on shutdown so it doesn't matter.
         */
        if (!l3_caches) {
-                int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
+                int size = amd_nb_num() * sizeof(struct amd_l3_cache);
                l3_caches = kzalloc(size, GFP_ATOMIC);
                if (!l3_caches)
@@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
        node = amd_get_nb_id(smp_processor_id());
-        if (!l3_caches[node]) {
+        if (!l3_caches[node].nb) {
-                l3_caches[node] = amd_init_l3_cache(node);
+                l3_caches[node].nb = node_to_amd_nb(node);
-                l3_caches[node]->can_disable = true;
+                amd_calc_l3_indices(&l3_caches[node]);
        }
-        WARN_ON(!l3_caches[node]);
+        this_leaf->l3 = &l3_caches[node];
-        this_leaf->l3 = l3_caches[node];
 }
 /*
@@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
 {
        unsigned int reg = 0;
-        pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+        pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
        /* check whether this slot is activated already */
        if (reg & (3UL << 30))
@@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 {
        int index;
-        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+        if (!this_leaf->l3 ||
+            !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                return -EINVAL;
        index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                if (!l3->subcaches[i])
                        continue;
-                pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+                pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
                /*
                 * We need to WBINVD on a core on the node containing the L3
@@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                wbinvd_on_cpu(cpu);
                reg |= BIT(31);
-                pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+                pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
        }
 }
@@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+        if (!this_leaf->l3 ||
+            !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                return -EINVAL;
        cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 #define STORE_CACHE_DISABLE(slot)                                       \
 static ssize_t                                                          \
 store_cache_disable_##slot(struct _cpuid4_info *this_leaf,              \
-                            const char *buf, size_t count)              \
+                           const char *buf, size_t count)               \
 {                                                                       \
        return store_cache_disable(this_leaf, buf, count, slot);        \
 }
@@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
                show_cache_disable_1, store_cache_disable_1);
 #else   /* CONFIG_AMD_NB */
-static void __cpuinit
+#define amd_init_l3_cache(x, y)
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
-{
-};
 #endif /* CONFIG_AMD_NB */
 static int
@@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
                amd_cpuid4(index, &eax, &ebx, &ecx);
-                amd_check_l3_disable(this_leaf, index);
+                amd_init_l3_cache(this_leaf, index);
        } else {
                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
        }
@@ -983,30 +944,48 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
-#define DEFAULT_SYSFS_CACHE_ATTRS       \
-        &type.attr,                     \
-        &level.attr,                    \
-        &coherency_line_size.attr,      \
-        &physical_line_partition.attr,  \
-        &ways_of_associativity.attr,    \
-        &number_of_sets.attr,           \
-        &size.attr,                     \
-        &shared_cpu_map.attr,           \
-        &shared_cpu_list.attr
 static struct attribute *default_attrs[] = {
-        DEFAULT_SYSFS_CACHE_ATTRS,
+        &type.attr,
+        &level.attr,
+        &coherency_line_size.attr,
+        &physical_line_partition.attr,
+        &ways_of_associativity.attr,
+        &number_of_sets.attr,
+        &size.attr,
+        &shared_cpu_map.attr,
+        &shared_cpu_list.attr,
        NULL
 };
-static struct attribute *default_l3_attrs[] = {
-        DEFAULT_SYSFS_CACHE_ATTRS,
 #ifdef CONFIG_AMD_NB
-        &cache_disable_0.attr,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
-        &cache_disable_1.attr,
+{
+        static struct attribute **attrs;
+        int n;
+        if (attrs)
+                return attrs;
+        n = sizeof (default_attrs) / sizeof (struct attribute *);
+        if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+                n += 2;
+        attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+        if (attrs == NULL)
+                return attrs = default_attrs;
+        for (n = 0; default_attrs[n]; n++)
+                attrs[n] = default_attrs[n];
+        if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+                attrs[n++] = &cache_disable_0.attr;
+                attrs[n++] = &cache_disable_1.attr;
+        }
+        return attrs;
+}
 #endif
-        NULL
-};
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
@@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
                this_leaf = CPUID4_INFO_IDX(cpu, i);
-                if (this_leaf->l3 && this_leaf->l3->can_disable)
+                ktype_cache.default_attrs = default_attrs;
-                        ktype_cache.default_attrs = default_l3_attrs;
+#ifdef CONFIG_AMD_NB
-                else
+                if (this_leaf->l3)
-                        ktype_cache.default_attrs = default_attrs;
+                        ktype_cache.default_attrs = amd_l3_attrs();
+#endif
                retval = kobject_init_and_add(&(this_object->kobj),
                                              &ktype_cache,
                                              per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 80c482382d5c..5bf2fac52aca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
-#define PFX               "mce_threshold: "
-#define VERSION           "version 1.1.1"
 #define NR_BANKS          6
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
        struct list_head        miscj;
 };
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
-        .interrupt_enable       = 0,
-        .threshold_limit        = THRESHOLD_MAX,
-};
 struct threshold_bank {
        struct kobject          *kobj;
        struct threshold_block  *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
 struct thresh_restart {
        struct threshold_block  *b;
        int                     reset;
+        int                     set_lvt_off;
+        int                     lvt_off;
        u16                     old_limit;
 };
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+        int msr = (hi & MASK_LVTOFF_HI) >> 20;
+        if (apic < 0) {
+                pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+                       "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+                       b->bank, b->block, b->address, hi, lo);
+                return 0;
+        }
+        if (apic != msr) {
+                pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+                       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+                       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+                return 0;
+        }
+        return 1;
+};
 /* must be called with correct cpu affinity */
 /* Called via smp_call_function_single() */
 static void threshold_restart_bank(void *_tr)
 {
        struct thresh_restart *tr = _tr;
-        u32 mci_misc_hi, mci_misc_lo;
+        u32 hi, lo;
-        rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+        rdmsr(tr->b->address, lo, hi);
-        if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+        if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
                tr->reset = 1;  /* limit cannot be lower than err count */
        if (tr->reset) {                /* reset err count and overflow bit */
-                mci_misc_hi =
+                hi =
-                    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+                    (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
                    (THRESHOLD_MAX - tr->b->threshold_limit);
        } else if (tr->old_limit) {     /* change limit w/o reset */
-                int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+                int new_count = (hi & THRESHOLD_MAX) +
                    (tr->old_limit - tr->b->threshold_limit);
-                mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+                hi = (hi & ~MASK_ERR_COUNT_HI) |
                    (new_count & THRESHOLD_MAX);
        }
+        if (tr->set_lvt_off) {
+                if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+                        /* set new lvt offset */
+                        hi &= ~MASK_LVTOFF_HI;
+                        hi |= tr->lvt_off << 20;
+                }
+        }
        tr->b->interrupt_enable ?
-            (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+            (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-            (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+            (hi &= ~MASK_INT_TYPE_HI);
-        mci_misc_hi |= MASK_COUNT_EN_HI;
+        hi |= MASK_COUNT_EN_HI;
-        wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+        wrmsr(tr->b->address, lo, hi);
+}
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+        struct thresh_restart tr = {
+                .b                      = b,
+                .set_lvt_off            = 1,
+                .lvt_off                = offset,
+        };
+        b->threshold_limit              = THRESHOLD_MAX;
+        threshold_restart_bank(&tr);
+};
+static int setup_APIC_mce(int reserved, int new)
+{
+        if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+                                              APIC_EILVT_MSG_FIX, 0))
+                return new;
+        return reserved;
 }
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
+        struct threshold_block b;
        unsigned int cpu = smp_processor_id();
        u32 low = 0, high = 0, address = 0;
        unsigned int bank, block;
-        struct thresh_restart tr;
+        int offset = -1;
-        int lvt_off = -1;
-        u8 offset;
        for (bank = 0; bank < NR_BANKS; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        if (shared_bank[bank] && c->cpu_core_id)
                                break;
 #endif
-                        offset = (high & MASK_LVTOFF_HI) >> 20;
+                        offset = setup_APIC_mce(offset,
-                        if (lvt_off < 0) {
+                                                (high & MASK_LVTOFF_HI) >> 20);
-                                if (setup_APIC_eilvt(offset,
-                                                     THRESHOLD_APIC_VECTOR,
-                                                     APIC_EILVT_MSG_FIX, 0)) {
-                                        pr_err(FW_BUG "cpu %d, failed to "
-                                               "setup threshold interrupt "
-                                               "for bank %d, block %d "
-                                               "(MSR%08X=0x%x%08x)",
-                                               smp_processor_id(), bank, block,
-                                               address, high, low);
-                                        continue;
-                                }
-                                lvt_off = offset;
-                        } else if (lvt_off != offset) {
-                                pr_err(FW_BUG "cpu %d, invalid threshold "
-                                       "interrupt offset %d for bank %d,"
-                                       "block %d (MSR%08X=0x%x%08x)",
-                                       smp_processor_id(), lvt_off, bank,
-                                       block, address, high, low);
-                                continue;
-                        }
-                        high &= ~MASK_LVTOFF_HI;
-                        high |= lvt_off << 20;
-                        wrmsr(address, low, high);
-                        threshold_defaults.address = address;
+                        memset(&b, 0, sizeof(b));
-                        tr.b = &threshold_defaults;
+                        b.cpu           = cpu;
-                        tr.reset = 0;
+                        b.bank          = bank;
-                        tr.old_limit = 0;
+                        b.block         = block;
-                        threshold_restart_bank(&tr);
+                        b.address       = address;
+                        mce_threshold_block_init(&b, offset);
                        mce_threshold_vector = amd_threshold_interrupt;
                }
        }
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
        b->interrupt_enable = !!new;
+        memset(&tr, 0, sizeof(tr));
        tr.b            = b;
-        tr.reset        = 0;
-        tr.old_limit    = 0;
        smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
        if (new < 1)
                new = 1;
+        memset(&tr, 0, sizeof(tr));
        tr.old_limit = b->threshold_limit;
        b->threshold_limit = new;
        tr.b = b;
-        tr.reset = 0;
        smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
                        continue;
                err = threshold_create_bank(cpu, bank);
                if (err)
-                        goto out;
+                        return err;
        }
-out:
        return err;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 4b683267eca5..e12246ff5aa6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,13 @@ struct thermal_state {
        struct _thermal_state core_power_limit;
        struct _thermal_state package_throttle;
        struct _thermal_state package_power_limit;
+        struct _thermal_state core_thresh0;
+        struct _thermal_state core_thresh1;
 };
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 static atomic_t therm_throt_en  = ATOMIC_INIT(0);
@@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level)
        return 0;
 }
+static int thresh_event_valid(int event)
+{
+        struct _thermal_state *state;
+        unsigned int this_cpu = smp_processor_id();
+        struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+        u64 now = get_jiffies_64();
+        state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+        if (time_before64(now, state->next_check))
+                return 0;
+        state->next_check = now + CHECK_INTERVAL;
+        return 1;
+}
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device);
 #define PACKAGE_THROTTLED       ((__u64)2 << 62)
 #define PACKAGE_POWER_LIMIT     ((__u64)3 << 62)
+static void notify_thresholds(__u64 msr_val)
+{
+        /* check whether the interrupt handler is defined;
+         * otherwise simply return
+         */
+        if (!platform_thermal_notify)
+                return;
+        /* lower threshold reached */
+        if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
+                platform_thermal_notify(msr_val);
+        /* higher threshold reached */
+        if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+                platform_thermal_notify(msr_val);
+}
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
@@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void)
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+        /* Check for violation of core thermal thresholds*/
+        notify_thresholds(msr_val);
        if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
                                THERMAL_THROTTLING_EVENT,
                                CORE_LEVEL) != 0)
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 4572f25f9325..cd28a350f7f9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
                if (!strncmp(buf, "xen", 3))
                        early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+#ifdef CONFIG_EARLY_PRINTK_MRST
                if (!strncmp(buf, "mrst", 4)) {
                        mrst_early_console_init();
                        early_console_register(&early_mrst_console, keep);
@@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf)
                        hsu_early_console_init();
                        early_console_register(&early_hsu_console, keep);
                }
 #endif
                buf++;
        }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3afb33f14d2d..298448656b60 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/module.h>
 #include <trace/syscall.h>
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
 int ftrace_arch_code_modify_prepare(void)
 {
        set_kernel_text_rw();
+        set_all_modules_text_rw();
        modifying_code = 1;
        return 0;
 }
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
 int ftrace_arch_code_modify_post_process(void)
 {
        modifying_code = 0;
+        set_all_modules_text_ro();
        set_kernel_text_ro();
        return 0;
 }
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 763310165fa0..7f138b3c3c52 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -61,6 +61,9 @@ void __init i386_start_kernel(void)
        case X86_SUBARCH_MRST:
                x86_mrst_early_setup();
                break;
+        case X86_SUBARCH_CE4100:
+                x86_ce4100_early_setup();
+                break;
        default:
                i386_default_early_setup();
                break;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c0dbd9ac24f0..9f54b209c378 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -139,39 +139,6 @@ ENTRY(startup_32)
        movl %eax, pa(olpc_ofw_pgd)
 #endif
-#ifdef CONFIG_PARAVIRT
-        /* This is can only trip for a broken bootloader... */
-        cmpw $0x207, pa(boot_params + BP_version)
-        jb default_entry
-        /* Paravirt-compatible boot parameters.  Look to see what architecture
-                we're booting under. */
-        movl pa(boot_params + BP_hardware_subarch), %eax
-        cmpl $num_subarch_entries, %eax
-        jae bad_subarch
-        movl pa(subarch_entries)(,%eax,4), %eax
-        subl $__PAGE_OFFSET, %eax
-        jmp *%eax
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
-        /* Unknown implementation; there's really
-           nothing we can do at this point. */
-        ud2a
-        __INITDATA
-subarch_entries:
-        .long default_entry             /* normal x86/PC */
-        .long lguest_entry              /* lguest hypervisor */
-        .long xen_entry                 /* Xen hypervisor */
-        .long default_entry             /* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
 /*
 * Initialize page tables.  This creates a PDE and a set of page
 * tables, which are located immediately beyond __brk_base.  The variable
@@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
 *
 * Note that the stack is not yet set up!
 */
-default_entry:
 #ifdef CONFIG_X86_PAE
        /*
@@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
        movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
        movl %eax,pa(initial_page_table+0xffc)
 #endif
-        jmp 3f
+#ifdef CONFIG_PARAVIRT
+        /* This is can only trip for a broken bootloader... */
+        cmpw $0x207, pa(boot_params + BP_version)
+        jb default_entry
+        /* Paravirt-compatible boot parameters.  Look to see what architecture
+                we're booting under. */
+        movl pa(boot_params + BP_hardware_subarch), %eax
+        cmpl $num_subarch_entries, %eax
+        jae bad_subarch
+        movl pa(subarch_entries)(,%eax,4), %eax
+        subl $__PAGE_OFFSET, %eax
+        jmp *%eax
+bad_subarch:
+WEAK(lguest_entry)
+WEAK(xen_entry)
+        /* Unknown implementation; there's really
+           nothing we can do at this point. */
+        ud2a
+        __INITDATA
+subarch_entries:
+        .long default_entry             /* normal x86/PC */
+        .long lguest_entry              /* lguest hypervisor */
+        .long xen_entry                 /* Xen hypervisor */
+        .long default_entry             /* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+        jmp default_entry
+#endif /* CONFIG_PARAVIRT */
 /*
 * Non-boot CPU entry point; entered from trampoline.S
 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -282,7 +283,7 @@ ENTRY(startup_32_smp)
        movl %eax,%fs
        movl %eax,%gs
 #endif /* CONFIG_SMP */
-3:
+default_entry:
 /*
 *      New page tables may be in 4Mbyte page mode and may
@@ -316,6 +317,10 @@ ENTRY(startup_32_smp)
        subl $0x80000001, %eax
        cmpl $(0x8000ffff-0x80000001), %eax
        ja 6f
+        /* Clear bogus XD_DISABLE bits */
+        call verify_cpu
        mov $0x80000001, %eax
        cpuid
        /* Execute Disable bit supported? */
@@ -611,6 +616,8 @@ ignore_int:
 #endif
        iret
+#include "verify_cpu.S"
        __REFDATA
 .align 4
 ENTRY(initial_code)
@@ -622,13 +629,13 @@ ENTRY(initial_code)
 __PAGE_ALIGNED_BSS
        .align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
-ENTRY(initial_pg_pmd)
+initial_pg_pmd:
        .fill 1024*KPMDS,4,0
 #else
 ENTRY(initial_page_table)
        .fill 1024,4,0
 #endif
-ENTRY(initial_pg_fixmap)
+initial_pg_fixmap:
        .fill 1024,4,0
 ENTRY(empty_zero_page)
        .fill 4096,1,0
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ce0cb4721c9a..0fe6d1a66c38 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
        return 0;
 }
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
-        memcpy(to, from, n);
-        return 0;
-}
 static void *
 get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
        u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
        void *mc;
-        if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
+        get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
-                return NULL;
        if (section_hdr[0] != UCODE_UCODE_TYPE) {
                pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
                return NULL;
        }
-        mc = vmalloc(UCODE_MAX_SIZE);
+        mc = vzalloc(UCODE_MAX_SIZE);
-        if (mc) {
+        if (!mc)
-                memset(mc, 0, UCODE_MAX_SIZE);
+                return NULL;
-                if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
-                                   total_size)) {
+        get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
-                        vfree(mc);
+        *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
-                        mc = NULL;
-                } else
-                        *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
-        }
        return mc;
 }
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
        unsigned int *buf_pos = (unsigned int *)container_hdr;
        unsigned long size;
-        if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
+        get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
-                return 0;
        size = buf_pos[2];
@@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf)
        }
        buf += UCODE_CONTAINER_HEADER_SIZE;
-        if (get_ucode_data(equiv_cpu_table, buf, size)) {
+        get_ucode_data(equiv_cpu_table, buf, size);
-                vfree(equiv_cpu_table);
-                return 0;
-        }
        return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
 }
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba0f0ca9f280..c01ffa5b9b87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -143,7 +143,7 @@ static void flush_gart(void)
        spin_lock_irqsave(&iommu_bitmap_lock, flags);
        if (need_flush) {
-                k8_flush_garts();
+                amd_flush_garts();
                need_flush = false;
        }
        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
 {
        int i;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return;
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                struct pci_dev *dev = k8_northbridges.nb_misc[i];
+                struct pci_dev *dev = node_to_amd_nb(i)->misc;
                enable_gart_translation(dev, __pa(agp_gatt_table));
        }
        /* Flush the GART-TLB to remove stale entries */
-        k8_flush_garts();
+        amd_flush_garts();
 }
 /*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
        if (!fix_up_north_bridges)
                return;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return;
        pr_info("PCI-DMA: Restoring GART aperture settings\n");
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                struct pci_dev *dev = k8_northbridges.nb_misc[i];
+                struct pci_dev *dev = node_to_amd_nb(i)->misc;
                /*
                 * Don't enable translations just yet.  That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
 * Private Northbridge GATT initialization in case we cannot use the
 * AGP driver for some reason.
 */
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
 {
        unsigned aper_size, gatt_size, new_aper_size;
        unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
        aper_size = aper_base = info->aper_size = 0;
        dev = NULL;
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                dev = k8_northbridges.nb_misc[i];
+                dev = node_to_amd_nb(i)->misc;
                new_aper_base = read_aperture(dev, &new_aper_size);
                if (!new_aper_base)
                        goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
        if (!no_agp)
                return;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return;
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
                u32 ctl;
-                dev = k8_northbridges.nb_misc[i];
+                dev = node_to_amd_nb(i)->misc;
                pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
                ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
        unsigned long scratch;
        long i;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return 0;
 #ifndef CONFIG_AGP_AMD64
        no_agp = 1;
 #else
        /* Makefile puts PCI initialization via subsys_initcall first. */
-        /* Add other K8 AGP bridge drivers here */
+        /* Add other AMD AGP bridge drivers here */
        no_agp = no_agp ||
                (agp_amd64_init() < 0) ||
                (agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
        if (no_iommu ||
            (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
            !gart_iommu_aperture ||
-            (no_agp && init_k8_gatt(&info) < 0)) {
+            (no_agp && init_amd_gatt(&info) < 0)) {
                if (max_pfn > MAX_DMA32_PFN) {
                        pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
                        pr_warning("falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313ebbb03..c8e41e90f59c 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
        outb(1, 0x92);
 }
+static void ce4100_reset(struct pci_dev *dev)
+{
+        int i;
+        for (i = 0; i < 10; i++) {
+                outb(0x2, 0xcf9);
+                udelay(50);
+        }
+}
 struct device_fixup {
        unsigned int vendor;
        unsigned int device;
        void (*reboot_fixup)(struct pci_dev *);
 };
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100      0x0708
 static const struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
 { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
 };
 /*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a0f52af256a0..d3cfe26c0252 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void)
 void __init setup_arch(char **cmdline_p)
 {
        int acpi = 0;
-        int k8 = 0;
+        int amd = 0;
        unsigned long flags;
 #ifdef CONFIG_X86_32
@@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p)
        acpi = acpi_numa_init();
 #endif
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
        if (!acpi)
-                k8 = !k8_numa_init(0, max_pfn);
+                amd = !amd_numa_init(0, max_pfn);
 #endif
-        initmem_init(0, max_pfn, acpi, k8);
+        initmem_init(0, max_pfn, acpi, amd);
        memblock_find_dma_reserve();
        dma32_reserve_bootmem();
@@ -1045,10 +1045,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
        init_apic_mappings();
-        ioapic_init_mappings();
+        ioapic_and_gsi_init();
-        /* need to wait for io_apic is mapped */
-        probe_nr_irqs_gsi();
        kvm_guest_init();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 68f61ac632e1..ee886fe10ef4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1161,6 +1161,20 @@ out:
        preempt_enable();
 }
+void arch_disable_nonboot_cpus_begin(void)
+{
+        /*
+         * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+         * In the suspend path, we will be back in the SMP mode shortly anyways.
+         */
+        skip_smp_alternatives = true;
+}
+void arch_disable_nonboot_cpus_end(void)
+{
+        skip_smp_alternatives = false;
+}
 void arch_enable_nonboot_cpus_begin(void)
 {
        set_mtrr_aps_delayed_init();
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..075d130efcf9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
 no_longmode:
        hlt
        jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
        # Careful these need to be in the same 64K segment as the above;
 tidt:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0c40d8b72416..356a0d455cf9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
+        if (tsc_clocksource_reliable)
+                return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (num_possible_cpus() > 1)
-                        tsc_unstable = 1;
+                        return 1;
        }
-        return tsc_unstable;
+        return 0;
+}
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+        static u64 tsc_start = -1, ref_start;
+        static int hpet;
+        u64 tsc_stop, ref_stop, delta;
+        unsigned long freq;
+        /* Don't bother refining TSC on unstable systems */
+        if (check_tsc_unstable())
+                goto out;
+        /*
+         * Since the work is started early in boot, we may be
+         * delayed the first time we expire. So set the workqueue
+         * again once we know timers are working.
+         */
+        if (tsc_start == -1) {
+                /*
+                 * Only set hpet once, to avoid mixing hardware
+                 * if the hpet becomes enabled later.
+                 */
+                hpet = is_hpet_enabled();
+                schedule_delayed_work(&tsc_irqwork, HZ);
+                tsc_start = tsc_read_refs(&ref_start, hpet);
+                return;
+        }
+        tsc_stop = tsc_read_refs(&ref_stop, hpet);
+        /* hpet or pmtimer available ? */
+        if (!hpet && !ref_start && !ref_stop)
+                goto out;
+        /* Check, whether the sampling was disturbed by an SMI */
+        if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+                goto out;
+        delta = tsc_stop - tsc_start;
+        delta *= 1000000LL;
+        if (hpet)
+                freq = calc_hpet_ref(delta, ref_start, ref_stop);
+        else
+                freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+        /* Make sure we're within 1% */
+        if (abs(tsc_khz - freq) > tsc_khz/100)
+                goto out;
+        tsc_khz = freq;
+        printk(KERN_INFO "Refined TSC clocksource calibration: "
+                "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+                                        (unsigned long)tsc_khz % 1000);
+out:
+        clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
-static void __init init_tsc_clocksource(void)
+static int __init init_tsc_clocksource(void)
 {
+        if (!cpu_has_tsc || tsc_disabled > 0)
+                return 0;
        if (tsc_clocksource_reliable)
                clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        /* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
                clocksource_tsc.rating = 0;
                clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
        }
-        clocksource_register_khz(&clocksource_tsc, tsc_khz);
+        schedule_delayed_work(&tsc_irqwork, 0);
+        return 0;
 }
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
 void __init tsc_init(void)
 {
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
                mark_tsc_unstable("TSCs unsynchronized");
        check_system_tsc_reliable();
-        init_tsc_clocksource();
 }
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..0edefc19a113 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
 *      Copyright (c) 2007  Andi Kleen (ak@suse.de)
 *      Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
 *      Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *      Copyright (c) 2010  Kees Cook (kees.cook@canonical.com)
 *
 *      This source code is licensed under the GNU General Public License,
 *      Version 2.  See the file COPYING for more details.
@@ -14,18 +15,17 @@
 *      This is a common code for verification whether CPU supports
 *      long mode and SSE or not. It is not called directly instead this
 *      file is included at various places and compiled in that context.
- *      Following are the current usage.
+ *      This file is expected to run in 32bit code.  Currently:
 *
- *      This file is included by both 16bit and 32bit code.
+ *      arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ *      arch/x86/kernel/trampoline_64.S: secondary processor verfication
+ *      arch/x86/kernel/head_32.S: processor startup
 *
- *      arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
+ *      verify_cpu, returns the status of longmode and SSE in register %eax.
- *      arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *      arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *      arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *      verify_cpu, returns the status of cpu check in register %eax.
 *              0: Success    1: Failure
 *
+ *      On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
 *      The caller needs to check for the error code and take the action
 *      appropriately. Either display a message or halt.
 */
@@ -62,8 +62,41 @@ verify_cpu:
        cmpl    $0x444d4163,%ecx
        jnz     verify_cpu_noamd
        mov     $1,%di                  # cpu is from AMD
+        jmp     verify_cpu_check
 verify_cpu_noamd:
+        cmpl    $0x756e6547,%ebx        # GenuineIntel?
+        jnz     verify_cpu_check
+        cmpl    $0x49656e69,%edx
+        jnz     verify_cpu_check
+        cmpl    $0x6c65746e,%ecx
+        jnz     verify_cpu_check
+        # only call IA32_MISC_ENABLE when:
+        # family > 6 || (family == 6 && model >= 0xd)
+        movl    $0x1, %eax              # check CPU family and model
+        cpuid
+        movl    %eax, %ecx
+        andl    $0x0ff00f00, %eax       # mask family and extended family
+        shrl    $8, %eax
+        cmpl    $6, %eax
+        ja      verify_cpu_clear_xd     # family > 6, ok
+        jb      verify_cpu_check        # family < 6, skip
+        andl    $0x000f00f0, %ecx       # mask model and extended model
+        shrl    $4, %ecx
+        cmpl    $0xd, %ecx
+        jb      verify_cpu_check        # family == 6, model < 0xd, skip
+verify_cpu_clear_xd:
+        movl    $MSR_IA32_MISC_ENABLE, %ecx
+        rdmsr
+        btrl    $2, %edx                # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+        jnc     verify_cpu_check        # only write MSR if bit was changed
+        wrmsr
+verify_cpu_check:
        movl    $0x1,%eax               # Does the cpu have what it takes
        cpuid
        andl    $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530aebfd0..bf4700755184 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
 PHDRS {
        text PT_LOAD FLAGS(5);          /* R_E */
-        data PT_LOAD FLAGS(7);          /* RWE */
+        data PT_LOAD FLAGS(6);          /* RW_ */
 #ifdef CONFIG_X86_64
        user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
        EXCEPTION_TABLE(16) :text = 0x9090
+#if defined(CONFIG_DEBUG_RODATA)
+        /* .text should occupy whole number of pages */
+        . = ALIGN(PAGE_SIZE);
+#endif
        X64_ALIGN_DEBUG_RODATA_BEGIN
        RO_DATA(PAGE_SIZE)
        X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
                __bss_start = .;
                *(.bss..page_aligned)
                *(.bss)
-                . = ALIGN(4);
+                . = ALIGN(PAGE_SIZE);
                __bss_stop = .;
        }
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index e7d5382ef263..4f420c2f2d55 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -4,7 +4,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>
-#include <asm/pgtable.h>
 /*G:020
 * Our story starts with the kernel booting into startup_32 in
@@ -38,113 +37,9 @@ ENTRY(lguest_entry)
        /* Set up the initial stack so we can run C code. */
        movl $(init_thread_union+THREAD_SIZE),%esp
-        call init_pagetables
        /* Jumps are relative: we're running __PAGE_OFFSET too low. */
        jmp lguest_init+__PAGE_OFFSET
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base.  The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- *
- * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they
- * don't have a stack at this point, so we can't just use call and ret.
- */
-init_pagetables:
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-#define pa(X) ((X) - __PAGE_OFFSET)
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
-        PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
-#ifdef CONFIG_X86_PAE
-        /*
-         * In PAE mode initial_page_table is statically defined to contain
-         * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-         * entries). The identity mapping is handled by pointing two PGD entries
-         * to the first kernel PMD.
-         *
-         * Note the upper half of each PMD or PTE are always zero at this stage.
-         */
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-        xorl %ebx,%ebx                          /* %ebx is kept at zero */
-        movl $pa(__brk_base), %edi
-        movl $pa(initial_pg_pmd), %edx
-        movl $PTE_IDENT_ATTR, %eax
-10:
-        leal PDE_IDENT_ATTR(%edi),%ecx          /* Create PMD entry */
-        movl %ecx,(%edx)                        /* Store PMD entry */
-                                                /* Upper half already zero */
-        addl $8,%edx
-        movl $512,%ecx
-11:
-        stosl
-        xchgl %eax,%ebx
-        stosl
-        xchgl %eax,%ebx
-        addl $0x1000,%eax
-        loop 11b
-        /*
-         * End condition: we must map up to the end + MAPPING_BEYOND_END.
-         */
-        movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-        cmpl %ebp,%eax
-        jb 10b
-1:
-        addl $__PAGE_OFFSET, %edi
-        movl %edi, pa(_brk_end)
-        shrl $12, %eax
-        movl %eax, pa(max_pfn_mapped)
-        /* Do early initialization of the fixmap area */
-        movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-        movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else   /* Not PAE */
-page_pde_offset = (__PAGE_OFFSET >> 20);
-        movl $pa(__brk_base), %edi
-        movl $pa(initial_page_table), %edx
-        movl $PTE_IDENT_ATTR, %eax
-10:
-        leal PDE_IDENT_ATTR(%edi),%ecx          /* Create PDE entry */
-        movl %ecx,(%edx)                        /* Store identity PDE entry */
-        movl %ecx,page_pde_offset(%edx)         /* Store kernel PDE entry */
-        addl $4,%edx
-        movl $1024, %ecx
-11:
-        stosl
-        addl $0x1000,%eax
-        loop 11b
-        /*
-         * End condition: we must map up to the end + MAPPING_BEYOND_END.
-         */
-        movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-        cmpl %ebp,%eax
-        jb 10b
-        addl $__PAGE_OFFSET, %edi
-        movl %edi, pa(_brk_end)
-        shrl $12, %eax
-        movl %eax, pa(max_pfn_mapped)
-        /* Do early initialization of the fixmap area */
-        movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-        movl %eax,pa(initial_page_table+0xffc)
-#endif
-        ret
 /*G:055
 * We create a macro which puts the assembler code between lgstart_ and lgend_
 * markers.  These templates are put in the .text section: they can't be
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 55543397a8a7..09df2f9a3d69 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)    += testmmiotrace.o
 obj-$(CONFIG_NUMA)              += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA)           += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA)          += amdtopology_64.o
 obj-$(CONFIG_ACPI_NUMA)         += srat_$(BITS).o
 obj-$(CONFIG_HAVE_MEMBLOCK)             += memblock.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c
index 804a3b6c6e14..51fae9cfdecb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -1,8 +1,8 @@
 /*
- * AMD K8 NUMA support.
+ * AMD NUMA support.
 * Discover the memory map and associated nodes.
 *
- * This version reads it directly from the K8 northbridge.
+ * This version reads it directly from the AMD northbridge.
 *
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
 */
@@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void)
 {
        /*
         * need to get the APIC ID of the BSP so can use that to
-         * create apicid_to_node in k8_scan_nodes()
+         * create apicid_to_node in amd_scan_nodes()
         */
 #ifdef CONFIG_X86_MPPARSE
        /*
@@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void)
        early_init_lapic_mapping();
 }
-int __init k8_get_nodes(struct bootnode *physnodes)
+int __init amd_get_nodes(struct bootnode *physnodes)
 {
        int i;
        int ret = 0;
@@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes)
        return ret;
 }
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long start = PFN_PHYS(start_pfn);
        unsigned long end = PFN_PHYS(end_pfn);
@@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
        return 0;
 }
-int __init k8_scan_nodes(void)
+int __init amd_scan_nodes(void)
 {
        unsigned int bits;
        unsigned int cores;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
        /*
         * We just marked the kernel text read only above, now that
         * we are going to free part of that, we need to make that
-         * writeable first.
+         * writeable and non-executable first.
         */
+        set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
        set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..f89b5bb4e93f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 static inline int is_kernel_text(unsigned long addr)
 {
-        if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+        if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
                return 1;
        return 0;
 }
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
+static void mark_nxdata_nx(void)
+{
+        /*
+         * When this called, init has already been executed and released,
+         * so everything past _etext sould be NX.
+         */
+        unsigned long start = PFN_ALIGN(_etext);
+        /*
+         * This comes from is_kernel_text upper limit. Also HPAGE where used:
+         */
+        unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+        if (__supported_pte_mask & _PAGE_NX)
+                printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+        set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+        mark_nxdata_nx();
 }
 #endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b727efd..7762a517d69d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
 static int __init setup_physnodes(unsigned long start, unsigned long end,
-                                        int acpi, int k8)
+                                        int acpi, int amd)
 {
        int nr_nodes = 0;
        int ret = 0;
@@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
        if (acpi)
                nr_nodes = acpi_get_nodes(physnodes);
 #endif
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
-        if (k8)
+        if (amd)
-                nr_nodes = k8_get_nodes(physnodes);
+                nr_nodes = amd_get_nodes(physnodes);
 #endif
        /*
         * Basic sanity checking on the physical node map: there may be errors
-         * if the SRAT or K8 incorrectly reported the topology or the mem=
+         * if the SRAT or AMD code incorrectly reported the topology or the mem=
         * kernel parameter is used.
         */
        for (i = 0; i < nr_nodes; i++) {
@@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 * numa=fake command-line option.
 */
 static int __init numa_emulation(unsigned long start_pfn,
-                        unsigned long last_pfn, int acpi, int k8)
+                        unsigned long last_pfn, int acpi, int amd)
 {
        u64 addr = start_pfn << PAGE_SHIFT;
        u64 max_addr = last_pfn << PAGE_SHIFT;
@@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn,
        int num_nodes;
        int i;
-        num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
+        num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
        /*
         * If the numa=fake command-line contains a 'M' or 'G', it represents
         * the fixed node size.  Otherwise, if it is just a single number N,
@@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 #endif /* CONFIG_NUMA_EMU */
 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-                                int acpi, int k8)
+                                int acpi, int amd)
 {
        int i;
@@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
        nodes_clear(node_online_map);
 #ifdef CONFIG_NUMA_EMU
-        if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+        if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
@@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
        nodes_clear(node_online_map);
 #endif
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
-        if (!numa_off && k8 && !k8_scan_nodes())
+        if (!numa_off && amd && !amd_scan_nodes())
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                                   unsigned long pfn)
 {
        pgprot_t forbidden = __pgprot(0);
+        pgprot_t required = __pgprot(0);
        /*
         * The BIOS area between 640k and 1Mb needs to be executable for
         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
         */
-        if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+        if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_NX;
+#endif
        /*
         * The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
        if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
                   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_RW;
+        /*
+         * .data and .bss should always be writable.
+         */
+        if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+            within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+                pgprot_val(required) |= _PAGE_RW;
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
        /*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
        prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+        prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
        return prot;
 }
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 {
        unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
        pte_t new_pte, old_pte, *tmp;
-        pgprot_t old_prot, new_prot;
+        pgprot_t old_prot, new_prot, req_prot;
        int i, do_split = 1;
        unsigned int level;
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * We are safe now. Check whether the new pgprot is the same:
         */
        old_pte = *kpte;
-        old_prot = new_prot = pte_pgprot(old_pte);
+        old_prot = new_prot = req_prot = pte_pgprot(old_pte);
-        pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
-        pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
        /*
         * old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
        pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;
-        new_prot = static_protections(new_prot, address, pfn);
+        new_prot = static_protections(req_prot, address, pfn);
        /*
         * We need to check the full range, whether
         * static_protection() requires a different pgprot for one of
         * the pages in the range we try to preserve:
         */
-        addr = address + PAGE_SIZE;
+        addr = address & pmask;
-        pfn++;
+        pfn = pte_pfn(old_pte);
-        for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
+        for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
-                pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+                pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
                        goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * that we limited the number of possible pages already to
         * the number of pages in the large page.
         */
-        if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+        if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
                /*
                 * The address is aligned and the number of pages
                 * covers the full page.
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
 {
        if (!cpu_has_nx) {
                printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-                       "missing in CPU or disabled in BIOS!\n");
+                       "missing in CPU!\n");
        } else {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
                if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..f16434568a51 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
        /* mark this node as "seen" in node bitmap */
        BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+        /* don't need to check apic_id here, because it is always 8 bits */
        apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
        printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..171a0aacb99a 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
        }
        apic_id = pa->apic_id;
+        if (apic_id >= MAX_LOCAL_APIC) {
+                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+                return;
+        }
        apicid_to_node[apic_id] = node;
        node_set(node, cpu_nodes_parsed);
        acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
        else
                apic_id = pa->apic_id;
+        if (apic_id >= MAX_LOCAL_APIC) {
+                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+                return;
+        }
        apicid_to_node[apic_id] = node;
        node_set(node, cpu_nodes_parsed);
        acpi_numa = 1;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 51104b33fd51..c3b8e24f2b16 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -610,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
                ret = setup_ibs_ctl(i);
                if (ret)
                        return ret;
+                pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
                return 0;
        }
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index effd96e33f16..6b8759f7634e 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 obj-$(CONFIG_PCI_XEN)           += xen.o
 obj-y                           += fixup.o
+obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
 obj-$(CONFIG_ACPI)              += acpi.o
 obj-y                           += legacy.o irq.o
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
new file mode 100644
index 000000000000..85b68ef5e809
--- /dev/null
+++ b/arch/x86/pci/ce4100.c
@@ -0,0 +1,315 @@
+/*
+ *  GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2010 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *  The full GNU General Public License is included in this distribution
+ *  in the file called LICENSE.GPL.
+ *
+ *  Contact Information:
+ *    Intel Corporation
+ *    2200 Mission College Blvd.
+ *    Santa Clara, CA  97052
+ *
+ * This provides access methods for PCI registers that mis-behave on
+ * the CE4100. Each register can be assigned a private init, read and
+ * write routine. The exception to this is the bridge device.  The
+ * bridge device is the only device on bus zero (0) that requires any
+ * fixup so it is a special case ATM
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+struct sim_reg {
+        u32 value;
+        u32 mask;
+};
+struct sim_dev_reg {
+        int dev_func;
+        int reg;
+        void (*init)(struct sim_dev_reg *reg);
+        void (*read)(struct sim_dev_reg *reg, u32 *value);
+        void (*write)(struct sim_dev_reg *reg, u32 value);
+        struct sim_reg sim_reg;
+};
+struct sim_reg_op {
+        void (*init)(struct sim_dev_reg *reg);
+        void (*read)(struct sim_dev_reg *reg, u32 value);
+        void (*write)(struct sim_dev_reg *reg, u32 value);
+};
+#define MB (1024 * 1024)
+#define KB (1024)
+#define SIZE_TO_MASK(size) (~(size - 1))
+#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
+{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
+        {0, SIZE_TO_MASK(size)} },
+static void reg_init(struct sim_dev_reg *reg)
+{
+        pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
+                              &reg->sim_reg.value);
+}
+static void reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&pci_config_lock, flags);
+        *value = reg->sim_reg.value;
+        raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+static void reg_write(struct sim_dev_reg *reg, u32 value)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&pci_config_lock, flags);
+        reg->sim_reg.value = (value & reg->sim_reg.mask) |
+                (reg->sim_reg.value & ~reg->sim_reg.mask);
+        raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+static void sata_reg_init(struct sim_dev_reg *reg)
+{
+        pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
+                              &reg->sim_reg.value);
+        reg->sim_reg.value += 0x400;
+}
+static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+        reg_read(reg, value);
+        if (*value != reg->sim_reg.mask)
+                *value |= 0x100;
+}
+void sata_revid_init(struct sim_dev_reg *reg)
+{
+        reg->sim_reg.value = 0x01060100;
+        reg->sim_reg.mask = 0;
+}
+static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
+{
+        reg_read(reg, value);
+}
+static struct sim_dev_reg bus1_fixups[] = {
+        DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
+        DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
+        DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
+        DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
+        DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
+        DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+        DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+        DEFINE_REG(14, 0, 0x8,  0, sata_revid_init, sata_revid_read, 0)
+        DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
+        DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
+        DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
+        DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
+        DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
+        DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
+        DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
+        DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
+        DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+        DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
+};
+static void __init init_sim_regs(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                if (bus1_fixups[i].init)
+                        bus1_fixups[i].init(&bus1_fixups[i]);
+        }
+}
+static inline void extract_bytes(u32 *value, int reg, int len)
+{
+        uint32_t mask;
+        *value >>= ((reg & 3) * 8);
+        mask = 0xFFFFFFFF >> ((4 - len) * 8);
+        *value &= mask;
+}
+int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+        u32 av_bridge_base, av_bridge_limit;
+        int retval = 0;
+        switch (reg) {
+        /* Make BARs appear to not request any memory. */
+        case PCI_BASE_ADDRESS_0:
+        case PCI_BASE_ADDRESS_0 + 1:
+        case PCI_BASE_ADDRESS_0 + 2:
+        case PCI_BASE_ADDRESS_0 + 3:
+                *value = 0;
+                break;
+                /* Since subordinate bus number register is hardwired
+                 * to zero and read only, so do the simulation.
+                 */
+        case PCI_PRIMARY_BUS:
+                if (len == 4)
+                        *value = 0x00010100;
+                break;
+        case PCI_SUBORDINATE_BUS:
+                *value = 1;
+                break;
+        case PCI_MEMORY_BASE:
+        case PCI_MEMORY_LIMIT:
+                /* Get the A/V bridge base address. */
+                pci_direct_conf1.read(0, 0, devfn,
+                                PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
+                av_bridge_limit = av_bridge_base + (512*MB - 1);
+                av_bridge_limit >>= 16;
+                av_bridge_limit &= 0xFFF0;
+                av_bridge_base >>= 16;
+                av_bridge_base &= 0xFFF0;
+                if (reg == PCI_MEMORY_LIMIT)
+                        *value = av_bridge_limit;
+                else if (len == 2)
+                        *value = av_bridge_base;
+                else
+                        *value = (av_bridge_limit << 16) | av_bridge_base;
+                break;
+                /* Make prefetchable memory limit smaller than prefetchable
+                 * memory base, so not claim prefetchable memory space.
+                 */
+        case PCI_PREF_MEMORY_BASE:
+                *value = 0xFFF0;
+                break;
+        case PCI_PREF_MEMORY_LIMIT:
+                *value = 0x0;
+                break;
+                /* Make IO limit smaller than IO base, so not claim IO space. */
+        case PCI_IO_BASE:
+                *value = 0xF0;
+                break;
+        case PCI_IO_LIMIT:
+                *value = 0;
+                break;
+        default:
+                retval = 1;
+        }
+        return retval;
+}
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+                            unsigned int devfn, int reg, int len, u32 *value)
+{
+        int i, retval = 1;
+        if (bus == 1) {
+                for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                        if (bus1_fixups[i].dev_func == devfn &&
+                            bus1_fixups[i].reg == (reg & ~3) &&
+                            bus1_fixups[i].read) {
+                                bus1_fixups[i].read(&(bus1_fixups[i]),
+                                                    value);
+                                extract_bytes(value, reg, len);
+                                return 0;
+                        }
+                }
+        }
+        if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
+            !bridge_read(devfn, reg, len, value))
+                return 0;
+        return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+}
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+                             unsigned int devfn, int reg, int len, u32 value)
+{
+        int i;
+        if (bus == 1) {
+                for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                        if (bus1_fixups[i].dev_func == devfn &&
+                            bus1_fixups[i].reg == (reg & ~3) &&
+                            bus1_fixups[i].write) {
+                                bus1_fixups[i].write(&(bus1_fixups[i]),
+                                                     value);
+                                return 0;
+                        }
+                }
+        }
+        /* Discard writes to A/V bridge BAR. */
+        if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
+            ((reg & ~3) == PCI_BASE_ADDRESS_0))
+                return 0;
+        return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+}
+struct pci_raw_ops ce4100_pci_conf = {
+        .read = ce4100_conf_read,
+        .write = ce4100_conf_write,
+};
+static int __init ce4100_pci_init(void)
+{
+        init_sim_regs();
+        raw_pci_ops = &ce4100_pci_conf;
+        return 0;
+}
+subsys_initcall(ce4100_pci_init);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d165096a..a5f7d0d63de0 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
 #include <linux/uaccess.h>
 #include <asm/pci_x86.h>
 #include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE        (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
 #define PCIBIOS_HW_TYPE1_SPEC           0x10
 #define PCIBIOS_HW_TYPE2_SPEC           0x20
+int pcibios_enabled;
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+        pcibios_enabled = 1;
+        set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+        if (__supported_pte_mask & _PAGE_NX)
+                printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
 /*
 * This is the standard structure used to identify the entry point
 * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
                        DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
                                        bios32_entry);
                        bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+                        set_bios_x();
                        if (check_pcibios())
                                return &pci_bios_access;
                }
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 7bf70b812fa2..021eee91c056 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,5 +1,7 @@
 # Platform specific code goes here
+obj-y   += ce4100/
 obj-y   += efi/
+obj-y   += iris/
 obj-y   += mrst/
 obj-y   += olpc/
 obj-y   += scx200/
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile
new file mode 100644
index 000000000000..91fc92971d94
--- /dev/null
+++ b/arch/x86/platform/ce4100/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
new file mode 100644
index 000000000000..d2c0d51a7178
--- /dev/null
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -0,0 +1,132 @@
+/*
+ * Intel CE4100  platform specific setup code
+ *
+ * (C) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+static int ce4100_i8042_detect(void)
+{
+        return 0;
+}
+static void __init sdv_find_smp_config(void)
+{
+}
+#ifdef CONFIG_SERIAL_8250
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+        offset = offset << p->regshift;
+        return readl(p->membase + offset);
+}
+/*
+ * The UART Tx interrupts are not set under some conditions and therefore serial
+ * transmission hangs. This is a silicon issue and has not been root caused. The
+ * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
+ * bit of LSR register in interrupt handler to see whether at least one of these
+ * two bits is set, if so then process the transmit request. If this workaround
+ * is not applied, then the serial transmission may hang. This workaround is for
+ * errata number 9 in Errata - B step.
+*/
+static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
+{
+        unsigned int ret, ier, lsr;
+        if (offset == UART_IIR) {
+                offset = offset << p->regshift;
+                ret = readl(p->membase + offset);
+                if (ret & UART_IIR_NO_INT) {
+                        /* see if the TX interrupt should have really set */
+                        ier = mem_serial_in(p, UART_IER);
+                        /* see if the UART's XMIT interrupt is enabled */
+                        if (ier & UART_IER_THRI) {
+                                lsr = mem_serial_in(p, UART_LSR);
+                                /* now check to see if the UART should be
+                                   generating an interrupt (but isn't) */
+                                if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
+                                        ret &= ~UART_IIR_NO_INT;
+                        }
+                }
+        } else
+                ret =  mem_serial_in(p, offset);
+        return ret;
+}
+static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
+{
+        offset = offset << p->regshift;
+        writel(value, p->membase + offset);
+}
+static void ce4100_serial_fixup(int port, struct uart_port *up,
+        unsigned short *capabilites)
+{
+#ifdef CONFIG_EARLY_PRINTK
+        /*
+         * Over ride the legacy port configuration that comes from
+         * asm/serial.h. Using the ioport driver then switching to the
+         * PCI memmaped driver hangs the IOAPIC
+         */
+        if (up->iotype !=  UPIO_MEM32) {
+                up->uartclk  = 14745600;
+                up->mapbase = 0xdffe0200;
+                set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
+                                up->mapbase & PAGE_MASK);
+                up->membase =
+                        (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+                up->membase += up->mapbase & ~PAGE_MASK;
+                up->iotype   = UPIO_MEM32;
+                up->regshift = 2;
+        }
+#endif
+        up->iobase = 0;
+        up->serial_in = ce4100_mem_serial_in;
+        up->serial_out = ce4100_mem_serial_out;
+        *capabilites |= (1 << 12);
+}
+static __init void sdv_serial_fixup(void)
+{
+        serial8250_set_isa_configurator(ce4100_serial_fixup);
+}
+#else
+static inline void sdv_serial_fixup(void);
+#endif
+static void __init sdv_arch_setup(void)
+{
+        sdv_serial_fixup();
+}
+/*
+ * CE4100 specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_ce4100_early_setup(void)
+{
+        x86_init.oem.arch_setup = sdv_arch_setup;
+        x86_platform.i8042_detect = ce4100_i8042_detect;
+        x86_init.resources.probe_roms = x86_init_noop;
+        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+        x86_init.mpparse.find_smp_config = sdv_find_smp_config;
+}
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile
new file mode 100644
index 000000000000..db921983a102
--- /dev/null
+++ b/arch/x86/platform/iris/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_32_IRIS)               += iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
new file mode 100644
index 000000000000..1ba7f5ed8c9b
--- /dev/null
+++ b/arch/x86/platform/iris/iris.c
@@ -0,0 +1,91 @@
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ *  Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+#define IRIS_GIO_BASE           0x340
+#define IRIS_GIO_INPUT          IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT         (IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE          0x80 /* First byte to send */
+#define IRIS_GIO_REST           0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV          0xff /* Likely not an Iris */
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+static int force;
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+static void (*old_pm_power_off)(void);
+static void iris_power_off(void)
+{
+        outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+        msleep(850);
+        outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris.  Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_init(void)
+{
+        unsigned char status;
+        if (force != 1) {
+                printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
+                return -ENODEV;
+        }
+        status = inb(IRIS_GIO_INPUT);
+        if (status == IRIS_GIO_NODEV) {
+                printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+                return -ENODEV;
+        }
+        old_pm_power_off = pm_power_off;
+        pm_power_off = &iris_power_off;
+        printk(KERN_INFO "Iris power_off handler installed.\n");
+        return 0;
+}
+static void iris_exit(void)
+{
+        pm_power_off = old_pm_power_off;
+        printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+}
+module_init(iris_init);
+module_exit(iris_exit);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index efbbc552fa95..f61ccdd49341 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1 +1,3 @@
 obj-$(CONFIG_X86_MRST)          += mrst.o
+obj-$(CONFIG_X86_MRST)          += vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..65df603622b2 100644
--- a/arch/x86/kernel/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 79ae68154e87..fee0b4914e07 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -9,9 +9,19 @@
 * as published by the Free Software Foundation; version 2
 * of the License.
 */
+#define pr_fmt(fmt) "mrst: " fmt
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pca953x.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <linux/module.h>
@@ -23,7 +33,9 @@
 #include <asm/mrst.h>
 #include <asm/io.h>
 #include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
 #include <asm/apb_timer.h>
+#include <asm/reboot.h>
 /*
 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
@@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
                memcpy(sfi_mtimer_array, pentry, totallen);
        }
-        printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+        pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
        pentry = sfi_mtimer_array;
        for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-                printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+                pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
                        " irq = %d\n", totallen, (u32)pentry->phys_addr,
                        pentry->freq_hz, pentry->irq);
                        if (!pentry->irq)
@@ -176,14 +188,14 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
                memcpy(sfi_mrtc_array, pentry, totallen);
        }
-        printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+        pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
        pentry = sfi_mrtc_array;
        for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-                printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+                pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
                        totallen, (u32)pentry->phys_addr, pentry->irq);
                mp_irq.type = MP_IOAPIC;
                mp_irq.irqtype = mp_INT;
-                mp_irq.irqflag = 0;
+                mp_irq.irqflag = 0xf;   /* level trigger and active low */
                mp_irq.srcbus = 0;
                mp_irq.srcbusirq = pentry->irq; /* IRQ */
                mp_irq.dstapic = MP_APIC_ALL;
@@ -209,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
 void __init mrst_time_init(void)
 {
+        sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
        switch (mrst_timer_options) {
        case MRST_TIMER_APBT_ONLY:
                break;
@@ -224,16 +237,10 @@ void __init mrst_time_init(void)
                return;
        }
        /* we need at least one APB timer */
-        sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
        pre_init_apic_IRQ0();
        apbt_time_init();
 }
-void __init mrst_rtc_init(void)
-{
-        sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
 void __cpuinit mrst_arch_setup(void)
 {
        if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
@@ -256,6 +263,17 @@ static int mrst_i8042_detect(void)
        return 0;
 }
+/* Reboot and power off are handled by the SCU on a MID device */
+static void mrst_power_off(void)
+{
+        intel_scu_ipc_simple_command(0xf1, 1);
+}
+static void mrst_reboot(void)
+{
+        intel_scu_ipc_simple_command(0xf1, 0);
+}
 /*
 * Moorestown specific x86_init function overrides and early setup
 * calls.
@@ -281,6 +299,10 @@ void __init x86_mrst_early_setup(void)
        legacy_pic = &null_legacy_pic;
+        /* Moorestown specific power_off/restart method */
+        pm_power_off = mrst_power_off;
+        machine_ops.emergency_restart  = mrst_reboot;
        /* Avoid searching for BIOS MP tables */
        x86_init.mpparse.find_smp_config = x86_init_noop;
        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
@@ -309,3 +331,505 @@ static inline int __init setup_x86_mrst_timer(char *arg)
        return 0;
 }
 __setup("x86_mrst_timer=", setup_x86_mrst_timer);
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static struct sfi_gpio_table_entry *gpio_table;
+static int gpio_num_entry;
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+        struct sfi_table_simple *sb;
+        struct sfi_gpio_table_entry *pentry;
+        int num, i;
+        if (gpio_table)
+                return 0;
+        sb = (struct sfi_table_simple *)table;
+        num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+        pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+        gpio_table = (struct sfi_gpio_table_entry *)
+                                kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+        if (!gpio_table)
+                return -1;
+        memcpy(gpio_table, pentry, num * sizeof(*pentry));
+        gpio_num_entry = num;
+        pr_debug("GPIO pin info:\n");
+        for (i = 0; i < num; i++, pentry++)
+                pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+                " pin = %d\n", i,
+                        pentry->controller_name,
+                        pentry->pin_name,
+                        pentry->pin_no);
+        return 0;
+}
+static int get_gpio_by_name(const char *name)
+{
+        struct sfi_gpio_table_entry *pentry = gpio_table;
+        int i;
+        if (!pentry)
+                return -1;
+        for (i = 0; i < gpio_num_entry; i++, pentry++) {
+                if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+                        return pentry->pin_no;
+        }
+        return -1;
+}
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+        char name[SFI_NAME_LEN + 1];
+        u8 type;
+        u8 delay;
+        void *(*get_platform_data)(void *info);
+};
+/* the offset for the mapping of global gpio pin to irq */
+#define MRST_IRQ_OFFSET 0x100
+static void __init *pmic_gpio_platform_data(void *info)
+{
+        static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
+        int gpio_base = get_gpio_by_name("pmic_gpio_base");
+        if (gpio_base == -1)
+                gpio_base = 64;
+        pmic_gpio_pdata.gpio_base = gpio_base;
+        pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
+        pmic_gpio_pdata.gpiointr = 0xffffeff8;
+        return &pmic_gpio_pdata;
+}
+static void __init *max3111_platform_data(void *info)
+{
+        struct spi_board_info *spi_info = info;
+        int intr = get_gpio_by_name("max3111_int");
+        if (intr == -1)
+                return NULL;
+        spi_info->irq = intr + MRST_IRQ_OFFSET;
+        return NULL;
+}
+/* we have multiple max7315 on the board ... */
+#define MAX7315_NUM 2
+static void __init *max7315_platform_data(void *info)
+{
+        static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+        static int nr;
+        struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+        struct i2c_board_info *i2c_info = info;
+        int gpio_base, intr;
+        char base_pin_name[SFI_NAME_LEN + 1];
+        char intr_pin_name[SFI_NAME_LEN + 1];
+        if (nr == MAX7315_NUM) {
+                pr_err("too many max7315s, we only support %d\n",
+                                MAX7315_NUM);
+                return NULL;
+        }
+        /* we have several max7315 on the board, we only need load several
+         * instances of the same pca953x driver to cover them
+         */
+        strcpy(i2c_info->type, "max7315");
+        if (nr++) {
+                sprintf(base_pin_name, "max7315_%d_base", nr);
+                sprintf(intr_pin_name, "max7315_%d_int", nr);
+        } else {
+                strcpy(base_pin_name, "max7315_base");
+                strcpy(intr_pin_name, "max7315_int");
+        }
+        gpio_base = get_gpio_by_name(base_pin_name);
+        intr = get_gpio_by_name(intr_pin_name);
+        if (gpio_base == -1)
+                return NULL;
+        max7315->gpio_base = gpio_base;
+        if (intr != -1) {
+                i2c_info->irq = intr + MRST_IRQ_OFFSET;
+                max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
+        } else {
+                i2c_info->irq = -1;
+                max7315->irq_base = -1;
+        }
+        return max7315;
+}
+static void __init *emc1403_platform_data(void *info)
+{
+        static short intr2nd_pdata;
+        struct i2c_board_info *i2c_info = info;
+        int intr = get_gpio_by_name("thermal_int");
+        int intr2nd = get_gpio_by_name("thermal_alert");
+        if (intr == -1 || intr2nd == -1)
+                return NULL;
+        i2c_info->irq = intr + MRST_IRQ_OFFSET;
+        intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+        return &intr2nd_pdata;
+}
+static void __init *lis331dl_platform_data(void *info)
+{
+        static short intr2nd_pdata;
+        struct i2c_board_info *i2c_info = info;
+        int intr = get_gpio_by_name("accel_int");
+        int intr2nd = get_gpio_by_name("accel_2");
+        if (intr == -1 || intr2nd == -1)
+                return NULL;
+        i2c_info->irq = intr + MRST_IRQ_OFFSET;
+        intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+        return &intr2nd_pdata;
+}
+static void __init *no_platform_data(void *info)
+{
+        return NULL;
+}
+static const struct devs_id __initconst device_ids[] = {
+        {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
+        {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
+        {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+        {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+        {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
+        {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
+        {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+        {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+        {},
+};
+#define MAX_IPCDEVS     24
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static int ipc_next_dev;
+#define MAX_SCU_SPI     24
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static int spi_next_dev;
+#define MAX_SCU_I2C     24
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static int i2c_bus[MAX_SCU_I2C];
+static int i2c_next_dev;
+static void __init intel_scu_device_register(struct platform_device *pdev)
+{
+        if(ipc_next_dev == MAX_IPCDEVS)
+                pr_err("too many SCU IPC devices");
+        else
+                ipc_devs[ipc_next_dev++] = pdev;
+}
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+        struct spi_board_info *new_dev;
+        if (spi_next_dev == MAX_SCU_SPI) {
+                pr_err("too many SCU SPI devices");
+                return;
+        }
+        new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+        if (!new_dev) {
+                pr_err("failed to alloc mem for delayed spi dev %s\n",
+                        sdev->modalias);
+                return;
+        }
+        memcpy(new_dev, sdev, sizeof(*sdev));
+        spi_devs[spi_next_dev++] = new_dev;
+}
+static void __init intel_scu_i2c_device_register(int bus,
+                                                struct i2c_board_info *idev)
+{
+        struct i2c_board_info *new_dev;
+        if (i2c_next_dev == MAX_SCU_I2C) {
+                pr_err("too many SCU I2C devices");
+                return;
+        }
+        new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+        if (!new_dev) {
+                pr_err("failed to alloc mem for delayed i2c dev %s\n",
+                        idev->type);
+                return;
+        }
+        memcpy(new_dev, idev, sizeof(*idev));
+        i2c_bus[i2c_next_dev] = bus;
+        i2c_devs[i2c_next_dev++] = new_dev;
+}
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+        int i;
+        for (i = 0; i < ipc_next_dev; i++)
+                platform_device_add(ipc_devs[i]);
+        for (i = 0; i < spi_next_dev; i++)
+                spi_register_board_info(spi_devs[i], 1);
+        for (i = 0; i < i2c_next_dev; i++) {
+                struct i2c_adapter *adapter;
+                struct i2c_client *client;
+                adapter = i2c_get_adapter(i2c_bus[i]);
+                if (adapter) {
+                        client = i2c_new_device(adapter, i2c_devs[i]);
+                        if (!client)
+                                pr_err("can't create i2c device %s\n",
+                                        i2c_devs[i]->type);
+                } else
+                        i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+        }
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+        int i;
+        for (i = 0; i < ipc_next_dev; i++)
+                platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+        /* Single threaded */
+        static struct resource __initdata res = {
+                .name = "IRQ",
+                .flags = IORESOURCE_IRQ,
+        };
+        res.start = irq;
+        platform_device_add_resources(pdev, &res, 1);
+}
+static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
+{
+        const struct devs_id *dev = device_ids;
+        void *pdata = NULL;
+        while (dev->name[0]) {
+                if (dev->type == SFI_DEV_TYPE_IPC &&
+                        !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
+                        pdata = dev->get_platform_data(pdev);
+                        break;
+                }
+                dev++;
+        }
+        pdev->dev.platform_data = pdata;
+        intel_scu_device_register(pdev);
+}
+static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
+{
+        const struct devs_id *dev = device_ids;
+        void *pdata = NULL;
+        while (dev->name[0]) {
+                if (dev->type == SFI_DEV_TYPE_SPI &&
+                                !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
+                        pdata = dev->get_platform_data(spi_info);
+                        break;
+                }
+                dev++;
+        }
+        spi_info->platform_data = pdata;
+        if (dev->delay)
+                intel_scu_spi_device_register(spi_info);
+        else
+                spi_register_board_info(spi_info, 1);
+}
+static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
+{
+        const struct devs_id *dev = device_ids;
+        void *pdata = NULL;
+        while (dev->name[0]) {
+                if (dev->type == SFI_DEV_TYPE_I2C &&
+                        !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
+                        pdata = dev->get_platform_data(i2c_info);
+                        break;
+                }
+                dev++;
+        }
+        i2c_info->platform_data = pdata;
+        if (dev->delay)
+                intel_scu_i2c_device_register(bus, i2c_info);
+        else
+                i2c_register_board_info(bus, i2c_info, 1);
+ }
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+        struct sfi_table_simple *sb;
+        struct sfi_device_table_entry *pentry;
+        struct spi_board_info spi_info;
+        struct i2c_board_info i2c_info;
+        struct platform_device *pdev;
+        int num, i, bus;
+        int ioapic;
+        struct io_apic_irq_attr irq_attr;
+        sb = (struct sfi_table_simple *)table;
+        num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+        pentry = (struct sfi_device_table_entry *)sb->pentry;
+        for (i = 0; i < num; i++, pentry++) {
+                if (pentry->irq != (u8)0xff) { /* native RTE case */
+                        /* these SPI2 devices are not exposed to system as PCI
+                         * devices, but they have separate RTE entry in IOAPIC
+                         * so we have to enable them one by one here
+                         */
+                        ioapic = mp_find_ioapic(pentry->irq);
+                        irq_attr.ioapic = ioapic;
+                        irq_attr.ioapic_pin = pentry->irq;
+                        irq_attr.trigger = 1;
+                        irq_attr.polarity = 1;
+                        io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
+                }
+                switch (pentry->type) {
+                case SFI_DEV_TYPE_IPC:
+                        /* ID as IRQ is a hack that will go away */
+                        pdev = platform_device_alloc(pentry->name, pentry->irq);
+                        if (pdev == NULL) {
+                                pr_err("out of memory for SFI platform device '%s'.\n",
+                                                        pentry->name);
+                                continue;
+                        }
+                        install_irq_resource(pdev, pentry->irq);
+                        pr_debug("info[%2d]: IPC bus, name = %16.16s, "
+                                "irq = 0x%2x\n", i, pentry->name, pentry->irq);
+                        sfi_handle_ipc_dev(pdev);
+                        break;
+                case SFI_DEV_TYPE_SPI:
+                        memset(&spi_info, 0, sizeof(spi_info));
+                        strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+                        spi_info.irq = pentry->irq;
+                        spi_info.bus_num = pentry->host_num;
+                        spi_info.chip_select = pentry->addr;
+                        spi_info.max_speed_hz = pentry->max_freq;
+                        pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
+                                "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
+                                spi_info.bus_num,
+                                spi_info.modalias,
+                                spi_info.irq,
+                                spi_info.max_speed_hz,
+                                spi_info.chip_select);
+                        sfi_handle_spi_dev(&spi_info);
+                        break;
+                case SFI_DEV_TYPE_I2C:
+                        memset(&i2c_info, 0, sizeof(i2c_info));
+                        bus = pentry->host_num;
+                        strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+                        i2c_info.irq = pentry->irq;
+                        i2c_info.addr = pentry->addr;
+                        pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
+                                "irq = 0x%2x, addr = 0x%x\n", i, bus,
+                                i2c_info.type,
+                                i2c_info.irq,
+                                i2c_info.addr);
+                        sfi_handle_i2c_dev(bus, &i2c_info);
+                        break;
+                case SFI_DEV_TYPE_UART:
+                case SFI_DEV_TYPE_HSI:
+                default:
+                        ;
+                }
+        }
+        return 0;
+}
+static int __init mrst_platform_init(void)
+{
+        sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+        sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+        return 0;
+}
+arch_initcall(mrst_platform_init);
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+        {KEY_POWER,             -1, 1, "power_btn",     EV_KEY, 0, 3000},
+        {KEY_PROG1,             -1, 1, "prog_btn1",     EV_KEY, 0, 20},
+        {KEY_PROG2,             -1, 1, "prog_btn2",     EV_KEY, 0, 20},
+        {SW_LID,                -1, 1, "lid_switch",    EV_SW,  0, 20},
+        {KEY_VOLUMEUP,          -1, 1, "vol_up",        EV_KEY, 0, 20},
+        {KEY_VOLUMEDOWN,        -1, 1, "vol_down",      EV_KEY, 0, 20},
+        {KEY_CAMERA,            -1, 1, "camera_full",   EV_KEY, 0, 20},
+        {KEY_CAMERA_FOCUS,      -1, 1, "camera_half",   EV_KEY, 0, 20},
+        {SW_KEYPAD_SLIDE,       -1, 1, "MagSw1",        EV_SW,  0, 20},
+        {SW_KEYPAD_SLIDE,       -1, 1, "MagSw2",        EV_SW,  0, 20},
+};
+static struct gpio_keys_platform_data mrst_gpio_keys = {
+        .buttons        = gpio_button,
+        .rep            = 1,
+        .nbuttons       = -1, /* will fill it after search */
+};
+static struct platform_device pb_device = {
+        .name           = "gpio-keys",
+        .id             = -1,
+        .dev            = {
+                .platform_data  = &mrst_gpio_keys,
+        },
+};
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+        struct gpio_keys_button *gb = gpio_button;
+        int i, num, good = 0;
+        num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
+        for (i = 0; i < num; i++) {
+                gb[i].gpio = get_gpio_by_name(gb[i].desc);
+                if (gb[i].gpio == -1)
+                        continue;
+                if (i != good)
+                        gb[good] = gb[i];
+                good++;
+        }
+        if (good) {
+                mrst_gpio_keys.nbuttons = good;
+                return platform_device_register(&pb_device);
+        }
+        return 0;
+}
+late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
new file mode 100644
index 000000000000..32cd7edd71a0
--- /dev/null
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -0,0 +1,165 @@
+/*
+ * vrtc.c: Driver for virtual RTC device on Intel MID platform
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based on RTC CMOS driver.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+#include <asm/time.h>
+#include <asm/fixmap.h>
+static unsigned char __iomem *vrtc_virt_base;
+unsigned char vrtc_cmos_read(unsigned char reg)
+{
+        unsigned char retval;
+        /* vRTC's registers range from 0x0 to 0xD */
+        if (reg > 0xd || !vrtc_virt_base)
+                return 0xff;
+        lock_cmos_prefix(reg);
+        retval = __raw_readb(vrtc_virt_base + (reg << 2));
+        lock_cmos_suffix(reg);
+        return retval;
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_read);
+void vrtc_cmos_write(unsigned char val, unsigned char reg)
+{
+        if (reg > 0xd || !vrtc_virt_base)
+                return;
+        lock_cmos_prefix(reg);
+        __raw_writeb(val, vrtc_virt_base + (reg << 2));
+        lock_cmos_suffix(reg);
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_write);
+unsigned long vrtc_get_time(void)
+{
+        u8 sec, min, hour, mday, mon;
+        u32 year;
+        while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
+                cpu_relax();
+        sec = vrtc_cmos_read(RTC_SECONDS);
+        min = vrtc_cmos_read(RTC_MINUTES);
+        hour = vrtc_cmos_read(RTC_HOURS);
+        mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+        mon = vrtc_cmos_read(RTC_MONTH);
+        year = vrtc_cmos_read(RTC_YEAR);
+        /* vRTC YEAR reg contains the offset to 1960 */
+        year += 1960;
+        printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
+                "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
+        return mktime(year, mon, mday, hour, min, sec);
+}
+/* Only care about the minutes and seconds */
+int vrtc_set_mmss(unsigned long nowtime)
+{
+        int real_sec, real_min;
+        int vrtc_min;
+        vrtc_min = vrtc_cmos_read(RTC_MINUTES);
+        real_sec = nowtime % 60;
+        real_min = nowtime / 60;
+        if (((abs(real_min - vrtc_min) + 15)/30) & 1)
+                real_min += 30;
+        real_min %= 60;
+        vrtc_cmos_write(real_sec, RTC_SECONDS);
+        vrtc_cmos_write(real_min, RTC_MINUTES);
+        return 0;
+}
+void __init mrst_rtc_init(void)
+{
+        unsigned long rtc_paddr;
+        void __iomem *virt_base;
+        sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+        if (!sfi_mrtc_num)
+                return;
+        rtc_paddr = sfi_mrtc_array[0].phys_addr;
+        /* vRTC's register address may not be page aligned */
+        set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
+        virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
+        virt_base += rtc_paddr & ~PAGE_MASK;
+        vrtc_virt_base = virt_base;
+        x86_platform.get_wallclock = vrtc_get_time;
+        x86_platform.set_wallclock = vrtc_set_mmss;
+}
+/*
+ * The Moorestown platform has a memory mapped virtual RTC device that emulates
+ * the programming interface of the RTC.
+ */
+static struct resource vrtc_resources[] = {
+        [0] = {
+                .flags  = IORESOURCE_MEM,
+        },
+        [1] = {
+                .flags  = IORESOURCE_IRQ,
+        }
+};
+static struct platform_device vrtc_device = {
+        .name           = "rtc_mrst",
+        .id             = -1,
+        .resource       = vrtc_resources,
+        .num_resources  = ARRAY_SIZE(vrtc_resources),
+};
+/* Register the RTC device if appropriate */
+static int __init mrst_device_create(void)
+{
+        /* No Moorestown, no device */
+        if (!mrst_identify_cpu())
+                return -ENODEV;
+        /* No timer, no device */
+        if (!sfi_mrtc_num)
+                return -ENODEV;
+        /* iomem resource */
+        vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
+        vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
+                                MRST_VRTC_MAP_SZ;
+        /* irq resource */
+        vrtc_resources[1].start = sfi_mrtc_array[0].irq;
+        vrtc_resources[1].end = sfi_mrtc_array[0].irq;
+        return platform_device_register(&vrtc_device);
+}
+module_init(mrst_device_create);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index dd4c281ffe57..ca54875ac795 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address)
 /* All CPUs enumerated by SFI must be present and enabled */
 static void __cpuinit mp_sfi_register_lapic(u8 id)
 {
-        if (MAX_APICS - id <= 0) {
+        if (MAX_LOCAL_APIC - id <= 0) {
                pr_warning("Processor #%d invalid (max %d)\n",
-                        id, MAX_APICS);
+                        id, MAX_LOCAL_APIC);
                return;
        }
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ba9caa808a9c..df58e9cad96a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode)
        /*
         * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-         * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+         * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
         */
        bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
                                * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
 /*
 * initialize the bau_control structure for each cpu
 */
-static void __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs)
 {
        int i;
        int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
        struct bau_control *smaster = NULL;
        struct socket_desc {
                short num_cpus;
-                short cpu_number[16];
+                short cpu_number[MAX_CPUS_PER_SOCKET];
        };
        struct uvhub_desc {
                unsigned short socket_mask;
@@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
                sdp = &bdp->socket[socket];
                sdp->cpu_number[sdp->num_cpus] = cpu;
                sdp->num_cpus++;
+                if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+                        printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+                        return 1;
+                }
        }
        for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
                if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
                                bcp->uvhub_master = hmaster;
                                bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
                                                blade_processor_id;
+                                if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+                                        printk(KERN_EMERG
+                                                "%d cpus per uvhub invalid\n",
+                                                bcp->uvhub_cpu);
+                                        return 1;
+                                }
                        }
 nextsocket:
                        socket++;
@@ -1595,6 +1605,7 @@ nextsocket:
                bcp->congested_reps = congested_reps;
                bcp->congested_period = congested_period;
        }
+        return 0;
 }
 /*
@@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void)
        spin_lock_init(&disable_lock);
        congested_cycles = microsec_2_cycles(congested_response_us);
-        uv_init_per_cpu(nuvhubs);
+        if (uv_init_per_cpu(nuvhubs)) {
+                nobau = 1;
+                return 0;
+        }
        uv_partition_base_pnode = 0x7fffffff;
        for (uvhub = 0; uvhub < nuvhubs; uvhub++)
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd053b89..632037671746 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
        ver = m->apicver;
        if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
                printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-                        m->apicid, MAX_APICS);
+                        m->apicid, MAX_LOCAL_APIC);
                return;
        }
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 5718566e00f9..d9926afec110 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
        int ret = 0;
+        int nr_cpu_entries = nr_cpu_ids;
+#ifdef CONFIG_X86
+        /*
+         * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
+         * SRAT cpu entries could have different order with that in MADT.
+         * So go over all cpu entries in SRAT to get apicid to node mapping.
+         */
+        nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
        /* SRAT: Static Resource Affinity Table */
        if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
                acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-                                     acpi_parse_x2apic_affinity, nr_cpu_ids);
+                                     acpi_parse_x2apic_affinity, nr_cpu_entries);
                acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-                                     acpi_parse_processor_affinity, nr_cpu_ids);
+                                     acpi_parse_processor_affinity, nr_cpu_entries);
                ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
                                            acpi_parse_memory_affinity,
                                            NR_NODE_MEMBLKS);
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 42396df55556..9252e85706ef 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -38,7 +38,7 @@ static int agp_bridges_found;
 static void amd64_tlbflush(struct agp_memory *temp)
 {
-        k8_flush_garts();
+        amd_flush_garts();
 }
 static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
        u32 temp;
        struct aper_size_info_32 *values;
-        dev = k8_northbridges.nb_misc[0];
+        dev = node_to_amd_nb(0)->misc;
        if (dev==NULL)
                return 0;
@@ -181,16 +181,15 @@ static int amd_8151_configure(void)
        unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
        int i;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return 0;
        /* Configure AGP regs in each x86-64 host bridge. */
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
                agp_bridge->gart_bus_addr =
-                                amd64_configure(k8_northbridges.nb_misc[i],
+                        amd64_configure(node_to_amd_nb(i)->misc, gatt_bus);
-                                                gatt_bus);
        }
-        k8_flush_garts();
+        amd_flush_garts();
        return 0;
 }
@@ -200,11 +199,11 @@ static void amd64_cleanup(void)
        u32 tmp;
        int i;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return;
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                struct pci_dev *dev = k8_northbridges.nb_misc[i];
+                struct pci_dev *dev = node_to_amd_nb(i)->misc;
                /* disable gart translation */
                pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
                tmp &= ~GARTEN;
@@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
 {
        int i;
-        if (cache_k8_northbridges() < 0)
+        if (amd_cache_northbridges() < 0)
                return -ENODEV;
-        if (!k8_northbridges.gart_supported)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return -ENODEV;
        i = 0;
-        for (i = 0; i < k8_northbridges.num; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                struct pci_dev *dev = k8_northbridges.nb_misc[i];
+                struct pci_dev *dev = node_to_amd_nb(i)->misc;
                if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
                        dev_err(&dev->dev, "no usable aperture found\n");
 #ifdef __x86_64__
@@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
        }
        /* shadow x86-64 registers into ULi registers */
-        pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+        pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
                               &httfea);
        /* if x86-64 aperture base is beyond 4G, exit here */
@@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev)
        pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
        /* shadow x86-64 registers into NVIDIA registers */
-        pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+        pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
                               &apbase);
        /* if x86-64 aperture base is beyond 4G, exit here */
@@ -778,7 +777,7 @@ int __init agp_amd64_init(void)
                }
                /* First check that we have at least one AMD64 NB */
-                if (!pci_dev_present(k8_nb_ids))
+                if (!pci_dev_present(amd_nb_misc_ids))
                        return -ENODEV;
                /* Look for any AGP bridge */
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index eca9ba193e94..df211181fca4 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void)
        opstate_init();
-        if (cache_k8_northbridges() < 0)
+        if (amd_cache_northbridges() < 0)
                goto err_ret;
        msrs = msrs_alloc();
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
         * to finish initialization of the MC instances.
         */
        err = -ENODEV;
-        for (nb = 0; nb < k8_northbridges.num; nb++) {
+        for (nb = 0; nb < amd_nb_num(); nb++) {
                if (!pvt_lookup[nb])
                        continue;
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
index 41a9e34899ac..ca35b0ce944a 100644
--- a/drivers/platform/x86/intel_scu_ipc.c
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -26,6 +26,7 @@
 #include <linux/sfi.h>
 #include <asm/mrst.h>
 #include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
 /* IPC defines the following message types */
 #define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
@@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
                iounmap(ipcdev.ipc_base);
                return -ENOMEM;
        }
+        intel_scu_devices_create();
        return 0;
 }
@@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev)
        iounmap(ipcdev.ipc_base);
        iounmap(ipcdev.i2c_base);
        ipcdev.pdev = NULL;
+        intel_scu_devices_destroy();
 }
 static const struct pci_device_id pci_ids[] = {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 2883428d5ac8..4941cade319f 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -463,6 +463,18 @@ config RTC_DRV_CMOS
          This driver can also be built as a module. If so, the module
          will be called rtc-cmos.
+config RTC_DRV_VRTC
+        tristate "Virtual RTC for Moorestown platforms"
+        depends on X86_MRST
+        default y if X86_MRST
+        help
+        Say "yes" here to get direct support for the real time clock
+        found on Moorestown platforms. The VRTC is a emulated RTC that
+        derives its clock source from a real RTC in the PMIC. The MC146818
+        style programming interface is mostly conserved, but any
+        updates are done via IPC calls to the system controller FW.
 config RTC_DRV_DS1216
        tristate "Dallas DS1216"
        depends on SNI_RM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 4c2832df4697..2afdaf3ff986 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS)	+= rtc-cmos.o
 obj-$(CONFIG_RTC_DRV_COH901331) += rtc-coh901331.o
 obj-$(CONFIG_RTC_DRV_DAVINCI)   += rtc-davinci.o
 obj-$(CONFIG_RTC_DRV_DM355EVM)  += rtc-dm355evm.o
+obj-$(CONFIG_RTC_DRV_VRTC)      += rtc-mrst.o
 obj-$(CONFIG_RTC_DRV_DS1216)    += rtc-ds1216.o
 obj-$(CONFIG_RTC_DRV_DS1286)    += rtc-ds1286.o
 obj-$(CONFIG_RTC_DRV_DS1302)    += rtc-ds1302.o
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
new file mode 100644
index 000000000000..bcd0cf63eb16
--- /dev/null
+++ b/drivers/rtc/rtc-mrst.c
@@ -0,0 +1,582 @@
+/*
+ * rtc-mrst.c: Driver for Moorestown virtual RTC
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *         Feng Tang (feng.tang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based upon drivers/rtc/rtc-cmos.c
+ */
+/*
+ * Note:
+ *  * vRTC only supports binary mode and 24H mode
+ *  * vRTC only support PIE and AIE, no UIE, and its PIE only happens
+ *    at 23:59:59pm everyday, no support for adjustable frequency
+ *  * Alarm function is also limited to hr/min/sec.
+ */
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <asm-generic/rtc.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+struct mrst_rtc {
+        struct rtc_device       *rtc;
+        struct device           *dev;
+        int                     irq;
+        struct resource         *iomem;
+        u8                      enabled_wake;
+        u8                      suspend_ctrl;
+};
+static const char driver_name[] = "rtc_mrst";
+#define RTC_IRQMASK     (RTC_PF | RTC_AF)
+static inline int is_intr(u8 rtc_intr)
+{
+        if (!(rtc_intr & RTC_IRQF))
+                return 0;
+        return rtc_intr & RTC_IRQMASK;
+}
+/*
+ * rtc_time's year contains the increment over 1900, but vRTC's YEAR
+ * register can't be programmed to value larger than 0x64, so vRTC
+ * driver chose to use 1960 (1970 is UNIX time start point) as the base,
+ * and does the translation at read/write time.
+ *
+ * Why not just use 1970 as the offset? it's because using 1960 will
+ * make it consistent in leap year setting for both vrtc and low-level
+ * physical rtc devices.
+ */
+static int mrst_read_time(struct device *dev, struct rtc_time *time)
+{
+        unsigned long flags;
+        if (rtc_is_updating())
+                mdelay(20);
+        spin_lock_irqsave(&rtc_lock, flags);
+        time->tm_sec = vrtc_cmos_read(RTC_SECONDS);
+        time->tm_min = vrtc_cmos_read(RTC_MINUTES);
+        time->tm_hour = vrtc_cmos_read(RTC_HOURS);
+        time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+        time->tm_mon = vrtc_cmos_read(RTC_MONTH);
+        time->tm_year = vrtc_cmos_read(RTC_YEAR);
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        /* Adjust for the 1960/1900 */
+        time->tm_year += 60;
+        time->tm_mon--;
+        return RTC_24H;
+}
+static int mrst_set_time(struct device *dev, struct rtc_time *time)
+{
+        int ret;
+        unsigned long flags;
+        unsigned char mon, day, hrs, min, sec;
+        unsigned int yrs;
+        yrs = time->tm_year;
+        mon = time->tm_mon + 1;   /* tm_mon starts at zero */
+        day = time->tm_mday;
+        hrs = time->tm_hour;
+        min = time->tm_min;
+        sec = time->tm_sec;
+        if (yrs < 70 || yrs > 138)
+                return -EINVAL;
+        yrs -= 60;
+        spin_lock_irqsave(&rtc_lock, flags);
+        vrtc_cmos_write(yrs, RTC_YEAR);
+        vrtc_cmos_write(mon, RTC_MONTH);
+        vrtc_cmos_write(day, RTC_DAY_OF_MONTH);
+        vrtc_cmos_write(hrs, RTC_HOURS);
+        vrtc_cmos_write(min, RTC_MINUTES);
+        vrtc_cmos_write(sec, RTC_SECONDS);
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME);
+        return ret;
+}
+static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        unsigned char rtc_control;
+        if (mrst->irq <= 0)
+                return -EIO;
+        /* Basic alarms only support hour, minute, and seconds fields.
+         * Some also support day and month, for alarms up to a year in
+         * the future.
+         */
+        t->time.tm_mday = -1;
+        t->time.tm_mon = -1;
+        t->time.tm_year = -1;
+        /* vRTC only supports binary mode */
+        spin_lock_irq(&rtc_lock);
+        t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
+        t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM);
+        t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM);
+        rtc_control = vrtc_cmos_read(RTC_CONTROL);
+        spin_unlock_irq(&rtc_lock);
+        t->enabled = !!(rtc_control & RTC_AIE);
+        t->pending = 0;
+        return 0;
+}
+static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control)
+{
+        unsigned char   rtc_intr;
+        /*
+         * NOTE after changing RTC_xIE bits we always read INTR_FLAGS;
+         * allegedly some older rtcs need that to handle irqs properly
+         */
+        rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS);
+        rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF;
+        if (is_intr(rtc_intr))
+                rtc_update_irq(mrst->rtc, 1, rtc_intr);
+}
+static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask)
+{
+        unsigned char   rtc_control;
+        /*
+         * Flush any pending IRQ status, notably for update irqs,
+         * before we enable new IRQs
+         */
+        rtc_control = vrtc_cmos_read(RTC_CONTROL);
+        mrst_checkintr(mrst, rtc_control);
+        rtc_control |= mask;
+        vrtc_cmos_write(rtc_control, RTC_CONTROL);
+        mrst_checkintr(mrst, rtc_control);
+}
+static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask)
+{
+        unsigned char   rtc_control;
+        rtc_control = vrtc_cmos_read(RTC_CONTROL);
+        rtc_control &= ~mask;
+        vrtc_cmos_write(rtc_control, RTC_CONTROL);
+        mrst_checkintr(mrst, rtc_control);
+}
+static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        unsigned char hrs, min, sec;
+        int ret = 0;
+        if (!mrst->irq)
+                return -EIO;
+        hrs = t->time.tm_hour;
+        min = t->time.tm_min;
+        sec = t->time.tm_sec;
+        spin_lock_irq(&rtc_lock);
+        /* Next rtc irq must not be from previous alarm setting */
+        mrst_irq_disable(mrst, RTC_AIE);
+        /* Update alarm */
+        vrtc_cmos_write(hrs, RTC_HOURS_ALARM);
+        vrtc_cmos_write(min, RTC_MINUTES_ALARM);
+        vrtc_cmos_write(sec, RTC_SECONDS_ALARM);
+        spin_unlock_irq(&rtc_lock);
+        ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM);
+        if (ret)
+                return ret;
+        spin_lock_irq(&rtc_lock);
+        if (t->enabled)
+                mrst_irq_enable(mrst, RTC_AIE);
+        spin_unlock_irq(&rtc_lock);
+        return 0;
+}
+static int mrst_irq_set_state(struct device *dev, int enabled)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        unsigned long   flags;
+        if (!mrst->irq)
+                return -ENXIO;
+        spin_lock_irqsave(&rtc_lock, flags);
+        if (enabled)
+                mrst_irq_enable(mrst, RTC_PIE);
+        else
+                mrst_irq_disable(mrst, RTC_PIE);
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        return 0;
+}
+#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
+/* Currently, the vRTC doesn't support UIE ON/OFF */
+static int
+mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        unsigned long   flags;
+        switch (cmd) {
+        case RTC_AIE_OFF:
+        case RTC_AIE_ON:
+                if (!mrst->irq)
+                        return -EINVAL;
+                break;
+        default:
+                /* PIE ON/OFF is handled by mrst_irq_set_state() */
+                return -ENOIOCTLCMD;
+        }
+        spin_lock_irqsave(&rtc_lock, flags);
+        switch (cmd) {
+        case RTC_AIE_OFF:       /* alarm off */
+                mrst_irq_disable(mrst, RTC_AIE);
+                break;
+        case RTC_AIE_ON:        /* alarm on */
+                mrst_irq_enable(mrst, RTC_AIE);
+                break;
+        }
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        return 0;
+}
+#else
+#define mrst_rtc_ioctl  NULL
+#endif
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+static int mrst_procfs(struct device *dev, struct seq_file *seq)
+{
+        unsigned char   rtc_control, valid;
+        spin_lock_irq(&rtc_lock);
+        rtc_control = vrtc_cmos_read(RTC_CONTROL);
+        valid = vrtc_cmos_read(RTC_VALID);
+        spin_unlock_irq(&rtc_lock);
+        return seq_printf(seq,
+                        "periodic_IRQ\t: %s\n"
+                        "alarm\t\t: %s\n"
+                        "BCD\t\t: no\n"
+                        "periodic_freq\t: daily (not adjustable)\n",
+                        (rtc_control & RTC_PIE) ? "on" : "off",
+                        (rtc_control & RTC_AIE) ? "on" : "off");
+}
+#else
+#define mrst_procfs     NULL
+#endif
+static const struct rtc_class_ops mrst_rtc_ops = {
+        .ioctl          = mrst_rtc_ioctl,
+        .read_time      = mrst_read_time,
+        .set_time       = mrst_set_time,
+        .read_alarm     = mrst_read_alarm,
+        .set_alarm      = mrst_set_alarm,
+        .proc           = mrst_procfs,
+        .irq_set_state  = mrst_irq_set_state,
+};
+static struct mrst_rtc  mrst_rtc;
+/*
+ * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in
+ * Reg B, so no need for this driver to clear it
+ */
+static irqreturn_t mrst_rtc_irq(int irq, void *p)
+{
+        u8 irqstat;
+        spin_lock(&rtc_lock);
+        /* This read will clear all IRQ flags inside Reg C */
+        irqstat = vrtc_cmos_read(RTC_INTR_FLAGS);
+        spin_unlock(&rtc_lock);
+        irqstat &= RTC_IRQMASK | RTC_IRQF;
+        if (is_intr(irqstat)) {
+                rtc_update_irq(p, 1, irqstat);
+                return IRQ_HANDLED;
+        }
+        return IRQ_NONE;
+}
+static int __init
+vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
+{
+        int retval = 0;
+        unsigned char rtc_control;
+        /* There can be only one ... */
+        if (mrst_rtc.dev)
+                return -EBUSY;
+        if (!iomem)
+                return -ENODEV;
+        iomem = request_mem_region(iomem->start,
+                        iomem->end + 1 - iomem->start,
+                        driver_name);
+        if (!iomem) {
+                dev_dbg(dev, "i/o mem already in use.\n");
+                return -EBUSY;
+        }
+        mrst_rtc.irq = rtc_irq;
+        mrst_rtc.iomem = iomem;
+        mrst_rtc.rtc = rtc_device_register(driver_name, dev,
+                                &mrst_rtc_ops, THIS_MODULE);
+        if (IS_ERR(mrst_rtc.rtc)) {
+                retval = PTR_ERR(mrst_rtc.rtc);
+                goto cleanup0;
+        }
+        mrst_rtc.dev = dev;
+        dev_set_drvdata(dev, &mrst_rtc);
+        rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
+        spin_lock_irq(&rtc_lock);
+        mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE);
+        rtc_control = vrtc_cmos_read(RTC_CONTROL);
+        spin_unlock_irq(&rtc_lock);
+        if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY)))
+                dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n");
+        if (rtc_irq) {
+                retval = request_irq(rtc_irq, mrst_rtc_irq,
+                                IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev),
+                                mrst_rtc.rtc);
+                if (retval < 0) {
+                        dev_dbg(dev, "IRQ %d is already in use, err %d\n",
+                                rtc_irq, retval);
+                        goto cleanup1;
+                }
+        }
+        dev_dbg(dev, "initialised\n");
+        return 0;
+cleanup1:
+        mrst_rtc.dev = NULL;
+        rtc_device_unregister(mrst_rtc.rtc);
+cleanup0:
+        release_region(iomem->start, iomem->end + 1 - iomem->start);
+        dev_err(dev, "rtc-mrst: unable to initialise\n");
+        return retval;
+}
+static void rtc_mrst_do_shutdown(void)
+{
+        spin_lock_irq(&rtc_lock);
+        mrst_irq_disable(&mrst_rtc, RTC_IRQMASK);
+        spin_unlock_irq(&rtc_lock);
+}
+static void __exit rtc_mrst_do_remove(struct device *dev)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        struct resource *iomem;
+        rtc_mrst_do_shutdown();
+        if (mrst->irq)
+                free_irq(mrst->irq, mrst->rtc);
+        rtc_device_unregister(mrst->rtc);
+        mrst->rtc = NULL;
+        iomem = mrst->iomem;
+        release_region(iomem->start, iomem->end + 1 - iomem->start);
+        mrst->iomem = NULL;
+        mrst->dev = NULL;
+        dev_set_drvdata(dev, NULL);
+}
+#ifdef  CONFIG_PM
+static int mrst_suspend(struct device *dev, pm_message_t mesg)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        unsigned char   tmp;
+        /* Only the alarm might be a wakeup event source */
+        spin_lock_irq(&rtc_lock);
+        mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL);
+        if (tmp & (RTC_PIE | RTC_AIE)) {
+                unsigned char   mask;
+                if (device_may_wakeup(dev))
+                        mask = RTC_IRQMASK & ~RTC_AIE;
+                else
+                        mask = RTC_IRQMASK;
+                tmp &= ~mask;
+                vrtc_cmos_write(tmp, RTC_CONTROL);
+                mrst_checkintr(mrst, tmp);
+        }
+        spin_unlock_irq(&rtc_lock);
+        if (tmp & RTC_AIE) {
+                mrst->enabled_wake = 1;
+                enable_irq_wake(mrst->irq);
+        }
+        dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n",
+                        (tmp & RTC_AIE) ? ", alarm may wake" : "",
+                        tmp);
+        return 0;
+}
+/*
+ * We want RTC alarms to wake us from the deep power saving state
+ */
+static inline int mrst_poweroff(struct device *dev)
+{
+        return mrst_suspend(dev, PMSG_HIBERNATE);
+}
+static int mrst_resume(struct device *dev)
+{
+        struct mrst_rtc *mrst = dev_get_drvdata(dev);
+        unsigned char tmp = mrst->suspend_ctrl;
+        /* Re-enable any irqs previously active */
+        if (tmp & RTC_IRQMASK) {
+                unsigned char   mask;
+                if (mrst->enabled_wake) {
+                        disable_irq_wake(mrst->irq);
+                        mrst->enabled_wake = 0;
+                }
+                spin_lock_irq(&rtc_lock);
+                do {
+                        vrtc_cmos_write(tmp, RTC_CONTROL);
+                        mask = vrtc_cmos_read(RTC_INTR_FLAGS);
+                        mask &= (tmp & RTC_IRQMASK) | RTC_IRQF;
+                        if (!is_intr(mask))
+                                break;
+                        rtc_update_irq(mrst->rtc, 1, mask);
+                        tmp &= ~RTC_AIE;
+                } while (mask & RTC_AIE);
+                spin_unlock_irq(&rtc_lock);
+        }
+        dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp);
+        return 0;
+}
+#else
+#define mrst_suspend    NULL
+#define mrst_resume     NULL
+static inline int mrst_poweroff(struct device *dev)
+{
+        return -ENOSYS;
+}
+#endif
+static int __init vrtc_mrst_platform_probe(struct platform_device *pdev)
+{
+        return vrtc_mrst_do_probe(&pdev->dev,
+                        platform_get_resource(pdev, IORESOURCE_MEM, 0),
+                        platform_get_irq(pdev, 0));
+}
+static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev)
+{
+        rtc_mrst_do_remove(&pdev->dev);
+        return 0;
+}
+static void vrtc_mrst_platform_shutdown(struct platform_device *pdev)
+{
+        if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev))
+                return;
+        rtc_mrst_do_shutdown();
+}
+MODULE_ALIAS("platform:vrtc_mrst");
+static struct platform_driver vrtc_mrst_platform_driver = {
+        .probe          = vrtc_mrst_platform_probe,
+        .remove         = __exit_p(vrtc_mrst_platform_remove),
+        .shutdown       = vrtc_mrst_platform_shutdown,
+        .driver = {
+                .name           = (char *) driver_name,
+                .suspend        = mrst_suspend,
+                .resume         = mrst_resume,
+        }
+};
+static int __init vrtc_mrst_init(void)
+{
+        return platform_driver_register(&vrtc_mrst_platform_driver);
+}
+static void __exit vrtc_mrst_exit(void)
+{
+        platform_driver_unregister(&vrtc_mrst_platform_driver);
+}
+module_init(vrtc_mrst_init);
+module_exit(vrtc_mrst_exit);
+MODULE_AUTHOR("Jacob Pan; Feng Tang");
+MODULE_DESCRIPTION("Driver for Moorestown virtual RTC");
+MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4ee..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-        int error;
+        int error = 0;
        if (!*top)
                sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-        error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        else if (!sdp->sd_rgrps)
+                error = gfs2_ri_update(ip);
        if (error)
                return error;
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f92c17704169..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = LM_OUT_ERROR;
-        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                         req_state, flags);
-        return ret;
-}
 /**
 * do_xmote - Calls the DLM to change the state of a lock
 * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-        BUG_ON(gl->gl_state == target);
+        GLOCK_BUG_ON(gl, gl->gl_state == target);
-        BUG_ON(gl->gl_state == gl->gl_target);
+        GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+        gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-        ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
-        if (!(ret & LM_OUT_ASYNC)) {
+        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
-                finish_xmote(gl, ret);
+                /* lock_dlm */
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+                GLOCK_BUG_ON(gl, ret);
+        } else { /* lock_nolock */
+                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-        } else {
-                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
        spin_lock(&gl->gl_spin);
 }
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-                printk(KERN_ERR " ");
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
+                printk(KERN_ERR " %pV", &vaf);
        }
        va_end(args);
 }
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
 *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
 */
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-                spin_unlock(&gl->gl_spin);
        }
+        spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+        smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-        char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
-        sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
-                  state2str(gh->gh_state),
+                       state2str(gh->gh_state),
-                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                  gh->gh_error, 
+                       gh->gh_error,
-                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                  gh_owner ? gh_owner->comm : "(ended)", buffer);
+                       gh_owner ? gh_owner->comm : "(ended)",
+                       (void *)gh->gh_ip);
        return 0;
 }
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_FREEZEABLE, 0);
+                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d220..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
  
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
 *
 * LM_OUT_ST_MASK
 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
 * LM_OUT_CANCELED
 * The lock request was canceled.
 *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
 */
 #define LM_OUT_ST_MASK          0x00000003
 #define LM_OUT_CANCELED         0x00000008
-#define LM_OUT_ASYNC            0x00000080
+#define LM_OUT_ERROR            0x00000004
-#define LM_OUT_ERROR            0x00000100
 /*
 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-        unsigned int (*lm_lock) (struct gfs2_glock *gl,
+        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
-                                 unsigned int req_state, unsigned int flags);
+                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
-#define LM_FLAG_TRY             0x00000001
-#define LM_FLAG_TRY_1CB         0x00000002
-#define LM_FLAG_NOEXP           0x00000004
-#define LM_FLAG_ANY             0x00000008
-#define LM_FLAG_PRIORITY        0x00000010
-#define GL_ASYNC                0x00000040
-#define GL_EXACT                0x00000080
-#define GL_SKIP                 0x00000100
-#define GL_NOCACHE              0x00000400
-#define GLR_TRYFAILED           13
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e5..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc8..8d3d2b4a0a7d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
-        unsigned int gl_state;
+        /* State fields protected by gl_spin */
-        unsigned int gl_target;
+        unsigned int gl_state:2,        /* Current state */
-        unsigned int gl_reply;
+                     gl_target:2,       /* Target state */
+                     gl_demote_state:2, /* State requested by remote node */
+                     gl_req:2,          /* State in last dlm request */
+                     gl_reply:8;        /* Last reply from the dlm */
        unsigned int gl_hash;
-        unsigned int gl_req;
-        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f9217..14e682dbe8bf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-        gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45fd..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
-                              unsigned int req_state, unsigned int flags)
+                     unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-        int error;
        int req;
        u32 lkf;
-        gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
-        error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+        return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                         GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-        if (error == -EAGAIN)
-                return 0;
-        if (error)
-                return LM_OUT_ERROR;
-        return LM_OUT_ASYNC;
 }
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c2..1db6b7343229 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(sdp, !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f606baf9ba72..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                        qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        qd->qd_qb.qb_value = qp->qu_value;
+                }
        }
        /* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
        if (fdq->d_fieldmask == 0)
                goto out_i;
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 33c8407b876f..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+                if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 * Returns: 0 on successful update, error code otherwise
 */
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 }
 /**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct inode *inode = &ip->i_inode;
-        struct file_ra_state ra_state;
-        struct gfs2_rgrpd *rgd;
-        unsigned int max_data = 0;
-        int error;
-        file_ra_state_init(&ra_state, inode->i_mapping);
-        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-                /* Ignore partials */
-                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    i_size_read(inode))
-                        break;
-                error = read_rindex_entry(ip, &ra_state);
-                if (error) {
-                        clear_rgrpdi(sdp);
-                        return error;
-                }
-        }
-        list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-                if (rgd->rd_data > max_data)
-                        max_data = rgd->rd_data;
-        sdp->sd_max_rg_data = max_data;
-        sdp->sd_rindex_uptodate = 1;
-        return 0;
-}
-/**
 * gfs2_rindex_hold - Grab a lock on the rindex
 * @sdp: The GFS2 superblock
 * @ri_gh: the glock holder
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
-                        error = gfs2_ri_update_special(ip);
+                        error = gfs2_ri_update(ip);
                if (error)
                        return error;
        }
+try_again:
        do {
                error = get_local_rgrp(ip, &last_unlinked);
                /* If there is no space, flushing the log may release some */
-                if (error)
+                if (error) {
+                        if (ip == GFS2_I(sdp->sd_rindex) &&
+                            !sdp->sd_rindex_uptodate) {
+                                error = gfs2_ri_update(ip);
+                                if (error)
+                                        return error;
+                                goto try_again;
+                        }
                        gfs2_log_flush(sdp, NULL);
+                }
        } while (error && tries++ < 3);
        if (error) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9a..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a6..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-        struct buffer_head *dibh;
        int error;
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
-        if (error)
-                goto out_trans_end;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(GFS2_SB(inode), !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
-out_trans_end:
        gfs2_trans_end(sdp);
        return error;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe4..08cba2c3b612 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_autogroup_show_task(p, m);
+        put_task_struct(p);
+        return 0;
+}
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+            size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        char buffer[PROC_NUMBUF];
+        long nice;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        err = strict_strtol(strstrip(buffer), 0, &nice);
+        if (err)
+                return -EINVAL;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        err = nice;
+        err = proc_sched_autogroup_set_nice(p, &err);
+        if (err)
+                count = err;
+        put_task_struct(p);
+        return count;
+}
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+        int ret;
+        ret = single_open(filp, sched_autogroup_show, NULL);
+        if (!ret) {
+                struct seq_file *m = filp->private_data;
+                m->private = inode;
+        }
+        return ret;
+}
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+        .open           = sched_autogroup_open,
+        .read           = seq_read,
+        .write          = sched_autogroup_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_SCHED_AUTOGROUP */
 static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
 {
@@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",    S_IRUSR, proc_pid_syscall),
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 36d57f74cd01..51494e6b5548 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
 extern unsigned long wait_for_completion_timeout(struct completion *x,
                                                   unsigned long timeout);
-extern unsigned long wait_for_completion_interruptible_timeout(
+extern long wait_for_completion_interruptible_timeout(
-                        struct completion *x, unsigned long timeout);
+        struct completion *x, unsigned long timeout);
-extern unsigned long wait_for_completion_killable_timeout(
+extern long wait_for_completion_killable_timeout(
-                        struct completion *x, unsigned long timeout);
+        struct completion *x, unsigned long timeout);
 extern bool try_wait_for_completion(struct completion *x);
 extern bool completion_done(struct completion *x);
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index a90b3892074a..1c70028f81f9 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 extern int ddebug_remove_module(const char *mod_name);
 #define dynamic_pr_debug(fmt, ...) do {                                 \
-        __label__ do_printk;                                            \
-        __label__ out;                                                  \
        static struct _ddebug descriptor                                \
        __used                                                          \
        __attribute__((section("__verbose"), aligned(8))) =             \
        { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
                _DPRINTK_FLAGS_DEFAULT };                               \
-        JUMP_LABEL(&descriptor.enabled, do_printk);                     \
+        if (unlikely(descriptor.enabled))                               \
-        goto out;                                                       \
+                printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);         \
-do_printk:                                                              \
-        printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);                 \
-out:    ;                                                               \
        } while (0)
 #define dynamic_dev_dbg(dev, fmt, ...) do {                             \
-        __label__ do_printk;                                            \
-        __label__ out;                                                  \
        static struct _ddebug descriptor                                \
        __used                                                          \
        __attribute__((section("__verbose"), aligned(8))) =             \
        { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
                _DPRINTK_FLAGS_DEFAULT };                               \
-        JUMP_LABEL(&descriptor.enabled, do_printk);                     \
+        if (unlikely(descriptor.enabled))                               \
-        goto out;                                                       \
+                dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);        \
-do_printk:                                                              \
-        dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);                \
-out:    ;                                                               \
        } while (0)
 #else
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0c1b857d3d..330586ffffbb 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -22,7 +22,7 @@
 #include <linux/wait.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
+#include <linux/timerqueue.h>
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
@@ -79,8 +79,8 @@ enum hrtimer_restart {
 /**
 * struct hrtimer - the basic hrtimer structure
- * @node:       red black tree node for time ordered insertion
+ * @node:       timerqueue node, which also manages node.expires,
- * @_expires:   the absolute expiry time in the hrtimers internal
+ *              the absolute expiry time in the hrtimers internal
 *              representation. The time is related to the clock on
 *              which the timer is based. Is setup by adding
 *              slack to the _softexpires value. For non range timers
@@ -101,8 +101,7 @@ enum hrtimer_restart {
 * The hrtimer structure must be initialized by hrtimer_init()
 */
 struct hrtimer {
-        struct rb_node                  node;
+        struct timerqueue_node          node;
-        ktime_t                         _expires;
        ktime_t                         _softexpires;
        enum hrtimer_restart            (*function)(struct hrtimer *);
        struct hrtimer_clock_base       *base;
@@ -141,8 +140,7 @@ struct hrtimer_sleeper {
 struct hrtimer_clock_base {
        struct hrtimer_cpu_base *cpu_base;
        clockid_t               index;
-        struct rb_root          active;
+        struct timerqueue_head  active;
-        struct rb_node          *first;
        ktime_t                 resolution;
        ktime_t                 (*get_time)(void);
        ktime_t                 softirq_time;
@@ -158,7 +156,6 @@ struct hrtimer_clock_base {
 * @lock:               lock protecting the base and associated clock bases
 *                      and timers
 * @clock_base:         array of clock bases for this cpu
- * @curr_timer:         the timer which is executing a callback right now
 * @expires_next:       absolute time of the next event which was scheduled
 *                      via clock_set_next_event()
 * @hres_active:        State of high resolution mode
@@ -184,43 +181,43 @@ struct hrtimer_cpu_base {
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
-        timer->_expires = time;
+        timer->node.expires = time;
        timer->_softexpires = time;
 }
 static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
 {
        timer->_softexpires = time;
-        timer->_expires = ktime_add_safe(time, delta);
+        timer->node.expires = ktime_add_safe(time, delta);
 }
 static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
 {
        timer->_softexpires = time;
-        timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+        timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
 }
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
-        timer->_expires.tv64 = tv64;
+        timer->node.expires.tv64 = tv64;
        timer->_softexpires.tv64 = tv64;
 }
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
-        timer->_expires = ktime_add_safe(timer->_expires, time);
+        timer->node.expires = ktime_add_safe(timer->node.expires, time);
        timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 }
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
 {
-        timer->_expires = ktime_add_ns(timer->_expires, ns);
+        timer->node.expires = ktime_add_ns(timer->node.expires, ns);
        timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
 }
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 {
-        return timer->_expires;
+        return timer->node.expires;
 }
 static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
@@ -230,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-        return timer->_expires.tv64;
+        return timer->node.expires.tv64;
 }
 static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 {
@@ -239,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
-        return ktime_to_ns(timer->_expires);
+        return ktime_to_ns(timer->node.expires);
 }
 static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 {
-    return ktime_sub(timer->_expires, timer->base->get_time());
+        return ktime_sub(timer->node.expires, timer->base->get_time());
 }
 #ifdef CONFIG_HIGH_RES_TIMERS
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f8c06ce0fa6..caa151fbebb7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
 #include <linux/securebits.h>
 #include <net/net_namespace.h>
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk)                                       \
+        .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
@@ -83,6 +90,12 @@ extern struct group_info init_groups;
 */
 # define CAP_INIT_BSET  CAP_FULL_SET
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()                                           \
+        .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_TREE_PREEMPT()                                    \
        .rcu_blocked_node = NULL,
@@ -94,7 +107,8 @@ extern struct group_info init_groups;
        .rcu_read_lock_nesting = 0,                                     \
        .rcu_read_unlock_special = 0,                                   \
        .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
-        INIT_TASK_RCU_TREE_PREEMPT()
+        INIT_TASK_RCU_TREE_PREEMPT()                                    \
+        INIT_TASK_RCU_BOOST()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
@@ -137,7 +151,7 @@ extern struct cred init_cred;
                .nr_cpus_allowed = NR_CPUS,                             \
        },                                                              \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
-        .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
+        INIT_PUSHABLE_TASKS(tsk)                                        \
        .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
        .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
        .real_parent    = &tsk,                                         \
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79d0c4f6d071..55e0d4253e49 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
 struct irqaction {
        irq_handler_t handler;
        unsigned long flags;
-        const char *name;
        void *dev_id;
        struct irqaction *next;
        int irq;
-        struct proc_dir_entry *dir;
        irq_handler_t thread_fn;
        struct task_struct *thread;
        unsigned long thread_flags;
-};
+        const char *name;
+        struct proc_dir_entry *dir;
+} ____cacheline_internodealigned_in_smp;
 extern irqreturn_t no_action(int cpl, void *dev_id);
diff --git a/include/linux/module.h b/include/linux/module.h
index 7575bbbdf2a2..8b17fd8c790d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -308,6 +308,9 @@ struct module
        /* The size of the executable code in each section.  */
        unsigned int init_text_size, core_text_size;
+        /* Size of RO sections of the module (text+rodata) */
+        unsigned int init_ro_size, core_ro_size;
        /* Arch-specific module values */
        struct mod_arch_specific arch;
@@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
 {
        return 0;
 }
 #endif /* CONFIG_MODULES */
 #ifdef CONFIG_SYSFS
@@ -687,6 +689,13 @@ extern int module_sysfs_initialized;
 #define __MODULE_STRING(x) __stringify(x)
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+extern void set_all_modules_text_rw(void);
+extern void set_all_modules_text_ro(void);
+#else
+static inline void set_all_modules_text_rw(void) { }
+static inline void set_all_modules_text_ro(void) { }
+#endif
 #ifdef CONFIG_GENERIC_BUG
 void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f363bc8fdc74..94b48bd40dd7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
+#define arch_mutex_cpu_relax()  cpu_relax()
+#endif
 #endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f31ef61f1c65..2dea94fc4402 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_entry_rcu(ptr, type, member) \
        list_entry_rcu((ptr)->next, type, member)
-#define __list_for_each_rcu(pos, head) \
-        for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-                pos != (head); \
-                pos = rcu_dereference_raw(list_next_rcu((pos)))
 /**
 * list_for_each_entry_rcu      -       iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03cda7bed985..af5614856285 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+#define UINT_CMP_GE(a, b)       (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)       (UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
 static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 /* Internal to kernel */
-extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 13877cb93a60..30ebd7c8d874 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
 #include <linux/cache.h>
-#define rcu_init_sched()        do { } while (0)
+static inline void rcu_init(void)
+{
+}
 #ifdef CONFIG_TINY_RCU
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
        synchronize_sched();
 }
+static inline void synchronize_sched_expedited(void)
+{
+        synchronize_sched();
+}
 #ifdef CONFIG_TINY_RCU
 static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
 }
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 static inline void rcu_scheduler_starting(void)
 {
 }
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 95518e628794..3a933482734a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
+extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 static inline void synchronize_rcu_bh_expedited(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a99d735db3df..777cd01e240e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -513,6 +513,8 @@ struct thread_group_cputimer {
        spinlock_t lock;
 };
+struct autogroup;
 /*
 * NOTE! "signal_struct" does not have it's own
 * locking, because a shared signal_struct always
@@ -580,6 +582,9 @@ struct signal_struct {
        struct tty_struct *tty; /* NULL if no tty */
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
@@ -1233,13 +1238,18 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
        struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+        struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        struct sched_info sched_info;
 #endif
        struct list_head tasks;
+#ifdef CONFIG_SMP
        struct plist_node pushable_tasks;
+#endif
        struct mm_struct *mm, *active_mm;
 #if defined(SPLIT_RSS_COUNTING)
@@ -1763,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #ifdef CONFIG_PREEMPT_RCU
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -1771,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p)
        p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
        p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+        p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
        INIT_LIST_HEAD(&p->rcu_node_entry);
 }
@@ -1876,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #ifdef CONFIG_HOTPLUG_CPU
-extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
 #endif
-extern void sched_idle_next(void);
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 extern void wake_up_idle_cpu(int cpu);
 #else
@@ -1893,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
 extern unsigned int sysctl_sched_child_runs_first;
 enum sched_tunable_scaling {
@@ -1910,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *length,
@@ -1935,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_compat_yield;
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1953,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler(struct task_struct *, int,
+                              const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
-                                      struct sched_param *);
+                                      const struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/include/linux/sfi.h b/include/linux/sfi.h
index 7f770c638e99..fe817918b30e 100644
--- a/include/linux/sfi.h
+++ b/include/linux/sfi.h
@@ -77,6 +77,8 @@
 #define SFI_OEM_ID_SIZE         6
 #define SFI_OEM_TABLE_ID_SIZE   8
+#define SFI_NAME_LEN            16
 #define SFI_SYST_SEARCH_BEGIN           0x000E0000
 #define SFI_SYST_SEARCH_END             0x000FFFFF
@@ -156,13 +158,13 @@ struct sfi_device_table_entry {
        u16     addr;
        u8      irq;
        u32     max_freq;
-        char    name[16];
+        char    name[SFI_NAME_LEN];
 } __packed;
 struct sfi_gpio_table_entry {
-        char    controller_name[16];
+        char    controller_name[SFI_NAME_LEN];
        u16     pin_no;
-        char    pin_name[16];
+        char    pin_name[SFI_NAME_LEN];
 } __packed;
 typedef int (*sfi_table_handler) (struct sfi_table_header *table);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 38cf093ef62c..6abd9138beda 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -24,9 +24,9 @@ struct timer_list {
        int slack;
 #ifdef CONFIG_TIMER_STATS
+        int start_pid;
        void *start_site;
        char start_comm[16];
-        int start_pid;
 #endif
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map lockdep_map;
@@ -48,12 +48,38 @@ extern struct tvec_base boot_tvec_bases;
 #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
 #endif
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
+ */
+#define TBASE_DEFERRABLE_FLAG           (0x1)
 #define TIMER_INITIALIZER(_function, _expires, _data) {         \
                .entry = { .prev = TIMER_ENTRY_STATIC },        \
                .function = (_function),                        \
                .expires = (_expires),                          \
                .data = (_data),                                \
                .base = &boot_tvec_bases,                       \
+                .slack = -1,                                    \
+                __TIMER_LOCKDEP_MAP_INITIALIZER(                \
+                        __FILE__ ":" __stringify(__LINE__))     \
+        }
+#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *)          \
+                  ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
+#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
+                .entry = { .prev = TIMER_ENTRY_STATIC },        \
+                .function = (_function),                        \
+                .expires = (_expires),                          \
+                .data = (_data),                                \
+                .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases),  \
                __TIMER_LOCKDEP_MAP_INITIALIZER(                \
                        __FILE__ ":" __stringify(__LINE__))     \
        }
@@ -248,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
 extern void add_timer(struct timer_list *timer);
+extern int try_to_del_timer_sync(struct timer_list *timer);
 #ifdef CONFIG_SMP
-  extern int try_to_del_timer_sync(struct timer_list *timer);
  extern int del_timer_sync(struct timer_list *timer);
 #else
-# define try_to_del_timer_sync(t)       del_timer(t)
 # define del_timer_sync(t)              del_timer(t)
 #endif
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
new file mode 100644
index 000000000000..d24aabaca474
--- /dev/null
+++ b/include/linux/timerqueue.h
@@ -0,0 +1,50 @@
+#ifndef _LINUX_TIMERQUEUE_H
+#define _LINUX_TIMERQUEUE_H
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+struct timerqueue_node {
+        struct rb_node node;
+        ktime_t expires;
+};
+struct timerqueue_head {
+        struct rb_root head;
+        struct timerqueue_node *next;
+};
+extern void timerqueue_add(struct timerqueue_head *head,
+                                struct timerqueue_node *node);
+extern void timerqueue_del(struct timerqueue_head *head,
+                                struct timerqueue_node *node);
+extern struct timerqueue_node *timerqueue_iterate_next(
+                                                struct timerqueue_node *node);
+/**
+ * timerqueue_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerqueue
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+static inline
+struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+{
+        return head->next;
+}
+static inline void timerqueue_init(struct timerqueue_node *node)
+{
+        RB_CLEAR_NODE(&node->node);
+}
+static inline void timerqueue_init_head(struct timerqueue_head *head)
+{
+        head->head = RB_ROOT;
+        head->next = NULL;
+}
+#endif /* _LINUX_TIMERQUEUE_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d3e4f87e95c0..c6814616653b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -32,7 +32,7 @@ struct tracepoint {
        int state;                      /* State. */
        void (*regfunc)(void);
        void (*unregfunc)(void);
-        struct tracepoint_func *funcs;
+        struct tracepoint_func __rcu *funcs;
 } __attribute__((aligned(32)));         /*
                                         * Aligned on 32 bytes because it is
                                         * globally visible and gcc happily
@@ -326,7 +326,7 @@ do_trace:								\
 *              memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 *              __entry->next_pid       = next->pid;
 *              __entry->next_prio      = next->prio;
- *      )
+ *      ),
 *
 *      *
 *      * Formatted output of a trace record via TP_printk().
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0c0771f06bfa..bd257fee6031 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -127,12 +127,20 @@ struct execute_work {
        .timer = TIMER_INITIALIZER(NULL, 0, 0),                 \
        }
+#define __DEFERRED_WORK_INITIALIZER(n, f) {                     \
+        .work = __WORK_INITIALIZER((n).work, (f)),              \
+        .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0),        \
+        }
 #define DECLARE_WORK(n, f)                                      \
        struct work_struct n = __WORK_INITIALIZER(n, f)
 #define DECLARE_DELAYED_WORK(n, f)                              \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
+#define DECLARE_DEFERRED_WORK(n, f)                             \
+        struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f)
 /*
 * initialize a work item's function pointer
 */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index b0b4eb24d592..da39b22636f7 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -21,6 +21,16 @@
 #undef CREATE_TRACE_POINTS
 #include <linux/stringify.h>
+/*
+ * module.h includes tracepoints, and because ftrace.h
+ * pulls in module.h:
+ *  trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h ->
+ *  linux/ftrace.h -> linux/module.h
+ * we must include module.h here before we play with any of
+ * the TRACE_EVENT() macros, otherwise the tracepoints included
+ * by module.h may break the build.
+ */
+#include <linux/module.h>
 #undef TRACE_EVENT
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)  \
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 75ce9d500d8e..f10293c41b1e 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -25,9 +25,7 @@ TRACE_EVENT(kfree_skb,
        TP_fast_assign(
                __entry->skbaddr = skb;
-                if (skb) {
+                __entry->protocol = ntohs(skb->protocol);
-                        __entry->protocol = ntohs(skb->protocol);
-                }
                __entry->location = location;
        ),
diff --git a/init/Kconfig b/init/Kconfig
index c9728992a776..8dfd094e6875 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -393,7 +393,6 @@ config PREEMPT_RCU
 config RCU_TRACE
        bool "Enable tracing for RCU"
-        depends on TREE_RCU || TREE_PREEMPT_RCU
        help
          This option provides tracing in RCU which presents stats
          in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
          TREE_PREEMPT_RCU implementations, permitting Makefile to
          trivially select kernel/rcutree_trace.c.
+config RCU_BOOST
+        bool "Enable RCU priority boosting"
+        depends on RT_MUTEXES && TINY_PREEMPT_RCU
+        default n
+        help
+          This option boosts the priority of preempted RCU readers that
+          block the current preemptible RCU grace period for too long.
+          This option also prevents heavy loads from blocking RCU
+          callback invocation for all flavors of RCU.
+          Say Y here if you are working with real-time apps or heavy loads
+          Say N here if you are unsure.
+config RCU_BOOST_PRIO
+        int "Real-time priority to boost RCU readers to"
+        range 1 99
+        depends on RCU_BOOST
+        default 1
+        help
+          This option specifies the real-time priority to which preempted
+          RCU readers are to be boosted.  If you are working with CPU-bound
+          real-time applications, you should specify a priority higher then
+          the highest-priority CPU-bound application.
+          Specify the real-time priority, or take the default if unsure.
+config RCU_BOOST_DELAY
+        int "Milliseconds to delay boosting after RCU grace-period start"
+        range 0 3000
+        depends on RCU_BOOST
+        default 500
+        help
+          This option specifies the time to wait after the beginning of
+          a given grace period before priority-boosting preempted RCU
+          readers blocking that grace period.  Note that any RCU reader
+          blocking an expedited RCU grace period is boosted immediately.
+          Accept the default if unsure.
+config SRCU_SYNCHRONIZE_DELAY
+        int "Microseconds to delay before waiting for readers"
+        range 0 20
+        default 10
+        help
+          This option controls how long SRCU delays before entering its
+          loop waiting on SRCU readers.  The purpose of this loop is
+          to avoid the unconditional context-switch penalty that would
+          otherwise be incurred if there was an active SRCU reader,
+          in a manner similar to adaptive locking schemes.  This should
+          be set to be a bit longer than the common-case SRCU read-side
+          critical-section overhead.
+          Accept the default if unsure.
 endmenu # "RCU Subsystem"
 config IKCONFIG
@@ -741,6 +794,19 @@ config NET_NS
 endif # NAMESPACES
+config SCHED_AUTOGROUP
+        bool "Automatic process group scheduling"
+        select EVENTFD
+        select CGROUPS
+        select CGROUP_SCHED
+        select FAIR_GROUP_SCHED
+        help
+          This option optimizes the scheduler for common desktop workloads by
+          automatically creating and populating task groups.  This separation
+          of workloads isolates aggressive CPU burners (like build jobs) from
+          desktop applications.  Task group autogeneration is currently based
+          upon task session.
 config MM_OWNER
        bool
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..e0f2831634b4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
 }
 struct take_cpu_down_param {
-        struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
-        unsigned int cpu = (unsigned long)param->hcpu;
        int err;
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
-        if (task_cpu(param->caller) == cpu)
-                move_task_off_dead_cpu(cpu, param->caller);
-        /* Force idle task to run as soon as we yield: it should
-           immediately notice cpu is offline and die quickly. */
-        sched_idle_next();
        return 0;
 }
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
-                .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        BUG_ON(cpu_online(cpu));
-        /* Wait for it to sleep (leaving idle task). */
+        /*
+         * The migration_call() CPU_DYING callback will have removed all
+         * runnable tasks from the cpu, there's only the idle task left now
+         * that the migration thread is done doing the stop_machine thing.
+         *
+         * Wait for the stop thread to go away.
+         */
        while (!idle_cpu(cpu))
-                yield();
+                cpu_relax();
        /* This actually kills the CPU. */
        __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
 int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+        arch_disable_nonboot_cpus_begin();
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
                }
        }
+        arch_disable_nonboot_cpus_end();
        if (!error) {
                BUG_ON(num_online_cpus() > 1);
                /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
 static inline void put_signal_struct(struct signal_struct *sig)
 {
-        if (atomic_dec_and_test(&sig->sigcnt))
+        if (atomic_dec_and_test(&sig->sigcnt)) {
+                sched_autogroup_exit(sig);
                free_signal_struct(sig);
+        }
 }
 void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        posix_cpu_timers_init_group(sig);
        tty_audit_fork(sig);
+        sched_autogroup_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
-                free_signal_struct(p->signal);
+                put_signal_struct(p->signal);
 bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d0..3019b92e6917 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 /*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED            0x01
+#define FLAGS_CLOCKRT           0x02
+#define FLAGS_HAS_TIMEOUT       0x04
+/*
 * Priority Inheritance state:
 */
 struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
        u32 bitset;
 };
+static const struct futex_q futex_q_init = {
+        /* list gets initialized in queue_me()*/
+        .key = FUTEX_KEY_INIT,
+        .bitset = FUTEX_BITSET_MATCH_ANY
+};
 /*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
        return 0;
 }
-static inline
+static inline void put_futex_key(union futex_key *key)
-void put_futex_key(int fshared, union futex_key *key)
 {
        drop_futex_key_refs(key);
 }
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
 * Wake up waiters matching bitset queued on this futex (uaddr).
 */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        }
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 }
@@ -917,7 +931,7 @@ out:
 * to this virtual address:
 */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -962,11 +976,11 @@ retry_private:
                if (ret)
                        goto out_put_keys;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &key2);
+                put_futex_key(&key2);
-                put_futex_key(fshared, &key1);
+                put_futex_key(&key1);
                goto retry;
        }
@@ -996,9 +1010,9 @@ retry_private:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        return ret;
 }
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
 * @uaddr1:     source futex user address
- * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @uaddr2:     target futex user address
 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
 * @cmpval:     @uaddr1 expected value (or %NULL)
 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- *              pi futex (pi to pi requeue is not supported)
+ *              pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * >=0 - on success, the number of tasks requeued or woken
 *  <0 - on error
 */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-                         int nr_wake, int nr_requeue, u32 *cmpval,
+                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
-                         int requeue_pi)
+                         u32 *cmpval, int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1216,11 +1230,11 @@ retry_private:
                        if (ret)
                                goto out_put_keys;
-                        if (!fshared)
+                        if (!(flags & FLAGS_SHARED))
                                goto retry_private;
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        goto retry;
                }
                if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
                        break;
                case -EFAULT:
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
                case -EAGAIN:
                        /* The owner was exiting, try again. */
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        cond_resched();
                        goto retry;
                default:
@@ -1352,9 +1366,9 @@ out_unlock:
                drop_futex_key_refs(&key1);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        if (pi_state != NULL)
                free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
 * private futexes.
 */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                                struct task_struct *newowner, int fshared)
+                                struct task_struct *newowner)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
        goto retry;
 }
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED            0x01
-#define FLAGS_CLOCKRT           0x02
-#define FLAGS_HAS_TIMEOUT       0x04
 static long futex_wait_restart(struct restart_block *restart);
 /**
 * fixup_owner() - Post lock pi_state and corner case management
 * @uaddr:      user address of the futex
- * @fshared:    whether the futex is shared (1) or not (0)
 * @q:          futex_q (contains pi_state and access to the rt_mutex)
 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
 *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
 *  0 - success, lock not taken
 * <0 - on error (-EFAULT)
 */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-                       int locked)
 {
        struct task_struct *owner;
        int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * did a lock-steal - fix up the PI-state in that case:
                 */
                if (q->pi_state->owner != current)
-                        ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                        ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * lock. Fix the state up.
                 */
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-                ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * futex_wait_setup() - Prepare to wait on a futex
 * @uaddr:      the futex userspace address
 * @val:        the expected value
- * @fshared:    whether the futex is shared (1) or not (0)
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @q:          the associated futex_q
 * @hb:         storage for hash_bucket pointer to be returned to caller
 *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *  0 - uaddr contains val and hb has been locked
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
 */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
 {
        u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         * rare, but normal.
         */
 retry:
-        q->key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
-        ret = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(ret != 0))
                return ret;
@@ -1769,10 +1772,10 @@ retry_private:
                if (ret)
                        goto out;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
                goto retry;
        }
@@ -1783,32 +1786,29 @@ retry_private:
 out:
        if (ret)
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
        return ret;
 }
-static int futex_wait(u32 __user *uaddr, int fshared,
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+                      ktime_t *abs_time, u32 bitset)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int ret;
        if (!bitset)
                return -EINVAL;
-        q.pi_state = NULL;
        q.bitset = bitset;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
         * Prepare to wait on uaddr. On success, holds hb lock and increments
         * q.key refs.
         */
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out;
@@ -1852,12 +1852,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = FLAGS_HAS_TIMEOUT;
+        restart->futex.flags = flags;
-        if (fshared)
-                restart->futex.flags |= FLAGS_SHARED;
-        if (clockrt)
-                restart->futex.flags |= FLAGS_CLOCKRT;
        ret = -ERESTART_RESTARTBLOCK;
@@ -1873,7 +1868,6 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = restart->futex.uaddr;
-        int fshared = 0;
        ktime_t t, *tp = NULL;
        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;
-        if (restart->futex.flags & FLAGS_SHARED)
-                fshared = 1;
+        return (long)futex_wait(uaddr, restart->futex.flags,
-        return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
+                                restart->futex.val, tp, restart->futex.bitset);
-                                restart->futex.bitset,
-                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
-                         int detect, ktime_t *time, int trylock)
+                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                hrtimer_set_expires(&to->timer, *time);
        }
-        q.pi_state = NULL;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
 retry:
-        q.key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
-        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out;
@@ -1941,7 +1929,7 @@ retry_private:
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                        put_futex_key(fshared, &q.key);
+                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
                default:
@@ -1971,7 +1959,7 @@ retry_private:
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
-        res = fixup_owner(uaddr, fshared, &q, !ret);
+        res = fixup_owner(uaddr, &q, !ret);
        /*
         * If fixup_owner() returned an error, proprogate that.  If it acquired
         * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
        queue_unlock(&q, hb);
 out_put_key:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
        if (ret)
                goto out_put_key;
-        if (!fshared)
+        if (!(flags & FLAGS_SHARED))
                goto retry_private;
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
        goto retry;
 }
@@ -2020,7 +2008,7 @@ uaddr_faulted:
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
 */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -2093,14 +2081,14 @@ retry:
 out_unlock:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 pi_faulted:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
        ret = fault_in_user_writeable(uaddr);
        if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
 * @uaddr:      the futex we initially wait on (non-pi)
- * @fshared:    whether the futexes are shared (1) or not (0).  They must be
+ * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
 *              the same type, no requeueing from private to shared, etc.
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *  0 - On success
 * <0 - On error
 */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
-                                 int clockrt, u32 __user *uaddr2)
+                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
-        union futex_key key2;
+        union futex_key key2 = FUTEX_KEY_INIT;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
-        key2 = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
-        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
-        q.pi_state = NULL;
        q.bitset = bitset;
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
         * count.
         */
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 */
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
-                        ret = fixup_pi_state_owner(uaddr2, &q, current,
+                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                                                   fshared);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
                 */
-                res = fixup_owner(uaddr2, fshared, &q, !ret);
+                res = fixup_owner(uaddr2, &q, !ret);
                /*
                 * If fixup_owner() returned an error, proprogate that.  If it
                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        }
 out_put_keys:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out_key2:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out:
        if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-        int clockrt, ret = -ENOSYS;
+        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
-        int cmd = op & FUTEX_CMD_MASK;
+        unsigned int flags = 0;
-        int fshared = 0;
        if (!(op & FUTEX_PRIVATE_FLAG))
-                fshared = 1;
+                flags |= FLAGS_SHARED;
-        clockrt = op & FUTEX_CLOCK_REALTIME;
+        if (op & FUTEX_CLOCK_REALTIME) {
-        if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                flags |= FLAGS_CLOCKRT;
-                return -ENOSYS;
+                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                        return -ENOSYS;
+        }
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-                ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+                ret = futex_wait(uaddr, flags, val, timeout, val3);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-                ret = futex_wake(uaddr, fshared, val, val3);
+                ret = futex_wake(uaddr, flags, val, val3);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-                                    0);
                break;
        case FUTEX_WAKE_OP:
-                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_unlock_pi(uaddr, fshared);
+                        ret = futex_unlock_pi(uaddr, flags);
                break;
        case FUTEX_TRYLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                            clockrt, uaddr2);
+                                            uaddr2);
                break;
        case FUTEX_CMP_REQUEUE_PI:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-                                    1);
                break;
        default:
                ret = -ENOSYS;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..f2429fc3438c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                struct hrtimer *timer;
+                struct timerqueue_node *next;
-                if (!base->first)
+                next = timerqueue_getnext(&base->active);
+                if (!next)
                        continue;
-                timer = rb_entry(base->first, struct hrtimer, node);
+                timer = container_of(next, struct hrtimer, node);
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                /*
                 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
 static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base)
 {
-        struct rb_node **link = &base->active.rb_node;
-        struct rb_node *parent = NULL;
-        struct hrtimer *entry;
-        int leftmost = 1;
        debug_activate(timer);
-        /*
+        timerqueue_add(&base->active, &timer->node);
-         * Find the right place in the rbtree:
-         */
-        while (*link) {
-                parent = *link;
-                entry = rb_entry(parent, struct hrtimer, node);
-                /*
-                 * We dont care about collisions. Nodes with
-                 * the same expiry time stay together.
-                 */
-                if (hrtimer_get_expires_tv64(timer) <
-                                hrtimer_get_expires_tv64(entry)) {
-                        link = &(*link)->rb_left;
-                } else {
-                        link = &(*link)->rb_right;
-                        leftmost = 0;
-                }
-        }
-        /*
-         * Insert the timer to the rbtree and check whether it
-         * replaces the first pending timer
-         */
-        if (leftmost)
-                base->first = &timer->node;
-        rb_link_node(&timer->node, parent, link);
-        rb_insert_color(&timer->node, &base->active);
        /*
         * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
         * state of a possibly running callback.
         */
        timer->state |= HRTIMER_STATE_ENQUEUED;
-        return leftmost;
+        return (&timer->node == base->active.next);
 }
 /*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
        if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                goto out;
-        /*
+        if (&timer->node == timerqueue_getnext(&base->active)) {
-         * Remove the timer from the rbtree and replace the first
-         * entry pointer if necessary.
-         */
-        if (base->first == &timer->node) {
-                base->first = rb_next(&timer->node);
 #ifdef CONFIG_HIGH_RES_TIMERS
                /* Reprogram the clock event device. if enabled */
                if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
                }
 #endif
        }
-        rb_erase(&timer->node, &base->active);
+        timerqueue_del(&base->active, &timer->node);
 out:
        timer->state = newstate;
 }
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
        if (!hrtimer_hres_active()) {
                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                        struct hrtimer *timer;
+                        struct timerqueue_node *next;
-                        if (!base->first)
+                        next = timerqueue_getnext(&base->active);
+                        if (!next)
                                continue;
-                        timer = rb_entry(base->first, struct hrtimer, node);
+                        timer = container_of(next, struct hrtimer, node);
                        delta.tv64 = hrtimer_get_expires_tv64(timer);
                        delta = ktime_sub(delta, base->get_time());
                        if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        timer->base = &cpu_base->clock_base[clock_id];
        hrtimer_init_timer_hres(timer);
+        timerqueue_init(&timer->node);
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                ktime_t basenow;
-                struct rb_node *node;
+                struct timerqueue_node *node;
                basenow = ktime_add(now, base->offset);
-                while ((node = base->first)) {
+                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
-                        timer = rb_entry(node, struct hrtimer, node);
+                        timer = container_of(node, struct hrtimer, node);
                        /*
                         * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
 */
 void hrtimer_run_queues(void)
 {
-        struct rb_node *node;
+        struct timerqueue_node *node;
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
        int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
        for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
                base = &cpu_base->clock_base[index];
+                if (!timerqueue_getnext(&base->active))
-                if (!base->first)
                        continue;
                if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
                raw_spin_lock(&cpu_base->lock);
-                while ((node = base->first)) {
+                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
-                        timer = rb_entry(node, struct hrtimer, node);
+                        timer = container_of(node, struct hrtimer, node);
                        if (base->softirq_time.tv64 <=
                                        hrtimer_get_expires_tv64(timer))
                                break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        raw_spin_lock_init(&cpu_base->lock);
-        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
+                timerqueue_init_head(&cpu_base->clock_base[i].active);
+        }
        hrtimer_init_hres(cpu_base);
 }
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
 {
        struct hrtimer *timer;
-        struct rb_node *node;
+        struct timerqueue_node *node;
-        while ((node = rb_first(&old_base->active))) {
+        while ((node = timerqueue_getnext(&old_base->active))) {
-                timer = rb_entry(node, struct hrtimer, node);
+                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 */
 static int irq_thread(void *data)
 {
-        struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+        static struct sched_param param = {
+                .sched_priority = MAX_USER_RT_PRIO/2,
+        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
        int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
-                struct sched_param param = { .sched_priority = 0 };
+                static struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..1969d2fc4b36 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                namelen += 2;
        for (i = 0; i < LOCKSTAT_POINTS; i++) {
-                char sym[KSYM_SYMBOL_LEN];
                char ip[32];
                if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                if (!i)
                        seq_line(m, '-', 40-namelen, namelen);
-                sprint_symbol(sym, class->contention_point[i]);
                snprintf(ip, sizeof(ip), "[<%p>]",
                                (void *)class->contention_point[i]);
-                seq_printf(m, "%40s %14lu %29s %s\n", name,
+                seq_printf(m, "%40s %14lu %29s %pS\n",
-                                stats->contention_point[i],
+                           name, stats->contention_point[i],
-                                ip, sym);
+                           ip, (void *)class->contention_point[i]);
        }
        for (i = 0; i < LOCKSTAT_POINTS; i++) {
-                char sym[KSYM_SYMBOL_LEN];
                char ip[32];
                if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                if (!i)
                        seq_line(m, '-', 40-namelen, namelen);
-                sprint_symbol(sym, class->contending_point[i]);
                snprintf(ip, sizeof(ip), "[<%p>]",
                                (void *)class->contending_point[i]);
-                seq_printf(m, "%40s %14lu %29s %s\n", name,
+                seq_printf(m, "%40s %14lu %29s %pS\n",
-                                stats->contending_point[i],
+                           name, stats->contending_point[i],
-                                ip, sym);
+                           ip, (void *)class->contending_point[i]);
        }
        if (i) {
                seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index d190664f25ff..34e00b708fad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
+#include <linux/pfn.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -70,6 +71,26 @@
 #define ARCH_SHF_SMALL 0
 #endif
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?         \
+                (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+                         PFN_DOWN((unsigned long)BASE) + 1)     \
+                : (0UL))
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
        return 0;
 }
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+        unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+        unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+        if (end_pfn > begin_pfn)
+                set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+static void set_section_ro_nx(void *base,
+                        unsigned long text_size,
+                        unsigned long ro_size,
+                        unsigned long total_size)
+{
+        /* begin and end PFNs of the current subsection */
+        unsigned long begin_pfn;
+        unsigned long end_pfn;
+        /*
+         * Set RO for module text and RO-data:
+         * - Always protect first page.
+         * - Do not protect last partial page.
+         */
+        if (ro_size > 0)
+                set_page_attributes(base, base + ro_size, set_memory_ro);
+        /*
+         * Set NX permissions for module data:
+         * - Do not protect first partial page.
+         * - Always protect last page.
+         */
+        if (total_size > text_size) {
+                begin_pfn = PFN_UP((unsigned long)base + text_size);
+                end_pfn = PFN_UP((unsigned long)base + total_size);
+                if (end_pfn > begin_pfn)
+                        set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+        }
+}
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+        unsigned long total_pages;
+        if (mod->module_core == module_region) {
+                /* Set core as NX+RW */
+                total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+                set_memory_nx((unsigned long)mod->module_core, total_pages);
+                set_memory_rw((unsigned long)mod->module_core, total_pages);
+        } else if (mod->module_init == module_region) {
+                /* Set init as NX+RW */
+                total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+                set_memory_nx((unsigned long)mod->module_init, total_pages);
+                set_memory_rw((unsigned long)mod->module_init, total_pages);
+        }
+}
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if ((mod->module_core) && (mod->core_text_size)) {
+                        set_page_attributes(mod->module_core,
+                                                mod->module_core + mod->core_text_size,
+                                                set_memory_rw);
+                }
+                if ((mod->module_init) && (mod->init_text_size)) {
+                        set_page_attributes(mod->module_init,
+                                                mod->module_init + mod->init_text_size,
+                                                set_memory_rw);
+                }
+        }
+        mutex_unlock(&module_mutex);
+}
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if ((mod->module_core) && (mod->core_text_size)) {
+                        set_page_attributes(mod->module_core,
+                                                mod->module_core + mod->core_text_size,
+                                                set_memory_ro);
+                }
+                if ((mod->module_init) && (mod->init_text_size)) {
+                        set_page_attributes(mod->module_init,
+                                                mod->module_init + mod->init_text_size,
+                                                set_memory_ro);
+                }
+        }
+        mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
        destroy_params(mod->kp, mod->num_kp);
        /* This may be NULL, but that's OK */
+        unset_section_ro_nx(mod, mod->module_init);
        module_free(mod, mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
+        unset_section_ro_nx(mod, mod->module_core);
        module_free(mod, mod->module_core);
 #ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
                        DEBUGP("\t%s\n", name);
                }
-                if (m == 0)
+                switch (m) {
+                case 0: /* executable */
+                        mod->core_size = debug_align(mod->core_size);
                        mod->core_text_size = mod->core_size;
+                        break;
+                case 1: /* RO: text and ro-data */
+                        mod->core_size = debug_align(mod->core_size);
+                        mod->core_ro_size = mod->core_size;
+                        break;
+                case 3: /* whole core */
+                        mod->core_size = debug_align(mod->core_size);
+                        break;
+                }
        }
        DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                         | INIT_OFFSET_MASK);
                        DEBUGP("\t%s\n", sname);
                }
-                if (m == 0)
+                switch (m) {
+                case 0: /* executable */
+                        mod->init_size = debug_align(mod->init_size);
                        mod->init_text_size = mod->init_size;
+                        break;
+                case 1: /* RO: text and ro-data */
+                        mod->init_size = debug_align(mod->init_size);
+                        mod->init_ro_size = mod->init_size;
+                        break;
+                case 3: /* whole init */
+                        mod->init_size = debug_align(mod->init_size);
+                        break;
+                }
        }
 }
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                        MODULE_STATE_COMING, mod);
+        /* Set RO and NX regions for core */
+        set_section_ro_nx(mod->module_core,
+                                mod->core_text_size,
+                                mod->core_ro_size,
+                                mod->core_size);
+        /* Set RO and NX regions for init */
+        set_section_ro_nx(mod->module_init,
+                                mod->init_text_size,
+                                mod->init_ro_size,
+                                mod->init_size);
        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mod->symtab = mod->core_symtab;
        mod->strtab = mod->core_strtab;
 #endif
+        unset_section_ro_nx(mod, mod->module_init);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * memory barriers as we'll eventually observe the right
                 * values at the cost of a few extra spins.
                 */
-                cpu_relax();
+                arch_mutex_cpu_relax();
        }
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+#define lock_timer(tid, flags)                                             \
+({      struct k_itimer *__timr;                                           \
+        __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
+        __timr;                                                            \
+})
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
@@ -619,7 +625,7 @@ out:
 * the find to the timer lock.  To avoid a dead lock, the timer id MUST
 * be release with out holding the timer lock.
 */
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
        /*
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
 void printk_tick(void)
 {
-        if (__get_cpu_var(printk_pending)) {
+        if (__this_cpu_read(printk_pending)) {
-                __get_cpu_var(printk_pending) = 0;
+                __this_cpu_write(printk_pending, 0);
                wake_up_interruptible(&log_wait);
        }
 }
 int printk_needs_cpu(int cpu)
 {
-        if (unlikely(cpu_is_offline(cpu)))
+        if (cpu_is_offline(cpu))
                printk_tick();
-        return per_cpu(printk_pending, cpu);
+        return __this_cpu_read(printk_pending);
 }
 void wake_up_klogd(void)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..034493724749 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-struct rcu_ctrlblk {
+static struct task_struct *rcu_kthread_task;
-        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+static unsigned long have_rcu_kthread_work;
-        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+static void invoke_rcu_kthread(void);
-};
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-        .donetail       = &rcu_sched_ctrlblk.rcucblist,
-        .curtail        = &rcu_sched_ctrlblk.rcucblist,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-        .donetail       = &rcu_bh_ctrlblk.rcucblist,
-        .curtail        = &rcu_bh_ctrlblk.rcucblist,
-};
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
 }
 /*
- * Helper function for rcu_process_callbacks() that operates on the
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
- * specified rcu_ctrlkblk structure.
+ * whose grace period has elapsed.
 */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
        struct rcu_head *next, *list;
        unsigned long flags;
+        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
+                local_bh_disable();
                list->func(list);
+                local_bh_enable();
                list = next;
+                RCU_TRACE(cb_count++);
        }
+        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
 */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
 {
-        __rcu_process_callbacks(&rcu_sched_ctrlblk);
+        unsigned long work;
-        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        unsigned long morework;
-        rcu_preempt_process_callbacks();
+        unsigned long flags;
+        for (;;) {
+                wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work) {
+                        rcu_process_callbacks(&rcu_sched_ctrlblk);
+                        rcu_process_callbacks(&rcu_bh_ctrlblk);
+                        rcu_preempt_process_callbacks();
+                }
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
+}
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+        local_irq_restore(flags);
 }
 /*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
        local_irq_save(flags);
        *rcp->curtail = head;
        rcp->curtail = &head->next;
+        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
 }
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        struct sched_param sp;
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
 }
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+        .donetail       = &rcu_sched_ctrlblk.rcucblist,
+        .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+        .donetail       = &rcu_bh_ctrlblk.rcucblist,
+        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #ifdef CONFIG_TINY_PREEMPT_RCU
 #include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
        struct list_head *gp_tasks;
                                /* Pointer to the first task blocking the */
                                /*  current grace period, or NULL if there */
-                                /*  is not such task. */
+                                /*  is no such task. */
        struct list_head *exp_tasks;
                                /* Pointer to first task blocking the */
                                /*  current expedited grace period, or NULL */
                                /*  if there is no such task.  If there */
                                /*  is no current expedited grace period, */
                                /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+        struct list_head *boost_tasks;
+                                /* Pointer to first task that needs to be */
+                                /*  priority-boosted, or NULL if no priority */
+                                /*  boosting is needed.  If there is no */
+                                /*  current or expedited grace period, there */
+                                /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
        u8 gpnum;               /* Current grace period. */
        u8 gpcpu;               /* Last grace period blocked by the CPU. */
        u8 completed;           /* Last grace period completed. */
                                /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+        s8 boosted_this_gp;     /* Has boosting already happened? */
+        unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+        unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+        unsigned long n_tasks_boosted;
+        unsigned long n_exp_boosts;
+        unsigned long n_normal_boosts;
+        unsigned long n_normal_balk_blkd_tasks;
+        unsigned long n_normal_balk_gp_tasks;
+        unsigned long n_normal_balk_boost_tasks;
+        unsigned long n_normal_balk_boosted;
+        unsigned long n_normal_balk_notyet;
+        unsigned long n_normal_balk_nos;
+        unsigned long n_exp_balk_blkd_tasks;
+        unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
 }
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+        struct list_head *np;
+        np = t->rcu_node_entry.next;
+        if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                np = NULL;
+        return np;
+}
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+        seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                   rcu_preempt_ctrlblk.rcb.qlen,
+                   rcu_preempt_ctrlblk.n_grace_periods,
+                   rcu_preempt_ctrlblk.gpnum,
+                   rcu_preempt_ctrlblk.gpcpu,
+                   rcu_preempt_ctrlblk.completed,
+                   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, "             ttb=%c btg=",
+                   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+        switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+        case -1:
+                seq_puts(m, "exp");
+                break;
+        case 0:
+                seq_puts(m, "no");
+                break;
+        case 1:
+                seq_puts(m, "begun");
+                break;
+        case 2:
+                seq_puts(m, "done");
+                break;
+        default:
+                seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+        }
+        seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                   rcu_preempt_ctrlblk.n_tasks_boosted,
+                   rcu_preempt_ctrlblk.n_exp_boosts,
+                   rcu_preempt_ctrlblk.n_normal_boosts,
+                   (int)(jiffies & 0xffff),
+                   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+        seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+                   "normal balk",
+                   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+                   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+                   rcu_preempt_ctrlblk.n_normal_balk_nos);
+        seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+                   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+#ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+        unsigned long flags;
+        struct rt_mutex mtx;
+        struct list_head *np;
+        struct task_struct *t;
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+                return 0;  /* Nothing to boost. */
+        raw_local_irq_save(flags);
+        rcu_preempt_ctrlblk.boosted_this_gp++;
+        t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+                         rcu_node_entry);
+        np = rcu_next_node_entry(t);
+        rt_mutex_init_proxy_locked(&mtx, t);
+        t->rcu_boost_mutex = &mtx;
+        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+        raw_local_irq_restore(flags);
+        rt_mutex_lock(&mtx);
+        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+        rcu_preempt_ctrlblk.boosted_this_gp++;
+        rt_mutex_unlock(&mtx);
+        return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+        if (!rcu_preempt_blocked_readers_cgp()) {
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+                return 0;
+        }
+        if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+            rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+            ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+                rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+                invoke_rcu_kthread();
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+        } else
+                RCU_TRACE(rcu_initiate_boost_trace());
+        return 1;
+}
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+                rcu_preempt_ctrlblk.boost_tasks =
+                        rcu_preempt_ctrlblk.blkd_tasks.next;
+                rcu_preempt_ctrlblk.boosted_this_gp = -1;
+                invoke_rcu_kthread();
+                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+        } else
+                RCU_TRACE(rcu_initiate_exp_boost_trace());
+        raw_local_irq_restore(flags);
+}
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+        rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+        if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+                rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+        return rcu_preempt_blocked_readers_cgp();
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * in a quiescent state.  There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        /* If there is no GP then there is nothing more to do.  */
+        if (!rcu_preempt_gp_in_progress())
+                return;
        /*
-         * If there is no GP, or if blocked readers are still blocking GP,
+         * Check up on boosting.  If there are no readers blocking the
-         * then there is nothing more to do.
+         * current grace period, leave.
         */
-        if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+        if (rcu_initiate_boost())
                return;
        /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
        if (!rcu_preempt_blocked_readers_any())
                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
-        /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
                        rcu_preempt_ctrlblk.gp_tasks =
                                rcu_preempt_ctrlblk.blkd_tasks.next;
+                /* Set up for RCU priority boosting. */
+                rcu_preempt_boost_start_gp();
                /* If there is no running reader, CPU is done with GP. */
                if (!rcu_preempt_running_reader())
                        rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                empty = !rcu_preempt_blocked_readers_cgp();
                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-                np = t->rcu_node_entry.next;
+                np = rcu_next_node_entry(t);
-                if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                        np = NULL;
                list_del(&t->rcu_node_entry);
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                        rcu_preempt_ctrlblk.gp_tasks = np;
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
                        rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                        rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
                INIT_LIST_HEAD(&t->rcu_node_entry);
                /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_report_exp_done();
        }
+#ifdef CONFIG_RCU_BOOST
+        /* Unboost self if was boosted. */
+        if (special & RCU_READ_UNLOCK_BOOSTED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+                rt_mutex_unlock(t->rcu_boost_mutex);
+                t->rcu_boost_mutex = NULL;
+        }
+#endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
 }
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
 /*
 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
 * handle that case.  Of course, it is invoked for all flavors of
 * RCU, but RCU callbacks can appear only on one of the lists, and
 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_preempt_process_callbacks(void)
 {
-        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 /*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        local_irq_save(flags);
        *rcu_preempt_ctrlblk.nexttail = head;
        rcu_preempt_ctrlblk.nexttail = &head->next;
+        RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
        rcu_preempt_start_gp();  /* checks to see if GP needed. */
        local_irq_restore(flags);
 }
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
        /* Wait for tail of ->blkd_tasks list to drain. */
        if (rcu_preempted_readers_exp())
+                rcu_initiate_expedited_boost();
                wait_event(sync_rcu_preempt_exp_wq,
                           !rcu_preempted_readers_exp());
@@ -572,6 +857,27 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 /*
 * During boot, we forgive RCU lockdep issues.  After this function is
 * invoked, we start taking RCU lockdep issues seriously.
 */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
        WARN_ON(nr_context_switches() > 0);
        rcu_scheduler_active = 1;
 }
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void)
+{
+        if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+                rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+        else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+                rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+        else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+                rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+        else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+                rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+        else
+                rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+static void rcu_initiate_exp_boost_trace(void)
+{
+        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+                rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+        else
+                rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        rcp->qlen -= n;
+        raw_local_irq_restore(flags);
+}
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+        show_tiny_preempt_stats(m);
+        seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+        seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+        return 0;
+}
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_tiny_stats, NULL);
+}
+static const struct file_operations show_tiny_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = show_tiny_stats_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static struct dentry *rcudir;
+static int __init rcutiny_trace_init(void)
+{
+        struct dentry *retval;
+        rcudir = debugfs_create_dir("rcu", NULL);
+        if (!rcudir)
+                goto free_out;
+        retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                     NULL, &show_tiny_stats_fops);
+        if (!retval)
+                goto free_out;
+        return 0;
+free_out:
+        debugfs_remove_recursive(rcudir);
+        return 1;
+}
+static void __exit rcutiny_trace_cleanup(void)
+{
+        debugfs_remove_recursive(rcudir);
+}
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;     /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 #define RCU_TORTURE_PIPE_LEN 10
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+static unsigned long boost_starttime;   /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
+                                        /*  and boost task create/destroy. */
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 #define FULLSTOP_DONTSTOP 0     /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
+        int can_boost;
        char *name;
 };
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu"
 };
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_sync"
 };
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_expedited"
 };
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
 };
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+struct rcu_boost_inflight {
+        struct rcu_head rcu;
+        int inflight;
+};
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+        struct rcu_boost_inflight *rbip =
+                container_of(head, struct rcu_boost_inflight, rcu);
+        smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+        rbip->inflight = 0;
+}
+static int rcu_torture_boost(void *arg)
+{
+        unsigned long call_rcu_time;
+        unsigned long endtime;
+        unsigned long oldstarttime;
+        struct rcu_boost_inflight rbi = { .inflight = 0 };
+        struct sched_param sp;
+        VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+        /* Set real-time priority. */
+        sp.sched_priority = 1;
+        if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+                VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+                n_rcu_torture_boost_rterror++;
+        }
+        /* Each pass through the following loop does one boost-test cycle. */
+        do {
+                /* Wait for the next test interval. */
+                oldstarttime = boost_starttime;
+                while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                        schedule_timeout_uninterruptible(1);
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /* Do one boost-test interval. */
+                endtime = oldstarttime + test_boost_duration * HZ;
+                call_rcu_time = jiffies;
+                while (jiffies - endtime > ULONG_MAX / 2) {
+                        /* If we don't have a callback in flight, post one. */
+                        if (!rbi.inflight) {
+                                smp_mb(); /* RCU core before ->inflight = 1. */
+                                rbi.inflight = 1;
+                                call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+                                if (jiffies - call_rcu_time >
+                                         test_boost_duration * HZ - HZ / 2) {
+                                        VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                        n_rcu_torture_boost_failure++;
+                                }
+                                call_rcu_time = jiffies;
+                        }
+                        cond_resched();
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /*
+                 * Set the start time of the next test interval.
+                 * Yes, this is vulnerable to long delays, but such
+                 * delays simply cause a false negative for the next
+                 * interval.  Besides, we are running at RT priority,
+                 * so delays should be relatively rare.
+                 */
+                while (oldstarttime == boost_starttime) {
+                        if (mutex_trylock(&boost_mutex)) {
+                                boost_starttime = jiffies +
+                                                  test_boost_interval * HZ;
+                                n_rcu_torture_boosts++;
+                                mutex_unlock(&boost_mutex);
+                                break;
+                        }
+                        schedule_timeout_uninterruptible(1);
+                }
+                /* Go do the stutter. */
+checkwait:      rcu_stutter_wait("rcu_torture_boost");
+        } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+        /* Clean up and exit. */
+        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_boost");
+        while (!kthread_should_stop() || rbi.inflight)
+                schedule_timeout_uninterruptible(1);
+        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+        return 0;
+}
+/*
 * RCU torture force-quiescent-state kthread.  Repeatedly induces
 * bursts of calls to force_quiescent_state(), increasing the probability
 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-                       "rtmbe: %d nt: %ld",
+                       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+                       "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free),
                       atomic_read(&n_rcu_torture_mberror),
+                       n_rcu_torture_boost_ktrerror,
+                       n_rcu_torture_boost_rterror,
+                       n_rcu_torture_boost_allocerror,
+                       n_rcu_torture_boost_afferror,
+                       n_rcu_torture_boost_failure,
+                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        if (atomic_read(&n_rcu_torture_mberror) != 0)
+        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+            n_rcu_torture_boost_ktrerror != 0 ||
+            n_rcu_torture_boost_rterror != 0 ||
+            n_rcu_torture_boost_allocerror != 0 ||
+            n_rcu_torture_boost_afferror != 0 ||
+            n_rcu_torture_boost_failure != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
 }
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                "shuffle_interval=%d stutter=%d irqreader=%d "
-                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+                "test_boost=%d/%d test_boost_interval=%d "
+                "test_boost_duration=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+                test_boost, cur_ops->can_boost,
+                test_boost_interval, test_boost_duration);
 }
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
        .notifier_call = rcutorture_shutdown_notify,
 };
+static void rcutorture_booster_cleanup(int cpu)
+{
+        struct task_struct *t;
+        if (boost_tasks[cpu] == NULL)
+                return;
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+        t = boost_tasks[cpu];
+        boost_tasks[cpu] = NULL;
+        mutex_unlock(&boost_mutex);
+        /* This must be outside of the mutex, otherwise deadlock! */
+        kthread_stop(t);
+}
+static int rcutorture_booster_init(int cpu)
+{
+        int retval;
+        if (boost_tasks[cpu] != NULL)
+                return 0;  /* Already created, nothing more to do. */
+        /* Don't allow time recalculation while creating a new task. */
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+                                          "rcu_torture_boost");
+        if (IS_ERR(boost_tasks[cpu])) {
+                retval = PTR_ERR(boost_tasks[cpu]);
+                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+                n_rcu_torture_boost_ktrerror++;
+                boost_tasks[cpu] = NULL;
+                mutex_unlock(&boost_mutex);
+                return retval;
+        }
+        kthread_bind(boost_tasks[cpu], cpu);
+        wake_up_process(boost_tasks[cpu]);
+        mutex_unlock(&boost_mutex);
+        return 0;
+}
+static int rcutorture_cpu_notify(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                (void)rcutorture_booster_init(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+                rcutorture_booster_cleanup(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block rcutorture_cpu_nb = {
+        .notifier_call = rcutorture_cpu_notify,
+};
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
        }
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
-        unregister_reboot_notifier(&rcutorture_nb);
+        unregister_reboot_notifier(&rcutorture_shutdown_nb);
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
                kthread_stop(fqs_task);
        }
        fqs_task = NULL;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                unregister_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i)
+                        rcutorture_booster_cleanup(i);
+        }
        /* Wait for all RCU callbacks to fire.  */
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
        if (cur_ops->cleanup)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
-                rcu_torture_print_module_parms("End of test: FAILURE");
+                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else
-                rcu_torture_print_module_parms("End of test: SUCCESS");
+                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
                nrealreaders = nreaders;
        else
                nrealreaders = 2 * num_online_cpus();
-        rcu_torture_print_module_parms("Start of test");
+        rcu_torture_print_module_parms(cur_ops, "Start of test");
        fullstop = FULLSTOP_DONTSTOP;
        /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
+        n_rcu_torture_boost_ktrerror = 0;
+        n_rcu_torture_boost_rterror = 0;
+        n_rcu_torture_boost_allocerror = 0;
+        n_rcu_torture_boost_afferror = 0;
+        n_rcu_torture_boost_failure = 0;
+        n_rcu_torture_boosts = 0;
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
        for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
-        register_reboot_notifier(&rcutorture_nb);
+        if (test_boost_interval < 1)
+                test_boost_interval = 1;
+        if (test_boost_duration < 2)
+                test_boost_duration = 2;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                int retval;
+                boost_starttime = jiffies + test_boost_interval * HZ;
+                register_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i) {
+                        if (cpu_is_offline(i))
+                                continue;  /* Heuristic: CPU can go offline. */
+                        retval = rcutorture_booster_init(i);
+                        if (retval < 0) {
+                                firsterr = retval;
+                                goto unwind;
+                        }
+                }
+        }
+        register_reboot_notifier(&rcutorture_shutdown_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..d0ddfea6579d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-        .orphan_cbs_list = NULL, \
-        .orphan_cbs_tail = &structname.orphan_cbs_list, \
-        .orphan_qlen = 0, \
        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        if (rdp->gpnum != rnp->gpnum) {
-                rdp->qs_pending = 1;
+                /*
-                rdp->passed_quiesc = 0;
+                 * If the current grace period is waiting for this CPU,
+                 * set up to detect a quiescent state, otherwise don't
+                 * go looking for one.
+                 */
                rdp->gpnum = rnp->gpnum;
+                if (rnp->qsmask & rdp->grpmask) {
+                        rdp->qs_pending = 1;
+                        rdp->passed_quiesc = 0;
+                } else
+                        rdp->qs_pending = 0;
        }
 }
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+                /*
+                 * If we were in an extended quiescent state, we may have
+                 * missed some grace periods that others CPUs handled on
+                 * our behalf. Catch up with this state to avoid noting
+                 * spurious new grace periods.  If another grace period
+                 * has started, then rnp->gpnum will have advanced, so
+                 * we will detect this later on.
+                 */
+                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+                        rdp->gpnum = rdp->completed;
+                /*
+                 * If RCU does not need a quiescent state from this CPU,
+                 * then make sure that this CPU doesn't go looking for one.
+                 */
+                if ((rnp->qsmask & rdp->grpmask) == 0)
+                        rdp->qs_pending = 0;
        }
 }
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * specified flavor of RCU.  The callbacks will be adopted by the next
+ * Synchronization is not required because this function executes
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
+ * in stop_machine() context.
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
 */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
        int i;
+        /* current DYING CPU is cleared in the cpu_online_mask */
+        int receive_cpu = cpumask_any(cpu_online_mask);
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-        *rsp->orphan_cbs_tail = rdp->nxtlist;
+        *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->qlen += rdp->qlen;
+        receive_rdp->n_cbs_adopted += rdp->qlen;
+        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
-        rsp->orphan_qlen += rdp->qlen;
-        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->qlen = 0;
-        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-        unsigned long flags;
-        struct rcu_data *rdp;
-        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        rdp = this_cpu_ptr(rsp->rda);
-        if (rsp->orphan_cbs_list == NULL) {
-                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-                return;
-        }
-        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-        rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-        rdp->qlen += rsp->orphan_qlen;
-        rdp->n_cbs_adopted += rsp->orphan_qlen;
-        rsp->orphan_cbs_list = NULL;
-        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-        rsp->orphan_qlen = 0;
-        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
-        rcu_adopt_orphan_cbs(rsp);
 }
 /*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-{
-}
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 {
 }
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         */
        local_irq_save(flags);
        rdp = this_cpu_ptr(rsp->rda);
-        rcu_process_gp_end(rsp, rdp);
-        check_for_new_grace_period(rsp, rdp);
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
-        /* Start a new grace period if one not already started. */
-        if (!rcu_gp_in_progress(rsp)) {
-                unsigned long nestflag;
-                struct rcu_node *rnp_root = rcu_get_root(rsp);
-                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-        }
        /*
         * Force the grace period if too many callbacks or too long waiting.
         * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * is the only one waiting for a grace period to complete.
         */
        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-                rdp->blimit = LONG_MAX;
-                if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                /* Are we ignoring a completed grace period? */
-                    *rdp->nxttail[RCU_DONE_TAIL] != head)
+                rcu_process_gp_end(rsp, rdp);
-                        force_quiescent_state(rsp, 0);
+                check_for_new_grace_period(rsp, rdp);
-                rdp->n_force_qs_snap = rsp->n_force_qs;
-                rdp->qlen_last_fqs_check = rdp->qlen;
+                /* Start a new grace period if one not already started. */
+                if (!rcu_gp_in_progress(rsp)) {
+                        unsigned long nestflag;
+                        struct rcu_node *rnp_root = rcu_get_root(rsp);
+                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                } else {
+                        /* Give the grace period a kick. */
+                        rdp->blimit = LONG_MAX;
+                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                                force_quiescent_state(rsp, 0);
+                        rdp->n_force_qs_snap = rsp->n_force_qs;
+                        rdp->qlen_last_fqs_check = rdp->qlen;
+                }
        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
         * might complete its grace period before all of the other CPUs
         * did their increment, causing this function to return too
-         * early.
+         * early.  Note that on_each_cpu() disables irqs, which prevents
+         * any CPUs from coming online or going offline until each online
+         * CPU has queued its RCU-barrier callback.
         */
        atomic_set(&rcu_barrier_cpu_count, 1);
-        preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-        rcu_adopt_orphan_cbs(rsp);
        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-        preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                complete(&rcu_barrier_completion);
        wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /*
-                 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
+                 * The whole machine is "stopped" except this CPU, so we can
-                 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+                 * touch any data without introducing corruption. We send the
-                 * returns, all online cpus have queued rcu_barrier_func().
+                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
-                 * The dying CPU clears its cpu_online_mask bit and
-                 * moves all of its RCU callbacks to ->orphan_cbs_list
-                 * in the context of stop_machine(), so subsequent calls
-                 * to _rcu_barrier() will adopt these callbacks and only
-                 * then queue rcu_barrier_func() on all remaining CPUs.
                 */
-                rcu_send_cbs_to_orphanage(&rcu_bh_state);
+                rcu_send_cbs_to_online(&rcu_bh_state);
-                rcu_send_cbs_to_orphanage(&rcu_sched_state);
+                rcu_send_cbs_to_online(&rcu_sched_state);
-                rcu_preempt_send_cbs_to_orphanage();
+                rcu_preempt_send_cbs_to_online();
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+        rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
 /*
 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
 * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
+ * In practice, this did work well going from three levels to four.
- * bug somewhere.
+ * Of course, your mileage may vary.
 */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT            (CONFIG_RCU_FANOUT)
+#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_SQ         (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       16
-#define RCU_FANOUT_CUBE       (RCU_FANOUT_SQ * RCU_FANOUT)
+#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#if NR_CPUS <= RCU_FANOUT
+#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS        2
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS        3
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_3       NR_CPUS
+#  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS        4
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_4       NR_CPUS
+#  define NUM_RCU_LVL_4       (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
-        unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
-        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
+        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
        /* End of fields guarded by root rcu_node's lock. */
        raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                                /*  starting new GP.  Also */
+                                                /*  starting new GP. */
-                                                /*  protects the following */
-                                                /*  orphan_cbs fields. */
-        struct rcu_head *orphan_cbs_list;       /* list of rcu_head structs */
-                                                /*  orphaned by all CPUs in */
-                                                /*  a given leaf rcu_node */
-                                                /*  going offline. */
-        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
-        long orphan_qlen;                       /* Number of orphaned cbs. */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
 */
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 /*
 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-        rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+        rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 /*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
 * Because there is no preemptable RCU, there are no callbacks to move.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+void synchronize_sched_expedited(void)
+{
+        cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#else /* #ifndef CONFIG_SMP */
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
+}
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+        int firstsnap, s, snap, trycount = 0;
+        /* Note that atomic_inc_return() implies full memory barrier. */
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+        get_online_cpus();
+        /*
+         * Each pass through the following loop attempts to force a
+         * context switch on each CPU.
+         */
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
+                put_online_cpus();
+                /* No joy, try again later.  Or just synchronize_sched(). */
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_sched();
+                        return;
+                }
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        return;
+                }
+                /*
+                 * Refetching sync_sched_expedited_started allows later
+                 * callers to piggyback on our grace period.  We subtract
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
+                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
+        }
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+        put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#endif /* #else #ifndef CONFIG_SMP */
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh, rsp->orphan_qlen);
+                   rsp->n_force_qs_lh);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
 static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
        struct dentry *retval;
@@ -337,14 +337,14 @@ free_out:
        return 1;
 }
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
        debugfs_remove_recursive(rcudir);
 }
-module_init(rcuclassic_trace_init);
+module_init(rcutree_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_exit(rcutree_trace_cleanup);
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index c68cead94dd7..04949089e760 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
        unsigned long shares;
+        atomic_t load_weight;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
        struct task_group *parent;
        struct list_head siblings;
        struct list_head children;
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
 };
 #define root_task_group init_task_group
-/* task_group_lock serializes add/remove of task groups and also changes to
+/* task_group_lock serializes the addition/removal of task groups */
- * a task group's cpu shares.
- */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-        return list_empty(&root_task_group.children);
-}
-#endif
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 /*
@@ -342,6 +341,7 @@ struct cfs_rq {
         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
         * list is used during load balance.
         */
+        int on_list;
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
@@ -360,14 +360,17 @@ struct cfs_rq {
        unsigned long h_load;
        /*
-         * this cpu's part of tg->shares
+         * Maintaining per-cpu shares distribution for group scheduling
+         *
+         * load_stamp is the last time we updated the load average
+         * load_last is the last time we updated the load average and saw load
+         * load_unacc_exec_time is currently unaccounted execution time
         */
-        unsigned long shares;
+        u64 load_avg;
+        u64 load_period;
+        u64 load_stamp, load_last, load_unacc_exec_time;
-        /*
+        unsigned long load_contribution;
-         * load.weight at the time we set shares
-         */
-        unsigned long rq_weight;
 #endif
 #endif
 };
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+        struct task_group *tg;
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
-        return container_of(css, struct task_group, css);
+        tg = container_of(css, struct task_group, css);
+        return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -793,20 +799,6 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
        lw->inv_weight = 0;
 }
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                    unsigned long sd_shares,
-                                    unsigned long sd_rq_weight,
-                                    unsigned long *usd_rq_weight)
-{
-        unsigned long shares, rq_weight;
-        int boost = 0;
-        rq_weight = usd_rq_weight[cpu];
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        /*
-         *             \Sum_j shares_j * rq_weight_i
-         * shares_i =  -----------------------------
-         *                  \Sum_j rq_weight_j
-         */
-        shares = (sd_shares * rq_weight) / sd_rq_weight;
-        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-        if (abs(shares - tg->se[cpu]->load.weight) >
-                        sysctl_sched_shares_thresh) {
-                struct rq *rq = cpu_rq(cpu);
-                unsigned long flags;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                __set_se_shares(tg->se[cpu], shares);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-}
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-        unsigned long *usd_rq_weight;
-        struct sched_domain *sd = data;
-        unsigned long flags;
-        int i;
-        if (!tg->se[0])
-                return 0;
-        local_irq_save(flags);
-        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-        for_each_cpu(i, sched_domain_span(sd)) {
-                weight = tg->cfs_rq[i]->load.weight;
-                usd_rq_weight[i] = weight;
-                rq_weight += weight;
-                /*
-                 * If there are currently no tasks on the cpu pretend there
-                 * is one of average load so that when a new task gets to
-                 * run here it will not get delayed by group starvation.
-                 */
-                if (!weight)
-                        weight = NICE_0_LOAD;
-                sum_weight += weight;
-                shares += tg->cfs_rq[i]->shares;
-        }
-        if (!rq_weight)
-                rq_weight = sum_weight;
-        if ((!shares && rq_weight) || shares > tg->shares)
-                shares = tg->shares;
-        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-                shares = tg->shares;
-        for_each_cpu(i, sched_domain_span(sd))
-                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-        local_irq_restore(flags);
-        return 0;
-}
 /*
 * Compute the cpu's hierarchical load factor for each task group.
 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                load = cpu_rq(cpu)->load.weight;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->cfs_rq[cpu]->shares;
+                load *= tg->se[cpu]->load.weight;
                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
        }
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
        return 0;
 }
-static void update_shares(struct sched_domain *sd)
-{
-        s64 elapsed;
-        u64 now;
-        if (root_task_group_empty())
-                return;
-        now = local_clock();
-        elapsed = now - sd->last_update;
-        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-                sd->last_update = now;
-                walk_tg_tree(tg_nop, tg_shares_up, sd);
-        }
-}
 static void update_h_load(long cpu)
 {
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
-#else
-static inline void update_shares(struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-        cfs_rq->shares = shares;
-#endif
-}
-#endif
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
 * The task's runqueue lock must be held.
 * Returns true if you have to wait for migration thread.
 */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
 {
-        struct rq *rq = task_rq(p);
        /*
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
        /* No more Mr. Nice Guy. */
-        if (unlikely(dest_cpu >= nr_cpu_ids)) {
+        dest_cpu = cpuset_cpus_allowed_fallback(p);
-                dest_cpu = cpuset_cpus_allowed_fallback(p);
+        /*
-                /*
+         * Don't tell them about moving exiting tasks or
-                 * Don't tell them about moving exiting tasks or
+         * kernel threads (both mm NULL), since they never
-                 * kernel threads (both mm NULL), since they never
+         * leave kernel.
-                 * leave kernel.
+         */
-                 */
+        if (p->mm && printk_ratelimit()) {
-                if (p->mm && printk_ratelimit()) {
+                printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                        printk(KERN_INFO "process %d (%s) no "
+                                task_pid_nr(p), p->comm, cpu);
-                               "longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
-                }
        }
        return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
        put_cpu();
 }
@@ -3549,7 +3418,7 @@ void sched_exec(void)
         * select_task_rq() can race against ->cpus_allowed
         */
        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+            likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
                task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                if (task_thread_info(rq->curr) != owner || need_resched())
                        return 0;
-                cpu_relax();
+                arch_mutex_cpu_relax();
        }
        return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
 */
-unsigned long __sched
+long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
 {
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
 */
-unsigned long __sched
+long __sched
 wait_for_completion_killable_timeout(struct completion *x,
                                     unsigned long timeout)
 {
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
 }
 static int __sched_setscheduler(struct task_struct *p, int policy,
-                                struct sched_param *param, bool user)
+                                const struct sched_param *param, bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
 * NOTE that the task may be already dead.
 */
 int sched_setscheduler(struct task_struct *p, int policy,
-                       struct sched_param *param)
+                       const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, true);
 }
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 * but our caller might not have that capability.
 */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                               struct sched_param *param)
+                               const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, false);
 }
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
        unsigned state;
        state = p->state ? __ffs(p->state) + 1 : 0;
-        printk(KERN_INFO "%-13.13s %c", p->comm,
+        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
        if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
        SET_SYSCTL(sched_min_granularity);
        SET_SYSCTL(sched_latency);
        SET_SYSCTL(sched_wakeup_granularity);
-        SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
@@ -5830,7 +5698,7 @@ again:
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (migrate_task(p, dest_cpu)) {
+        if (migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
 */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
 {
-        struct rq *rq = cpu_rq(dead_cpu);
+        struct mm_struct *mm = current->active_mm;
-        int needs_cpu, uninitialized_var(dest_cpu);
-        unsigned long flags;
-        local_irq_save(flags);
+        BUG_ON(cpu_online(smp_processor_id()));
-        raw_spin_lock(&rq->lock);
+        if (mm != &init_mm)
-        needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+                switch_mm(mm, &init_mm, current);
-        if (needs_cpu)
+        mmdrop(mm);
-                dest_cpu = select_fallback_rq(dead_cpu, p);
-        raw_spin_unlock(&rq->lock);
-        /*
-         * It can only fail if we race with set_cpus_allowed(),
-         * in the racer should migrate the task anyway.
-         */
-        if (needs_cpu)
-                __migrate_task(p, dead_cpu, dest_cpu);
-        local_irq_restore(flags);
 }
 /*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-        unsigned long flags;
-        local_irq_save(flags);
-        double_rq_lock(rq_src, rq_dest);
        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
        rq_src->nr_uninterruptible = 0;
-        double_rq_unlock(rq_src, rq_dest);
-        local_irq_restore(flags);
-}
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-        struct task_struct *p, *t;
-        read_lock(&tasklist_lock);
-        do_each_thread(t, p) {
-                if (p == current)
-                        continue;
-                if (task_cpu(p) == src_cpu)
-                        move_task_off_dead_cpu(src_cpu, p);
-        } while_each_thread(t, p);
-        read_unlock(&tasklist_lock);
 }
 /*
- * Schedules idle task to be the next runnable task on current CPU.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
 */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
 {
-        int this_cpu = smp_processor_id();
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-        struct rq *rq = cpu_rq(this_cpu);
+        rq->calc_load_active = 0;
-        struct task_struct *p = rq->idle;
-        unsigned long flags;
-        /* cpu has to be offline */
-        BUG_ON(cpu_online(this_cpu));
-        /*
-         * Strictly not necessary since rest of the CPUs are stopped by now
-         * and interrupts disabled on the current cpu.
-         */
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-        activate_task(rq, p, 0);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * offline.
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
 */
-void idle_task_exit(void)
+static void migrate_tasks(unsigned int dead_cpu)
-{
-        struct mm_struct *mm = current->active_mm;
-        BUG_ON(cpu_online(smp_processor_id()));
-        if (mm != &init_mm)
-                switch_mm(mm, &init_mm, current);
-        mmdrop(mm);
-}
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
        struct rq *rq = cpu_rq(dead_cpu);
+        struct task_struct *next, *stop = rq->stop;
-        /* Must be exiting, otherwise would be on tasklist. */
+        int dest_cpu;
-        BUG_ON(!p->exit_state);
-        /* Cannot have done final schedule yet: would have vanished. */
-        BUG_ON(p->state == TASK_DEAD);
-        get_task_struct(p);
        /*
-         * Drop lock around migration; if someone else moves it,
+         * Fudge the rq selection such that the below task selection loop
-         * that's OK. No task can be added to this CPU, so iteration is
+         * doesn't get stuck on the currently eligible stop task.
-         * fine.
+         *
+         * We're currently inside stop_machine() and the rq is either stuck
+         * in the stop_machine_cpu_stop() loop, or we're executing this code,
+         * either way we should never end up calling schedule() until we're
+         * done here.
         */
-        raw_spin_unlock_irq(&rq->lock);
+        rq->stop = NULL;
-        move_task_off_dead_cpu(dead_cpu, p);
-        raw_spin_lock_irq(&rq->lock);
-        put_task_struct(p);
-}
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-        struct rq *rq = cpu_rq(dead_cpu);
-        struct task_struct *next;
        for ( ; ; ) {
-                if (!rq->nr_running)
+                /*
+                 * There's this thread running, bail when that's the only
+                 * remaining thread.
+                 */
+                if (rq->nr_running == 1)
                        break;
                next = pick_next_task(rq);
-                if (!next)
+                BUG_ON(!next);
-                        break;
                next->sched_class->put_prev_task(rq, next);
-                migrate_dead(dead_cpu, next);
+                /* Find suitable destination for @next, with force if needed. */
+                dest_cpu = select_fallback_rq(dead_cpu, next);
+                raw_spin_unlock(&rq->lock);
+                __migrate_task(next, dead_cpu, dest_cpu);
+                raw_spin_lock(&rq->lock);
        }
-}
-/*
+        rq->stop = stop;
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-        rq->calc_load_active = 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        unsigned long flags;
        struct rq *rq = cpu_rq(cpu);
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                migrate_live_tasks(cpu);
-                /* Idle task back to normal (off runqueue, low prio) */
-                raw_spin_lock_irq(&rq->lock);
-                deactivate_task(rq, rq->idle, 0);
-                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-                rq->idle->sched_class = &idle_sched_class;
-                migrate_dead_tasks(cpu);
-                raw_spin_unlock_irq(&rq->lock);
-                migrate_nr_uninterruptible(rq);
-                BUG_ON(rq->nr_running != 0);
-                calc_global_load_remove(rq);
-                break;
        case CPU_DYING:
-        case CPU_DYING_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
+                migrate_tasks(cpu);
+                BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
+                migrate_nr_uninterruptible(rq);
+                calc_global_load_remove(rq);
                break;
 #endif
        }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                                struct sched_entity *se, int cpu, int add,
+                                struct sched_entity *se, int cpu,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
        tg->cfs_rq[cpu] = cfs_rq;
        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
-        if (add)
-                list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
        tg->se[cpu] = se;
        /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        se->load.weight = tg->shares;
+        update_load_set(&se->load, 0);
-        se->load.inv_weight = 0;
        se->parent = parent;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu, int add,
+                struct sched_rt_entity *rt_se, int cpu,
                struct sched_rt_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-        if (add)
-                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&init_task_group.children);
+        autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                            __alignof__(unsigned long));
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
                init_task_group.shares = init_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                /*
                 * How much cpu bandwidth does init_task_group get?
                 *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
                 * We achieve this by letting init_task_group's tasks sit
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                 */
-                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
+                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
-#endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
+                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
-                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
 #endif
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
-                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
        return 1;
@@ -8497,15 +8271,21 @@ err:
        return 0;
 }
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-        list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                        &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
-        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        /*
+        * Only empty task groups can be destroyed; so we can speculatively
+        * check on_list without danger of it being re-added.
+        */
+        if (!tg->cfs_rq[cpu]->on_list)
+                return;
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
@@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
@@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
        return 1;
@@ -8586,17 +8362,6 @@ err_free_rq:
 err:
        return 0;
 }
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-        list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                        &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
@@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        return 1;
 }
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
@@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
        unsigned long flags;
-        int i;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                goto err;
        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i) {
-                register_fair_sched_group(tg, i);
-                register_rt_sched_group(tg, i);
-        }
        list_add_rcu(&tg->list, &task_groups);
        WARN_ON(!parent); /* root should already exist */
@@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
        unsigned long flags;
        int i;
-        spin_lock_irqsave(&task_group_lock, flags);
+        /* end participation in shares distribution */
-        for_each_possible_cpu(i) {
+        for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
-                unregister_rt_sched_group(tg, i);
-        }
+        spin_lock_irqsave(&task_group_lock, flags);
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-        struct cfs_rq *cfs_rq = se->cfs_rq;
-        int on_rq;
-        on_rq = se->on_rq;
-        if (on_rq)
-                dequeue_entity(cfs_rq, se, 0);
-        se->load.weight = shares;
-        se->load.inv_weight = 0;
-        if (on_rq)
-                enqueue_entity(cfs_rq, se, 0);
-}
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-        struct cfs_rq *cfs_rq = se->cfs_rq;
-        struct rq *rq = cfs_rq->rq;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __set_se_shares(se, shares);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (tg->shares == shares)
                goto done;
-        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i)
-                unregister_fair_sched_group(tg, i);
-        list_del_rcu(&tg->siblings);
-        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for any ongoing reference to this group to finish */
-        synchronize_sched();
-        /*
-         * Now we are free to modify the group's share on each cpu
-         * w/o tripping rebalance_share or load_balance_fair.
-         */
        tg->shares = shares;
        for_each_possible_cpu(i) {
-                /*
+                struct rq *rq = cpu_rq(i);
-                 * force a rebalance
+                struct sched_entity *se;
-                 */
-                cfs_rq_set_shares(tg->cfs_rq[i], 0);
+                se = tg->se[i];
-                set_se_shares(tg->se[i], shares);
+                /* Propagate contribution to hierarchy */
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                for_each_sched_entity(se)
+                        update_cfs_shares(group_cfs_rq(se), 0);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
-        /*
-         * Enable load balance activity on this group, by inserting it back on
-         * each cpu's rq->leaf_cfs_rq_list.
-         */
-        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i)
-                register_fair_sched_group(tg, i);
-        list_add_rcu(&tg->siblings, &tg->parent->children);
-        spin_unlock_irqrestore(&task_group_lock, flags);
 done:
        mutex_unlock(&shares_mutex);
        return 0;
@@ -9532,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-void synchronize_sched_expedited(void)
-{
-        barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#else /* #ifndef CONFIG_SMP */
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-        /*
-         * There must be a full memory barrier on each affected CPU
-         * between the time that try_stop_cpus() is called and the
-         * time that it returns.
-         *
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
-         * this point and the following smp_mb() is not strictly
-         * necessary.  Do smp_mb() anyway for documentation and
-         * robustness against future implementation changes.
-         */
-        smp_mb(); /* See above comment block. */
-        return 0;
-}
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-        int snap, trycount = 0;
-        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-        get_online_cpus();
-        while (try_stop_cpus(cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                if (trycount++ < 10)
-                        udelay(trycount * num_online_cpus());
-                else {
-                        synchronize_sched();
-                        return;
-                }
-                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                        smp_mb(); /* ensure test happens before caller kfree */
-                        return;
-                }
-                get_online_cpus();
-        }
-        atomic_inc(&synchronize_sched_expedited_count);
-        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..c80fedcd476b
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+static void autogroup_init(struct task_struct *init_task)
+{
+        autogroup_default.tg = &init_task_group;
+        init_task_group.autogroup = &autogroup_default;
+        kref_init(&autogroup_default.kref);
+        init_rwsem(&autogroup_default.lock);
+        init_task->signal->autogroup = &autogroup_default;
+}
+static inline void autogroup_free(struct task_group *tg)
+{
+        kfree(tg->autogroup);
+}
+static inline void autogroup_destroy(struct kref *kref)
+{
+        struct autogroup *ag = container_of(kref, struct autogroup, kref);
+        sched_destroy_group(ag->tg);
+}
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+        kref_put(&ag->kref, autogroup_destroy);
+}
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+        kref_get(&ag->kref);
+        return ag;
+}
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+        struct autogroup *ag;
+        unsigned long flags;
+        if (!lock_task_sighand(p, &flags))
+                return autogroup_kref_get(&autogroup_default);
+        ag = autogroup_kref_get(p->signal->autogroup);
+        unlock_task_sighand(p, &flags);
+        return ag;
+}
+static inline struct autogroup *autogroup_create(void)
+{
+        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+        struct task_group *tg;
+        if (!ag)
+                goto out_fail;
+        tg = sched_create_group(&init_task_group);
+        if (IS_ERR(tg))
+                goto out_free;
+        kref_init(&ag->kref);
+        init_rwsem(&ag->lock);
+        ag->id = atomic_inc_return(&autogroup_seq_nr);
+        ag->tg = tg;
+        tg->autogroup = ag;
+        return ag;
+out_free:
+        kfree(ag);
+out_fail:
+        if (printk_ratelimit()) {
+                printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                        ag ? "sched_create_group()" : "kmalloc()");
+        }
+        return autogroup_kref_get(&autogroup_default);
+}
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+        if (tg != &root_task_group)
+                return false;
+        if (p->sched_class != &fair_sched_class)
+                return false;
+        /*
+         * We can only assume the task group can't go away on us if
+         * autogroup_move_group() can see us on ->thread_group list.
+         */
+        if (p->flags & PF_EXITING)
+                return false;
+        return true;
+}
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (enabled && task_wants_autogroup(p, tg))
+                return p->signal->autogroup->tg;
+        return tg;
+}
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+        struct autogroup *prev;
+        struct task_struct *t;
+        unsigned long flags;
+        BUG_ON(!lock_task_sighand(p, &flags));
+        prev = p->signal->autogroup;
+        if (prev == ag) {
+                unlock_task_sighand(p, &flags);
+                return;
+        }
+        p->signal->autogroup = autogroup_kref_get(ag);
+        t = p;
+        do {
+                sched_move_task(t);
+        } while_each_thread(p, t);
+        unlock_task_sighand(p, &flags);
+        autogroup_kref_put(prev);
+}
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+        struct autogroup *ag = autogroup_create();
+        autogroup_move_group(p, ag);
+        /* drop extra refrence added by autogroup_create() */
+        autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+        autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+        sig->autogroup = autogroup_task_get(current);
+}
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+        autogroup_kref_put(sig->autogroup);
+}
+static int __init setup_autogroup(char *str)
+{
+        sysctl_sched_autogroup_enabled = 0;
+        return 1;
+}
+__setup("noautogroup", setup_autogroup);
+#ifdef CONFIG_PROC_FS
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+        static unsigned long next = INITIAL_JIFFIES;
+        struct autogroup *ag;
+        int err;
+        if (*nice < -20 || *nice > 19)
+                return -EINVAL;
+        err = security_task_setnice(current, *nice);
+        if (err)
+                return err;
+        if (*nice < 0 && !can_nice(current, *nice))
+                return -EPERM;
+        /* this is a heavy operation taking global locks.. */
+        if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+                return -EAGAIN;
+        next = HZ / 10 + jiffies;
+        ag = autogroup_task_get(p);
+        down_write(&ag->lock);
+        err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+        if (!err)
+                ag->nice = *nice;
+        up_write(&ag->lock);
+        autogroup_kref_put(ag);
+        return err;
+}
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+        struct autogroup *ag = autogroup_task_get(p);
+        down_read(&ag->lock);
+        seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+        up_read(&ag->lock);
+        autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+struct autogroup {
+        struct kref             kref;
+        struct task_group       *tg;
+        struct rw_semaphore     lock;
+        unsigned long           id;
+        int                     nice;
+};
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+#else /* !CONFIG_SCHED_AUTOGROUP */
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        return tg;
+}
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+        return 0;
+}
+#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock);
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 __read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
-                struct task_group *tg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
-#ifdef CONFIG_CGROUP_SCHED
-        {
-                char path[64];
-                rcu_read_lock();
-                cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-                rcu_read_unlock();
-                SEQ_printf(m, " %s", path);
-        }
-#endif
        SEQ_printf(m, "\n");
 }
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_unlock_irqrestore(&tasklist_lock, flags);
 }
-#if defined(CONFIG_CGROUP_SCHED) && \
-        (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-        /* may be NULL if the underlying cgroup isn't fully-created yet */
-        if (!tg->css.cgroup) {
-                buf[0] = '\0';
-                return;
-        }
-        cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-        char path[128];
-        struct task_group *tg = cfs_rq->tg;
-        task_group_path(tg, path, sizeof(path));
-        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        spread0 = min_vruntime - rq0_min_vruntime;
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
                        SPLIT_NS(spread0));
-        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
+        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                        SPLIT_NS(cfs_rq->load_avg));
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                        SPLIT_NS(cfs_rq->load_period));
+        SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                        cfs_rq->load_contribution);
+        SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                        atomic_read(&cfs_rq->tg->load_weight));
 #endif
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-        char path[128];
-        struct task_group *tg = rt_rq->tg;
-        task_group_path(tg, path, sizeof(path));
-        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
 #define P(x) \
        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 #undef P
 }
+extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
 static int sched_debug_show(struct seq_file *m, void *v)
 {
-        u64 now = ktime_to_ns(ktime_get());
+        u64 ktime, sched_clk, cpu_clk;
+        unsigned long flags;
        int cpu;
-        SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+        local_irq_save(flags);
+        ktime = ktime_to_ns(ktime_get());
+        sched_clk = sched_clock();
+        cpu_clk = local_clock();
+        local_irq_restore(flags);
+        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
-        SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+        SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+        SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+        PN(ktime);
+        PN(sched_clk);
+        PN(cpu_clk);
+        P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+        P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+        SEQ_printf(m, "\n");
+        SEQ_printf(m, "sysctl_sched\n");
 #define P(x) \
        SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
        SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-        P(jiffies);
        PN(sysctl_sched_latency);
        PN(sysctl_sched_min_granularity);
        PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return cfs_rq->tg->cfs_rq[this_cpu];
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->on_list) {
+                /*
+                 * Ensure we either appear before our parent (if already
+                 * enqueued) or force our parent to appear after us when it is
+                 * enqueued.  The fact that we always enqueue bottom-up
+                 * reduces this to two cases.
+                 */
+                if (cfs_rq->tg->parent &&
+                    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                        list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                } else {
+                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                }
+                cfs_rq->on_list = 1;
+        }
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->on_list) {
+                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                cfs_rq->on_list = 0;
+        }
+}
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return &cpu_rq(this_cpu)->cfs;
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
        WRT_SYSCTL(sched_min_granularity);
        WRT_SYSCTL(sched_latency);
        WRT_SYSCTL(sched_wakeup_granularity);
-        WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
        return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+        cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_add(&se->group_node, &cfs_rq->tasks);
        }
        cfs_rq->nr_running++;
-        se->on_rq = 1;
 }
 static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_del_init(&se->group_node);
        }
        cfs_rq->nr_running--;
-        se->on_rq = 0;
 }
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                            int global_update)
+{
+        struct task_group *tg = cfs_rq->tg;
+        long load_avg;
+        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+        load_avg -= cfs_rq->load_contribution;
+        if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+                atomic_add(load_avg, &tg->load_weight);
+                cfs_rq->load_contribution += load_avg;
+        }
+}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+        u64 period = sysctl_sched_shares_window;
+        u64 now, delta;
+        unsigned long load = cfs_rq->load.weight;
+        if (!cfs_rq)
+                return;
+        now = rq_of(cfs_rq)->clock;
+        delta = now - cfs_rq->load_stamp;
+        /* truncate load history at 4 idle periods */
+        if (cfs_rq->load_stamp > cfs_rq->load_last &&
+            now - cfs_rq->load_last > 4 * period) {
+                cfs_rq->load_period = 0;
+                cfs_rq->load_avg = 0;
+        }
+        cfs_rq->load_stamp = now;
+        cfs_rq->load_unacc_exec_time = 0;
+        cfs_rq->load_period += delta;
+        if (load) {
+                cfs_rq->load_last = now;
+                cfs_rq->load_avg += delta * load;
+        }
+        /* consider updating load contribution on each fold or truncate */
+        if (global_update || cfs_rq->load_period > period
+            || !cfs_rq->load_period)
+                update_cfs_rq_load_contribution(cfs_rq, global_update);
+        while (cfs_rq->load_period > period) {
+                /*
+                 * Inline assembly required to prevent the compiler
+                 * optimising this loop into a divmod call.
+                 * See __iter_div_u64_rem() for another example of this.
+                 */
+                asm("" : "+rm" (cfs_rq->load_period));
+                cfs_rq->load_period /= 2;
+                cfs_rq->load_avg /= 2;
+        }
+        if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+                list_del_leaf_cfs_rq(cfs_rq);
+}
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                            unsigned long weight)
+{
+        if (se->on_rq) {
+                /* commit outstanding execution time */
+                if (cfs_rq->curr == se)
+                        update_curr(cfs_rq);
+                account_entity_dequeue(cfs_rq, se);
+        }
+        update_load_set(&se->load, weight);
+        if (se->on_rq)
+                account_entity_enqueue(cfs_rq, se);
+}
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+        struct task_group *tg;
+        struct sched_entity *se;
+        long load_weight, load, shares;
+        if (!cfs_rq)
+                return;
+        tg = cfs_rq->tg;
+        se = tg->se[cpu_of(rq_of(cfs_rq))];
+        if (!se)
+                return;
+        load = cfs_rq->load.weight + weight_delta;
+        load_weight = atomic_read(&tg->load_weight);
+        load_weight -= cfs_rq->load_contribution;
+        load_weight += load;
+        shares = (tg->shares * load);
+        if (load_weight)
+                shares /= load_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        if (shares > tg->shares)
+                shares = tg->shares;
+        reweight_entity(cfs_rq_of(se), se, shares);
+}
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+        update_cfs_load(cfs_rq, 0);
+        update_cfs_shares(cfs_rq, se->load.weight);
        account_entity_enqueue(cfs_rq, se);
        if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
+        se->on_rq = 1;
+        if (cfs_rq->nr_running == 1)
+                list_add_leaf_cfs_rq(cfs_rq);
 }
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
+        se->on_rq = 0;
+        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+        update_cfs_shares(cfs_rq, 0);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_curr(cfs_rq);
+        /*
+         * Update share accounting for long-running entities.
+         */
+        update_entity_shares_tick(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                flags = ENQUEUE_WAKEUP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
        hrtick_update(rq);
 }
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
                flags |= DEQUEUE_SLEEP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
        hrtick_update(rq);
 }
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
 */
-static long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!tg->parent)
                return wl;
-        /*
-         * By not taking the decrease of shares on the other cpu into
-         * account our error leans towards reducing the affine wakeups.
-         */
-        if (!wl && sched_feat(ASYM_EFF_LOAD))
-                return wl;
        for_each_sched_entity(se) {
                long S, rw, s, a, b;
-                long more_w;
-                /*
-                 * Instead of using this increment, also add the difference
-                 * between when the shares were last updated and now.
-                 */
-                more_w = se->my_q->load.weight - se->my_q->rq_weight;
-                wl += more_w;
-                wg += more_w;
                S = se->my_q->tg->shares;
-                s = se->my_q->shares;
+                s = se->load.weight;
-                rw = se->my_q->rq_weight;
+                rw = se->my_q->load.weight;
                a = S*(rw + wl);
                b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                        sd = tmp;
        }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (sched_feat(LB_SHARES_UPDATE)) {
-                /*
-                 * Pick the largest domain to update shares over
-                 */
-                tmp = sd;
-                if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                        tmp = affine_sd;
-                if (tmp) {
-                        raw_spin_unlock(&rq->lock);
-                        update_shares(tmp);
-                        raw_spin_lock(&rq->lock);
-                }
-        }
-#endif
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                        return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        unsigned long flags;
+        struct rq *rq;
+        if (!tg->se[cpu])
+                return 0;
+        rq = cpu_rq(cpu);
+        cfs_rq = tg->cfs_rq[cpu];
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        update_rq_clock(rq);
+        update_cfs_load(cfs_rq, 1);
+        /*
+         * We need to update shares after updating tg->load_weight in
+         * order to adjust the weight of groups with long running tasks.
+         */
+        update_cfs_shares(cfs_rq, 0);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        return 0;
+}
+static void update_shares(int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        struct rq *rq = cpu_rq(cpu);
+        rcu_read_lock();
+        for_each_leaf_cfs_rq(rq, cfs_rq)
+                update_shares_cpu(cfs_rq->tg, cpu);
+        rcu_read_unlock();
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3174,8 +3381,6 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-        if (ld_moved)
-                update_shares(sd);
        return ld_moved;
 }
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         */
        raw_spin_unlock(&this_rq->lock);
+        update_shares(this_cpu);
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
+        update_shares(cpu);
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
 /*
 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+        list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+        list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
+        if (!rt_rq->rt_nr_running)
+                list_add_leaf_rt_rq(rt_rq);
        if (head)
                list_add(&rt_se->run_list, queue);
        else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                __clear_bit(rt_se_prio(rt_se), array->bitmap);
        dec_rt_tasks(rt_se, rt_rq);
+        if (!rt_rq->rt_nr_running)
+                list_del_leaf_rt_rq(rt_rq);
 }
 /*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN: {
-                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+                static struct sched_param param = {
+                        .sched_priority = MAX_RT_PRIO-1
+                };
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..98d8c1e80edb 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * all srcu_read_lock() calls using the old counters have completed.
         * Their corresponding critical sections might well be still
         * executing, but the srcu_read_lock() primitives themselves
-         * will have finished executing.
+         * will have finished executing.  We initially give readers
+         * an arbitrarily chosen 10 microseconds to get out of their
+         * SRCU read-side critical sections, then loop waiting 1/HZ
+         * seconds per iteration.
         */
+        if (srcu_readers_active_idx(sp, idx))
+                udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
-        if (err > 0)
+        if (err > 0) {
                proc_sid_connector(group_leader);
+                sched_autogroup_create_attach(group_leader);
+        }
        return err;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 46404414d8a7..ae5cbb1e3ced 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 #ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_wakeup_granularity_ns,
        },
        {
-                .procname       = "sched_shares_ratelimit",
-                .data           = &sysctl_sched_shares_ratelimit,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = sched_proc_update_handler,
-                .extra1         = &min_sched_shares_ratelimit,
-                .extra2         = &max_sched_shares_ratelimit,
-        },
-        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
                .maxlen         = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_sched_tunable_scaling,
        },
        {
-                .procname       = "sched_shares_thresh",
-                .data           = &sysctl_sched_shares_thresh,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-        },
-        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
+                .procname       = "sched_shares_window",
+                .data           = &sysctl_sched_shares_window,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
                .procname       = "timer_migration",
                .data           = &sysctl_timer_migration,
                .maxlen         = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SCHED_AUTOGROUP
+        {
+                .procname       = "sched_autogroup_enabled",
+                .data           = &sysctl_sched_autogroup_enabled,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/kernel.h>
 /*
 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
        int index;
        int num_samples = sync->num_samples;
-        if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+        if (num_samples > ARRAY_SIZE(buffer)) {
                samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
                if (!samples) {
                        samples = buffer;
-                        num_samples = sizeof(buffer)/sizeof(buffer[0]);
+                        num_samples = ARRAY_SIZE(buffer);
                }
        } else {
                samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..5bb86da82003 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
        cycle_t cycle_interval;
        /* Number of clock shifted nano seconds in one NTP interval. */
        u64     xtime_interval;
+        /* shifted nano seconds left over when rounding cycle_interval */
+        s64     xtime_remainder;
        /* Raw nano seconds accumulated per NTP interval. */
        u32     raw_interval;
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
 static void timekeeper_setup_internals(struct clocksource *clock)
 {
        cycle_t interval;
-        u64 tmp;
+        u64 tmp, ntpinterval;
        timekeeper.clock = clock;
        clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
+        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Go back from cycles -> shifted ns */
        timekeeper.xtime_interval = (u64) interval * clock->mult;
+        timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
        timekeeper.raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        /* Accumulate error between NTP and clock interval */
        timekeeper.ntp_error += tick_length << shift;
-        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+        timekeeper.ntp_error -=
+            (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
                                (timekeeper.ntp_error_shift + shift);
        return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..32a19f9397fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 {
        struct hrtimer *timer, tmp;
        unsigned long next = 0, i;
-        struct rb_node *curr;
+        struct timerqueue_node *curr;
        unsigned long flags;
 next_one:
        i = 0;
        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
-        curr = base->first;
+        curr = timerqueue_getnext(&base->active);
        /*
         * Crude but we have to do this O(N*N) thing, because
         * we have to unlock the base when printing:
         */
        while (curr && i < next) {
-                curr = rb_next(curr);
+                curr = timerqueue_iterate_next(curr);
                i++;
        }
        if (curr) {
-                timer = rb_entry(curr, struct hrtimer, node);
+                timer = container_of(curr, struct hrtimer, node);
                tmp = *timer;
                raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
diff --git a/kernel/timer.c b/kernel/timer.c
index 353b9227c2ec..43ca9936f2d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG           (0x1)
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-        timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
+        timer->base = TBASE_MAKE_DEFERRED(timer->base);
-                                       TBASE_DEFERRABLE_FLAG));
 }
 static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
-static inline void set_running_timer(struct tvec_base *base,
-                                        struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
-        base->running_timer = timer;
-#endif
-}
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(del_timer);
-#ifdef CONFIG_SMP
 /**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer: timer do del
 *
 * This function tries to deactivate a timer. Upon successful (ret >= 0)
 * exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
@@ -973,6 +948,7 @@ out:
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
+#ifdef CONFIG_SMP
 /**
 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
+ * hardirq contexts. The caller must not hold locks which would prevent
 * completion of the timer's handler. The timer's handler must not call
 * add_timer_on(). Upon exit the timer is not queued and the handler is
 * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 int del_timer_sync(struct timer_list *timer)
 {
 #ifdef CONFIG_LOCKDEP
-        unsigned long flags;
+        local_bh_disable();
-        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
-        local_irq_restore(flags);
+        local_bh_enable();
 #endif
+        /*
+         * don't use it in hardirq context, because it
+         * could lead to deadlock.
+         */
+        WARN_ON(in_irq());
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
                        timer_stats_account_timer(timer);
-                        set_running_timer(base, timer);
+                        base->running_timer = timer;
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
                        spin_lock_irq(&base->lock);
                }
        }
-        set_running_timer(base, NULL);
+        base->running_timer = NULL;
        spin_unlock_irq(&base->lock);
 }
@@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
 */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        struct tvec_base *base = __this_cpu_read(tvec_bases);
        unsigned long expires;
        /*
@@ -1298,7 +1276,7 @@ void update_process_times(int user_tick)
 */
 static void run_timer_softirq(struct softirq_action *h)
 {
-        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        struct tvec_base *base = __this_cpu_read(tvec_bases);
        hrtimer_run_pending();
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        __this_cpu_inc(user_stack_count);
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
-                return;
+                goto out_drop_count;
        entry   = ring_buffer_event_data(event);
        entry->tgid             = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
        __this_cpu_dec(user_stack_count);
 out:
        preempt_enable();
 }
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..562c56e048fd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 static int trace_wakeup_test_thread(void *data)
 {
        /* Make this a RT thread, doesn't need to be too high */
-        struct sched_param param = { .sched_priority = 5 };
+        static struct sched_param param = { .sched_priority = 5 };
        struct completion *x = data;
        sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index aaa8dae08236..6e7b575ac33c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
        sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/lib/Makefile b/lib/Makefile
index e6a3763b8212..9e2db72d128e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
-         rbtree.o radix-tree.o dump_stack.o \
+         rbtree.o radix-tree.o dump_stack.o timerqueue.o\
         idr.o int_sqrt.o extable.o prio_tree.o \
         sha1.o irq_regs.o reciprocal_div.o argv_split.o \
         proportions.o prio_heap.o ratelimit.o show_mem.o \
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 3094318bfea7..b335acb43be2 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query,
                        else if (!dp->flags)
                                dt->num_enabled++;
                        dp->flags = newflags;
-                        if (newflags) {
+                        if (newflags)
-                                jump_label_enable(&dp->enabled);
+                                dp->enabled = 1;
-                        } else {
+                        else
-                                jump_label_disable(&dp->enabled);
+                                dp->enabled = 0;
-                        }
                        if (verbose)
                                printk(KERN_INFO
                                        "ddebug: changed %s:%d [%s]%s %s\n",
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
new file mode 100644
index 000000000000..e3a1050e6820
--- /dev/null
+++ b/lib/timerqueue.c
@@ -0,0 +1,107 @@
+/*
+ *  Generic Timer-queue
+ *
+ *  Manages a simple queue of timers, ordered by expiration time.
+ *  Uses rbtrees for quick list adds and expiration.
+ *
+ *  NOTE: All of the following functions need to be serialized
+ *  to avoid races. No locking is done by this libary code.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/timerqueue.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+/**
+ * timerqueue_add - Adds timer to timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be added
+ *
+ * Adds the timer node to the timerqueue, sorted by the
+ * node's expires value.
+ */
+void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+        struct rb_node **p = &head->head.rb_node;
+        struct rb_node *parent = NULL;
+        struct timerqueue_node  *ptr;
+        /* Make sure we don't add nodes that are already added */
+        WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+        while (*p) {
+                parent = *p;
+                ptr = rb_entry(parent, struct timerqueue_node, node);
+                if (node->expires.tv64 < ptr->expires.tv64)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&node->node, parent, p);
+        rb_insert_color(&node->node, &head->head);
+        if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+                head->next = node;
+}
+EXPORT_SYMBOL_GPL(timerqueue_add);
+/**
+ * timerqueue_del - Removes a timer from the timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be removed
+ *
+ * Removes the timer node from the timerqueue.
+ */
+void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+        /* update next pointer */
+        if (head->next == node) {
+                struct rb_node *rbn = rb_next(&node->node);
+                head->next = rbn ?
+                        rb_entry(rbn, struct timerqueue_node, node) : NULL;
+        }
+        rb_erase(&node->node, &head->head);
+        RB_CLEAR_NODE(&node->node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_del);
+/**
+ * timerqueue_iterate_next - Returns the timer after the provided timer
+ *
+ * @node: Pointer to a timer.
+ *
+ * Provides the timer that is after the given node. This is used, when
+ * necessary, to iterate through the list of timers in a timer list
+ * without modifying the list.
+ */
+struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
+{
+        struct rb_node *next;
+        if (!node)
+                return NULL;
+        next = rb_next(&node->node);
+        if (!next)
+                return NULL;
+        return container_of(next, struct timerqueue_node, node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 39580a5dc5df..9f85012acf0d 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -155,6 +155,8 @@ use strict;
 # '@parameter' - name of a parameter
 # '%CONST' - name of a constant.
+## init lots of data
 my $errors = 0;
 my $warnings = 0;
 my $anon_struct_union = 0;
@@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1",
                        $type_param, "\$1" );
 my $blankline_list = "";
-sub usage {
-    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
-    print "         [ -no-doc-sections ]\n";
-    print "         [ -function funcname [ -function funcname ...] ]\n";
-    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
-    print "         c source file(s) > outputfile\n";
-    print "         -v : verbose output, more warnings & other info listed\n";
-    exit 1;
-}
 # read arguments
 if ($#ARGV == -1) {
    usage();
 }
+my $kernelversion;
+my $dohighlight = "";
 my $verbose = 0;
 my $output_mode = "man";
 my $no_doc_sections = 0;
@@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June',
                'November', 'December')[(localtime)[4]] .
  " " . ((localtime)[5]+1900);
-# Essentially these are globals
+# Essentially these are globals.
 # They probably want to be tidied up, made more localised or something.
 # CAVEAT EMPTOR!  Some of the others I localised may not want to be, which
 # could cause "use of undefined value" or other bugs.
@@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) {
    }
 }
+# continue execution near EOF;
+sub usage {
+    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
+    print "         [ -no-doc-sections ]\n";
+    print "         [ -function funcname [ -function funcname ...] ]\n";
+    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
+    print "         c source file(s) > outputfile\n";
+    print "         -v : verbose output, more warnings & other info listed\n";
+    exit 1;
+}
 # get kernel version from env
 sub get_kernel_version() {
    my $version = 'unknown kernel version';
@@ -362,15 +369,6 @@ sub get_kernel_version() {
    }
    return $version;
 }
-my $kernelversion = get_kernel_version();
-# generate a sequence of code that will splice in highlighting information
-# using the s// operator.
-my $dohighlight = "";
-foreach my $pattern (keys %highlights) {
-#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
-    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
-}
 ##
 # dumps section contents to arrays/hashes intended for that purpose.
@@ -1851,34 +1849,6 @@ sub dump_function($$) {
                       });
 }
-sub process_file($);
-# Read the file that maps relative names to absolute names for
-# separate source and object directories and for shadow trees.
-if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
-        my ($relname, $absname);
-        while(<SOURCE_MAP>) {
-                chop();
-                ($relname, $absname) = (split())[0..1];
-                $relname =~ s:^/+::;
-                $source_map{$relname} = $absname;
-        }
-        close(SOURCE_MAP);
-}
-foreach (@ARGV) {
-    chomp;
-    process_file($_);
-}
-if ($verbose && $errors) {
-  print STDERR "$errors errors\n";
-}
-if ($verbose && $warnings) {
-  print STDERR "$warnings warnings\n";
-}
-exit($errors);
 sub reset_state {
    $function = "";
    %constants = ();
@@ -2285,3 +2255,39 @@ sub process_file($) {
        }
    }
 }
+$kernelversion = get_kernel_version();
+# generate a sequence of code that will splice in highlighting information
+# using the s// operator.
+foreach my $pattern (keys %highlights) {
+#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
+    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
+}
+# Read the file that maps relative names to absolute names for
+# separate source and object directories and for shadow trees.
+if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
+        my ($relname, $absname);
+        while(<SOURCE_MAP>) {
+                chop();
+                ($relname, $absname) = (split())[0..1];
+                $relname =~ s:^/+::;
+                $source_map{$relname} = $absname;
+        }
+        close(SOURCE_MAP);
+}
+foreach (@ARGV) {
+    chomp;
+    process_file($_);
+}
+if ($verbose && $errors) {
+  print STDERR "$errors errors\n";
+}
+if ($verbose && $warnings) {
+  print STDERR "$warnings warnings\n";
+}
+exit($errors);
author	Ingo Molnar <mingo@elte.hu>	2011-01-09 04:42:21 -0500
committer	Ingo Molnar <mingo@elte.hu>	2011-01-09 04:42:21 -0500
commit	4385428a477559b26736cc3c80d8b68f31126c71 (patch)
tree	8eb0cbc78e79c368687fa13a1e0674ae537f830f
parent	047a3772feaae8e43d81d790f3d3f80dae8ae676 (diff)
parent	2d75af2f2a7a6103a6d539a492fe81deacabde44 (diff)