aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-cache_disable18
-rw-r--r--Documentation/DMA-API.txt12
-rw-r--r--Documentation/futex-requeue-pi.txt131
-rw-r--r--Documentation/kernel-parameters.txt15
-rw-r--r--Documentation/memory-barriers.txt129
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/trace/ftrace.txt15
-rw-r--r--Documentation/x86/boot.txt122
-rw-r--r--Documentation/x86/x86_64/boot-options.txt5
-rw-r--r--Documentation/x86/x86_64/mm.txt9
-rw-r--r--Makefile2
-rw-r--r--arch/alpha/kernel/sys_dp264.c8
-rw-r--r--arch/alpha/kernel/sys_titan.c4
-rw-r--r--arch/arm/common/gic.c4
-rw-r--r--arch/cris/arch-v32/kernel/irq.c4
-rw-r--r--arch/ia64/hp/sim/hpsim_irq.c3
-rw-r--r--arch/ia64/kernel/acpi.c5
-rw-r--r--arch/ia64/kernel/iosapic.c10
-rw-r--r--arch/ia64/kernel/msi_ia64.c16
-rw-r--r--arch/ia64/sn/kernel/irq.c4
-rw-r--r--arch/ia64/sn/kernel/msi_sn.c8
-rw-r--r--arch/mips/cavium-octeon/octeon-irq.c8
-rw-r--r--arch/mips/include/asm/irq.h2
-rw-r--r--arch/mips/kernel/irq-gic.c5
-rw-r--r--arch/mips/mti-malta/malta-smtc.c4
-rw-r--r--arch/mips/sibyte/bcm1480/irq.c8
-rw-r--r--arch/mips/sibyte/sb1250/irq.c8
-rw-r--r--arch/parisc/kernel/irq.c6
-rw-r--r--arch/powerpc/platforms/pseries/xics.c12
-rw-r--r--arch/powerpc/sysdev/mpic.c4
-rw-r--r--arch/powerpc/sysdev/mpic.h2
-rw-r--r--arch/sparc/include/asm/thread_info_64.h4
-rw-r--r--arch/sparc/kernel/irq_64.c12
-rw-r--r--arch/x86/Kbuild16
-rw-r--r--arch/x86/Kconfig53
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile19
-rw-r--r--arch/x86/boot/.gitignore2
-rw-r--r--arch/x86/boot/Makefile29
-rw-r--r--arch/x86/boot/a20.c9
-rw-r--r--arch/x86/boot/apm.c76
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/boot.h48
-rw-r--r--arch/x86/boot/compressed/.gitignore3
-rw-r--r--arch/x86/boot/compressed/Makefile54
-rw-r--r--arch/x86/boot/compressed/head_32.S194
-rw-r--r--arch/x86/boot/compressed/head_64.S169
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c97
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S (renamed from arch/x86/boot/compressed/vmlinux_64.lds)29
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/edd.c71
-rw-r--r--arch/x86/boot/header.S30
-rw-r--r--arch/x86/boot/main.c39
-rw-r--r--arch/x86/boot/mca.c27
-rw-r--r--arch/x86/boot/memory.c79
-rw-r--r--arch/x86/boot/regs.c29
-rw-r--r--arch/x86/boot/setup.ld6
-rw-r--r--arch/x86/boot/tty.c52
-rw-r--r--arch/x86/boot/video-bios.c27
-rw-r--r--arch/x86/boot/video-vesa.c137
-rw-r--r--arch/x86/boot/video-vga.c95
-rw-r--r--arch/x86/boot/video.c42
-rw-r--r--arch/x86/boot/video.h14
-rw-r--r--arch/x86/configs/i386_defconfig148
-rw-r--r--arch/x86/configs/x86_64_defconfig151
-rw-r--r--arch/x86/include/asm/alternative.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h55
-rw-r--r--arch/x86/include/asm/apic.h33
-rw-r--r--arch/x86/include/asm/apicdef.h8
-rw-r--r--arch/x86/include/asm/boot.h15
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h25
-rw-r--r--arch/x86/include/asm/i387.h43
-rw-r--r--arch/x86/include/asm/i8259.h4
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h1
-rw-r--r--arch/x86/include/asm/k8.h13
-rw-r--r--arch/x86/include/asm/microcode.h25
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/numa_64.h10
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h12
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/syscalls.h45
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h6
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c311
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c902
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c20
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c23
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/irq.c20
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)149
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c329
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c15
-rw-r--r--arch/x86/kernel/process_32.c17
-rw-r--r--arch/x86/kernel/process_64.c17
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/setup.c40
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/smp.c20
-rw-r--r--arch/x86/kernel/smpboot.c22
-rw-r--r--arch/x86/kernel/tlb_uv.c17
-rw-r--r--arch/x86/kernel/traps.c10
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S430
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/mm/dump_pagetables.c7
-rw-r--r--arch/x86/mm/fault.c57
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c78
-rw-r--r--arch/x86/mm/init_32.c61
-rw-r--r--arch/x86/mm/init_64.c47
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/memtest.c14
-rw-r--r--arch/x86/mm/numa_64.c33
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--arch/x86/mm/srat_64.c98
-rw-r--r--arch/x86/pci/irq.c84
-rw-r--r--arch/x86/vdso/vma.c1
-rw-r--r--arch/x86/xen/enlighten.c65
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h1
-rw-r--r--block/bsg.c3
-rw-r--r--drivers/acpi/pci_irq.c5
-rw-r--r--drivers/acpi/processor_core.c2
-rw-r--r--drivers/char/hpet.c4
-rw-r--r--drivers/char/mem.c5
-rw-r--r--drivers/char/mxser.c2
-rw-r--r--drivers/cpufreq/cpufreq.c2
-rw-r--r--drivers/md/raid5.c28
-rw-r--r--drivers/net/r8169.c11
-rw-r--r--drivers/parisc/iosapic.c6
-rw-r--r--drivers/pci/hotplug/ibmphp_core.c54
-rw-r--r--drivers/pci/htirq.c4
-rw-r--r--drivers/pci/intel-iommu.c9
-rw-r--r--drivers/pci/intr_remapping.c54
-rw-r--r--drivers/pnp/pnpacpi/rsparser.c2
-rw-r--r--drivers/xen/Kconfig20
-rw-r--r--drivers/xen/Makefile4
-rw-r--r--drivers/xen/events.c20
-rw-r--r--drivers/xen/evtchn.c507
-rw-r--r--drivers/xen/manage.c9
-rw-r--r--drivers/xen/sys-hypervisor.c445
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c61
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c2
-rw-r--r--drivers/xen/xenfs/super.c19
-rw-r--r--fs/autofs4/waitq.c22
-rw-r--r--fs/jbd/commit.c6
-rw-r--r--fs/proc/loadavg.c18
-rw-r--r--include/Kbuild1
-rw-r--r--include/asm-generic/pgtable.h21
-rw-r--r--include/linux/acpi.h2
-rw-r--r--include/linux/cpumask.h15
-rw-r--r--include/linux/dma-debug.h7
-rw-r--r--include/linux/dmar.h3
-rw-r--r--include/linux/futex.h6
-rw-r--r--include/linux/interrupt.h2
-rw-r--r--include/linux/irq.h58
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/mutex.h1
-rw-r--r--include/linux/sched.h28
-rw-r--r--include/linux/spinlock_up.h1
-rw-r--r--include/linux/swiotlb.h3
-rw-r--r--include/linux/thread_info.h3
-rw-r--r--include/linux/wait.h2
-rw-r--r--include/xen/Kbuild1
-rw-r--r--include/xen/events.h3
-rw-r--r--include/xen/evtchn.h88
-rw-r--r--include/xen/interface/version.h3
-rw-r--r--include/xen/xenbus.h3
-rw-r--r--ipc/shm.c7
-rw-r--r--kernel/futex.c1188
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c38
-rw-r--r--kernel/mutex.c29
-rw-r--r--kernel/rtmutex.c248
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c304
-rw-r--r--kernel/sched_cpupri.c2
-rw-r--r--kernel/sched_fair.c13
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c86
-rw-r--r--kernel/wait.c2
-rw-r--r--lib/cpumask.c12
-rw-r--r--lib/dma-debug.c432
-rw-r--r--lib/swiotlb.c119
-rw-r--r--lib/vsprintf.c56
-rw-r--r--mm/page_alloc.c69
-rw-r--r--mm/percpu.c141
-rw-r--r--net/sched/cls_cgroup.c3
-rw-r--r--scripts/Makefile.lib28
-rw-r--r--scripts/bin_size10
-rw-r--r--virt/kvm/kvm_main.c2
280 files changed, 8190 insertions, 5280 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable
new file mode 100644
index 000000000000..175bb4f70512
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-cache_disable
@@ -0,0 +1,18 @@
1What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
2Date: August 2008
3KernelVersion: 2.6.27
4Contact: mark.langsdorf@amd.com
5Description: These files exist in every cpu's cache index directories.
6 There are currently 2 cache_disable_# files in each
7 directory. Reading from these files on a supported
8 processor will return that cache disable index value
9 for that processor and node. Writing to one of these
10 files will cause the specificed cache index to be disabled.
11
12 Currently, only AMD Family 10h Processors support cache index
13 disable, and only for their L3 caches. See the BIOS and
14 Kernel Developer's Guide at
15 http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
16 for formatting information and other details on the
17 cache index disable.
18Users: joachim.deguara@amd.com
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d9aa43d78bcc..25fb8bcf32a2 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
704 The current number of free dma_debug_entries 704 The current number of free dma_debug_entries
705 in the allocator. 705 in the allocator.
706 706
707 dma-api/driver-filter
708 You can write a name of a driver into this file
709 to limit the debug output to requests from that
710 particular driver. Write an empty string to
711 that file to disable the filter and see
712 all errors again.
713
707If you have this code compiled into your kernel it will be enabled by default. 714If you have this code compiled into your kernel it will be enabled by default.
708If you want to boot without the bookkeeping anyway you can provide 715If you want to boot without the bookkeeping anyway you can provide
709'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. 716'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
710Notice that you can not enable it again at runtime. You have to reboot to do 717Notice that you can not enable it again at runtime. You have to reboot to do
711so. 718so.
712 719
720If you want to see debug messages only for a special device driver you can
721specify the dma_debug_driver=<drivername> parameter. This will enable the
722driver filter at boot time. The debug code will only print errors for that
723driver afterwards. This filter can be disabled or changed later using debugfs.
724
713When the code disables itself at runtime this is most likely because it ran 725When the code disables itself at runtime this is most likely because it ran
714out of dma_debug_entries. These entries are preallocated at boot. The number 726out of dma_debug_entries. These entries are preallocated at boot. The number
715of preallocated entries is defined per architecture. If it is too low for you 727of preallocated entries is defined per architecture. If it is too low for you
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt
new file mode 100644
index 000000000000..9dc1ff4fd536
--- /dev/null
+++ b/Documentation/futex-requeue-pi.txt
@@ -0,0 +1,131 @@
1Futex Requeue PI
2----------------
3
4Requeueing of tasks from a non-PI futex to a PI futex requires
5special handling in order to ensure the underlying rt_mutex is never
6left without an owner if it has waiters; doing so would break the PI
7boosting logic [see rt-mutex-desgin.txt] For the purposes of
8brevity, this action will be referred to as "requeue_pi" throughout
9this document. Priority inheritance is abbreviated throughout as
10"PI".
11
12Motivation
13----------
14
15Without requeue_pi, the glibc implementation of
16pthread_cond_broadcast() must resort to waking all the tasks waiting
17on a pthread_condvar and letting them try to sort out which task
18gets to run first in classic thundering-herd formation. An ideal
19implementation would wake the highest-priority waiter, and leave the
20rest to the natural wakeup inherent in unlocking the mutex
21associated with the condvar.
22
23Consider the simplified glibc calls:
24
25/* caller must lock mutex */
26pthread_cond_wait(cond, mutex)
27{
28 lock(cond->__data.__lock);
29 unlock(mutex);
30 do {
31 unlock(cond->__data.__lock);
32 futex_wait(cond->__data.__futex);
33 lock(cond->__data.__lock);
34 } while(...)
35 unlock(cond->__data.__lock);
36 lock(mutex);
37}
38
39pthread_cond_broadcast(cond)
40{
41 lock(cond->__data.__lock);
42 unlock(cond->__data.__lock);
43 futex_requeue(cond->data.__futex, cond->mutex);
44}
45
46Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
47has waiters. Note that pthread_cond_wait() attempts to lock the
48mutex only after it has returned to user space. This will leave the
49underlying rt_mutex with waiters, and no owner, breaking the
50previously mentioned PI-boosting algorithms.
51
52In order to support PI-aware pthread_condvar's, the kernel needs to
53be able to requeue tasks to PI futexes. This support implies that
54upon a successful futex_wait system call, the caller would return to
55user space already holding the PI futex. The glibc implementation
56would be modified as follows:
57
58
59/* caller must lock mutex */
60pthread_cond_wait_pi(cond, mutex)
61{
62 lock(cond->__data.__lock);
63 unlock(mutex);
64 do {
65 unlock(cond->__data.__lock);
66 futex_wait_requeue_pi(cond->__data.__futex);
67 lock(cond->__data.__lock);
68 } while(...)
69 unlock(cond->__data.__lock);
70 /* the kernel acquired the the mutex for us */
71}
72
73pthread_cond_broadcast_pi(cond)
74{
75 lock(cond->__data.__lock);
76 unlock(cond->__data.__lock);
77 futex_requeue_pi(cond->data.__futex, cond->mutex);
78}
79
80The actual glibc implementation will likely test for PI and make the
81necessary changes inside the existing calls rather than creating new
82calls for the PI cases. Similar changes are needed for
83pthread_cond_timedwait() and pthread_cond_signal().
84
85Implementation
86--------------
87
88In order to ensure the rt_mutex has an owner if it has waiters, it
89is necessary for both the requeue code, as well as the waiting code,
90to be able to acquire the rt_mutex before returning to user space.
91The requeue code cannot simply wake the waiter and leave it to
92acquire the rt_mutex as it would open a race window between the
93requeue call returning to user space and the waiter waking and
94starting to run. This is especially true in the uncontended case.
95
96The solution involves two new rt_mutex helper routines,
97rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
98allow the requeue code to acquire an uncontended rt_mutex on behalf
99of the waiter and to enqueue the waiter on a contended rt_mutex.
100Two new system calls provide the kernel<->user interface to
101requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
102
103FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
104and pthread_cond_timedwait()) to block on the initial futex and wait
105to be requeued to a PI-aware futex. The implementation is the
106result of a high-speed collision between futex_wait() and
107futex_lock_pi(), with some extra logic to check for the additional
108wake-up scenarios.
109
110FUTEX_REQUEUE_CMP_PI is called by the waker
111(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
112possibly wake the waiting tasks. Internally, this system call is
113still handled by futex_requeue (by passing requeue_pi=1). Before
114requeueing, futex_requeue() attempts to acquire the requeue target
115PI futex on behalf of the top waiter. If it can, this waiter is
116woken. futex_requeue() then proceeds to requeue the remaining
117nr_wake+nr_requeue tasks to the PI futex, calling
118rt_mutex_start_proxy_lock() prior to each requeue to prepare the
119task as a waiter on the underlying rt_mutex. It is possible that
120the lock can be acquired at this stage as well, if so, the next
121waiter is woken to finish the acquisition of the lock.
122
123FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
124their sum is all that really matters. futex_requeue() will wake or
125requeue up to nr_wake + nr_requeue tasks. It will wake only as many
126tasks as it can acquire the lock for, which in the majority of cases
127should be 0 as good programming practice dictates that the caller of
128either pthread_cond_broadcast() or pthread_cond_signal() acquire the
129mutex prior to making the call. FUTEX_REQUEUE_PI requires that
130nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for
131signal.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8041e51f8860..a5253f6d01af 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -329,11 +329,6 @@ and is between 256 and 4096 characters. It is defined in the file
329 flushed before they will be reused, which 329 flushed before they will be reused, which
330 is a lot of faster 330 is a lot of faster
331 331
332 amd_iommu_size= [HW,X86-64]
333 Define the size of the aperture for the AMD IOMMU
334 driver. Possible values are:
335 '32M', '64M' (default), '128M', '256M', '512M', '1G'
336
337 amijoy.map= [HW,JOY] Amiga joystick support 332 amijoy.map= [HW,JOY] Amiga joystick support
338 Map of devices attached to JOY0DAT and JOY1DAT 333 Map of devices attached to JOY0DAT and JOY1DAT
339 Format: <a>,<b> 334 Format: <a>,<b>
@@ -646,6 +641,13 @@ and is between 256 and 4096 characters. It is defined in the file
646 DMA-API debugging code disables itself because the 641 DMA-API debugging code disables itself because the
647 architectural default is too low. 642 architectural default is too low.
648 643
644 dma_debug_driver=<driver_name>
645 With this option the DMA-API debugging driver
646 filter feature can be enabled at boot time. Just
647 pass the driver to filter for as the parameter.
648 The filter can be disabled or changed to another
649 driver later using sysfs.
650
649 dscc4.setup= [NET] 651 dscc4.setup= [NET]
650 652
651 dtc3181e= [HW,SCSI] 653 dtc3181e= [HW,SCSI]
@@ -1581,6 +1583,9 @@ and is between 256 and 4096 characters. It is defined in the file
1581 noinitrd [RAM] Tells the kernel not to load any configured 1583 noinitrd [RAM] Tells the kernel not to load any configured
1582 initial RAM disk. 1584 initial RAM disk.
1583 1585
1586 nointremap [X86-64, Intel-IOMMU] Do not enable interrupt
1587 remapping.
1588
1584 nointroute [IA-64] 1589 nointroute [IA-64]
1585 1590
1586 nojitter [IA64] Disables jitter checking for ITC timers. 1591 nojitter [IA64] Disables jitter checking for ITC timers.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f5b7127f54ac..7f5809eddee6 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -31,6 +31,7 @@ Contents:
31 31
32 - Locking functions. 32 - Locking functions.
33 - Interrupt disabling functions. 33 - Interrupt disabling functions.
34 - Sleep and wake-up functions.
34 - Miscellaneous functions. 35 - Miscellaneous functions.
35 36
36 (*) Inter-CPU locking barrier effects. 37 (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
1217other means. 1218other means.
1218 1219
1219 1220
1221SLEEP AND WAKE-UP FUNCTIONS
1222---------------------------
1223
1224Sleeping and waking on an event flagged in global data can be viewed as an
1225interaction between two pieces of data: the task state of the task waiting for
1226the event and the global data used to indicate the event. To make sure that
1227these appear to happen in the right order, the primitives to begin the process
1228of going to sleep, and the primitives to initiate a wake up imply certain
1229barriers.
1230
1231Firstly, the sleeper normally follows something like this sequence of events:
1232
1233 for (;;) {
1234 set_current_state(TASK_UNINTERRUPTIBLE);
1235 if (event_indicated)
1236 break;
1237 schedule();
1238 }
1239
1240A general memory barrier is interpolated automatically by set_current_state()
1241after it has altered the task state:
1242
1243 CPU 1
1244 ===============================
1245 set_current_state();
1246 set_mb();
1247 STORE current->state
1248 <general barrier>
1249 LOAD event_indicated
1250
1251set_current_state() may be wrapped by:
1252
1253 prepare_to_wait();
1254 prepare_to_wait_exclusive();
1255
1256which therefore also imply a general memory barrier after setting the state.
1257The whole sequence above is available in various canned forms, all of which
1258interpolate the memory barrier in the right place:
1259
1260 wait_event();
1261 wait_event_interruptible();
1262 wait_event_interruptible_exclusive();
1263 wait_event_interruptible_timeout();
1264 wait_event_killable();
1265 wait_event_timeout();
1266 wait_on_bit();
1267 wait_on_bit_lock();
1268
1269
1270Secondly, code that performs a wake up normally follows something like this:
1271
1272 event_indicated = 1;
1273 wake_up(&event_wait_queue);
1274
1275or:
1276
1277 event_indicated = 1;
1278 wake_up_process(event_daemon);
1279
1280A write memory barrier is implied by wake_up() and co. if and only if they wake
1281something up. The barrier occurs before the task state is cleared, and so sits
1282between the STORE to indicate the event and the STORE to set TASK_RUNNING:
1283
1284 CPU 1 CPU 2
1285 =============================== ===============================
1286 set_current_state(); STORE event_indicated
1287 set_mb(); wake_up();
1288 STORE current->state <write barrier>
1289 <general barrier> STORE current->state
1290 LOAD event_indicated
1291
1292The available waker functions include:
1293
1294 complete();
1295 wake_up();
1296 wake_up_all();
1297 wake_up_bit();
1298 wake_up_interruptible();
1299 wake_up_interruptible_all();
1300 wake_up_interruptible_nr();
1301 wake_up_interruptible_poll();
1302 wake_up_interruptible_sync();
1303 wake_up_interruptible_sync_poll();
1304 wake_up_locked();
1305 wake_up_locked_poll();
1306 wake_up_nr();
1307 wake_up_poll();
1308 wake_up_process();
1309
1310
1311[!] Note that the memory barriers implied by the sleeper and the waker do _not_
1312order multiple stores before the wake-up with respect to loads of those stored
1313values after the sleeper has called set_current_state(). For instance, if the
1314sleeper does:
1315
1316 set_current_state(TASK_INTERRUPTIBLE);
1317 if (event_indicated)
1318 break;
1319 __set_current_state(TASK_RUNNING);
1320 do_something(my_data);
1321
1322and the waker does:
1323
1324 my_data = value;
1325 event_indicated = 1;
1326 wake_up(&event_wait_queue);
1327
1328there's no guarantee that the change to event_indicated will be perceived by
1329the sleeper as coming after the change to my_data. In such a circumstance, the
1330code on both sides must interpolate its own memory barriers between the
1331separate data accesses. Thus the above sleeper ought to do:
1332
1333 set_current_state(TASK_INTERRUPTIBLE);
1334 if (event_indicated) {
1335 smp_rmb();
1336 do_something(my_data);
1337 }
1338
1339and the waker should do:
1340
1341 my_data = value;
1342 smp_wmb();
1343 event_indicated = 1;
1344 wake_up(&event_wait_queue);
1345
1346
1220MISCELLANEOUS FUNCTIONS 1347MISCELLANEOUS FUNCTIONS
1221----------------------- 1348-----------------------
1222 1349
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
1366 1493
1367Under normal operation, memory operation reordering is generally not going to 1494Under normal operation, memory operation reordering is generally not going to
1368be a problem as a single-threaded linear piece of code will still appear to 1495be a problem as a single-threaded linear piece of code will still appear to
1369work correctly, even if it's in an SMP kernel. There are, however, three 1496work correctly, even if it's in an SMP kernel. There are, however, four
1370circumstances in which reordering definitely _could_ be a problem: 1497circumstances in which reordering definitely _could_ be a problem:
1371 1498
1372 (*) Interprocessor interaction. 1499 (*) Interprocessor interaction.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 5ba4d3fc625a..1df7f9cdab05 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
4CONTENTS 4CONTENTS
5======== 5========
6 6
70. WARNING
71. Overview 81. Overview
8 1.1 The problem 9 1.1 The problem
9 1.2 The solution 10 1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
143. Future plans 153. Future plans
15 16
16 17
180. WARNING
19==========
20
21 Fiddling with these settings can result in an unstable system, the knobs are
22 root only and assumes root knows what he is doing.
23
24Most notable:
25
26 * very small values in sched_rt_period_us can result in an unstable
27 system when the period is smaller than either the available hrtimer
28 resolution, or the time it takes to handle the budget refresh itself.
29
30 * very small values in sched_rt_runtime_us can result in an unstable
31 system when the runtime is so small the system has difficulty making
32 forward progress (NOTE: the migration thread and kstopmachine both
33 are real-time processes).
34
171. Overview 351. Overview
18=========== 36===========
19 37
@@ -169,7 +187,7 @@ get their allocated time.
169 187
170Implementing SCHED_EDF might take a while to complete. Priority Inheritance is 188Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
171the biggest challenge as the current linux PI infrastructure is geared towards 189the biggest challenge as the current linux PI infrastructure is geared towards
172the limited static priority levels 0-139. With deadline scheduling you need to 190the limited static priority levels 0-99. With deadline scheduling you need to
173do deadline inheritance (since priority is inversely proportional to the 191do deadline inheritance (since priority is inversely proportional to the
174deadline delta (deadline - now). 192deadline delta (deadline - now).
175 193
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e693813..e362f50c496f 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
518values starting at 100 (nice -20). Below is a quick chart to map 518values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities. 519the kernel priority to user land priorities.
520 520
521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 521 Kernel Space User Space
522 Kernel priority: 100 to 139 ==> user nice -20 to 19 522 ===============================================================
523 Kernel priority: 140 ==> idle task priority 523 0(high) to 98(low) user RT priority 99(high) to 1(low)
524 with SCHED_RR or SCHED_FIFO
525 ---------------------------------------------------------------
526 99 sched_priority is not used in scheduling
527 decisions(it must be specified as 0)
528 ---------------------------------------------------------------
529 100(high) to 139(low) user nice -20(high) to 19(low)
530 ---------------------------------------------------------------
531 140 idle task priority
532 ---------------------------------------------------------------
524 533
525The task states are: 534The task states are:
526 535
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index e0203662f9e9..8da3a795083f 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,6 +50,10 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format
50Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical 50Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical
51 pointer to single linked list of struct setup_data. 51 pointer to single linked list of struct setup_data.
52 52
53Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment
54 beyond the kernel_alignment added, new init_size and
55 pref_address fields. Added extended boot loader IDs.
56
53**** MEMORY LAYOUT 57**** MEMORY LAYOUT
54 58
55The traditional memory map for the kernel loader, used for Image or 59The traditional memory map for the kernel loader, used for Image or
@@ -168,12 +172,13 @@ Offset Proto Name Meaning
168021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 172021C/4 2.00+ ramdisk_size initrd size (set by boot loader)
1690220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 1730220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only
1700224/2 2.01+ heap_end_ptr Free memory after setup end 1740224/2 2.01+ heap_end_ptr Free memory after setup end
1710226/2 N/A pad1 Unused 1750226/1 2.02+(3 ext_loader_ver Extended boot loader version
1760227/1 2.02+(3 ext_loader_type Extended boot loader ID
1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 1770228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line
173022C/4 2.03+ ramdisk_max Highest legal initrd address 178022C/4 2.03+ ramdisk_max Highest legal initrd address
1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 1790230/4 2.05+ kernel_alignment Physical addr alignment required for kernel
1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1800234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1760235/1 N/A pad2 Unused 1810235/1 2.10+ min_alignment Minimum alignment, as a power of two
1770236/2 N/A pad3 Unused 1820236/2 N/A pad3 Unused
1780238/4 2.06+ cmdline_size Maximum size of the kernel command line 1830238/4 2.06+ cmdline_size Maximum size of the kernel command line
179023C/4 2.07+ hardware_subarch Hardware subarchitecture 184023C/4 2.07+ hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset Proto Name Meaning
182024C/4 2.08+ payload_length Length of kernel payload 187024C/4 2.08+ payload_length Length of kernel payload
1830250/8 2.09+ setup_data 64-bit physical pointer to linked list 1880250/8 2.09+ setup_data 64-bit physical pointer to linked list
184 of struct setup_data 189 of struct setup_data
1900258/8 2.10+ pref_address Preferred loading address
1910260/4 2.10+ init_size Linear memory required during initialization
185 192
186(1) For backwards compatibility, if the setup_sects field contains 0, the 193(1) For backwards compatibility, if the setup_sects field contains 0, the
187 real value is 4. 194 real value is 4.
@@ -190,6 +197,8 @@ Offset Proto Name Meaning
190 field are unusable, which means the size of a bzImage kernel 197 field are unusable, which means the size of a bzImage kernel
191 cannot be determined. 198 cannot be determined.
192 199
200(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
201
193If the "HdrS" (0x53726448) magic number is not found at offset 0x202, 202If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
194the boot protocol version is "old". Loading an old kernel, the 203the boot protocol version is "old". Loading an old kernel, the
195following parameters should be assumed: 204following parameters should be assumed:
@@ -343,18 +352,32 @@ Protocol: 2.00+
343 0xTV here, where T is an identifier for the boot loader and V is 352 0xTV here, where T is an identifier for the boot loader and V is
344 a version number. Otherwise, enter 0xFF here. 353 a version number. Otherwise, enter 0xFF here.
345 354
355 For boot loader IDs above T = 0xD, write T = 0xE to this field and
356 write the extended ID minus 0x10 to the ext_loader_type field.
357 Similarly, the ext_loader_ver field can be used to provide more than
358 four bits for the bootloader version.
359
360 For example, for T = 0x15, V = 0x234, write:
361
362 type_of_loader <- 0xE4
363 ext_loader_type <- 0x05
364 ext_loader_ver <- 0x23
365
346 Assigned boot loader ids: 366 Assigned boot loader ids:
347 0 LILO (0x00 reserved for pre-2.00 bootloader) 367 0 LILO (0x00 reserved for pre-2.00 bootloader)
348 1 Loadlin 368 1 Loadlin
349 2 bootsect-loader (0x20, all other values reserved) 369 2 bootsect-loader (0x20, all other values reserved)
350 3 SYSLINUX 370 3 Syslinux
351 4 EtherBoot 371 4 Etherboot/gPXE
352 5 ELILO 372 5 ELILO
353 7 GRUB 373 7 GRUB
354 8 U-BOOT 374 8 U-Boot
355 9 Xen 375 9 Xen
356 A Gujin 376 A Gujin
357 B Qemu 377 B Qemu
378 C Arcturus Networks uCbootloader
379 E Extended (see ext_loader_type)
380 F Special (0xFF = undefined)
358 381
359 Please contact <hpa@zytor.com> if you need a bootloader ID 382 Please contact <hpa@zytor.com> if you need a bootloader ID
360 value assigned. 383 value assigned.
@@ -453,6 +476,35 @@ Protocol: 2.01+
453 Set this field to the offset (from the beginning of the real-mode 476 Set this field to the offset (from the beginning of the real-mode
454 code) of the end of the setup stack/heap, minus 0x0200. 477 code) of the end of the setup stack/heap, minus 0x0200.
455 478
479Field name: ext_loader_ver
480Type: write (optional)
481Offset/size: 0x226/1
482Protocol: 2.02+
483
484 This field is used as an extension of the version number in the
485 type_of_loader field. The total version number is considered to be
486 (type_of_loader & 0x0f) + (ext_loader_ver << 4).
487
488 The use of this field is boot loader specific. If not written, it
489 is zero.
490
491 Kernels prior to 2.6.31 did not recognize this field, but it is safe
492 to write for protocol version 2.02 or higher.
493
494Field name: ext_loader_type
495Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0)
496Offset/size: 0x227/1
497Protocol: 2.02+
498
499 This field is used as an extension of the type number in
500 type_of_loader field. If the type in type_of_loader is 0xE, then
501 the actual type is (ext_loader_type + 0x10).
502
503 This field is ignored if the type in type_of_loader is not 0xE.
504
505 Kernels prior to 2.6.31 did not recognize this field, but it is safe
506 to write for protocol version 2.02 or higher.
507
456Field name: cmd_line_ptr 508Field name: cmd_line_ptr
457Type: write (obligatory) 509Type: write (obligatory)
458Offset/size: 0x228/4 510Offset/size: 0x228/4
@@ -482,11 +534,19 @@ Protocol: 2.03+
482 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) 534 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
483 535
484Field name: kernel_alignment 536Field name: kernel_alignment
485Type: read (reloc) 537Type: read/modify (reloc)
486Offset/size: 0x230/4 538Offset/size: 0x230/4
487Protocol: 2.05+ 539Protocol: 2.05+ (read), 2.10+ (modify)
540
541 Alignment unit required by the kernel (if relocatable_kernel is
542 true.) A relocatable kernel that is loaded at an alignment
543 incompatible with the value in this field will be realigned during
544 kernel initialization.
488 545
489 Alignment unit required by the kernel (if relocatable_kernel is true.) 546 Starting with protocol version 2.10, this reflects the kernel
547 alignment preferred for optimal performance; it is possible for the
548 loader to modify this field to permit a lesser alignment. See the
549 min_alignment and pref_address field below.
490 550
491Field name: relocatable_kernel 551Field name: relocatable_kernel
492Type: read (reloc) 552Type: read (reloc)
@@ -498,6 +558,22 @@ Protocol: 2.05+
498 After loading, the boot loader must set the code32_start field to 558 After loading, the boot loader must set the code32_start field to
499 point to the loaded code, or to a boot loader hook. 559 point to the loaded code, or to a boot loader hook.
500 560
561Field name: min_alignment
562Type: read (reloc)
563Offset/size: 0x235/1
564Protocol: 2.10+
565
566 This field, if nonzero, indicates as a power of two the minimum
567 alignment required, as opposed to preferred, by the kernel to boot.
568 If a boot loader makes use of this field, it should update the
569 kernel_alignment field with the alignment unit desired; typically:
570
571 kernel_alignment = 1 << min_alignment
572
573 There may be a considerable performance cost with an excessively
574 misaligned kernel. Therefore, a loader should typically try each
575 power-of-two alignment from kernel_alignment down to this alignment.
576
501Field name: cmdline_size 577Field name: cmdline_size
502Type: read 578Type: read
503Offset/size: 0x238/4 579Offset/size: 0x238/4
@@ -582,6 +658,36 @@ Protocol: 2.09+
582 sure to consider the case where the linked list already contains 658 sure to consider the case where the linked list already contains
583 entries. 659 entries.
584 660
661Field name: pref_address
662Type: read (reloc)
663Offset/size: 0x258/8
664Protocol: 2.10+
665
666 This field, if nonzero, represents a preferred load address for the
667 kernel. A relocating bootloader should attempt to load at this
668 address if possible.
669
670 A non-relocatable kernel will unconditionally move itself and to run
671 at this address.
672
673Field name: init_size
674Type: read
675Offset/size: 0x25c/4
676
677 This field indicates the amount of linear contiguous memory starting
678 at the kernel runtime start address that the kernel needs before it
679 is capable of examining its memory map. This is not the same thing
680 as the total amount of memory the kernel needs to boot, but it can
681 be used by a relocating boot loader to help select a safe load
682 address for the kernel.
683
684 The kernel runtime start address is determined by the following algorithm:
685
686 if (relocatable_kernel)
687 runtime_start = align_up(load_address, kernel_alignment)
688 else
689 runtime_start = pref_address
690
585 691
586**** THE IMAGE CHECKSUM 692**** THE IMAGE CHECKSUM
587 693
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c13040a718..2db5893d6c97 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -150,11 +150,6 @@ NUMA
150 Otherwise, the remaining system RAM is allocated to an 150 Otherwise, the remaining system RAM is allocated to an
151 additional node. 151 additional node.
152 152
153 numa=hotadd=percent
154 Only allow hotadd memory to preallocate page structures upto
155 percent of already available memory.
156 numa=hotadd=0 will disable hotadd memory.
157
158ACPI 153ACPI
159 154
160 acpi=off Don't enable ACPI 155 acpi=off Don't enable ACPI
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 29b52b14d0b4..d6498e3cd713 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
7hole caused by [48:63] sign extension 7hole caused by [48:63] sign extension
8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole 8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
9ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory 9ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
10ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole 10ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
11ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space 11ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
12ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) 12ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
13ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
13... unused hole ... 14... unused hole ...
14ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 15ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
15ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space 16ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
diff --git a/Makefile b/Makefile
index 106515492089..03373bb703ca 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
1VERSION = 2 1VERSION = 2
2PATCHLEVEL = 6 2PATCHLEVEL = 6
3SUBLEVEL = 30 3SUBLEVEL = 30
4EXTRAVERSION = -rc8 4EXTRAVERSION =
5NAME = Man-Eating Seals of Antiquity 5NAME = Man-Eating Seals of Antiquity
6 6
7# *DOCUMENTATION* 7# *DOCUMENTATION*
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index 9c9d1fd4155f..5bd5259324b7 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
176 } 176 }
177} 177}
178 178
179static void 179static int
180dp264_set_affinity(unsigned int irq, const struct cpumask *affinity) 180dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
181{ 181{
182 spin_lock(&dp264_irq_lock); 182 spin_lock(&dp264_irq_lock);
183 cpu_set_irq_affinity(irq, *affinity); 183 cpu_set_irq_affinity(irq, *affinity);
184 tsunami_update_irq_hw(cached_irq_mask); 184 tsunami_update_irq_hw(cached_irq_mask);
185 spin_unlock(&dp264_irq_lock); 185 spin_unlock(&dp264_irq_lock);
186
187 return 0;
186} 188}
187 189
188static void 190static int
189clipper_set_affinity(unsigned int irq, const struct cpumask *affinity) 191clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
190{ 192{
191 spin_lock(&dp264_irq_lock); 193 spin_lock(&dp264_irq_lock);
192 cpu_set_irq_affinity(irq - 16, *affinity); 194 cpu_set_irq_affinity(irq - 16, *affinity);
193 tsunami_update_irq_hw(cached_irq_mask); 195 tsunami_update_irq_hw(cached_irq_mask);
194 spin_unlock(&dp264_irq_lock); 196 spin_unlock(&dp264_irq_lock);
197
198 return 0;
195} 199}
196 200
197static struct hw_interrupt_type dp264_irq_type = { 201static struct hw_interrupt_type dp264_irq_type = {
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 27f840a4ad3d..8dd239ebdb9e 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
157 157
158} 158}
159 159
160static void 160static int
161titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) 161titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
162{ 162{
163 spin_lock(&titan_irq_lock); 163 spin_lock(&titan_irq_lock);
164 titan_cpu_set_irq_affinity(irq - 16, *affinity); 164 titan_cpu_set_irq_affinity(irq - 16, *affinity);
165 titan_update_irq_hw(titan_cached_irq_mask); 165 titan_update_irq_hw(titan_cached_irq_mask);
166 spin_unlock(&titan_irq_lock); 166 spin_unlock(&titan_irq_lock);
167
168 return 0;
167} 169}
168 170
169static void 171static void
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index 3e1714c6523f..664c7b8b1ba8 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
109} 109}
110 110
111#ifdef CONFIG_SMP 111#ifdef CONFIG_SMP
112static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) 112static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
113{ 113{
114 void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3); 114 void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
115 unsigned int shift = (irq % 4) * 8; 115 unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
122 val |= 1 << (cpu + shift); 122 val |= 1 << (cpu + shift);
123 writel(val, reg); 123 writel(val, reg);
124 spin_unlock(&irq_controller_lock); 124 spin_unlock(&irq_controller_lock);
125
126 return 0;
125} 127}
126#endif 128#endif
127 129
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index df3925cb1c7f..d70b445f4a8f 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
325{ 325{
326} 326}
327 327
328void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest) 328int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
329{ 329{
330 unsigned long flags; 330 unsigned long flags;
331 spin_lock_irqsave(&irq_lock, flags); 331 spin_lock_irqsave(&irq_lock, flags);
332 irq_allocations[irq - FIRST_IRQ].mask = *dest; 332 irq_allocations[irq - FIRST_IRQ].mask = *dest;
333 spin_unlock_irqrestore(&irq_lock, flags); 333 spin_unlock_irqrestore(&irq_lock, flags);
334
335 return 0;
334} 336}
335 337
336static struct irq_chip crisv32_irq_type = { 338static struct irq_chip crisv32_irq_type = {
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index cc0a3182db3c..acb5047ab573 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
21{ 21{
22} 22}
23 23
24static void 24static int
25hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b) 25hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
26{ 26{
27 return 0;
27} 28}
28 29
29static struct hw_interrupt_type irq_type_hp_sim = { 30static struct hw_interrupt_type irq_type_hp_sim = {
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5510317db37b..baec6f00f7f3 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
636 * success: return IRQ number (>=0) 636 * success: return IRQ number (>=0)
637 * failure: return < 0 637 * failure: return < 0
638 */ 638 */
639int acpi_register_gsi(u32 gsi, int triggering, int polarity) 639int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
640{ 640{
641 if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) 641 if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
642 return gsi; 642 return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
678 678
679 fadt = (struct acpi_table_fadt *)fadt_header; 679 fadt = (struct acpi_table_fadt *)fadt_header;
680 680
681 acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); 681 acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
682 ACPI_ACTIVE_LOW);
682 return 0; 683 return 0;
683} 684}
684 685
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 166e0d839fa0..f92cef47bf86 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
329} 329}
330 330
331 331
332static void 332static int
333iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) 333iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
334{ 334{
335#ifdef CONFIG_SMP 335#ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
343 343
344 cpu = cpumask_first_and(cpu_online_mask, mask); 344 cpu = cpumask_first_and(cpu_online_mask, mask);
345 if (cpu >= nr_cpu_ids) 345 if (cpu >= nr_cpu_ids)
346 return; 346 return -1;
347 347
348 if (irq_prepare_move(irq, cpu)) 348 if (irq_prepare_move(irq, cpu))
349 return; 349 return -1;
350 350
351 dest = cpu_physical_id(cpu); 351 dest = cpu_physical_id(cpu);
352 352
353 if (!iosapic_intr_info[irq].count) 353 if (!iosapic_intr_info[irq].count)
354 return; /* not an IOSAPIC interrupt */ 354 return -1; /* not an IOSAPIC interrupt */
355 355
356 set_irq_affinity_info(irq, dest, redir); 356 set_irq_affinity_info(irq, dest, redir);
357 357
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
376 iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); 376 iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
377 iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32); 377 iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
378 } 378 }
379
379#endif 380#endif
381 return 0;
380} 382}
381 383
382/* 384/*
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 2b15e233f7fe..0f8ade9331ba 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -12,7 +12,7 @@
12static struct irq_chip ia64_msi_chip; 12static struct irq_chip ia64_msi_chip;
13 13
14#ifdef CONFIG_SMP 14#ifdef CONFIG_SMP
15static void ia64_set_msi_irq_affinity(unsigned int irq, 15static int ia64_set_msi_irq_affinity(unsigned int irq,
16 const cpumask_t *cpu_mask) 16 const cpumask_t *cpu_mask)
17{ 17{
18 struct msi_msg msg; 18 struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
20 int cpu = first_cpu(*cpu_mask); 20 int cpu = first_cpu(*cpu_mask);
21 21
22 if (!cpu_online(cpu)) 22 if (!cpu_online(cpu))
23 return; 23 return -1;
24 24
25 if (irq_prepare_move(irq, cpu)) 25 if (irq_prepare_move(irq, cpu))
26 return; 26 return -1;
27 27
28 read_msi_msg(irq, &msg); 28 read_msi_msg(irq, &msg);
29 29
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
39 39
40 write_msi_msg(irq, &msg); 40 write_msi_msg(irq, &msg);
41 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu)); 41 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
42
43 return 0;
42} 44}
43#endif /* CONFIG_SMP */ 45#endif /* CONFIG_SMP */
44 46
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
130 132
131#ifdef CONFIG_DMAR 133#ifdef CONFIG_DMAR
132#ifdef CONFIG_SMP 134#ifdef CONFIG_SMP
133static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 135static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
134{ 136{
135 struct irq_cfg *cfg = irq_cfg + irq; 137 struct irq_cfg *cfg = irq_cfg + irq;
136 struct msi_msg msg; 138 struct msi_msg msg;
137 int cpu = cpumask_first(mask); 139 int cpu = cpumask_first(mask);
138 140
139 if (!cpu_online(cpu)) 141 if (!cpu_online(cpu))
140 return; 142 return -1;
141 143
142 if (irq_prepare_move(irq, cpu)) 144 if (irq_prepare_move(irq, cpu))
143 return; 145 return -1;
144 146
145 dmar_msi_read(irq, &msg); 147 dmar_msi_read(irq, &msg);
146 148
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
151 153
152 dmar_msi_write(irq, &msg); 154 dmar_msi_write(irq, &msg);
153 cpumask_copy(irq_desc[irq].affinity, mask); 155 cpumask_copy(irq_desc[irq].affinity, mask);
156
157 return 0;
154} 158}
155#endif /* CONFIG_SMP */ 159#endif /* CONFIG_SMP */
156 160
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 66fd705e82c0..764f26abac05 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,7 +227,7 @@ finish_up:
227 return new_irq_info; 227 return new_irq_info;
228} 228}
229 229
230static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) 230static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
231{ 231{
232 struct sn_irq_info *sn_irq_info, *sn_irq_info_safe; 232 struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
233 nasid_t nasid; 233 nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
239 list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe, 239 list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
240 sn_irq_lh[irq], list) 240 sn_irq_lh[irq], list)
241 (void)sn_retarget_vector(sn_irq_info, nasid, slice); 241 (void)sn_retarget_vector(sn_irq_info, nasid, slice);
242
243 return 0;
242} 244}
243 245
244#ifdef CONFIG_SMP 246#ifdef CONFIG_SMP
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 81e428943d73..fbbfb9701201 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
151} 151}
152 152
153#ifdef CONFIG_SMP 153#ifdef CONFIG_SMP
154static void sn_set_msi_irq_affinity(unsigned int irq, 154static int sn_set_msi_irq_affinity(unsigned int irq,
155 const struct cpumask *cpu_mask) 155 const struct cpumask *cpu_mask)
156{ 156{
157 struct msi_msg msg; 157 struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
168 cpu = cpumask_first(cpu_mask); 168 cpu = cpumask_first(cpu_mask);
169 sn_irq_info = sn_msi_info[irq].sn_irq_info; 169 sn_irq_info = sn_msi_info[irq].sn_irq_info;
170 if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) 170 if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
171 return; 171 return -1;
172 172
173 /* 173 /*
174 * Release XIO resources for the old MSI PCI address 174 * Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
189 new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice); 189 new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
190 sn_msi_info[irq].sn_irq_info = new_irq_info; 190 sn_msi_info[irq].sn_irq_info = new_irq_info;
191 if (new_irq_info == NULL) 191 if (new_irq_info == NULL)
192 return; 192 return -1;
193 193
194 /* 194 /*
195 * Map the xio address into bus space 195 * Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
206 206
207 write_msi_msg(irq, &msg); 207 write_msi_msg(irq, &msg);
208 cpumask_copy(irq_desc[irq].affinity, cpu_mask); 208 cpumask_copy(irq_desc[irq].affinity, cpu_mask);
209
210 return 0;
209} 211}
210#endif /* CONFIG_SMP */ 212#endif /* CONFIG_SMP */
211 213
diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c
index 1c19af8daa62..d3a0c8154bec 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
177} 177}
178 178
179#ifdef CONFIG_SMP 179#ifdef CONFIG_SMP
180static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest) 180static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
181{ 181{
182 int cpu; 182 int cpu;
183 int bit = irq - OCTEON_IRQ_WORKQ0; /* Bit 0-63 of EN0 */ 183 int bit = irq - OCTEON_IRQ_WORKQ0; /* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
199 */ 199 */
200 cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2)); 200 cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
201 write_unlock(&octeon_irq_ciu0_rwlock); 201 write_unlock(&octeon_irq_ciu0_rwlock);
202
203 return 0;
202} 204}
203#endif 205#endif
204 206
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
292} 294}
293 295
294#ifdef CONFIG_SMP 296#ifdef CONFIG_SMP
295static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest) 297static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
296{ 298{
297 int cpu; 299 int cpu;
298 int bit = irq - OCTEON_IRQ_WDOG0; /* Bit 0-63 of EN1 */ 300 int bit = irq - OCTEON_IRQ_WDOG0; /* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
315 */ 317 */
316 cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1)); 318 cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
317 write_unlock(&octeon_irq_ciu1_rwlock); 319 write_unlock(&octeon_irq_ciu1_rwlock);
320
321 return 0;
318} 322}
319#endif 323#endif
320 324
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 3214ade02d10..4f1eed107b08 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
49#ifdef CONFIG_MIPS_MT_SMTC_IRQAFF 49#ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
50#include <linux/cpumask.h> 50#include <linux/cpumask.h>
51 51
52extern void plat_set_irq_affinity(unsigned int irq, 52extern int plat_set_irq_affinity(unsigned int irq,
53 const struct cpumask *affinity); 53 const struct cpumask *affinity);
54extern void smtc_forward_irq(unsigned int irq); 54extern void smtc_forward_irq(unsigned int irq);
55 55
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 87deb8f6c458..3f43c2e3aa5a 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
155 155
156static DEFINE_SPINLOCK(gic_lock); 156static DEFINE_SPINLOCK(gic_lock);
157 157
158static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) 158static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
159{ 159{
160 cpumask_t tmp = CPU_MASK_NONE; 160 cpumask_t tmp = CPU_MASK_NONE;
161 unsigned long flags; 161 unsigned long flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
166 166
167 cpumask_and(&tmp, cpumask, cpu_online_mask); 167 cpumask_and(&tmp, cpumask, cpu_online_mask);
168 if (cpus_empty(tmp)) 168 if (cpus_empty(tmp))
169 return; 169 return -1;
170 170
171 /* Assumption : cpumask refers to a single CPU */ 171 /* Assumption : cpumask refers to a single CPU */
172 spin_lock_irqsave(&gic_lock, flags); 172 spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
190 cpumask_copy(irq_desc[irq].affinity, cpumask); 190 cpumask_copy(irq_desc[irq].affinity, cpumask);
191 spin_unlock_irqrestore(&gic_lock, flags); 191 spin_unlock_irqrestore(&gic_lock, flags);
192 192
193 return 0;
193} 194}
194#endif 195#endif
195 196
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index 5ba31888fefb..499ffe5475df 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
114 */ 114 */
115 115
116 116
117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) 117int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
118{ 118{
119 cpumask_t tmask; 119 cpumask_t tmask;
120 int cpu = 0; 120 int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
156 156
157 /* Do any generic SMTC IRQ affinity setup */ 157 /* Do any generic SMTC IRQ affinity setup */
158 smtc_set_irq_affinity(irq, tmask); 158 smtc_set_irq_affinity(irq, tmask);
159
160 return 0;
159} 161}
160#endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */ 162#endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index c147c4b35d3f..690de06bde90 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
50static void disable_bcm1480_irq(unsigned int irq); 50static void disable_bcm1480_irq(unsigned int irq);
51static void ack_bcm1480_irq(unsigned int irq); 51static void ack_bcm1480_irq(unsigned int irq);
52#ifdef CONFIG_SMP 52#ifdef CONFIG_SMP
53static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask); 53static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
54#endif 54#endif
55 55
56#ifdef CONFIG_PCI 56#ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
109} 109}
110 110
111#ifdef CONFIG_SMP 111#ifdef CONFIG_SMP
112static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) 112static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
113{ 113{
114 int i = 0, old_cpu, cpu, int_on, k; 114 int i = 0, old_cpu, cpu, int_on, k;
115 u64 cur_ints; 115 u64 cur_ints;
@@ -118,7 +118,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
118 118
119 if (cpumask_weight(mask) != 1) { 119 if (cpumask_weight(mask) != 1) {
120 printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); 120 printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
121 return; 121 return -1;
122 } 122 }
123 i = cpumask_first(mask); 123 i = cpumask_first(mask);
124 124
@@ -152,6 +152,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
152 } 152 }
153 } 153 }
154 spin_unlock_irqrestore(&bcm1480_imr_lock, flags); 154 spin_unlock_irqrestore(&bcm1480_imr_lock, flags);
155
156 return 0;
155} 157}
156#endif 158#endif
157 159
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index 38cb998ade22..409dec798863 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
50static void disable_sb1250_irq(unsigned int irq); 50static void disable_sb1250_irq(unsigned int irq);
51static void ack_sb1250_irq(unsigned int irq); 51static void ack_sb1250_irq(unsigned int irq);
52#ifdef CONFIG_SMP 52#ifdef CONFIG_SMP
53static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask); 53static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
54#endif 54#endif
55 55
56#ifdef CONFIG_SIBYTE_HAS_LDT 56#ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
103} 103}
104 104
105#ifdef CONFIG_SMP 105#ifdef CONFIG_SMP
106static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) 106static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
107{ 107{
108 int i = 0, old_cpu, cpu, int_on; 108 int i = 0, old_cpu, cpu, int_on;
109 u64 cur_ints; 109 u64 cur_ints;
@@ -113,7 +113,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
113 113
114 if (cpumask_weight(mask) > 1) { 114 if (cpumask_weight(mask) > 1) {
115 printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); 115 printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
116 return; 116 return -1;
117 } 117 }
118 118
119 /* Convert logical CPU to physical CPU */ 119 /* Convert logical CPU to physical CPU */
@@ -143,6 +143,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
143 R_IMR_INTERRUPT_MASK)); 143 R_IMR_INTERRUPT_MASK));
144 } 144 }
145 spin_unlock_irqrestore(&sb1250_imr_lock, flags); 145 spin_unlock_irqrestore(&sb1250_imr_lock, flags);
146
147 return 0;
146} 148}
147#endif 149#endif
148 150
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 4ea4229d765c..8007f1e65729 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
130 return cpu_dest; 130 return cpu_dest;
131} 131}
132 132
133static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest) 133static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
134{ 134{
135 int cpu_dest; 135 int cpu_dest;
136 136
137 cpu_dest = cpu_check_affinity(irq, dest); 137 cpu_dest = cpu_check_affinity(irq, dest);
138 if (cpu_dest < 0) 138 if (cpu_dest < 0)
139 return; 139 return -1;
140 140
141 cpumask_copy(&irq_desc[irq].affinity, dest); 141 cpumask_copy(&irq_desc[irq].affinity, dest);
142
143 return 0;
142} 144}
143#endif 145#endif
144 146
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 80b513449f4c..be3581a8c294 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
333 lpar_xirr_info_set((0xff << 24) | irq); 333 lpar_xirr_info_set((0xff << 24) | irq);
334} 334}
335 335
336static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) 336static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
337{ 337{
338 unsigned int irq; 338 unsigned int irq;
339 int status; 339 int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
342 342
343 irq = (unsigned int)irq_map[virq].hwirq; 343 irq = (unsigned int)irq_map[virq].hwirq;
344 if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) 344 if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
345 return; 345 return -1;
346 346
347 status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); 347 status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
348 348
349 if (status) { 349 if (status) {
350 printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", 350 printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
351 __func__, irq, status); 351 __func__, irq, status);
352 return; 352 return -1;
353 } 353 }
354 354
355 /* 355 /*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
363 printk(KERN_WARNING 363 printk(KERN_WARNING
364 "%s: No online cpus in the mask %s for irq %d\n", 364 "%s: No online cpus in the mask %s for irq %d\n",
365 __func__, cpulist, virq); 365 __func__, cpulist, virq);
366 return; 366 return -1;
367 } 367 }
368 368
369 status = rtas_call(ibm_set_xive, 3, 1, NULL, 369 status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
372 if (status) { 372 if (status) {
373 printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n", 373 printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
374 __func__, irq, status); 374 __func__, irq, status);
375 return; 375 return -1;
376 } 376 }
377
378 return 0;
377} 379}
378 380
379static struct irq_chip xics_pic_direct = { 381static struct irq_chip xics_pic_direct = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 0efc12d1a3d7..352d8c3ef526 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
807 807
808#endif /* CONFIG_SMP */ 808#endif /* CONFIG_SMP */
809 809
810void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) 810int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
811{ 811{
812 struct mpic *mpic = mpic_from_irq(irq); 812 struct mpic *mpic = mpic_from_irq(irq);
813 unsigned int src = mpic_irq_to_hw(irq); 813 unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
824 mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), 824 mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
825 mpic_physmask(cpus_addr(tmp)[0])); 825 mpic_physmask(cpus_addr(tmp)[0]));
826 } 826 }
827
828 return 0;
827} 829}
828 830
829static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type) 831static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 3cef2af10f42..eff433c322a0 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
36 36
37extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type); 37extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
38extern void mpic_set_vector(unsigned int virq, unsigned int vector); 38extern void mpic_set_vector(unsigned int virq, unsigned int vector);
39extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask); 39extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
40 40
41#endif /* _POWERPC_SYSDEV_MPIC_H */ 41#endif /* _POWERPC_SYSDEV_MPIC_H */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 639ac805448a..65865726b283 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,8 +102,8 @@ struct thread_info {
102#define TI_KERN_CNTD1 0x00000488 102#define TI_KERN_CNTD1 0x00000488
103#define TI_PCR 0x00000490 103#define TI_PCR 0x00000490
104#define TI_RESTART_BLOCK 0x00000498 104#define TI_RESTART_BLOCK 0x00000498
105#define TI_KUNA_REGS 0x000004c0 105#define TI_KUNA_REGS 0x000004c8
106#define TI_KUNA_INSN 0x000004c8 106#define TI_KUNA_INSN 0x000004d0
107#define TI_FPREGS 0x00000500 107#define TI_FPREGS 0x00000500
108 108
109/* We embed this in the uppermost byte of thread_info->flags */ 109/* We embed this in the uppermost byte of thread_info->flags */
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 5deabe921a47..e5e78f9cfc95 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
318 } 318 }
319} 319}
320 320
321static void sun4u_set_affinity(unsigned int virt_irq, 321static int sun4u_set_affinity(unsigned int virt_irq,
322 const struct cpumask *mask) 322 const struct cpumask *mask)
323{ 323{
324 sun4u_irq_enable(virt_irq); 324 sun4u_irq_enable(virt_irq);
325
326 return 0;
325} 327}
326 328
327/* Don't do anything. The desc->status check for IRQ_DISABLED in 329/* Don't do anything. The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
377 ino, err); 379 ino, err);
378} 380}
379 381
380static void sun4v_set_affinity(unsigned int virt_irq, 382static int sun4v_set_affinity(unsigned int virt_irq,
381 const struct cpumask *mask) 383 const struct cpumask *mask)
382{ 384{
383 unsigned int ino = virt_irq_table[virt_irq].dev_ino; 385 unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
388 if (err != HV_EOK) 390 if (err != HV_EOK)
389 printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): " 391 printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
390 "err(%d)\n", ino, cpuid, err); 392 "err(%d)\n", ino, cpuid, err);
393
394 return 0;
391} 395}
392 396
393static void sun4v_irq_disable(unsigned int virt_irq) 397static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
445 dev_handle, dev_ino, err); 449 dev_handle, dev_ino, err);
446} 450}
447 451
448static void sun4v_virt_set_affinity(unsigned int virt_irq, 452static int sun4v_virt_set_affinity(unsigned int virt_irq,
449 const struct cpumask *mask) 453 const struct cpumask *mask)
450{ 454{
451 unsigned long cpuid, dev_handle, dev_ino; 455 unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
461 printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): " 465 printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
462 "err(%d)\n", 466 "err(%d)\n",
463 dev_handle, dev_ino, cpuid, err); 467 dev_handle, dev_ino, cpuid, err);
468
469 return 0;
464} 470}
465 471
466static void sun4v_virq_disable(unsigned int virt_irq) 472static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 000000000000..ad8ec356fb36
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
1
2obj-$(CONFIG_KVM) += kvm/
3
4# Xen paravirtualization support
5obj-$(CONFIG_XEN) += xen/
6
7# lguest paravirtualization support
8obj-$(CONFIG_LGUEST_GUEST) += lguest/
9
10obj-y += kernel/
11obj-y += mm/
12
13obj-y += crypto/
14obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/
16
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a6efe0a2e9ae..aafae3b140de 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
47 select HAVE_KERNEL_BZIP2 47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 48 select HAVE_KERNEL_LZMA
49 49
50config OUTPUT_FORMAT
51 string
52 default "elf32-i386" if X86_32
53 default "elf64-x86-64" if X86_64
54
50config ARCH_DEFCONFIG 55config ARCH_DEFCONFIG
51 string 56 string
52 default "arch/x86/configs/i386_defconfig" if X86_32 57 default "arch/x86/configs/i386_defconfig" if X86_32
@@ -274,15 +279,9 @@ config SPARSE_IRQ
274 279
275 If you don't know what to do here, say N. 280 If you don't know what to do here, say N.
276 281
277config NUMA_MIGRATE_IRQ_DESC 282config NUMA_IRQ_DESC
278 bool "Move irq desc when changing irq smp_affinity" 283 def_bool y
279 depends on SPARSE_IRQ && NUMA 284 depends on SPARSE_IRQ && NUMA
280 depends on BROKEN
281 default n
282 ---help---
283 This enables moving irq_desc to cpu/node that irq will use handled.
284
285 If you don't know what to do here, say N.
286 285
287config X86_MPPARSE 286config X86_MPPARSE
288 bool "Enable MPS table" if ACPI 287 bool "Enable MPS table" if ACPI
@@ -355,7 +354,7 @@ config X86_UV
355 depends on X86_64 354 depends on X86_64
356 depends on X86_EXTENDED_PLATFORM 355 depends on X86_EXTENDED_PLATFORM
357 depends on NUMA 356 depends on NUMA
358 select X86_X2APIC 357 depends on X86_X2APIC
359 ---help--- 358 ---help---
360 This option is needed in order to support SGI Ultraviolet systems. 359 This option is needed in order to support SGI Ultraviolet systems.
361 If you don't have one of these, you should say N here. 360 If you don't have one of these, you should say N here.
@@ -1466,9 +1465,7 @@ config KEXEC_JUMP
1466 1465
1467config PHYSICAL_START 1466config PHYSICAL_START
1468 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1467 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1469 default "0x1000000" if X86_NUMAQ 1468 default "0x1000000"
1470 default "0x200000" if X86_64
1471 default "0x100000"
1472 ---help--- 1469 ---help---
1473 This gives the physical address where the kernel is loaded. 1470 This gives the physical address where the kernel is loaded.
1474 1471
@@ -1487,15 +1484,15 @@ config PHYSICAL_START
1487 to be specifically compiled to run from a specific memory area 1484 to be specifically compiled to run from a specific memory area
1488 (normally a reserved region) and this option comes handy. 1485 (normally a reserved region) and this option comes handy.
1489 1486
1490 So if you are using bzImage for capturing the crash dump, leave 1487 So if you are using bzImage for capturing the crash dump,
1491 the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. 1488 leave the value here unchanged to 0x1000000 and set
1492 Otherwise if you plan to use vmlinux for capturing the crash dump 1489 CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
1493 change this value to start of the reserved region (Typically 16MB 1490 for capturing the crash dump change this value to start of
1494 0x1000000). In other words, it can be set based on the "X" value as 1491 the reserved region. In other words, it can be set based on
1495 specified in the "crashkernel=YM@XM" command line boot parameter 1492 the "X" value as specified in the "crashkernel=YM@XM"
1496 passed to the panic-ed kernel. Typically this parameter is set as 1493 command line boot parameter passed to the panic-ed
1497 crashkernel=64M@16M. Please take a look at 1494 kernel. Please take a look at Documentation/kdump/kdump.txt
1498 Documentation/kdump/kdump.txt for more details about crash dumps. 1495 for more details about crash dumps.
1499 1496
1500 Usage of bzImage for capturing the crash dump is recommended as 1497 Usage of bzImage for capturing the crash dump is recommended as
1501 one does not have to build two kernels. Same kernel can be used 1498 one does not have to build two kernels. Same kernel can be used
@@ -1508,8 +1505,8 @@ config PHYSICAL_START
1508 Don't change this unless you know what you are doing. 1505 Don't change this unless you know what you are doing.
1509 1506
1510config RELOCATABLE 1507config RELOCATABLE
1511 bool "Build a relocatable kernel (EXPERIMENTAL)" 1508 bool "Build a relocatable kernel"
1512 depends on EXPERIMENTAL 1509 default y
1513 ---help--- 1510 ---help---
1514 This builds a kernel image that retains relocation information 1511 This builds a kernel image that retains relocation information
1515 so it can be loaded someplace besides the default 1MB. 1512 so it can be loaded someplace besides the default 1MB.
@@ -1524,12 +1521,16 @@ config RELOCATABLE
1524 it has been loaded at and the compile time physical address 1521 it has been loaded at and the compile time physical address
1525 (CONFIG_PHYSICAL_START) is ignored. 1522 (CONFIG_PHYSICAL_START) is ignored.
1526 1523
1524# Relocation on x86-32 needs some additional build support
1525config X86_NEED_RELOCS
1526 def_bool y
1527 depends on X86_32 && RELOCATABLE
1528
1527config PHYSICAL_ALIGN 1529config PHYSICAL_ALIGN
1528 hex 1530 hex
1529 prompt "Alignment value to which kernel should be aligned" if X86_32 1531 prompt "Alignment value to which kernel should be aligned" if X86_32
1530 default "0x100000" if X86_32 1532 default "0x1000000"
1531 default "0x200000" if X86_64 1533 range 0x2000 0x1000000
1532 range 0x2000 0x400000
1533 ---help--- 1534 ---help---
1534 This value puts the alignment restrictions on physical address 1535 This value puts the alignment restrictions on physical address
1535 where kernel is loaded and run from. Kernel is compiled for an 1536 where kernel is loaded and run from. Kernel is compiled for an
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e73317f..33fac6bbe1c2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,10 +159,17 @@ config IOMMU_DEBUG
159 options. See Documentation/x86_64/boot-options.txt for more 159 options. See Documentation/x86_64/boot-options.txt for more
160 details. 160 details.
161 161
162config IOMMU_STRESS
163 bool "Enable IOMMU stress-test mode"
164 ---help---
165 This option disables various optimizations in IOMMU related
166 code to do real stress testing of the IOMMU code. This option
167 will cause a performance drop and should only be enabled for
168 testing.
169
162config IOMMU_LEAK 170config IOMMU_LEAK
163 bool "IOMMU leak tracing" 171 bool "IOMMU leak tracing"
164 depends on DEBUG_KERNEL 172 depends on IOMMU_DEBUG && DMA_API_DEBUG
165 depends on IOMMU_DEBUG
166 ---help--- 173 ---help---
167 Add a simple leak tracer to the IOMMU code. This is useful when you 174 Add a simple leak tracer to the IOMMU code. This is useful when you
168 are debugging a buggy device driver that leaks IOMMU mappings. 175 are debugging a buggy device driver that leaks IOMMU mappings.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8c86b72afdc2..edbd0ca62067 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
12# BITS is used as extension for files which are available in a 32 bit 10# BITS is used as extension for files which are available in a 32 bit
13# and a 64 bit version to simplify shared Makefiles. 11# and a 64 bit version to simplify shared Makefiles.
14# e.g.: obj-y += foo_$(BITS).o 12# e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
118 116
119libs-y += arch/x86/lib/ 117libs-y += arch/x86/lib/
120 118
121# Sub architecture files that needs linking first 119# See arch/x86/Kbuild for content of core part of the kernel
122core-y += $(fcore-y) 120core-y += arch/x86/
123
124# Xen paravirtualization support
125core-$(CONFIG_XEN) += arch/x86/xen/
126
127# lguest paravirtualization support
128core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
129
130core-y += arch/x86/kernel/
131core-y += arch/x86/mm/
132
133core-y += arch/x86/crypto/
134core-y += arch/x86/vdso/
135core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
136 121
137# drivers-y are linked after core-y 122# drivers-y are linked after core-y
138drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ 123drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bdd..851fe936d242 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
3cpustr.h 3cpustr.h
4mkcpustr 4mkcpustr
5offsets.h 5offsets.h
6voffset.h
7zoffset.h
6setup 8setup
7setup.bin 9setup.bin
8setup.elf 10setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505a..8d16ada25048 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
31setup-y += printf.o string.o tty.o video.o video-mode.o version.o 31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
32setup-y += version.o
32setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
33 34
34# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
@@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
86 87
87SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) 88SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
88 89
89sed-offsets := -e 's/^00*/0/' \ 90sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
90 -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
91 91
92quiet_cmd_offsets = OFFSETS $@ 92quiet_cmd_voffset = VOFFSET $@
93 cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ 93 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
94 94
95$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE 95targets += voffset.h
96 $(call if_changed,offsets) 96$(obj)/voffset.h: vmlinux FORCE
97 $(call if_changed,voffset)
98
99sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
100
101quiet_cmd_zoffset = ZOFFSET $@
102 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
103
104targets += zoffset.h
105$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
106 $(call if_changed,zoffset)
97 107
98targets += offsets.h
99 108
100AFLAGS_header.o += -I$(obj) 109AFLAGS_header.o += -I$(obj)
101$(obj)/header.o: $(obj)/offsets.h 110$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
102 111
103LDFLAGS_setup.elf := -T 112LDFLAGS_setup.elf := -T
104$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 113$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c2442..64a31a6d751a 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation 5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
6 * 6 *
7 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
8 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
90 90
91static void enable_a20_bios(void) 91static void enable_a20_bios(void)
92{ 92{
93 asm volatile("pushfl; int $0x15; popfl" 93 struct biosregs ireg;
94 : : "a" ((u16)0x2401)); 94
95 initregs(&ireg);
96 ireg.ax = 0x2401;
97 intcall(0x15, &ireg, NULL);
95} 98}
96 99
97static void enable_a20_kbc(void) 100static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f9..ee274834ea8b 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * Original APM BIOS checking by Stephen Rothwell, May 1994 7 * Original APM BIOS checking by Stephen Rothwell, May 1994
7 * (sfr@canb.auug.org.au) 8 * (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
19 20
20int query_apm_bios(void) 21int query_apm_bios(void)
21{ 22{
22 u16 ax, bx, cx, dx, di; 23 struct biosregs ireg, oreg;
23 u32 ebx, esi;
24 u8 err;
25 24
26 /* APM BIOS installation check */ 25 /* APM BIOS installation check */
27 ax = 0x5300; 26 initregs(&ireg);
28 bx = cx = 0; 27 ireg.ah = 0x53;
29 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" 28 intcall(0x15, &ireg, &oreg);
30 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
31 : : "esi", "edi");
32 29
33 if (err) 30 if (oreg.flags & X86_EFLAGS_CF)
34 return -1; /* No APM BIOS */ 31 return -1; /* No APM BIOS */
35 32
36 if (bx != 0x504d) /* "PM" signature */ 33 if (oreg.bx != 0x504d) /* "PM" signature */
37 return -1; 34 return -1;
38 35
39 if (!(cx & 0x02)) /* 32 bits supported? */ 36 if (!(oreg.cx & 0x02)) /* 32 bits supported? */
40 return -1; 37 return -1;
41 38
42 /* Disconnect first, just in case */ 39 /* Disconnect first, just in case */
43 ax = 0x5304; 40 ireg.al = 0x04;
44 bx = 0; 41 intcall(0x15, &ireg, NULL);
45 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
46 : "+a" (ax), "+b" (bx)
47 : : "ecx", "edx", "esi", "edi");
48
49 /* Paranoia */
50 ebx = esi = 0;
51 cx = dx = di = 0;
52 42
53 /* 32-bit connect */ 43 /* 32-bit connect */
54 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" 44 ireg.al = 0x03;
55 : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), 45 intcall(0x15, &ireg, &oreg);
56 "+S" (esi), "+D" (di), "=m" (err) 46
57 : "a" (0x5303)); 47 boot_params.apm_bios_info.cseg = oreg.ax;
58 48 boot_params.apm_bios_info.offset = oreg.ebx;
59 boot_params.apm_bios_info.cseg = ax; 49 boot_params.apm_bios_info.cseg_16 = oreg.cx;
60 boot_params.apm_bios_info.offset = ebx; 50 boot_params.apm_bios_info.dseg = oreg.dx;
61 boot_params.apm_bios_info.cseg_16 = cx; 51 boot_params.apm_bios_info.cseg_len = oreg.si;
62 boot_params.apm_bios_info.dseg = dx; 52 boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
63 boot_params.apm_bios_info.cseg_len = (u16)esi; 53 boot_params.apm_bios_info.dseg_len = oreg.di;
64 boot_params.apm_bios_info.cseg_16_len = esi >> 16; 54
65 boot_params.apm_bios_info.dseg_len = di; 55 if (oreg.flags & X86_EFLAGS_CF)
66
67 if (err)
68 return -1; 56 return -1;
69 57
70 /* Redo the installation check as the 32-bit connect; 58 /* Redo the installation check as the 32-bit connect;
71 some BIOSes return different flags this way... */ 59 some BIOSes return different flags this way... */
72 60
73 ax = 0x5300; 61 ireg.al = 0x00;
74 bx = cx = 0; 62 intcall(0x15, &ireg, &oreg);
75 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
76 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
77 : : "esi", "edi");
78 63
79 if (err || bx != 0x504d) { 64 if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
80 /* Failure with 32-bit connect, try to disconect and ignore */ 65 /* Failure with 32-bit connect, try to disconect and ignore */
81 ax = 0x5304; 66 ireg.al = 0x04;
82 bx = 0; 67 intcall(0x15, &ireg, NULL);
83 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
84 : "+a" (ax), "+b" (bx)
85 : : "ecx", "edx", "esi", "edi");
86 return -1; 68 return -1;
87 } 69 }
88 70
89 boot_params.apm_bios_info.version = ax; 71 boot_params.apm_bios_info.version = oreg.ax;
90 boot_params.apm_bios_info.flags = cx; 72 boot_params.apm_bios_info.flags = oreg.cx;
91 return 0; 73 return 0;
92} 74}
93 75
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 000000000000..507793739ea5
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
13 * touching registers they shouldn't be.
14 */
15
16 .code16
17 .text
18 .globl intcall
19 .type intcall, @function
20intcall:
21 /* Self-modify the INT instruction. Ugly, but works. */
22 cmpb %al, 3f
23 je 1f
24 movb %al, 3f
25 jmp 1f /* Synchronize pipeline */
261:
27 /* Save state */
28 pushfl
29 pushw %fs
30 pushw %gs
31 pushal
32
33 /* Copy input state to stack frame */
34 subw $44, %sp
35 movw %dx, %si
36 movw %sp, %di
37 movw $11, %cx
38 rep; movsd
39
40 /* Pop full state from the stack */
41 popal
42 popw %gs
43 popw %fs
44 popw %es
45 popw %ds
46 popfl
47
48 /* Actual INT */
49 .byte 0xcd /* INT opcode */
503: .byte 0
51
52 /* Push full state to the stack */
53 pushfl
54 pushw %ds
55 pushw %es
56 pushw %fs
57 pushw %gs
58 pushal
59
60 /* Re-establish C environment invariants */
61 cld
62 movzwl %sp, %esp
63 movw %cs, %ax
64 movw %ax, %ds
65 movw %ax, %es
66
67 /* Copy output state from stack frame */
68 movw 68(%esp), %di /* Original %cx == 3rd argument */
69 andw %di, %di
70 jz 4f
71 movw %sp, %si
72 movw $11, %cx
73 rep; movsd
744: addw $44, %sp
75
76 /* Restore state and return */
77 popal
78 popw %gs
79 popw %fs
80 popfl
81 retl
82 .size intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e5..98239d2658f2 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
26#include <asm/setup.h> 27#include <asm/setup.h>
27#include "bitops.h" 28#include "bitops.h"
28#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h>
29 31
30/* Useful macros */ 32/* Useful macros */
31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
241/* apm.c */ 243/* apm.c */
242int query_apm_bios(void); 244int query_apm_bios(void);
243 245
246/* bioscall.c */
247struct biosregs {
248 union {
249 struct {
250 u32 edi;
251 u32 esi;
252 u32 ebp;
253 u32 _esp;
254 u32 ebx;
255 u32 edx;
256 u32 ecx;
257 u32 eax;
258 u32 _fsgs;
259 u32 _dses;
260 u32 eflags;
261 };
262 struct {
263 u16 di, hdi;
264 u16 si, hsi;
265 u16 bp, hbp;
266 u16 _sp, _hsp;
267 u16 bx, hbx;
268 u16 dx, hdx;
269 u16 cx, hcx;
270 u16 ax, hax;
271 u16 gs, fs;
272 u16 es, ds;
273 u16 flags, hflags;
274 };
275 struct {
276 u8 dil, dih, edi2, edi3;
277 u8 sil, sih, esi2, esi3;
278 u8 bpl, bph, ebp2, ebp3;
279 u8 _spl, _sph, _esp2, _esp3;
280 u8 bl, bh, ebx2, ebx3;
281 u8 dl, dh, edx2, edx3;
282 u8 cl, ch, ecx2, ecx3;
283 u8 al, ah, eax2, eax3;
284 };
285 };
286};
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288
244/* cmdline.c */ 289/* cmdline.c */
245int cmdline_find_option(const char *option, char *buffer, int bufsize); 290int cmdline_find_option(const char *option, char *buffer, int bufsize);
246int cmdline_find_option_bool(const char *option); 291int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
279int vsprintf(char *buf, const char *fmt, va_list args); 324int vsprintf(char *buf, const char *fmt, va_list args);
280int printf(const char *fmt, ...); 325int printf(const char *fmt, ...);
281 326
327/* regs.c */
328void initregs(struct biosregs *regs);
329
282/* string.c */ 330/* string.c */
283int strcmp(const char *str1, const char *str2); 331int strcmp(const char *str1, const char *str2);
284size_t strnlen(const char *s, size_t maxlen); 332size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d01..4a46fab7162e 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
1relocs 1relocs
2vmlinux.bin.all 2vmlinux.bin.all
3vmlinux.relocs 3vmlinux.relocs
4vmlinux.lds
5mkpiggy
6piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f8571..49c8a4c37d7c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,9 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
19LDFLAGS := -m elf_$(UTS_MACHINE) 19LDFLAGS := -m elf_$(UTS_MACHINE)
20LDFLAGS_vmlinux := -T 20LDFLAGS_vmlinux := -T
21 21
22$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 22hostprogs-y := mkpiggy
23
24$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
23 $(call if_changed,ld) 25 $(call if_changed,ld)
24 @: 26 @:
25 27
@@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
29 31
30 32
31targets += vmlinux.bin.all vmlinux.relocs relocs 33targets += vmlinux.bin.all vmlinux.relocs relocs
32hostprogs-$(CONFIG_X86_32) += relocs 34hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
33 35
34quiet_cmd_relocs = RELOCS $@ 36quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 37 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
37 $(call if_changed,relocs) 39 $(call if_changed,relocs)
38 40
39vmlinux.bin.all-y := $(obj)/vmlinux.bin 41vmlinux.bin.all-y := $(obj)/vmlinux.bin
40vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs 42vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
41quiet_cmd_relocbin = BUILD $@
42 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin)
45
46ifeq ($(CONFIG_X86_32),y)
47 43
48ifdef CONFIG_RELOCATABLE 44$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
49$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
50 $(call if_changed,gzip)
51$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
52 $(call if_changed,bzip2)
53$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
54 $(call if_changed,lzma)
55else
56$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
57 $(call if_changed,gzip) 45 $(call if_changed,gzip)
58$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE 46$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
59 $(call if_changed,bzip2) 47 $(call if_changed,bzip2)
60$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE 48$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
61 $(call if_changed,lzma) 49 $(call if_changed,lzma)
62endif
63LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
64 50
65else 51suffix-$(CONFIG_KERNEL_GZIP) := gz
52suffix-$(CONFIG_KERNEL_BZIP2) := bz2
53suffix-$(CONFIG_KERNEL_LZMA) := lzma
66 54
67$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 55quiet_cmd_mkpiggy = MKPIGGY $@
68 $(call if_changed,gzip) 56 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
69$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
70 $(call if_changed,bzip2)
71$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
72 $(call if_changed,lzma)
73
74LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
75endif
76 57
77suffix_$(CONFIG_KERNEL_GZIP) = gz 58targets += piggy.S
78suffix_$(CONFIG_KERNEL_BZIP2) = bz2 59$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
79suffix_$(CONFIG_KERNEL_LZMA) = lzma 60 $(call if_changed,mkpiggy)
80
81$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
82 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e2..75e4f001e706 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
12 * the page directory. [According to comments etc elsewhere on a compressed 12 * the page directory. [According to comments etc elsewhere on a compressed
13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] 13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
14 * 14 *
15 * Page 0 is deliberately kept safe, since System Management Mode code in 15 * Page 0 is deliberately kept safe, since System Management Mode code in
16 * laptops may need to access the BIOS data stored there. This is also 16 * laptops may need to access the BIOS data stored there. This is also
17 * useful for future device drivers that either access the BIOS via VM86 17 * useful for future device drivers that either access the BIOS via VM86
18 * mode. 18 * mode.
19 */ 19 */
20 20
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.text 24 .text
25 25
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
@@ -29,161 +29,151 @@
29#include <asm/boot.h> 29#include <asm/boot.h>
30#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
31 31
32.section ".text.head","ax",@progbits 32 .section ".text.head","ax",@progbits
33ENTRY(startup_32) 33ENTRY(startup_32)
34 cld 34 cld
35 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 35 /*
36 * us to not reload segments */ 36 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
37 testb $(1<<6), BP_loadflags(%esi) 37 * us to not reload segments
38 jnz 1f 38 */
39 testb $(1<<6), BP_loadflags(%esi)
40 jnz 1f
39 41
40 cli 42 cli
41 movl $(__BOOT_DS),%eax 43 movl $__BOOT_DS, %eax
42 movl %eax,%ds 44 movl %eax, %ds
43 movl %eax,%es 45 movl %eax, %es
44 movl %eax,%fs 46 movl %eax, %fs
45 movl %eax,%gs 47 movl %eax, %gs
46 movl %eax,%ss 48 movl %eax, %ss
471: 491:
48 50
49/* Calculate the delta between where we were compiled to run 51/*
52 * Calculate the delta between where we were compiled to run
50 * at and where we were actually loaded at. This can only be done 53 * at and where we were actually loaded at. This can only be done
51 * with a short local call on x86. Nothing else will tell us what 54 * with a short local call on x86. Nothing else will tell us what
52 * address we are running at. The reserved chunk of the real-mode 55 * address we are running at. The reserved chunk of the real-mode
53 * data at 0x1e4 (defined as a scratch field) are used as the stack 56 * data at 0x1e4 (defined as a scratch field) are used as the stack
54 * for this calculation. Only 4 bytes are needed. 57 * for this calculation. Only 4 bytes are needed.
55 */ 58 */
56 leal (0x1e4+4)(%esi), %esp 59 leal (BP_scratch+4)(%esi), %esp
57 call 1f 60 call 1f
581: popl %ebp 611: popl %ebp
59 subl $1b, %ebp 62 subl $1b, %ebp
60 63
61/* %ebp contains the address we are loaded at by the boot loader and %ebx 64/*
65 * %ebp contains the address we are loaded at by the boot loader and %ebx
62 * contains the address where we should move the kernel image temporarily 66 * contains the address where we should move the kernel image temporarily
63 * for safe in-place decompression. 67 * for safe in-place decompression.
64 */ 68 */
65 69
66#ifdef CONFIG_RELOCATABLE 70#ifdef CONFIG_RELOCATABLE
67 movl %ebp, %ebx 71 movl %ebp, %ebx
68 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx 72 movl BP_kernel_alignment(%esi), %eax
69 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx 73 decl %eax
74 addl %eax, %ebx
75 notl %eax
76 andl %eax, %ebx
70#else 77#else
71 movl $LOAD_PHYSICAL_ADDR, %ebx 78 movl $LOAD_PHYSICAL_ADDR, %ebx
72#endif 79#endif
73 80
74 /* Replace the compressed data size with the uncompressed size */ 81 /* Target address to relocate to for decompression */
75 subl input_len(%ebp), %ebx 82 addl $z_extract_offset, %ebx
76 movl output_len(%ebp), %eax 83
77 addl %eax, %ebx 84 /* Set up the stack */
78 /* Add 8 bytes for every 32K input block */ 85 leal boot_stack_end(%ebx), %esp
79 shrl $12, %eax 86
80 addl %eax, %ebx 87 /* Zero EFLAGS */
81 /* Add 32K + 18 bytes of extra slack */ 88 pushl $0
82 addl $(32768 + 18), %ebx 89 popfl
83 /* Align on a 4K boundary */ 90
84 addl $4095, %ebx 91/*
85 andl $~4095, %ebx 92 * Copy the compressed kernel to the end of our buffer
86
87/* Copy the compressed kernel to the end of our buffer
88 * where decompression in place becomes safe. 93 * where decompression in place becomes safe.
89 */ 94 */
90 pushl %esi 95 pushl %esi
91 leal _end(%ebp), %esi 96 leal (_bss-4)(%ebp), %esi
92 leal _end(%ebx), %edi 97 leal (_bss-4)(%ebx), %edi
93 movl $(_end - startup_32), %ecx 98 movl $(_bss - startup_32), %ecx
99 shrl $2, %ecx
94 std 100 std
95 rep 101 rep movsl
96 movsb
97 cld 102 cld
98 popl %esi 103 popl %esi
99
100/* Compute the kernel start address.
101 */
102#ifdef CONFIG_RELOCATABLE
103 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
104 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
105#else
106 movl $LOAD_PHYSICAL_ADDR, %ebp
107#endif
108 104
109/* 105/*
110 * Jump to the relocated address. 106 * Jump to the relocated address.
111 */ 107 */
112 leal relocated(%ebx), %eax 108 leal relocated(%ebx), %eax
113 jmp *%eax 109 jmp *%eax
114ENDPROC(startup_32) 110ENDPROC(startup_32)
115 111
116.section ".text" 112 .text
117relocated: 113relocated:
118 114
119/* 115/*
120 * Clear BSS 116 * Clear BSS (stack is currently empty)
121 */
122 xorl %eax,%eax
123 leal _edata(%ebx),%edi
124 leal _end(%ebx), %ecx
125 subl %edi,%ecx
126 cld
127 rep
128 stosb
129
130/*
131 * Setup the stack for the decompressor
132 */ 117 */
133 leal boot_stack_end(%ebx), %esp 118 xorl %eax, %eax
119 leal _bss(%ebx), %edi
120 leal _ebss(%ebx), %ecx
121 subl %edi, %ecx
122 shrl $2, %ecx
123 rep stosl
134 124
135/* 125/*
136 * Do the decompression, and jump to the new kernel.. 126 * Do the decompression, and jump to the new kernel..
137 */ 127 */
138 movl output_len(%ebx), %eax 128 leal z_extract_offset_negative(%ebx), %ebp
139 pushl %eax 129 /* push arguments for decompress_kernel: */
140 # push arguments for decompress_kernel: 130 pushl %ebp /* output address */
141 pushl %ebp # output address 131 pushl $z_input_len /* input_len */
142 movl input_len(%ebx), %eax 132 leal input_data(%ebx), %eax
143 pushl %eax # input_len 133 pushl %eax /* input_data */
144 leal input_data(%ebx), %eax 134 leal boot_heap(%ebx), %eax
145 pushl %eax # input_data 135 pushl %eax /* heap area */
146 leal boot_heap(%ebx), %eax 136 pushl %esi /* real mode pointer */
147 pushl %eax # heap area 137 call decompress_kernel
148 pushl %esi # real mode pointer 138 addl $20, %esp
149 call decompress_kernel
150 addl $20, %esp
151 popl %ecx
152 139
153#if CONFIG_RELOCATABLE 140#if CONFIG_RELOCATABLE
154/* Find the address of the relocations. 141/*
142 * Find the address of the relocations.
155 */ 143 */
156 movl %ebp, %edi 144 leal z_output_len(%ebp), %edi
157 addl %ecx, %edi
158 145
159/* Calculate the delta between where vmlinux was compiled to run 146/*
147 * Calculate the delta between where vmlinux was compiled to run
160 * and where it was actually loaded. 148 * and where it was actually loaded.
161 */ 149 */
162 movl %ebp, %ebx 150 movl %ebp, %ebx
163 subl $LOAD_PHYSICAL_ADDR, %ebx 151 subl $LOAD_PHYSICAL_ADDR, %ebx
164 jz 2f /* Nothing to be done if loaded at compiled addr. */ 152 jz 2f /* Nothing to be done if loaded at compiled addr. */
165/* 153/*
166 * Process relocations. 154 * Process relocations.
167 */ 155 */
168 156
1691: subl $4, %edi 1571: subl $4, %edi
170 movl 0(%edi), %ecx 158 movl (%edi), %ecx
171 testl %ecx, %ecx 159 testl %ecx, %ecx
172 jz 2f 160 jz 2f
173 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) 161 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
174 jmp 1b 162 jmp 1b
1752: 1632:
176#endif 164#endif
177 165
178/* 166/*
179 * Jump to the decompressed kernel. 167 * Jump to the decompressed kernel.
180 */ 168 */
181 xorl %ebx,%ebx 169 xorl %ebx, %ebx
182 jmp *%ebp 170 jmp *%ebp
183 171
184.bss 172/*
185/* Stack and heap for uncompression */ 173 * Stack and heap for uncompression
186.balign 4 174 */
175 .bss
176 .balign 4
187boot_heap: 177boot_heap:
188 .fill BOOT_HEAP_SIZE, 1, 0 178 .fill BOOT_HEAP_SIZE, 1, 0
189boot_stack: 179boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a82948002..f62c284db9eb 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.code32 24 .code32
25.text 25 .text
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
@@ -33,12 +33,14 @@
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
35 35
36.section ".text.head" 36 .section ".text.head"
37 .code32 37 .code32
38ENTRY(startup_32) 38ENTRY(startup_32)
39 cld 39 cld
40 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 40 /*
41 * us to not reload segments */ 41 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
42 * us to not reload segments
43 */
42 testb $(1<<6), BP_loadflags(%esi) 44 testb $(1<<6), BP_loadflags(%esi)
43 jnz 1f 45 jnz 1f
44 46
@@ -49,14 +51,15 @@ ENTRY(startup_32)
49 movl %eax, %ss 51 movl %eax, %ss
501: 521:
51 53
52/* Calculate the delta between where we were compiled to run 54/*
55 * Calculate the delta between where we were compiled to run
53 * at and where we were actually loaded at. This can only be done 56 * at and where we were actually loaded at. This can only be done
54 * with a short local call on x86. Nothing else will tell us what 57 * with a short local call on x86. Nothing else will tell us what
55 * address we are running at. The reserved chunk of the real-mode 58 * address we are running at. The reserved chunk of the real-mode
56 * data at 0x1e4 (defined as a scratch field) are used as the stack 59 * data at 0x1e4 (defined as a scratch field) are used as the stack
57 * for this calculation. Only 4 bytes are needed. 60 * for this calculation. Only 4 bytes are needed.
58 */ 61 */
59 leal (0x1e4+4)(%esi), %esp 62 leal (BP_scratch+4)(%esi), %esp
60 call 1f 63 call 1f
611: popl %ebp 641: popl %ebp
62 subl $1b, %ebp 65 subl $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
70 testl %eax, %eax 73 testl %eax, %eax
71 jnz no_longmode 74 jnz no_longmode
72 75
73/* Compute the delta between where we were compiled to run at 76/*
77 * Compute the delta between where we were compiled to run at
74 * and where the code will actually run at. 78 * and where the code will actually run at.
75 */ 79 *
76/* %ebp contains the address we are loaded at by the boot loader and %ebx 80 * %ebp contains the address we are loaded at by the boot loader and %ebx
77 * contains the address where we should move the kernel image temporarily 81 * contains the address where we should move the kernel image temporarily
78 * for safe in-place decompression. 82 * for safe in-place decompression.
79 */ 83 */
80 84
81#ifdef CONFIG_RELOCATABLE 85#ifdef CONFIG_RELOCATABLE
82 movl %ebp, %ebx 86 movl %ebp, %ebx
83 addl $(PMD_PAGE_SIZE -1), %ebx 87 movl BP_kernel_alignment(%esi), %eax
84 andl $PMD_PAGE_MASK, %ebx 88 decl %eax
89 addl %eax, %ebx
90 notl %eax
91 andl %eax, %ebx
85#else 92#else
86 movl $CONFIG_PHYSICAL_START, %ebx 93 movl $LOAD_PHYSICAL_ADDR, %ebx
87#endif 94#endif
88 95
89 /* Replace the compressed data size with the uncompressed size */ 96 /* Target address to relocate to for decompression */
90 subl input_len(%ebp), %ebx 97 addl $z_extract_offset, %ebx
91 movl output_len(%ebp), %eax
92 addl %eax, %ebx
93 /* Add 8 bytes for every 32K input block */
94 shrl $12, %eax
95 addl %eax, %ebx
96 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
97 addl $(32768 + 18 + 4095), %ebx
98 andl $~4095, %ebx
99 98
100/* 99/*
101 * Prepare for entering 64 bit mode 100 * Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
114 /* 113 /*
115 * Build early 4G boot pagetable 114 * Build early 4G boot pagetable
116 */ 115 */
117 /* Initialize Page tables to 0*/ 116 /* Initialize Page tables to 0 */
118 leal pgtable(%ebx), %edi 117 leal pgtable(%ebx), %edi
119 xorl %eax, %eax 118 xorl %eax, %eax
120 movl $((4096*6)/4), %ecx 119 movl $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
155 btsl $_EFER_LME, %eax 154 btsl $_EFER_LME, %eax
156 wrmsr 155 wrmsr
157 156
158 /* Setup for the jump to 64bit mode 157 /*
158 * Setup for the jump to 64bit mode
159 * 159 *
160 * When the jump is performend we will be in long mode but 160 * When the jump is performend we will be in long mode but
161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu_64.S"
186 186
187 /* Be careful here startup_64 needs to be at a predictable 187 /*
188 * Be careful here startup_64 needs to be at a predictable
188 * address so I can export it in an ELF header. Bootloaders 189 * address so I can export it in an ELF header. Bootloaders
189 * should look at the ELF header to find this address, as 190 * should look at the ELF header to find this address, as
190 * it may change in the future. 191 * it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
192 .code64 193 .code64
193 .org 0x200 194 .org 0x200
194ENTRY(startup_64) 195ENTRY(startup_64)
195 /* We come here either from startup_32 or directly from a 196 /*
197 * We come here either from startup_32 or directly from a
196 * 64bit bootloader. If we come here from a bootloader we depend on 198 * 64bit bootloader. If we come here from a bootloader we depend on
197 * an identity mapped page table being provied that maps our 199 * an identity mapped page table being provied that maps our
198 * entire text+data+bss and hopefully all of memory. 200 * entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
209 movl $0x20, %eax 211 movl $0x20, %eax
210 ltr %ax 212 ltr %ax
211 213
212 /* Compute the decompressed kernel start address. It is where 214 /*
215 * Compute the decompressed kernel start address. It is where
213 * we were loaded at aligned to a 2M boundary. %rbp contains the 216 * we were loaded at aligned to a 2M boundary. %rbp contains the
214 * decompressed kernel start address. 217 * decompressed kernel start address.
215 * 218 *
216 * If it is a relocatable kernel then decompress and run the kernel 219 * If it is a relocatable kernel then decompress and run the kernel
217 * from load address aligned to 2MB addr, otherwise decompress and 220 * from load address aligned to 2MB addr, otherwise decompress and
218 * run the kernel from CONFIG_PHYSICAL_START 221 * run the kernel from LOAD_PHYSICAL_ADDR
222 *
223 * We cannot rely on the calculation done in 32-bit mode, since we
224 * may have been invoked via the 64-bit entry point.
219 */ 225 */
220 226
221 /* Start with the delta to where the kernel will run at. */ 227 /* Start with the delta to where the kernel will run at. */
222#ifdef CONFIG_RELOCATABLE 228#ifdef CONFIG_RELOCATABLE
223 leaq startup_32(%rip) /* - $startup_32 */, %rbp 229 leaq startup_32(%rip) /* - $startup_32 */, %rbp
224 addq $(PMD_PAGE_SIZE - 1), %rbp 230 movl BP_kernel_alignment(%rsi), %eax
225 andq $PMD_PAGE_MASK, %rbp 231 decl %eax
226 movq %rbp, %rbx 232 addq %rax, %rbp
233 notq %rax
234 andq %rax, %rbp
227#else 235#else
228 movq $CONFIG_PHYSICAL_START, %rbp 236 movq $LOAD_PHYSICAL_ADDR, %rbp
229 movq %rbp, %rbx
230#endif 237#endif
231 238
232 /* Replace the compressed data size with the uncompressed size */ 239 /* Target address to relocate to for decompression */
233 movl input_len(%rip), %eax 240 leaq z_extract_offset(%rbp), %rbx
234 subq %rax, %rbx 241
235 movl output_len(%rip), %eax 242 /* Set up the stack */
236 addq %rax, %rbx 243 leaq boot_stack_end(%rbx), %rsp
237 /* Add 8 bytes for every 32K input block */ 244
238 shrq $12, %rax 245 /* Zero EFLAGS */
239 addq %rax, %rbx 246 pushq $0
240 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ 247 popfq
241 addq $(32768 + 18 + 4095), %rbx 248
242 andq $~4095, %rbx 249/*
243 250 * Copy the compressed kernel to the end of our buffer
244/* Copy the compressed kernel to the end of our buffer
245 * where decompression in place becomes safe. 251 * where decompression in place becomes safe.
246 */ 252 */
247 leaq _end_before_pgt(%rip), %r8 253 pushq %rsi
248 leaq _end_before_pgt(%rbx), %r9 254 leaq (_bss-8)(%rip), %rsi
249 movq $_end_before_pgt /* - $startup_32 */, %rcx 255 leaq (_bss-8)(%rbx), %rdi
2501: subq $8, %r8 256 movq $_bss /* - $startup_32 */, %rcx
251 subq $8, %r9 257 shrq $3, %rcx
252 movq 0(%r8), %rax 258 std
253 movq %rax, 0(%r9) 259 rep movsq
254 subq $8, %rcx 260 cld
255 jnz 1b 261 popq %rsi
256 262
257/* 263/*
258 * Jump to the relocated address. 264 * Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
260 leaq relocated(%rbx), %rax 266 leaq relocated(%rbx), %rax
261 jmp *%rax 267 jmp *%rax
262 268
263.section ".text" 269 .text
264relocated: 270relocated:
265 271
266/* 272/*
267 * Clear BSS 273 * Clear BSS (stack is currently empty)
268 */ 274 */
269 xorq %rax, %rax 275 xorl %eax, %eax
270 leaq _edata(%rbx), %rdi 276 leaq _bss(%rip), %rdi
271 leaq _end_before_pgt(%rbx), %rcx 277 leaq _ebss(%rip), %rcx
272 subq %rdi, %rcx 278 subq %rdi, %rcx
273 cld 279 shrq $3, %rcx
274 rep 280 rep stosq
275 stosb
276
277 /* Setup the stack */
278 leaq boot_stack_end(%rip), %rsp
279
280 /* zero EFLAGS after setting rsp */
281 pushq $0
282 popfq
283 281
284/* 282/*
285 * Do the decompression, and jump to the new kernel.. 283 * Do the decompression, and jump to the new kernel..
286 */ 284 */
287 pushq %rsi # Save the real mode argument 285 pushq %rsi /* Save the real mode argument */
288 movq %rsi, %rdi # real mode address 286 movq %rsi, %rdi /* real mode address */
289 leaq boot_heap(%rip), %rsi # malloc area for uncompression 287 leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
290 leaq input_data(%rip), %rdx # input_data 288 leaq input_data(%rip), %rdx /* input_data */
291 movl input_len(%rip), %eax 289 movl $z_input_len, %ecx /* input_len */
292 movq %rax, %rcx # input_len 290 movq %rbp, %r8 /* output target address */
293 movq %rbp, %r8 # output
294 call decompress_kernel 291 call decompress_kernel
295 popq %rsi 292 popq %rsi
296 293
@@ -311,11 +308,21 @@ gdt:
311 .quad 0x0000000000000000 /* TS continued */ 308 .quad 0x0000000000000000 /* TS continued */
312gdt_end: 309gdt_end:
313 310
314.bss 311/*
315/* Stack and heap for uncompression */ 312 * Stack and heap for uncompression
316.balign 4 313 */
314 .bss
315 .balign 4
317boot_heap: 316boot_heap:
318 .fill BOOT_HEAP_SIZE, 1, 0 317 .fill BOOT_HEAP_SIZE, 1, 0
319boot_stack: 318boot_stack:
320 .fill BOOT_STACK_SIZE, 1, 0 319 .fill BOOT_STACK_SIZE, 1, 0
321boot_stack_end: 320boot_stack_end:
321
322/*
323 * Space for page tables (not in .bss so not zeroed)
324 */
325 .section ".pgtable","a",@nobits
326 .balign 4096
327pgtable:
328 .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684ff..842b2a36174a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
325 free_mem_ptr = heap; /* Heap */ 325 free_mem_ptr = heap; /* Heap */
326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
327 327
328 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
329 error("Destination address inappropriately aligned");
328#ifdef CONFIG_X86_64 330#ifdef CONFIG_X86_64
329 if ((unsigned long)output & (__KERNEL_ALIGN - 1)) 331 if (heap > 0x3fffffffffffUL)
330 error("Destination address not 2M aligned");
331 if ((unsigned long)output >= 0xffffffffffUL)
332 error("Destination address too large"); 332 error("Destination address too large");
333#else 333#else
334 if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
335 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
336 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 334 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
337 error("Destination address too large"); 335 error("Destination address too large");
336#endif
338#ifndef CONFIG_RELOCATABLE 337#ifndef CONFIG_RELOCATABLE
339 if ((u32)output != LOAD_PHYSICAL_ADDR) 338 if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
340 error("Wrong destination address"); 339 error("Wrong destination address");
341#endif 340#endif
342#endif
343 341
344 if (!quiet) 342 if (!quiet)
345 putstr("\nDecompressing Linux... "); 343 putstr("\nDecompressing Linux... ");
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 000000000000..bcbd36c41432
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright (C) 2009 Intel Corporation. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * H. Peter Anvin <hpa@linux.intel.com>
20 *
21 * ----------------------------------------------------------------------- */
22
23/*
24 * Compute the desired load offset from a compressed program; outputs
25 * a small assembly wrapper with the appropriate symbols defined.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <inttypes.h>
32
33static uint32_t getle32(const void *p)
34{
35 const uint8_t *cp = p;
36
37 return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
38 ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
39}
40
41int main(int argc, char *argv[])
42{
43 uint32_t olen;
44 long ilen;
45 unsigned long offs;
46 FILE *f;
47
48 if (argc < 2) {
49 fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
50 return 1;
51 }
52
53 /* Get the information for the compressed kernel image first */
54
55 f = fopen(argv[1], "r");
56 if (!f) {
57 perror(argv[1]);
58 return 1;
59 }
60
61
62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]);
64 }
65 fread(&olen, sizeof olen, 1, f);
66 ilen = ftell(f);
67 olen = getle32(&olen);
68 fclose(f);
69
70 /*
71 * Now we have the input (compressed) and output (uncompressed)
72 * sizes, compute the necessary decompression offset...
73 */
74
75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79
80 printf(".section \".rodata.compressed\",\"a\",@progbits\n");
81 printf(".globl z_input_len\n");
82 printf("z_input_len = %lu\n", ilen);
83 printf(".globl z_output_len\n");
84 printf("z_output_len = %lu\n", (unsigned long)olen);
85 printf(".globl z_extract_offset\n");
86 printf("z_extract_offset = 0x%lx\n", offs);
87 /* z_extract_offset_negative allows simplification of head_32.S */
88 printf(".globl z_extract_offset_negative\n");
89 printf("z_extract_offset_negative = -0x%lx\n", offs);
90
91 printf(".globl input_data, input_data_end\n");
92 printf("input_data:\n");
93 printf(".incbin \"%s\"\n", argv[1]);
94 printf("input_data_end:\n");
95
96 return 0;
97}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bce..cc353e1b3ffd 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 1OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
2
3#undef i386
4
5#include <asm/page_types.h>
6
7#ifdef CONFIG_X86_64
2OUTPUT_ARCH(i386:x86-64) 8OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64) 9ENTRY(startup_64)
10#else
11OUTPUT_ARCH(i386)
12ENTRY(startup_32)
13#endif
14
4SECTIONS 15SECTIONS
5{ 16{
6 /* Be careful parts of head_64.S assume startup_32 is at 17 /* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
33 *(.data.*) 44 *(.data.*)
34 _edata = . ; 45 _edata = . ;
35 } 46 }
47 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
36 .bss : { 48 .bss : {
37 _bss = . ; 49 _bss = . ;
38 *(.bss) 50 *(.bss)
39 *(.bss.*) 51 *(.bss.*)
40 *(COMMON) 52 *(COMMON)
41 . = ALIGN(8); 53 . = ALIGN(8); /* For convenience during zeroing */
42 _end_before_pgt = . ;
43 . = ALIGN(4096);
44 pgtable = . ;
45 . = . + 4096 * 6;
46 _ebss = .; 54 _ebss = .;
47 } 55 }
56#ifdef CONFIG_X86_64
57 . = ALIGN(PAGE_SIZE);
58 .pgtable : {
59 _pgtable = . ;
60 *(.pgtable)
61 _epgtable = . ;
62 }
63#endif
64 _end = .;
48} 65}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c48..000000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
1SECTIONS
2{
3 .rodata.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c40..000000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head_32.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .rodata.compressed : {
16 *(.rodata.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca1..c501a5b466f8 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
22 */ 23 */
23static int read_mbr(u8 devno, void *buf) 24static int read_mbr(u8 devno, void *buf)
24{ 25{
25 u16 ax, bx, cx, dx; 26 struct biosregs ireg, oreg;
26 27
27 ax = 0x0201; /* Legacy Read, one sector */ 28 initregs(&ireg);
28 cx = 0x0001; /* Sector 0-0-1 */ 29 ireg.ax = 0x0201; /* Legacy Read, one sector */
29 dx = devno; 30 ireg.cx = 0x0001; /* Sector 0-0-1 */
30 bx = (size_t)buf; 31 ireg.dl = devno;
31 asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" 32 ireg.bx = (size_t)buf;
32 : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
33 : : "esi", "edi", "memory");
34 33
35 return -(u8)ax; /* 0 or -1 */ 34 intcall(0x13, &ireg, &oreg);
35
36 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
36} 37}
37 38
38static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) 39static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
72 73
73static int get_edd_info(u8 devno, struct edd_info *ei) 74static int get_edd_info(u8 devno, struct edd_info *ei)
74{ 75{
75 u16 ax, bx, cx, dx, di; 76 struct biosregs ireg, oreg;
76 77
77 memset(ei, 0, sizeof *ei); 78 memset(ei, 0, sizeof *ei);
78 79
79 /* Check Extensions Present */ 80 /* Check Extensions Present */
80 81
81 ax = 0x4100; 82 initregs(&ireg);
82 bx = EDDMAGIC1; 83 ireg.ah = 0x41;
83 dx = devno; 84 ireg.bx = EDDMAGIC1;
84 asm("pushfl; stc; int $0x13; setc %%al; popfl" 85 ireg.dl = devno;
85 : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) 86 intcall(0x13, &ireg, &oreg);
86 : : "esi", "edi");
87 87
88 if ((u8)ax) 88 if (oreg.eflags & X86_EFLAGS_CF)
89 return -1; /* No extended information */ 89 return -1; /* No extended information */
90 90
91 if (bx != EDDMAGIC2) 91 if (oreg.bx != EDDMAGIC2)
92 return -1; 92 return -1;
93 93
94 ei->device = devno; 94 ei->device = devno;
95 ei->version = ax >> 8; /* EDD version number */ 95 ei->version = oreg.ah; /* EDD version number */
96 ei->interface_support = cx; /* EDD functionality subsets */ 96 ei->interface_support = oreg.cx; /* EDD functionality subsets */
97 97
98 /* Extended Get Device Parameters */ 98 /* Extended Get Device Parameters */
99 99
100 ei->params.length = sizeof(ei->params); 100 ei->params.length = sizeof(ei->params);
101 ax = 0x4800; 101 ireg.ah = 0x48;
102 dx = devno; 102 ireg.si = (size_t)&ei->params;
103 asm("pushfl; int $0x13; popfl" 103 intcall(0x13, &ireg, &oreg);
104 : "+a" (ax), "+d" (dx), "=m" (ei->params)
105 : "S" (&ei->params)
106 : "ebx", "ecx", "edi");
107 104
108 /* Get legacy CHS parameters */ 105 /* Get legacy CHS parameters */
109 106
110 /* Ralf Brown recommends setting ES:DI to 0:0 */ 107 /* Ralf Brown recommends setting ES:DI to 0:0 */
111 ax = 0x0800; 108 ireg.ah = 0x08;
112 dx = devno; 109 ireg.es = 0;
113 di = 0; 110 intcall(0x13, &ireg, &oreg);
114 asm("pushw %%es; " 111
115 "movw %%di,%%es; " 112 if (!(oreg.eflags & X86_EFLAGS_CF)) {
116 "pushfl; stc; int $0x13; setc %%al; popfl; " 113 ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
117 "popw %%es" 114 ei->legacy_max_head = oreg.dh;
118 : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) 115 ei->legacy_sectors_per_track = oreg.cl & 0x3f;
119 : : "esi");
120
121 if ((u8)ax == 0) {
122 ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
123 ei->legacy_max_head = dx >> 8;
124 ei->legacy_sectors_per_track = cx & 0x3f;
125 } 116 }
126 117
127 return 0; 118 return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4c..b31cc54b4641 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
22#include <asm/page_types.h> 22#include <asm/page_types.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "voffset.h"
26#include "zoffset.h"
26 27
27BOOTSEG = 0x07C0 /* original address of boot-sector */ 28BOOTSEG = 0x07C0 /* original address of boot-sector */
28SYSSEG = 0x1000 /* historical load address >> 4 */ 29SYSSEG = 0x1000 /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
115 # Part 2 of the header, from the old setup.S 116 # Part 2 of the header, from the old setup.S
116 117
117 .ascii "HdrS" # header signature 118 .ascii "HdrS" # header signature
118 .word 0x0209 # header version number (>= 0x0105) 119 .word 0x020a # header version number (>= 0x0105)
119 # or else old loadlin-1.5 will fail) 120 # or else old loadlin-1.5 will fail)
120 .globl realmode_swtch 121 .globl realmode_swtch
121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 122realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512
168 # end of setup code can be used by setup 169 # end of setup code can be used by setup
169 # for local heap purposes. 170 # for local heap purposes.
170 171
171pad1: .word 0 172ext_loader_ver:
173 .byte 0 # Extended boot loader version
174ext_loader_type:
175 .byte 0 # Extended boot loader type
176
172cmd_line_ptr: .long 0 # (Header version 0x0202 or later) 177cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
173 # If nonzero, a 32-bit pointer 178 # If nonzero, a 32-bit pointer
174 # to the kernel command line. 179 # to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel: .byte 1
200#else 205#else
201relocatable_kernel: .byte 0 206relocatable_kernel: .byte 0
202#endif 207#endif
203pad2: .byte 0 208min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
204pad3: .word 0 209pad3: .word 0
205 210
206cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 211cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
212 217
213hardware_subarch_data: .quad 0 218hardware_subarch_data: .quad 0
214 219
215payload_offset: .long input_data 220payload_offset: .long ZO_input_data
216payload_length: .long input_data_end-input_data 221payload_length: .long ZO_z_input_len
217 222
218setup_data: .quad 0 # 64-bit physical pointer to 223setup_data: .quad 0 # 64-bit physical pointer to
219 # single linked list of 224 # single linked list of
220 # struct setup_data 225 # struct setup_data
221 226
227pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
228
229#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
230#define VO_INIT_SIZE (VO__end - VO__text)
231#if ZO_INIT_SIZE > VO_INIT_SIZE
232#define INIT_SIZE ZO_INIT_SIZE
233#else
234#define INIT_SIZE VO_INIT_SIZE
235#endif
236init_size: .long INIT_SIZE # kernel initialization size
237
222# End of setup header ##################################################### 238# End of setup header #####################################################
223 239
224 .section ".inittext", "ax" 240 .section ".entrytext", "ax"
225start_of_setup: 241start_of_setup:
226#ifdef SAFE_RESET_DISK_CONTROLLER 242#ifdef SAFE_RESET_DISK_CONTROLLER
227# Reset the disk controller. 243# Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae0..140172b895bd 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
61 */ 62 */
62static void keyboard_set_repeat(void) 63static void keyboard_set_repeat(void)
63{ 64{
64 u16 ax = 0x0305; 65 struct biosregs ireg;
65 u16 bx = 0; 66 initregs(&ireg);
66 asm volatile("int $0x16" 67 ireg.ax = 0x0305;
67 : "+a" (ax), "+b" (bx) 68 intcall(0x16, &ireg, NULL);
68 : : "ecx", "edx", "esi", "edi");
69} 69}
70 70
71/* 71/*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 struct biosregs ireg, oreg;
77
76 /* Some older BIOSes apparently crash on this call, so filter 78 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */ 79 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6) 80 if (cpu.level < 6)
79 return; 81 return;
80 82
81 asm("int $0x15" 83 initregs(&ireg);
82 : "=a" (boot_params.ist_info.signature), 84 ireg.ax = 0xe980; /* IST Support */
83 "=b" (boot_params.ist_info.command), 85 ireg.edx = 0x47534943; /* Request value */
84 "=c" (boot_params.ist_info.event), 86 intcall(0x15, &ireg, &oreg);
85 "=d" (boot_params.ist_info.perf_level) 87
86 : "a" (0x0000e980), /* IST Support */ 88 boot_params.ist_info.signature = oreg.eax;
87 "d" (0x47534943)); /* Request value */ 89 boot_params.ist_info.command = oreg.ebx;
90 boot_params.ist_info.event = oreg.ecx;
91 boot_params.ist_info.perf_level = oreg.edx;
88} 92}
89 93
90/* 94/*
@@ -93,13 +97,12 @@ static void query_ist(void)
93static void set_bios_mode(void) 97static void set_bios_mode(void)
94{ 98{
95#ifdef CONFIG_X86_64 99#ifdef CONFIG_X86_64
96 u32 eax, ebx; 100 struct biosregs ireg;
97 101
98 eax = 0xec00; 102 initregs(&ireg);
99 ebx = 2; 103 ireg.ax = 0xec00;
100 asm volatile("int $0x15" 104 ireg.bx = 2;
101 : "+a" (eax), "+b" (ebx) 105 intcall(0x15, &ireg, NULL);
102 : : "ecx", "edx", "esi", "edi");
103#endif 106#endif
104} 107}
105 108
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d696..a95a531148ef 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
16 17
17int query_mca(void) 18int query_mca(void)
18{ 19{
19 u8 err; 20 struct biosregs ireg, oreg;
20 u16 es, bx, len; 21 u16 len;
21 22
22 asm("pushw %%es ; " 23 initregs(&ireg);
23 "int $0x15 ; " 24 ireg.ah = 0xc0;
24 "setc %0 ; " 25 intcall(0x15, &ireg, &oreg);
25 "movw %%es, %1 ; " 26
26 "popw %%es" 27 if (oreg.eflags & X86_EFLAGS_CF)
27 : "=acd" (err), "=acdSD" (es), "=b" (bx)
28 : "a" (0xc000));
29
30 if (err)
31 return -1; /* No MCA present */ 28 return -1; /* No MCA present */
32 29
33 set_fs(es); 30 set_fs(oreg.es);
34 len = rdfs16(bx); 31 len = rdfs16(oreg.bx);
35 32
36 if (len > sizeof(boot_params.sys_desc_table)) 33 if (len > sizeof(boot_params.sys_desc_table))
37 len = sizeof(boot_params.sys_desc_table); 34 len = sizeof(boot_params.sys_desc_table);
38 35
39 copy_from_fs(&boot_params.sys_desc_table, bx, len); 36 copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
40 return 0; 37 return 0;
41} 38}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 74b3d2ba84e9..cae3feb1035e 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,12 +20,16 @@
20static int detect_memory_e820(void) 20static int detect_memory_e820(void)
21{ 21{
22 int count = 0; 22 int count = 0;
23 u32 next = 0; 23 struct biosregs ireg, oreg;
24 u32 size, id, edi;
25 u8 err;
26 struct e820entry *desc = boot_params.e820_map; 24 struct e820entry *desc = boot_params.e820_map;
27 static struct e820entry buf; /* static so it is zeroed */ 25 static struct e820entry buf; /* static so it is zeroed */
28 26
27 initregs(&ireg);
28 ireg.ax = 0xe820;
29 ireg.cx = sizeof buf;
30 ireg.edx = SMAP;
31 ireg.di = (size_t)&buf;
32
29 /* 33 /*
30 * Note: at least one BIOS is known which assumes that the 34 * Note: at least one BIOS is known which assumes that the
31 * buffer pointed to by one e820 call is the same one as 35 * buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
41 */ 45 */
42 46
43 do { 47 do {
44 size = sizeof buf; 48 intcall(0x15, &ireg, &oreg);
45 49 ireg.ebx = oreg.ebx; /* for next iteration... */
46 /* Important: %edx and %esi are clobbered by some BIOSes,
47 so they must be either used for the error output
48 or explicitly marked clobbered. Given that, assume there
49 is something out there clobbering %ebp and %edi, too. */
50 asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
51 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
52 "=D" (edi), "+m" (buf)
53 : "D" (&buf), "d" (SMAP), "a" (0xe820)
54 : "esi");
55 50
56 /* BIOSes which terminate the chain with CF = 1 as opposed 51 /* BIOSes which terminate the chain with CF = 1 as opposed
57 to %ebx = 0 don't always report the SMAP signature on 52 to %ebx = 0 don't always report the SMAP signature on
58 the final, failing, probe. */ 53 the final, failing, probe. */
59 if (err) 54 if (oreg.eflags & X86_EFLAGS_CF)
60 break; 55 break;
61 56
62 /* Some BIOSes stop returning SMAP in the middle of 57 /* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
64 screwed up the map at that point, we might have a 59 screwed up the map at that point, we might have a
65 partial map, the full map, or complete garbage, so 60 partial map, the full map, or complete garbage, so
66 just return failure. */ 61 just return failure. */
67 if (id != SMAP) { 62 if (oreg.eax != SMAP) {
68 count = 0; 63 count = 0;
69 break; 64 break;
70 } 65 }
71 66
72 *desc++ = buf; 67 *desc++ = buf;
73 count++; 68 count++;
74 } while (next && count < ARRAY_SIZE(boot_params.e820_map)); 69 } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
75 70
76 return boot_params.e820_entries = count; 71 return boot_params.e820_entries = count;
77} 72}
78 73
79static int detect_memory_e801(void) 74static int detect_memory_e801(void)
80{ 75{
81 u16 ax, bx, cx, dx; 76 struct biosregs ireg, oreg;
82 u8 err;
83 77
84 bx = cx = dx = 0; 78 initregs(&ireg);
85 ax = 0xe801; 79 ireg.ax = 0xe801;
86 asm("stc; int $0x15; setc %0" 80 intcall(0x15, &ireg, &oreg);
87 : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
88 81
89 if (err) 82 if (oreg.eflags & X86_EFLAGS_CF)
90 return -1; 83 return -1;
91 84
92 /* Do we really need to do this? */ 85 /* Do we really need to do this? */
93 if (cx || dx) { 86 if (oreg.cx || oreg.dx) {
94 ax = cx; 87 oreg.ax = oreg.cx;
95 bx = dx; 88 oreg.bx = oreg.dx;
96 } 89 }
97 90
98 if (ax > 15*1024) 91 if (oreg.ax > 15*1024) {
99 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
100 93 } else if (oreg.ax == 15*1024) {
101 /* This ignores memory above 16MB if we have a memory hole 94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
102 there. If someone actually finds a machine with a memory 95 } else {
103 hole at 16MB and no support for 0E820h they should probably 96 /*
104 generate a fake e820 map. */ 97 * This ignores memory above 16MB if we have a memory
105 boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; 98 * hole there. If someone actually finds a machine
99 * with a memory hole at 16MB and no support for
100 * 0E820h they should probably generate a fake e820
101 * map.
102 */
103 boot_params.alt_mem_k = oreg.ax;
104 }
106 105
107 return 0; 106 return 0;
108} 107}
109 108
110static int detect_memory_88(void) 109static int detect_memory_88(void)
111{ 110{
112 u16 ax; 111 struct biosregs ireg, oreg;
113 u8 err;
114 112
115 ax = 0x8800; 113 initregs(&ireg);
116 asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); 114 ireg.ah = 0x88;
115 intcall(0x15, &ireg, &oreg);
117 116
118 boot_params.screen_info.ext_mem_k = ax; 117 boot_params.screen_info.ext_mem_k = oreg.ax;
119 118
120 return -err; 119 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
121} 120}
122 121
123int detect_memory(void) 122int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 000000000000..958019b1cfa5
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * Simple helper function for initializing a register set.
13 *
14 * Note that this sets EFLAGS_CF in the input register set; this
15 * makes it easier to catch functions which do nothing but don't
16 * explicitly set CF.
17 */
18
19#include "boot.h"
20
21void initregs(struct biosregs *reg)
22{
23 memset(reg, 0, sizeof *reg);
24 reg->eflags |= X86_EFLAGS_CF;
25 reg->ds = ds();
26 reg->es = ds();
27 reg->fs = fs();
28 reg->gs = gs();
29}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de7969..0f6ec455a2b1 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
15 15
16 . = 497; 16 . = 497;
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .entrytext : { *(.entrytext) }
18 .inittext : { *(.inittext) } 19 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 20 .initdata : { *(.initdata) }
21 __end_init = .;
22
20 .text : { *(.text) } 23 .text : { *(.text) }
21 .text32 : { *(.text32) } 24 .text32 : { *(.text32) }
22 25
@@ -52,4 +55,7 @@ SECTIONS
52 55
53 . = ASSERT(_end <= 0x8000, "Setup too big!"); 56 . = ASSERT(_end <= 0x8000, "Setup too big!");
54 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); 57 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
58 /* Necessary for the very-old-loader check to work... */
59 . = ASSERT(__end_init <= 5*512, "init sections too big!");
60
55} 61}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f6..01ec69c901c7 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
22 23
23void __attribute__((section(".inittext"))) putchar(int ch) 24void __attribute__((section(".inittext"))) putchar(int ch)
24{ 25{
25 unsigned char c = ch; 26 struct biosregs ireg;
26 27
27 if (c == '\n') 28 if (ch == '\n')
28 putchar('\r'); /* \n -> \r\n */ 29 putchar('\r'); /* \n -> \r\n */
29 30
30 /* int $0x10 is known to have bugs involving touching registers 31 initregs(&ireg);
31 it shouldn't. Be extra conservative... */ 32 ireg.bx = 0x0007;
32 asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" 33 ireg.cx = 0x0001;
33 : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); 34 ireg.ah = 0x0e;
35 ireg.al = ch;
36 intcall(0x10, &ireg, NULL);
34} 37}
35 38
36void __attribute__((section(".inittext"))) puts(const char *str) 39void __attribute__((section(".inittext"))) puts(const char *str)
37{ 40{
38 int n = 0; 41 while (*str)
39 while (*str) {
40 putchar(*str++); 42 putchar(*str++);
41 n++;
42 }
43} 43}
44 44
45/* 45/*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
49 49
50static u8 gettime(void) 50static u8 gettime(void)
51{ 51{
52 u16 ax = 0x0200; 52 struct biosregs ireg, oreg;
53 u16 cx, dx;
54 53
55 asm volatile("int $0x1a" 54 initregs(&ireg);
56 : "+a" (ax), "=c" (cx), "=d" (dx) 55 ireg.ah = 0x02;
57 : : "ebx", "esi", "edi"); 56 intcall(0x1a, &ireg, &oreg);
58 57
59 return dx >> 8; 58 return oreg.dh;
60} 59}
61 60
62/* 61/*
@@ -64,19 +63,24 @@ static u8 gettime(void)
64 */ 63 */
65int getchar(void) 64int getchar(void)
66{ 65{
67 u16 ax = 0; 66 struct biosregs ireg, oreg;
68 asm volatile("int $0x16" : "+a" (ax)); 67
68 initregs(&ireg);
69 /* ireg.ah = 0x00; */
70 intcall(0x16, &ireg, &oreg);
69 71
70 return ax & 0xff; 72 return oreg.al;
71} 73}
72 74
73static int kbd_pending(void) 75static int kbd_pending(void)
74{ 76{
75 u8 pending; 77 struct biosregs ireg, oreg;
76 asm volatile("int $0x16; setnz %0" 78
77 : "=qm" (pending) 79 initregs(&ireg);
78 : "a" (0x0100)); 80 ireg.ah = 0x01;
79 return pending; 81 intcall(0x16, &ireg, &oreg);
82
83 return !(oreg.eflags & X86_EFLAGS_ZF);
80} 84}
81 85
82void kbd_flush(void) 86void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c363..d660be492363 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
29 30
30static int set_bios_mode(u8 mode) 31static int set_bios_mode(u8 mode)
31{ 32{
32 u16 ax; 33 struct biosregs ireg, oreg;
33 u8 new_mode; 34 u8 new_mode;
34 35
35 ax = mode; /* AH=0x00 Set Video Mode */ 36 initregs(&ireg);
36 asm volatile(INT10 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
37 : "+a" (ax) 38 intcall(0x10, &ireg, NULL);
38 : : "ebx", "ecx", "edx", "esi", "edi");
39 39
40 ax = 0x0f00; /* Get Current Video Mode */ 40
41 asm volatile(INT10 41 ireg.ah = 0x0f; /* Get Current Video Mode */
42 : "+a" (ax) 42 intcall(0x10, &ireg, &oreg);
43 : : "ebx", "ecx", "edx", "esi", "edi");
44 43
45 do_restore = 1; /* Assume video contents were lost */ 44 do_restore = 1; /* Assume video contents were lost */
46 new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ 45
46 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f;
47 48
48 if (new_mode == mode) 49 if (new_mode == mode)
49 return 0; /* Mode change OK */ 50 return 0; /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
53 /* Mode setting failed, but we didn't end up where we 54 /* Mode setting failed, but we didn't end up where we
54 started. That's bad. Try to revert to the original 55 started. That's bad. Try to revert to the original
55 video mode. */ 56 video mode. */
56 ax = boot_params.screen_info.orig_video_mode; 57 ireg.ax = boot_params.screen_info.orig_video_mode;
57 asm volatile(INT10 58 intcall(0x10, &ireg, NULL);
58 : "+a" (ax)
59 : : "ebx", "ecx", "edx", "esi", "edi");
60 } 59 }
61#endif 60#endif
62 return -1; 61 return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f69..c700147d6ffb 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
31static int vesa_probe(void) 32static int vesa_probe(void)
32{ 33{
33#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) 34#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
34 u16 ax, cx, di; 35 struct biosregs ireg, oreg;
35 u16 mode; 36 u16 mode;
36 addr_t mode_ptr; 37 addr_t mode_ptr;
37 struct mode_info *mi; 38 struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
39 40
40 video_vesa.modes = GET_HEAP(struct mode_info, 0); 41 video_vesa.modes = GET_HEAP(struct mode_info, 0);
41 42
42 ax = 0x4f00; 43 initregs(&ireg);
43 di = (size_t)&vginfo; 44 ireg.ax = 0x4f00;
44 asm(INT10 45 ireg.di = (size_t)&vginfo;
45 : "+a" (ax), "+D" (di), "=m" (vginfo) 46 intcall(0x10, &ireg, &oreg);
46 : : "ebx", "ecx", "edx", "esi");
47 47
48 if (ax != 0x004f || 48 if (ireg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
65 65
66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
67 67
68 ax = 0x4f01; 68 ireg.ax = 0x4f01;
69 cx = mode; 69 ireg.cx = mode;
70 di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 asm(INT10 71 intcall(0x10, &ireg, &oreg);
72 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
73 : : "ebx", "edx", "esi");
74 72
75 if (ax != 0x004f) 73 if (ireg.ax != 0x004f)
76 continue; 74 continue;
77 75
78 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
111 109
112static int vesa_set_mode(struct mode_info *mode) 110static int vesa_set_mode(struct mode_info *mode)
113{ 111{
114 u16 ax, bx, cx, di; 112 struct biosregs ireg, oreg;
115 int is_graphic; 113 int is_graphic;
116 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; 114 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
117 115
118 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 116 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
119 117
120 ax = 0x4f01; 118 initregs(&ireg);
121 cx = vesa_mode; 119 ireg.ax = 0x4f01;
122 di = (size_t)&vminfo; 120 ireg.cx = vesa_mode;
123 asm(INT10 121 ireg.di = (size_t)&vminfo;
124 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) 122 intcall(0x10, &ireg, &oreg);
125 : : "ebx", "edx", "esi");
126 123
127 if (ax != 0x004f) 124 if (oreg.ax != 0x004f)
128 return -1; 125 return -1;
129 126
130 if ((vminfo.mode_attr & 0x15) == 0x05) { 127 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
141 } 138 }
142 139
143 140
144 ax = 0x4f02; 141 initregs(&ireg);
145 bx = vesa_mode; 142 ireg.ax = 0x4f02;
146 di = 0; 143 ireg.bx = vesa_mode;
147 asm volatile(INT10 144 intcall(0x10, &ireg, &oreg);
148 : "+a" (ax), "+b" (bx), "+D" (di)
149 : : "ecx", "edx", "esi");
150 145
151 if (ax != 0x004f) 146 if (oreg.ax != 0x004f)
152 return -1; 147 return -1;
153 148
154 graphic_mode = is_graphic; 149 graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
171/* Switch DAC to 8-bit mode */ 166/* Switch DAC to 8-bit mode */
172static void vesa_dac_set_8bits(void) 167static void vesa_dac_set_8bits(void)
173{ 168{
169 struct biosregs ireg, oreg;
174 u8 dac_size = 6; 170 u8 dac_size = 6;
175 171
176 /* If possible, switch the DAC to 8-bit mode */ 172 /* If possible, switch the DAC to 8-bit mode */
177 if (vginfo.capabilities & 1) { 173 if (vginfo.capabilities & 1) {
178 u16 ax, bx; 174 initregs(&ireg);
179 175 ireg.ax = 0x4f08;
180 ax = 0x4f08; 176 ireg.bh = 0x08;
181 bx = 0x0800; 177 intcall(0x10, &ireg, &oreg);
182 asm volatile(INT10 178 if (oreg.ax == 0x004f)
183 : "+a" (ax), "+b" (bx) 179 dac_size = oreg.bh;
184 : : "ecx", "edx", "esi", "edi");
185
186 if (ax == 0x004f)
187 dac_size = bx >> 8;
188 } 180 }
189 181
190 /* Set the color sizes to the DAC size, and offsets to 0 */ 182 /* Set the color sizes to the DAC size, and offsets to 0 */
191 boot_params.screen_info.red_size = dac_size; 183 boot_params.screen_info.red_size = dac_size;
192 boot_params.screen_info.green_size = dac_size; 184 boot_params.screen_info.green_size = dac_size;
193 boot_params.screen_info.blue_size = dac_size; 185 boot_params.screen_info.blue_size = dac_size;
194 boot_params.screen_info.rsvd_size = dac_size; 186 boot_params.screen_info.rsvd_size = dac_size;
195 187
196 boot_params.screen_info.red_pos = 0; 188 boot_params.screen_info.red_pos = 0;
197 boot_params.screen_info.green_pos = 0; 189 boot_params.screen_info.green_pos = 0;
198 boot_params.screen_info.blue_pos = 0; 190 boot_params.screen_info.blue_pos = 0;
199 boot_params.screen_info.rsvd_pos = 0; 191 boot_params.screen_info.rsvd_pos = 0;
200} 192}
201 193
202/* Save the VESA protected mode info */ 194/* Save the VESA protected mode info */
203static void vesa_store_pm_info(void) 195static void vesa_store_pm_info(void)
204{ 196{
205 u16 ax, bx, di, es; 197 struct biosregs ireg, oreg;
206 198
207 ax = 0x4f0a; 199 initregs(&ireg);
208 bx = di = 0; 200 ireg.ax = 0x4f0a;
209 asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" 201 intcall(0x10, &ireg, &oreg);
210 : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
211 : : "ecx", "esi");
212 202
213 if (ax != 0x004f) 203 if (oreg.ax != 0x004f)
214 return; 204 return;
215 205
216 boot_params.screen_info.vesapm_seg = es; 206 boot_params.screen_info.vesapm_seg = oreg.es;
217 boot_params.screen_info.vesapm_off = di; 207 boot_params.screen_info.vesapm_off = oreg.di;
218} 208}
219 209
220/* 210/*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
252void vesa_store_edid(void) 242void vesa_store_edid(void)
253{ 243{
254#ifdef CONFIG_FIRMWARE_EDID 244#ifdef CONFIG_FIRMWARE_EDID
255 u16 ax, bx, cx, dx, di; 245 struct biosregs ireg, oreg;
256 246
257 /* Apparently used as a nonsense token... */ 247 /* Apparently used as a nonsense token... */
258 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); 248 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
260 if (vginfo.version < 0x0200) 250 if (vginfo.version < 0x0200)
261 return; /* EDID requires VBE 2.0+ */ 251 return; /* EDID requires VBE 2.0+ */
262 252
263 ax = 0x4f15; /* VBE DDC */ 253 initregs(&ireg);
264 bx = 0x0000; /* Report DDC capabilities */ 254 ireg.ax = 0x4f15; /* VBE DDC */
265 cx = 0; /* Controller 0 */ 255 /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
266 di = 0; /* ES:DI must be 0 by spec */ 256 /* ireg.cx = 0; */ /* Controller 0 */
267 257 ireg.es = 0; /* ES:DI must be 0 by spec */
268 /* Note: The VBE DDC spec is different from the main VESA spec; 258 intcall(0x10, &ireg, &oreg);
269 we genuinely have to assume all registers are destroyed here. */
270
271 asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
272 : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
273 : : "esi", "edx");
274 259
275 if (ax != 0x004f) 260 if (oreg.ax != 0x004f)
276 return; /* No EDID */ 261 return; /* No EDID */
277 262
278 /* BH = time in seconds to transfer EDD information */ 263 /* BH = time in seconds to transfer EDD information */
279 /* BL = DDC level supported */ 264 /* BL = DDC level supported */
280 265
281 ax = 0x4f15; /* VBE DDC */ 266 ireg.ax = 0x4f15; /* VBE DDC */
282 bx = 0x0001; /* Read EDID */ 267 ireg.bx = 0x0001; /* Read EDID */
283 cx = 0; /* Controller 0 */ 268 /* ireg.cx = 0; */ /* Controller 0 */
284 dx = 0; /* EDID block number */ 269 /* ireg.dx = 0; */ /* EDID block number */
285 di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ 270 ireg.es = ds();
286 asm(INT10 271 ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
287 : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), 272 intcall(0x10, &ireg, &oreg);
288 "+c" (cx), "+D" (di)
289 : : "esi");
290#endif /* CONFIG_FIRMWARE_EDID */ 273#endif /* CONFIG_FIRMWARE_EDID */
291} 274}
292 275
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a37768..8f8d827e254d 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
39/* Set basic 80x25 mode */ 40/* Set basic 80x25 mode */
40static u8 vga_set_basic_mode(void) 41static u8 vga_set_basic_mode(void)
41{ 42{
43 struct biosregs ireg, oreg;
42 u16 ax; 44 u16 ax;
43 u8 rows; 45 u8 rows;
44 u8 mode; 46 u8 mode;
45 47
48 initregs(&ireg);
49
46#ifdef CONFIG_VIDEO_400_HACK 50#ifdef CONFIG_VIDEO_400_HACK
47 if (adapter >= ADAPTER_VGA) { 51 if (adapter >= ADAPTER_VGA) {
48 asm volatile(INT10 52 ireg.ax = 0x1202;
49 : : "a" (0x1202), "b" (0x0030) 53 ireg.bx = 0x0030;
50 : "ecx", "edx", "esi", "edi"); 54 intcall(0x10, &ireg, NULL);
51 } 55 }
52#endif 56#endif
53 57
54 ax = 0x0f00; 58 ax = 0x0f00;
55 asm volatile(INT10 59 intcall(0x10, &ireg, &oreg);
56 : "+a" (ax) 60 mode = oreg.al;
57 : : "ebx", "ecx", "edx", "esi", "edi");
58
59 mode = (u8)ax;
60 61
61 set_fs(0); 62 set_fs(0);
62 rows = rdfs8(0x484); /* rows minus one */ 63 rows = rdfs8(0x484); /* rows minus one */
63 64
64#ifndef CONFIG_VIDEO_400_HACK 65#ifndef CONFIG_VIDEO_400_HACK
65 if ((ax == 0x5003 || ax == 0x5007) && 66 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
66 (rows == 0 || rows == 24)) 67 (rows == 0 || rows == 24))
67 return mode; 68 return mode;
68#endif 69#endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
71 mode = 3; 72 mode = 3;
72 73
73 /* Set the mode */ 74 /* Set the mode */
74 ax = mode; 75 ireg.ax = mode; /* AH=0: set mode */
75 asm volatile(INT10 76 intcall(0x10, &ireg, NULL);
76 : "+a" (ax)
77 : : "ebx", "ecx", "edx", "esi", "edi");
78 do_restore = 1; 77 do_restore = 1;
79 return mode; 78 return mode;
80} 79}
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
82static void vga_set_8font(void) 81static void vga_set_8font(void)
83{ 82{
84 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ 83 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
84 struct biosregs ireg;
85
86 initregs(&ireg);
85 87
86 /* Set 8x8 font */ 88 /* Set 8x8 font */
87 asm volatile(INT10 : : "a" (0x1112), "b" (0)); 89 ireg.ax = 0x1112;
90 /* ireg.bl = 0; */
91 intcall(0x10, &ireg, NULL);
88 92
89 /* Use alternate print screen */ 93 /* Use alternate print screen */
90 asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); 94 ireg.ax = 0x1200;
95 ireg.bl = 0x20;
96 intcall(0x10, &ireg, NULL);
91 97
92 /* Turn off cursor emulation */ 98 /* Turn off cursor emulation */
93 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 99 ireg.ax = 0x1201;
100 ireg.bl = 0x34;
101 intcall(0x10, &ireg, NULL);
94 102
95 /* Cursor is scan lines 6-7 */ 103 /* Cursor is scan lines 6-7 */
96 asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); 104 ireg.ax = 0x0100;
105 ireg.cx = 0x0607;
106 intcall(0x10, &ireg, NULL);
97} 107}
98 108
99static void vga_set_14font(void) 109static void vga_set_14font(void)
100{ 110{
101 /* Set 9x14 font - 80x28 on VGA */ 111 /* Set 9x14 font - 80x28 on VGA */
112 struct biosregs ireg;
113
114 initregs(&ireg);
102 115
103 /* Set 9x14 font */ 116 /* Set 9x14 font */
104 asm volatile(INT10 : : "a" (0x1111), "b" (0)); 117 ireg.ax = 0x1111;
118 /* ireg.bl = 0; */
119 intcall(0x10, &ireg, NULL);
105 120
106 /* Turn off cursor emulation */ 121 /* Turn off cursor emulation */
107 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 122 ireg.ax = 0x1201;
123 ireg.bl = 0x34;
124 intcall(0x10, &ireg, NULL);
108 125
109 /* Cursor is scan lines 11-12 */ 126 /* Cursor is scan lines 11-12 */
110 asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); 127 ireg.ax = 0x0100;
128 ireg.cx = 0x0b0c;
129 intcall(0x10, &ireg, NULL);
111} 130}
112 131
113static void vga_set_80x43(void) 132static void vga_set_80x43(void)
114{ 133{
115 /* Set 80x43 mode on VGA (not EGA) */ 134 /* Set 80x43 mode on VGA (not EGA) */
135 struct biosregs ireg;
136
137 initregs(&ireg);
116 138
117 /* Set 350 scans */ 139 /* Set 350 scans */
118 asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); 140 ireg.ax = 0x1201;
141 ireg.bl = 0x30;
142 intcall(0x10, &ireg, NULL);
119 143
120 /* Reset video mode */ 144 /* Reset video mode */
121 asm volatile(INT10 : : "a" (0x0003)); 145 ireg.ax = 0x0003;
146 intcall(0x10, &ireg, NULL);
122 147
123 vga_set_8font(); 148 vga_set_8font();
124} 149}
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
225 */ 250 */
226static int vga_probe(void) 251static int vga_probe(void)
227{ 252{
228 u16 ega_bx;
229
230 static const char *card_name[] = { 253 static const char *card_name[] = {
231 "CGA/MDA/HGC", "EGA", "VGA" 254 "CGA/MDA/HGC", "EGA", "VGA"
232 }; 255 };
@@ -240,26 +263,26 @@ static int vga_probe(void)
240 sizeof(ega_modes)/sizeof(struct mode_info), 263 sizeof(ega_modes)/sizeof(struct mode_info),
241 sizeof(vga_modes)/sizeof(struct mode_info), 264 sizeof(vga_modes)/sizeof(struct mode_info),
242 }; 265 };
243 u8 vga_flag;
244 266
245 asm(INT10 267 struct biosregs ireg, oreg;
246 : "=b" (ega_bx) 268
247 : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ 269 initregs(&ireg);
248 : "ecx", "edx", "esi", "edi"); 270
271 ireg.ax = 0x1200;
272 ireg.bl = 0x10; /* Check EGA/VGA */
273 intcall(0x10, &ireg, &oreg);
249 274
250#ifndef _WAKEUP 275#ifndef _WAKEUP
251 boot_params.screen_info.orig_video_ega_bx = ega_bx; 276 boot_params.screen_info.orig_video_ega_bx = oreg.bx;
252#endif 277#endif
253 278
254 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ 279 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
255 if ((u8)ega_bx != 0x10) { 280 if (oreg.bl != 0x10) {
256 /* EGA/VGA */ 281 /* EGA/VGA */
257 asm(INT10 282 ireg.ax = 0x1a00;
258 : "=a" (vga_flag) 283 intcall(0x10, &ireg, &oreg);
259 : "a" (0x1a00)
260 : "ebx", "ecx", "edx", "esi", "edi");
261 284
262 if (vga_flag == 0x1a) { 285 if (oreg.al == 0x1a) {
263 adapter = ADAPTER_VGA; 286 adapter = ADAPTER_VGA;
264#ifndef _WAKEUP 287#ifndef _WAKEUP
265 boot_params.screen_info.orig_video_isVGA = 1; 288 boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe9..bad728b76fc2 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
18 19
19static void store_cursor_position(void) 20static void store_cursor_position(void)
20{ 21{
21 u16 curpos; 22 struct biosregs ireg, oreg;
22 u16 ax, bx;
23 23
24 ax = 0x0300; 24 initregs(&ireg);
25 bx = 0; 25 ireg.ah = 0x03;
26 asm(INT10 26 intcall(0x10, &ireg, &oreg);
27 : "=d" (curpos), "+a" (ax), "+b" (bx)
28 : : "ecx", "esi", "edi");
29 27
30 boot_params.screen_info.orig_x = curpos; 28 boot_params.screen_info.orig_x = oreg.dl;
31 boot_params.screen_info.orig_y = curpos >> 8; 29 boot_params.screen_info.orig_y = oreg.dh;
32} 30}
33 31
34static void store_video_mode(void) 32static void store_video_mode(void)
35{ 33{
36 u16 ax, page; 34 struct biosregs ireg, oreg;
37 35
38 /* N.B.: the saving of the video page here is a bit silly, 36 /* N.B.: the saving of the video page here is a bit silly,
39 since we pretty much assume page 0 everywhere. */ 37 since we pretty much assume page 0 everywhere. */
40 ax = 0x0f00; 38 initregs(&ireg);
41 asm(INT10 39 ireg.ah = 0x0f;
42 : "+a" (ax), "=b" (page) 40 intcall(0x10, &ireg, &oreg);
43 : : "ecx", "edx", "esi", "edi");
44 41
45 /* Not all BIOSes are clean with respect to the top bit */ 42 /* Not all BIOSes are clean with respect to the top bit */
46 boot_params.screen_info.orig_video_mode = ax & 0x7f; 43 boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
47 boot_params.screen_info.orig_video_page = page >> 8; 44 boot_params.screen_info.orig_video_page = oreg.bh;
48} 45}
49 46
50/* 47/*
@@ -257,7 +254,7 @@ static void restore_screen(void)
257 int y; 254 int y;
258 addr_t dst = 0; 255 addr_t dst = 0;
259 u16 *src = saved.data; 256 u16 *src = saved.data;
260 u16 ax, bx, dx; 257 struct biosregs ireg;
261 258
262 if (graphic_mode) 259 if (graphic_mode)
263 return; /* Can't restore onto a graphic mode */ 260 return; /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
296 } 293 }
297 294
298 /* Restore cursor position */ 295 /* Restore cursor position */
299 ax = 0x0200; /* Set cursor position */ 296 initregs(&ireg);
300 bx = 0; /* Page number (<< 8) */ 297 ireg.ah = 0x02; /* Set cursor position */
301 dx = (saved.cury << 8)+saved.curx; 298 ireg.dh = saved.cury;
302 asm volatile(INT10 299 ireg.dl = saved.curx;
303 : "+a" (ax), "+b" (bx), "+d" (dx) 300 intcall(0x10, &ireg, NULL);
304 : : "ecx", "esi", "edi");
305} 301}
306#else 302#else
307#define save_screen() ((void)0) 303#define save_screen() ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d14461..5bb174a997fc 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
112extern int do_restore; /* Restore screen contents */ 112extern int do_restore; /* Restore screen contents */
113extern int graphic_mode; /* Graphics mode with linear frame buffer */ 113extern int graphic_mode; /* Graphics mode with linear frame buffer */
114 114
115/*
116 * int $0x10 is notorious for touching registers it shouldn't.
117 * gcc doesn't like %ebp being clobbered, so define it as a push/pop
118 * sequence here.
119 *
120 * A number of systems, including the original PC can clobber %bp in
121 * certain circumstances, like when scrolling. There exists at least
122 * one Trident video card which could clobber DS under a set of
123 * circumstances that we are unlikely to encounter (scrolling when
124 * using an extended graphics mode of more than 800x600 pixels), but
125 * it's cheap insurance to deal with that here.
126 */
127#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
128
129/* Accessing VGA indexed registers */ 115/* Accessing VGA indexed registers */
130static inline u8 in_idx(u16 port, u8 index) 116static inline u8 in_idx(u16 port, u8 index)
131{ 117{
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f2..edb992ebef92 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:50:58 2009 4# Mon May 11 16:21:55 2009
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf32-i386"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
33CONFIG_ARCH_HAS_DEFAULT_IDLE=y 34CONFIG_ARCH_HAS_DEFAULT_IDLE=y
34CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
35CONFIG_HAVE_SETUP_PER_CPU_AREA=y 36CONFIG_HAVE_SETUP_PER_CPU_AREA=y
37CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
36# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set 38# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
37CONFIG_ARCH_HIBERNATION_POSSIBLE=y 39CONFIG_ARCH_HIBERNATION_POSSIBLE=y
38CONFIG_ARCH_SUSPEND_POSSIBLE=y 40CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
40CONFIG_ARCH_POPULATES_NODE_MAP=y 42CONFIG_ARCH_POPULATES_NODE_MAP=y
41# CONFIG_AUDIT_ARCH is not set 43# CONFIG_AUDIT_ARCH is not set
42CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 44CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
45CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
43CONFIG_GENERIC_HARDIRQS=y 46CONFIG_GENERIC_HARDIRQS=y
47CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
44CONFIG_GENERIC_IRQ_PROBE=y 48CONFIG_GENERIC_IRQ_PROBE=y
45CONFIG_GENERIC_PENDING_IRQ=y 49CONFIG_GENERIC_PENDING_IRQ=y
46CONFIG_X86_SMP=y
47CONFIG_USE_GENERIC_SMP_HELPERS=y 50CONFIG_USE_GENERIC_SMP_HELPERS=y
48CONFIG_X86_32_SMP=y 51CONFIG_X86_32_SMP=y
49CONFIG_X86_HT=y 52CONFIG_X86_HT=y
50CONFIG_X86_BIOS_REBOOT=y
51CONFIG_X86_TRAMPOLINE=y 53CONFIG_X86_TRAMPOLINE=y
54CONFIG_X86_32_LAZY_GS=y
52CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
53CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
54 57
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
60CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
61CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
62# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
63CONFIG_SWAP=y 72CONFIG_SWAP=y
64CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
65CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
66CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
67CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
68# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
69CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
113CONFIG_NET_NS=y 123CONFIG_NET_NS=y
114CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
115CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
116CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
117CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
118# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
119CONFIG_UID16=y 133CONFIG_UID16=y
120CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
121CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
122CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
123CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
124CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
125CONFIG_PRINTK=y 140CONFIG_PRINTK=y
126CONFIG_BUG=y 141CONFIG_BUG=y
127CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
128CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
129# CONFIG_COMPAT_BRK is not set
130CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
131CONFIG_FUTEX=y 145CONFIG_FUTEX=y
132CONFIG_ANON_INODES=y
133CONFIG_EPOLL=y 146CONFIG_EPOLL=y
134CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
135CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
139CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
140CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
141CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
142# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
143CONFIG_SLUB=y 157CONFIG_SLUB=y
144# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
154CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
155CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
156CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
157CONFIG_HAVE_GENERIC_DMA_COHERENT=y 173CONFIG_HAVE_GENERIC_DMA_COHERENT=y
158CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
159CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
168CONFIG_BLOCK=y 184CONFIG_BLOCK=y
169# CONFIG_LBD is not set 185# CONFIG_LBD is not set
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 186CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 187# CONFIG_BLK_DEV_INTEGRITY is not set
173 188
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
194CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
195CONFIG_SMP=y 210CONFIG_SMP=y
196CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
197CONFIG_X86_FIND_SMP_CONFIG=y
198CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
213# CONFIG_X86_BIGSMP is not set
214CONFIG_X86_EXTENDED_PLATFORM=y
199# CONFIG_X86_ELAN is not set 215# CONFIG_X86_ELAN is not set
200# CONFIG_X86_GENERICARCH is not set
201# CONFIG_X86_VSMP is not set
202# CONFIG_X86_RDC321X is not set 216# CONFIG_X86_RDC321X is not set
217# CONFIG_X86_32_NON_STANDARD is not set
203CONFIG_SCHED_OMIT_FRAME_POINTER=y 218CONFIG_SCHED_OMIT_FRAME_POINTER=y
204# CONFIG_PARAVIRT_GUEST is not set 219# CONFIG_PARAVIRT_GUEST is not set
205# CONFIG_MEMTEST is not set 220# CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
230# CONFIG_GENERIC_CPU is not set 245# CONFIG_GENERIC_CPU is not set
231CONFIG_X86_GENERIC=y 246CONFIG_X86_GENERIC=y
232CONFIG_X86_CPU=y 247CONFIG_X86_CPU=y
248CONFIG_X86_L1_CACHE_BYTES=64
249CONFIG_X86_INTERNODE_CACHE_BYTES=64
233CONFIG_X86_CMPXCHG=y 250CONFIG_X86_CMPXCHG=y
234CONFIG_X86_L1_CACHE_SHIFT=7 251CONFIG_X86_L1_CACHE_SHIFT=5
235CONFIG_X86_XADD=y 252CONFIG_X86_XADD=y
236# CONFIG_X86_PPRO_FENCE is not set 253# CONFIG_X86_PPRO_FENCE is not set
237CONFIG_X86_WP_WORKS_OK=y 254CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
247CONFIG_CPU_SUP_INTEL=y 264CONFIG_CPU_SUP_INTEL=y
248CONFIG_CPU_SUP_CYRIX_32=y 265CONFIG_CPU_SUP_CYRIX_32=y
249CONFIG_CPU_SUP_AMD=y 266CONFIG_CPU_SUP_AMD=y
250CONFIG_CPU_SUP_CENTAUR_32=y 267CONFIG_CPU_SUP_CENTAUR=y
251CONFIG_CPU_SUP_TRANSMETA_32=y 268CONFIG_CPU_SUP_TRANSMETA_32=y
252CONFIG_CPU_SUP_UMC_32=y 269CONFIG_CPU_SUP_UMC_32=y
253CONFIG_X86_DS=y 270CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
279CONFIG_MICROCODE_OLD_INTERFACE=y 296CONFIG_MICROCODE_OLD_INTERFACE=y
280CONFIG_X86_MSR=y 297CONFIG_X86_MSR=y
281CONFIG_X86_CPUID=y 298CONFIG_X86_CPUID=y
299# CONFIG_X86_CPU_DEBUG is not set
282# CONFIG_NOHIGHMEM is not set 300# CONFIG_NOHIGHMEM is not set
283CONFIG_HIGHMEM4G=y 301CONFIG_HIGHMEM4G=y
284# CONFIG_HIGHMEM64G is not set 302# CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
302CONFIG_BOUNCE=y 320CONFIG_BOUNCE=y
303CONFIG_VIRT_TO_BUS=y 321CONFIG_VIRT_TO_BUS=y
304CONFIG_UNEVICTABLE_LRU=y 322CONFIG_UNEVICTABLE_LRU=y
323CONFIG_HAVE_MLOCK=y
324CONFIG_HAVE_MLOCKED_PAGE_BIT=y
305CONFIG_HIGHPTE=y 325CONFIG_HIGHPTE=y
306CONFIG_X86_CHECK_BIOS_CORRUPTION=y 326CONFIG_X86_CHECK_BIOS_CORRUPTION=y
307CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 327CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
312CONFIG_X86_PAT=y 332CONFIG_X86_PAT=y
313CONFIG_EFI=y 333CONFIG_EFI=y
314CONFIG_SECCOMP=y 334CONFIG_SECCOMP=y
335# CONFIG_CC_STACKPROTECTOR is not set
315# CONFIG_HZ_100 is not set 336# CONFIG_HZ_100 is not set
316# CONFIG_HZ_250 is not set 337# CONFIG_HZ_250 is not set
317# CONFIG_HZ_300 is not set 338# CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
322CONFIG_CRASH_DUMP=y 343CONFIG_CRASH_DUMP=y
323# CONFIG_KEXEC_JUMP is not set 344# CONFIG_KEXEC_JUMP is not set
324CONFIG_PHYSICAL_START=0x1000000 345CONFIG_PHYSICAL_START=0x1000000
325# CONFIG_RELOCATABLE is not set 346CONFIG_RELOCATABLE=y
326CONFIG_PHYSICAL_ALIGN=0x200000 347CONFIG_X86_NEED_RELOCS=y
348CONFIG_PHYSICAL_ALIGN=0x1000000
327CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
328# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
329# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
363CONFIG_ACPI_BLACKLIST_YEAR=0 385CONFIG_ACPI_BLACKLIST_YEAR=0
364# CONFIG_ACPI_DEBUG is not set 386# CONFIG_ACPI_DEBUG is not set
365# CONFIG_ACPI_PCI_SLOT is not set 387# CONFIG_ACPI_PCI_SLOT is not set
366CONFIG_ACPI_SYSTEM=y
367CONFIG_X86_PM_TIMER=y 388CONFIG_X86_PM_TIMER=y
368CONFIG_ACPI_CONTAINER=y 389CONFIG_ACPI_CONTAINER=y
369# CONFIG_ACPI_SBS is not set 390# CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
425CONFIG_PCI_DIRECT=y 446CONFIG_PCI_DIRECT=y
426CONFIG_PCI_MMCONFIG=y 447CONFIG_PCI_MMCONFIG=y
427CONFIG_PCI_DOMAINS=y 448CONFIG_PCI_DOMAINS=y
449# CONFIG_DMAR is not set
428CONFIG_PCIEPORTBUS=y 450CONFIG_PCIEPORTBUS=y
429# CONFIG_HOTPLUG_PCI_PCIE is not set 451# CONFIG_HOTPLUG_PCI_PCIE is not set
430CONFIG_PCIEAER=y 452CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
435# CONFIG_PCI_DEBUG is not set 457# CONFIG_PCI_DEBUG is not set
436# CONFIG_PCI_STUB is not set 458# CONFIG_PCI_STUB is not set
437CONFIG_HT_IRQ=y 459CONFIG_HT_IRQ=y
460# CONFIG_PCI_IOV is not set
438CONFIG_ISA_DMA_API=y 461CONFIG_ISA_DMA_API=y
439# CONFIG_ISA is not set 462# CONFIG_ISA is not set
440# CONFIG_MCA is not set 463# CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
481# 504#
482# Networking options 505# Networking options
483# 506#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 507CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 508CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 509CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 661# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 662# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 663# CONFIG_WAN_ROUTER is not set
664# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 665CONFIG_NET_SCHED=y
643 666
644# 667#
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
696# 719#
697# CONFIG_NET_PKTGEN is not set 720# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 721# CONFIG_NET_TCPPROBE is not set
722# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 723CONFIG_HAMRADIO=y
700 724
701# 725#
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 730# CONFIG_IRDA is not set
707# CONFIG_BT is not set 731# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 732# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 733CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 734CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 735CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 736# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 737CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 738CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 739CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
789# CONFIG_ICS932S401 is not set 811# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 812# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_HP_ILO is not set 813# CONFIG_HP_ILO is not set
814# CONFIG_ISL29003 is not set
792# CONFIG_C2PORT is not set 815# CONFIG_C2PORT is not set
793 816
794# 817#
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
842# CONFIG_SCSI_LOWLEVEL is not set 865# CONFIG_SCSI_LOWLEVEL is not set
843# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 866# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
844# CONFIG_SCSI_DH is not set 867# CONFIG_SCSI_DH is not set
868# CONFIG_SCSI_OSD_INITIATOR is not set
845CONFIG_ATA=y 869CONFIG_ATA=y
846# CONFIG_ATA_NONSTANDARD is not set 870# CONFIG_ATA_NONSTANDARD is not set
847CONFIG_ATA_ACPI=y 871CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 964CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 965CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 966CONFIG_NETDEVICES=y
967CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 968# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 969# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 970# CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 1002CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 1003# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 1004# CONFIG_TYPHOON is not set
1005# CONFIG_ETHOC is not set
1006# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1007CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1008# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1009# CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
1026CONFIG_E1000E=y 1053CONFIG_E1000E=y
1027# CONFIG_IP1000 is not set 1054# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1055# CONFIG_IGB is not set
1056# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1057# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1058# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1059# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
1040# CONFIG_QLA3XXX is not set 1068# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1069# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1070# CONFIG_ATL1E is not set
1071# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1072# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1073CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1074# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1078# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1079# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1080# CONFIG_S2IO is not set
1081# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1082# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1083# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1084# CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1088# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1089# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1090# CONFIG_SFC is not set
1091# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1092CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1093# CONFIG_IBMOL is not set
1063# CONFIG_IBMLS is not set 1094# CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
1073# CONFIG_LIBERTAS is not set 1104# CONFIG_LIBERTAS is not set
1074# CONFIG_LIBERTAS_THINFIRM is not set 1105# CONFIG_LIBERTAS_THINFIRM is not set
1075# CONFIG_AIRO is not set 1106# CONFIG_AIRO is not set
1076# CONFIG_HERMES is not set
1077# CONFIG_ATMEL is not set 1107# CONFIG_ATMEL is not set
1108# CONFIG_AT76C50X_USB is not set
1078# CONFIG_AIRO_CS is not set 1109# CONFIG_AIRO_CS is not set
1079# CONFIG_PCMCIA_WL3501 is not set 1110# CONFIG_PCMCIA_WL3501 is not set
1080# CONFIG_PRISM54 is not set 1111# CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
1084# CONFIG_RTL8187 is not set 1115# CONFIG_RTL8187 is not set
1085# CONFIG_ADM8211 is not set 1116# CONFIG_ADM8211 is not set
1086# CONFIG_MAC80211_HWSIM is not set 1117# CONFIG_MAC80211_HWSIM is not set
1118# CONFIG_MWL8K is not set
1087# CONFIG_P54_COMMON is not set 1119# CONFIG_P54_COMMON is not set
1088CONFIG_ATH5K=y 1120CONFIG_ATH5K=y
1089# CONFIG_ATH5K_DEBUG is not set 1121# CONFIG_ATH5K_DEBUG is not set
1090# CONFIG_ATH9K is not set 1122# CONFIG_ATH9K is not set
1123# CONFIG_AR9170_USB is not set
1091# CONFIG_IPW2100 is not set 1124# CONFIG_IPW2100 is not set
1092# CONFIG_IPW2200 is not set 1125# CONFIG_IPW2200 is not set
1093# CONFIG_IWLCORE is not set 1126# CONFIG_IWLWIFI is not set
1094# CONFIG_IWLWIFI_LEDS is not set
1095# CONFIG_IWLAGN is not set
1096# CONFIG_IWL3945 is not set
1097# CONFIG_HOSTAP is not set 1127# CONFIG_HOSTAP is not set
1098# CONFIG_B43 is not set 1128# CONFIG_B43 is not set
1099# CONFIG_B43LEGACY is not set 1129# CONFIG_B43LEGACY is not set
1100# CONFIG_ZD1211RW is not set 1130# CONFIG_ZD1211RW is not set
1101# CONFIG_RT2X00 is not set 1131# CONFIG_RT2X00 is not set
1132# CONFIG_HERMES is not set
1102 1133
1103# 1134#
1104# Enable WiMAX (Networking options) to see the WiMAX drivers 1135# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
1209# CONFIG_TABLET_USB_KBTAB is not set 1240# CONFIG_TABLET_USB_KBTAB is not set
1210# CONFIG_TABLET_USB_WACOM is not set 1241# CONFIG_TABLET_USB_WACOM is not set
1211CONFIG_INPUT_TOUCHSCREEN=y 1242CONFIG_INPUT_TOUCHSCREEN=y
1243# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1244# CONFIG_TOUCHSCREEN_AD7879 is not set
1212# CONFIG_TOUCHSCREEN_FUJITSU is not set 1245# CONFIG_TOUCHSCREEN_FUJITSU is not set
1213# CONFIG_TOUCHSCREEN_GUNZE is not set 1246# CONFIG_TOUCHSCREEN_GUNZE is not set
1214# CONFIG_TOUCHSCREEN_ELO is not set 1247# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
1303# CONFIG_LEGACY_PTYS is not set 1336# CONFIG_LEGACY_PTYS is not set
1304# CONFIG_IPMI_HANDLER is not set 1337# CONFIG_IPMI_HANDLER is not set
1305CONFIG_HW_RANDOM=y 1338CONFIG_HW_RANDOM=y
1339# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1306CONFIG_HW_RANDOM_INTEL=y 1340CONFIG_HW_RANDOM_INTEL=y
1307CONFIG_HW_RANDOM_AMD=y 1341CONFIG_HW_RANDOM_AMD=y
1308CONFIG_HW_RANDOM_GEODE=y 1342CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
1390# CONFIG_SENSORS_PCF8574 is not set 1424# CONFIG_SENSORS_PCF8574 is not set
1391# CONFIG_PCF8575 is not set 1425# CONFIG_PCF8575 is not set
1392# CONFIG_SENSORS_PCA9539 is not set 1426# CONFIG_SENSORS_PCA9539 is not set
1393# CONFIG_SENSORS_PCF8591 is not set
1394# CONFIG_SENSORS_MAX6875 is not set 1427# CONFIG_SENSORS_MAX6875 is not set
1395# CONFIG_SENSORS_TSL2550 is not set 1428# CONFIG_SENSORS_TSL2550 is not set
1396# CONFIG_I2C_DEBUG_CORE is not set 1429# CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
1424# CONFIG_SENSORS_ADT7475 is not set 1457# CONFIG_SENSORS_ADT7475 is not set
1425# CONFIG_SENSORS_K8TEMP is not set 1458# CONFIG_SENSORS_K8TEMP is not set
1426# CONFIG_SENSORS_ASB100 is not set 1459# CONFIG_SENSORS_ASB100 is not set
1460# CONFIG_SENSORS_ATK0110 is not set
1427# CONFIG_SENSORS_ATXP1 is not set 1461# CONFIG_SENSORS_ATXP1 is not set
1428# CONFIG_SENSORS_DS1621 is not set 1462# CONFIG_SENSORS_DS1621 is not set
1429# CONFIG_SENSORS_I5K_AMB is not set 1463# CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
1433# CONFIG_SENSORS_FSCHER is not set 1467# CONFIG_SENSORS_FSCHER is not set
1434# CONFIG_SENSORS_FSCPOS is not set 1468# CONFIG_SENSORS_FSCPOS is not set
1435# CONFIG_SENSORS_FSCHMD is not set 1469# CONFIG_SENSORS_FSCHMD is not set
1470# CONFIG_SENSORS_G760A is not set
1436# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1437# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1438# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
1448# CONFIG_SENSORS_LM90 is not set 1483# CONFIG_SENSORS_LM90 is not set
1449# CONFIG_SENSORS_LM92 is not set 1484# CONFIG_SENSORS_LM92 is not set
1450# CONFIG_SENSORS_LM93 is not set 1485# CONFIG_SENSORS_LM93 is not set
1486# CONFIG_SENSORS_LTC4215 is not set
1451# CONFIG_SENSORS_LTC4245 is not set 1487# CONFIG_SENSORS_LTC4245 is not set
1488# CONFIG_SENSORS_LM95241 is not set
1452# CONFIG_SENSORS_MAX1619 is not set 1489# CONFIG_SENSORS_MAX1619 is not set
1453# CONFIG_SENSORS_MAX6650 is not set 1490# CONFIG_SENSORS_MAX6650 is not set
1454# CONFIG_SENSORS_PC87360 is not set 1491# CONFIG_SENSORS_PC87360 is not set
1455# CONFIG_SENSORS_PC87427 is not set 1492# CONFIG_SENSORS_PC87427 is not set
1493# CONFIG_SENSORS_PCF8591 is not set
1456# CONFIG_SENSORS_SIS5595 is not set 1494# CONFIG_SENSORS_SIS5595 is not set
1457# CONFIG_SENSORS_DME1737 is not set 1495# CONFIG_SENSORS_DME1737 is not set
1458# CONFIG_SENSORS_SMSC47M1 is not set 1496# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
1643# CONFIG_FB_3DFX is not set 1681# CONFIG_FB_3DFX is not set
1644# CONFIG_FB_VOODOO1 is not set 1682# CONFIG_FB_VOODOO1 is not set
1645# CONFIG_FB_VT8623 is not set 1683# CONFIG_FB_VT8623 is not set
1646# CONFIG_FB_CYBLA is not set
1647# CONFIG_FB_TRIDENT is not set 1684# CONFIG_FB_TRIDENT is not set
1648# CONFIG_FB_ARK is not set 1685# CONFIG_FB_ARK is not set
1649# CONFIG_FB_PM3 is not set 1686# CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
1652# CONFIG_FB_VIRTUAL is not set 1689# CONFIG_FB_VIRTUAL is not set
1653# CONFIG_FB_METRONOME is not set 1690# CONFIG_FB_METRONOME is not set
1654# CONFIG_FB_MB862XX is not set 1691# CONFIG_FB_MB862XX is not set
1692# CONFIG_FB_BROADSHEET is not set
1655CONFIG_BACKLIGHT_LCD_SUPPORT=y 1693CONFIG_BACKLIGHT_LCD_SUPPORT=y
1656# CONFIG_LCD_CLASS_DEVICE is not set 1694# CONFIG_LCD_CLASS_DEVICE is not set
1657CONFIG_BACKLIGHT_CLASS_DEVICE=y 1695CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
1738# CONFIG_SND_INDIGO is not set 1776# CONFIG_SND_INDIGO is not set
1739# CONFIG_SND_INDIGOIO is not set 1777# CONFIG_SND_INDIGOIO is not set
1740# CONFIG_SND_INDIGODJ is not set 1778# CONFIG_SND_INDIGODJ is not set
1779# CONFIG_SND_INDIGOIOX is not set
1780# CONFIG_SND_INDIGODJX is not set
1741# CONFIG_SND_EMU10K1 is not set 1781# CONFIG_SND_EMU10K1 is not set
1742# CONFIG_SND_EMU10K1X is not set 1782# CONFIG_SND_EMU10K1X is not set
1743# CONFIG_SND_ENS1370 is not set 1783# CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
1811# 1851#
1812# Special HID drivers 1852# Special HID drivers
1813# 1853#
1814CONFIG_HID_COMPAT=y
1815CONFIG_HID_A4TECH=y 1854CONFIG_HID_A4TECH=y
1816CONFIG_HID_APPLE=y 1855CONFIG_HID_APPLE=y
1817CONFIG_HID_BELKIN=y 1856CONFIG_HID_BELKIN=y
1818CONFIG_HID_CHERRY=y 1857CONFIG_HID_CHERRY=y
1819CONFIG_HID_CHICONY=y 1858CONFIG_HID_CHICONY=y
1820CONFIG_HID_CYPRESS=y 1859CONFIG_HID_CYPRESS=y
1860# CONFIG_DRAGONRISE_FF is not set
1821CONFIG_HID_EZKEY=y 1861CONFIG_HID_EZKEY=y
1862CONFIG_HID_KYE=y
1822CONFIG_HID_GYRATION=y 1863CONFIG_HID_GYRATION=y
1864CONFIG_HID_KENSINGTON=y
1823CONFIG_HID_LOGITECH=y 1865CONFIG_HID_LOGITECH=y
1824CONFIG_LOGITECH_FF=y 1866CONFIG_LOGITECH_FF=y
1825# CONFIG_LOGIRUMBLEPAD2_FF is not set 1867# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
1885# CONFIG_USB_TMC is not set 1927# CONFIG_USB_TMC is not set
1886 1928
1887# 1929#
1888# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1930# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1889# 1931#
1890 1932
1891# 1933#
1892# see USB_STORAGE Help for more information 1934# also be needed; see USB_STORAGE Help for more info
1893# 1935#
1894CONFIG_USB_STORAGE=y 1936CONFIG_USB_STORAGE=y
1895# CONFIG_USB_STORAGE_DEBUG is not set 1937# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
1931# CONFIG_USB_LED is not set 1973# CONFIG_USB_LED is not set
1932# CONFIG_USB_CYPRESS_CY7C63 is not set 1974# CONFIG_USB_CYPRESS_CY7C63 is not set
1933# CONFIG_USB_CYTHERM is not set 1975# CONFIG_USB_CYTHERM is not set
1934# CONFIG_USB_PHIDGET is not set
1935# CONFIG_USB_IDMOUSE is not set 1976# CONFIG_USB_IDMOUSE is not set
1936# CONFIG_USB_FTDI_ELAN is not set 1977# CONFIG_USB_FTDI_ELAN is not set
1937# CONFIG_USB_APPLEDISPLAY is not set 1978# CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
1947# 1988#
1948# OTG and related infrastructure 1989# OTG and related infrastructure
1949# 1990#
1991# CONFIG_NOP_USB_XCEIV is not set
1950# CONFIG_UWB is not set 1992# CONFIG_UWB is not set
1951# CONFIG_MMC is not set 1993# CONFIG_MMC is not set
1952# CONFIG_MEMSTICK is not set 1994# CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
1958# 2000#
1959# CONFIG_LEDS_ALIX2 is not set 2001# CONFIG_LEDS_ALIX2 is not set
1960# CONFIG_LEDS_PCA9532 is not set 2002# CONFIG_LEDS_PCA9532 is not set
2003# CONFIG_LEDS_LP5521 is not set
1961# CONFIG_LEDS_CLEVO_MAIL is not set 2004# CONFIG_LEDS_CLEVO_MAIL is not set
1962# CONFIG_LEDS_PCA955X is not set 2005# CONFIG_LEDS_PCA955X is not set
2006# CONFIG_LEDS_BD2802 is not set
1963 2007
1964# 2008#
1965# LED Triggers 2009# LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
1969# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 2013# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1970# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 2014# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1971# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 2015# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
2016
2017#
2018# iptables trigger is under Netfilter config (LED target)
2019#
1972# CONFIG_ACCESSIBILITY is not set 2020# CONFIG_ACCESSIBILITY is not set
1973# CONFIG_INFINIBAND is not set 2021# CONFIG_INFINIBAND is not set
1974CONFIG_EDAC=y 2022CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
2037# DMA Devices 2085# DMA Devices
2038# 2086#
2039# CONFIG_INTEL_IOATDMA is not set 2087# CONFIG_INTEL_IOATDMA is not set
2088# CONFIG_AUXDISPLAY is not set
2040# CONFIG_UIO is not set 2089# CONFIG_UIO is not set
2041# CONFIG_STAGING is not set 2090# CONFIG_STAGING is not set
2042CONFIG_X86_PLATFORM_DEVICES=y 2091CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
2071# 2120#
2072# CONFIG_EXT2_FS is not set 2121# CONFIG_EXT2_FS is not set
2073CONFIG_EXT3_FS=y 2122CONFIG_EXT3_FS=y
2123# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2074CONFIG_EXT3_FS_XATTR=y 2124CONFIG_EXT3_FS_XATTR=y
2075CONFIG_EXT3_FS_POSIX_ACL=y 2125CONFIG_EXT3_FS_POSIX_ACL=y
2076CONFIG_EXT3_FS_SECURITY=y 2126CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y
2101CONFIG_GENERIC_ACL=y 2151CONFIG_GENERIC_ACL=y
2102 2152
2103# 2153#
2154# Caches
2155#
2156# CONFIG_FSCACHE is not set
2157
2158#
2104# CD-ROM/DVD Filesystems 2159# CD-ROM/DVD Filesystems
2105# 2160#
2106CONFIG_ISO9660_FS=y 2161CONFIG_ISO9660_FS=y
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
2151# CONFIG_ROMFS_FS is not set 2206# CONFIG_ROMFS_FS is not set
2152# CONFIG_SYSV_FS is not set 2207# CONFIG_SYSV_FS is not set
2153# CONFIG_UFS_FS is not set 2208# CONFIG_UFS_FS is not set
2209# CONFIG_NILFS2_FS is not set
2154CONFIG_NETWORK_FILESYSTEMS=y 2210CONFIG_NETWORK_FILESYSTEMS=y
2155CONFIG_NFS_FS=y 2211CONFIG_NFS_FS=y
2156CONFIG_NFS_V3=y 2212CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2164CONFIG_NFS_COMMON=y 2220CONFIG_NFS_COMMON=y
2165CONFIG_SUNRPC=y 2221CONFIG_SUNRPC=y
2166CONFIG_SUNRPC_GSS=y 2222CONFIG_SUNRPC_GSS=y
2167# CONFIG_SUNRPC_REGISTER_V4 is not set
2168CONFIG_RPCSEC_GSS_KRB5=y 2223CONFIG_RPCSEC_GSS_KRB5=y
2169# CONFIG_RPCSEC_GSS_SPKM3 is not set 2224# CONFIG_RPCSEC_GSS_SPKM3 is not set
2170# CONFIG_SMB_FS is not set 2225# CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
2251CONFIG_DEBUG_KERNEL=y 2306CONFIG_DEBUG_KERNEL=y
2252# CONFIG_DEBUG_SHIRQ is not set 2307# CONFIG_DEBUG_SHIRQ is not set
2253# CONFIG_DETECT_SOFTLOCKUP is not set 2308# CONFIG_DETECT_SOFTLOCKUP is not set
2309# CONFIG_DETECT_HUNG_TASK is not set
2254# CONFIG_SCHED_DEBUG is not set 2310# CONFIG_SCHED_DEBUG is not set
2255CONFIG_SCHEDSTATS=y 2311CONFIG_SCHEDSTATS=y
2256CONFIG_TIMER_STATS=y 2312CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
2266# CONFIG_LOCK_STAT is not set 2322# CONFIG_LOCK_STAT is not set
2267# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2323# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2268# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2324# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2325CONFIG_STACKTRACE=y
2269# CONFIG_DEBUG_KOBJECT is not set 2326# CONFIG_DEBUG_KOBJECT is not set
2270# CONFIG_DEBUG_HIGHMEM is not set 2327# CONFIG_DEBUG_HIGHMEM is not set
2271CONFIG_DEBUG_BUGVERBOSE=y 2328CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
2289# CONFIG_FAULT_INJECTION is not set 2346# CONFIG_FAULT_INJECTION is not set
2290# CONFIG_LATENCYTOP is not set 2347# CONFIG_LATENCYTOP is not set
2291CONFIG_SYSCTL_SYSCALL_CHECK=y 2348CONFIG_SYSCTL_SYSCALL_CHECK=y
2349# CONFIG_DEBUG_PAGEALLOC is not set
2292CONFIG_USER_STACKTRACE_SUPPORT=y 2350CONFIG_USER_STACKTRACE_SUPPORT=y
2351CONFIG_NOP_TRACER=y
2293CONFIG_HAVE_FUNCTION_TRACER=y 2352CONFIG_HAVE_FUNCTION_TRACER=y
2294CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2353CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2295CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2354CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2296CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2297CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2298CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y
2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y
2299 2362
2300# 2363#
2301# Tracers 2364# Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2305# CONFIG_SYSPROF_TRACER is not set 2368# CONFIG_SYSPROF_TRACER is not set
2306# CONFIG_SCHED_TRACER is not set 2369# CONFIG_SCHED_TRACER is not set
2307# CONFIG_CONTEXT_SWITCH_TRACER is not set 2370# CONFIG_CONTEXT_SWITCH_TRACER is not set
2371# CONFIG_EVENT_TRACER is not set
2372# CONFIG_FTRACE_SYSCALLS is not set
2308# CONFIG_BOOT_TRACER is not set 2373# CONFIG_BOOT_TRACER is not set
2309# CONFIG_TRACE_BRANCH_PROFILING is not set 2374# CONFIG_TRACE_BRANCH_PROFILING is not set
2310# CONFIG_POWER_TRACER is not set 2375# CONFIG_POWER_TRACER is not set
2311# CONFIG_STACK_TRACER is not set 2376# CONFIG_STACK_TRACER is not set
2312# CONFIG_HW_BRANCH_TRACER is not set 2377# CONFIG_HW_BRANCH_TRACER is not set
2378# CONFIG_KMEMTRACE is not set
2379# CONFIG_WORKQUEUE_TRACER is not set
2380CONFIG_BLK_DEV_IO_TRACE=y
2381# CONFIG_FTRACE_STARTUP_TEST is not set
2382# CONFIG_MMIOTRACE is not set
2313CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2383CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2314# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2384# CONFIG_DYNAMIC_DEBUG is not set
2385# CONFIG_DMA_API_DEBUG is not set
2315# CONFIG_SAMPLES is not set 2386# CONFIG_SAMPLES is not set
2316CONFIG_HAVE_ARCH_KGDB=y 2387CONFIG_HAVE_ARCH_KGDB=y
2317# CONFIG_KGDB is not set 2388# CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
2321CONFIG_EARLY_PRINTK_DBGP=y 2392CONFIG_EARLY_PRINTK_DBGP=y
2322CONFIG_DEBUG_STACKOVERFLOW=y 2393CONFIG_DEBUG_STACKOVERFLOW=y
2323CONFIG_DEBUG_STACK_USAGE=y 2394CONFIG_DEBUG_STACK_USAGE=y
2324# CONFIG_DEBUG_PAGEALLOC is not set
2325# CONFIG_DEBUG_PER_CPU_MAPS is not set 2395# CONFIG_DEBUG_PER_CPU_MAPS is not set
2326# CONFIG_X86_PTDUMP is not set 2396# CONFIG_X86_PTDUMP is not set
2327CONFIG_DEBUG_RODATA=y 2397CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
2329CONFIG_DEBUG_NX_TEST=m 2399CONFIG_DEBUG_NX_TEST=m
2330# CONFIG_4KSTACKS is not set 2400# CONFIG_4KSTACKS is not set
2331CONFIG_DOUBLEFAULT=y 2401CONFIG_DOUBLEFAULT=y
2332# CONFIG_MMIOTRACE is not set 2402CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2333CONFIG_IO_DELAY_TYPE_0X80=0 2403CONFIG_IO_DELAY_TYPE_0X80=0
2334CONFIG_IO_DELAY_TYPE_0XED=1 2404CONFIG_IO_DELAY_TYPE_0XED=1
2335CONFIG_IO_DELAY_TYPE_UDELAY=2 2405CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2365CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2435CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2366# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2436# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2367# CONFIG_SECURITY_SMACK is not set 2437# CONFIG_SECURITY_SMACK is not set
2438# CONFIG_SECURITY_TOMOYO is not set
2439# CONFIG_IMA is not set
2368CONFIG_CRYPTO=y 2440CONFIG_CRYPTO=y
2369 2441
2370# 2442#
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2380CONFIG_CRYPTO_HASH=y 2452CONFIG_CRYPTO_HASH=y
2381CONFIG_CRYPTO_HASH2=y 2453CONFIG_CRYPTO_HASH2=y
2382CONFIG_CRYPTO_RNG2=y 2454CONFIG_CRYPTO_RNG2=y
2455CONFIG_CRYPTO_PCOMP=y
2383CONFIG_CRYPTO_MANAGER=y 2456CONFIG_CRYPTO_MANAGER=y
2384CONFIG_CRYPTO_MANAGER2=y 2457CONFIG_CRYPTO_MANAGER2=y
2385# CONFIG_CRYPTO_GF128MUL is not set 2458# CONFIG_CRYPTO_GF128MUL is not set
2386# CONFIG_CRYPTO_NULL is not set 2459# CONFIG_CRYPTO_NULL is not set
2460CONFIG_CRYPTO_WORKQUEUE=y
2387# CONFIG_CRYPTO_CRYPTD is not set 2461# CONFIG_CRYPTO_CRYPTD is not set
2388CONFIG_CRYPTO_AUTHENC=y 2462CONFIG_CRYPTO_AUTHENC=y
2389# CONFIG_CRYPTO_TEST is not set 2463# CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
2456# Compression 2530# Compression
2457# 2531#
2458# CONFIG_CRYPTO_DEFLATE is not set 2532# CONFIG_CRYPTO_DEFLATE is not set
2533# CONFIG_CRYPTO_ZLIB is not set
2459# CONFIG_CRYPTO_LZO is not set 2534# CONFIG_CRYPTO_LZO is not set
2460 2535
2461# 2536#
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
2467# CONFIG_CRYPTO_DEV_GEODE is not set 2542# CONFIG_CRYPTO_DEV_GEODE is not set
2468# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2543# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2469CONFIG_HAVE_KVM=y 2544CONFIG_HAVE_KVM=y
2545CONFIG_HAVE_KVM_IRQCHIP=y
2470CONFIG_VIRTUALIZATION=y 2546CONFIG_VIRTUALIZATION=y
2471# CONFIG_KVM is not set 2547# CONFIG_KVM is not set
2472# CONFIG_LGUEST is not set 2548# CONFIG_LGUEST is not set
2473# CONFIG_VIRTIO_PCI is not set 2549# CONFIG_VIRTIO_PCI is not set
2474# CONFIG_VIRTIO_BALLOON is not set 2550# CONFIG_VIRTIO_BALLOON is not set
2551CONFIG_BINARY_PRINTF=y
2475 2552
2476# 2553#
2477# Library routines 2554# Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
2489# CONFIG_LIBCRC32C is not set 2566# CONFIG_LIBCRC32C is not set
2490CONFIG_AUDIT_GENERIC=y 2567CONFIG_AUDIT_GENERIC=y
2491CONFIG_ZLIB_INFLATE=y 2568CONFIG_ZLIB_INFLATE=y
2492CONFIG_PLIST=y 2569CONFIG_DECOMPRESS_GZIP=y
2570CONFIG_DECOMPRESS_BZIP2=y
2571CONFIG_DECOMPRESS_LZMA=y
2493CONFIG_HAS_IOMEM=y 2572CONFIG_HAS_IOMEM=y
2494CONFIG_HAS_IOPORT=y 2573CONFIG_HAS_IOPORT=y
2495CONFIG_HAS_DMA=y 2574CONFIG_HAS_DMA=y
2575CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4c..cee1dd2e69b2 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:44:16 2009 4# Mon May 11 16:22:00 2009
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf64-x86-64"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
34CONFIG_ARCH_HAS_DEFAULT_IDLE=y 35CONFIG_ARCH_HAS_DEFAULT_IDLE=y
35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 36CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
36CONFIG_HAVE_SETUP_PER_CPU_AREA=y 37CONFIG_HAVE_SETUP_PER_CPU_AREA=y
38CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
37CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y 39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
38CONFIG_ARCH_HIBERNATION_POSSIBLE=y 40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
39CONFIG_ARCH_SUSPEND_POSSIBLE=y 41CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
41CONFIG_ARCH_POPULATES_NODE_MAP=y 43CONFIG_ARCH_POPULATES_NODE_MAP=y
42CONFIG_AUDIT_ARCH=y 44CONFIG_AUDIT_ARCH=y
43CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 45CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
46CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
44CONFIG_GENERIC_HARDIRQS=y 47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
45CONFIG_GENERIC_IRQ_PROBE=y 49CONFIG_GENERIC_IRQ_PROBE=y
46CONFIG_GENERIC_PENDING_IRQ=y 50CONFIG_GENERIC_PENDING_IRQ=y
47CONFIG_X86_SMP=y
48CONFIG_USE_GENERIC_SMP_HELPERS=y 51CONFIG_USE_GENERIC_SMP_HELPERS=y
49CONFIG_X86_64_SMP=y 52CONFIG_X86_64_SMP=y
50CONFIG_X86_HT=y 53CONFIG_X86_HT=y
51CONFIG_X86_BIOS_REBOOT=y
52CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
53# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
54CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
61CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
62CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
63# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
64CONFIG_SWAP=y 72CONFIG_SWAP=y
65CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
66CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
67CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
68CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
69# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
70CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
114CONFIG_NET_NS=y 123CONFIG_NET_NS=y
115CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
116CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
117CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
118CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
119# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
120CONFIG_UID16=y 133CONFIG_UID16=y
121CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
122CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
123CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
124CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
125CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
126CONFIG_PRINTK=y 140CONFIG_PRINTK=y
127CONFIG_BUG=y 141CONFIG_BUG=y
128CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
129CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
130# CONFIG_COMPAT_BRK is not set
131CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
132CONFIG_FUTEX=y 145CONFIG_FUTEX=y
133CONFIG_ANON_INODES=y
134CONFIG_EPOLL=y 146CONFIG_EPOLL=y
135CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
136CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
140CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
141CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
142CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
143# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
144CONFIG_SLUB=y 157CONFIG_SLUB=y
145# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
155CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
156CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
157CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
158# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set 173# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
159CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
160CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167# CONFIG_MODULE_SRCVERSION_ALL is not set 182# CONFIG_MODULE_SRCVERSION_ALL is not set
168CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
169CONFIG_BLOCK=y 184CONFIG_BLOCK=y
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 185CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 186# CONFIG_BLK_DEV_INTEGRITY is not set
173CONFIG_BLOCK_COMPAT=y 187CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
195CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
196CONFIG_SMP=y 210CONFIG_SMP=y
197CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
198# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
199CONFIG_X86_FIND_SMP_CONFIG=y
200CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
201# CONFIG_X86_ELAN is not set 213CONFIG_X86_EXTENDED_PLATFORM=y
202# CONFIG_X86_GENERICARCH is not set
203# CONFIG_X86_VSMP is not set 214# CONFIG_X86_VSMP is not set
215# CONFIG_X86_UV is not set
204CONFIG_SCHED_OMIT_FRAME_POINTER=y 216CONFIG_SCHED_OMIT_FRAME_POINTER=y
205# CONFIG_PARAVIRT_GUEST is not set 217# CONFIG_PARAVIRT_GUEST is not set
206# CONFIG_MEMTEST is not set 218# CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
230# CONFIG_MCORE2 is not set 242# CONFIG_MCORE2 is not set
231CONFIG_GENERIC_CPU=y 243CONFIG_GENERIC_CPU=y
232CONFIG_X86_CPU=y 244CONFIG_X86_CPU=y
233CONFIG_X86_L1_CACHE_BYTES=128 245CONFIG_X86_L1_CACHE_BYTES=64
234CONFIG_X86_INTERNODE_CACHE_BYTES=128 246CONFIG_X86_INTERNODE_CACHE_BYTES=64
235CONFIG_X86_CMPXCHG=y 247CONFIG_X86_CMPXCHG=y
236CONFIG_X86_L1_CACHE_SHIFT=7 248CONFIG_X86_L1_CACHE_SHIFT=6
237CONFIG_X86_WP_WORKS_OK=y 249CONFIG_X86_WP_WORKS_OK=y
238CONFIG_X86_TSC=y 250CONFIG_X86_TSC=y
239CONFIG_X86_CMPXCHG64=y 251CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
242CONFIG_X86_DEBUGCTLMSR=y 254CONFIG_X86_DEBUGCTLMSR=y
243CONFIG_CPU_SUP_INTEL=y 255CONFIG_CPU_SUP_INTEL=y
244CONFIG_CPU_SUP_AMD=y 256CONFIG_CPU_SUP_AMD=y
245CONFIG_CPU_SUP_CENTAUR_64=y 257CONFIG_CPU_SUP_CENTAUR=y
246CONFIG_X86_DS=y 258CONFIG_X86_DS=y
247CONFIG_X86_PTRACE_BTS=y 259CONFIG_X86_PTRACE_BTS=y
248CONFIG_HPET_TIMER=y 260CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
269CONFIG_X86_MCE=y 281CONFIG_X86_MCE=y
270CONFIG_X86_MCE_INTEL=y 282CONFIG_X86_MCE_INTEL=y
271CONFIG_X86_MCE_AMD=y 283CONFIG_X86_MCE_AMD=y
284CONFIG_X86_MCE_THRESHOLD=y
272# CONFIG_I8K is not set 285# CONFIG_I8K is not set
273CONFIG_MICROCODE=y 286CONFIG_MICROCODE=y
274CONFIG_MICROCODE_INTEL=y 287CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
276CONFIG_MICROCODE_OLD_INTERFACE=y 289CONFIG_MICROCODE_OLD_INTERFACE=y
277CONFIG_X86_MSR=y 290CONFIG_X86_MSR=y
278CONFIG_X86_CPUID=y 291CONFIG_X86_CPUID=y
292# CONFIG_X86_CPU_DEBUG is not set
279CONFIG_ARCH_PHYS_ADDR_T_64BIT=y 293CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
280CONFIG_DIRECT_GBPAGES=y 294CONFIG_DIRECT_GBPAGES=y
281CONFIG_NUMA=y 295CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
309CONFIG_BOUNCE=y 323CONFIG_BOUNCE=y
310CONFIG_VIRT_TO_BUS=y 324CONFIG_VIRT_TO_BUS=y
311CONFIG_UNEVICTABLE_LRU=y 325CONFIG_UNEVICTABLE_LRU=y
326CONFIG_HAVE_MLOCK=y
327CONFIG_HAVE_MLOCKED_PAGE_BIT=y
312CONFIG_X86_CHECK_BIOS_CORRUPTION=y 328CONFIG_X86_CHECK_BIOS_CORRUPTION=y
313CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 329CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
314CONFIG_X86_RESERVE_LOW_64K=y 330CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
317CONFIG_X86_PAT=y 333CONFIG_X86_PAT=y
318CONFIG_EFI=y 334CONFIG_EFI=y
319CONFIG_SECCOMP=y 335CONFIG_SECCOMP=y
336# CONFIG_CC_STACKPROTECTOR is not set
320# CONFIG_HZ_100 is not set 337# CONFIG_HZ_100 is not set
321# CONFIG_HZ_250 is not set 338# CONFIG_HZ_250 is not set
322# CONFIG_HZ_300 is not set 339# CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
325CONFIG_SCHED_HRTICK=y 342CONFIG_SCHED_HRTICK=y
326CONFIG_KEXEC=y 343CONFIG_KEXEC=y
327CONFIG_CRASH_DUMP=y 344CONFIG_CRASH_DUMP=y
345# CONFIG_KEXEC_JUMP is not set
328CONFIG_PHYSICAL_START=0x1000000 346CONFIG_PHYSICAL_START=0x1000000
329# CONFIG_RELOCATABLE is not set 347CONFIG_RELOCATABLE=y
330CONFIG_PHYSICAL_ALIGN=0x200000 348CONFIG_PHYSICAL_ALIGN=0x1000000
331CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
332# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
333# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
370CONFIG_ACPI_BLACKLIST_YEAR=0 388CONFIG_ACPI_BLACKLIST_YEAR=0
371# CONFIG_ACPI_DEBUG is not set 389# CONFIG_ACPI_DEBUG is not set
372# CONFIG_ACPI_PCI_SLOT is not set 390# CONFIG_ACPI_PCI_SLOT is not set
373CONFIG_ACPI_SYSTEM=y
374CONFIG_X86_PM_TIMER=y 391CONFIG_X86_PM_TIMER=y
375CONFIG_ACPI_CONTAINER=y 392CONFIG_ACPI_CONTAINER=y
376# CONFIG_ACPI_SBS is not set 393# CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
436# CONFIG_PCI_DEBUG is not set 453# CONFIG_PCI_DEBUG is not set
437# CONFIG_PCI_STUB is not set 454# CONFIG_PCI_STUB is not set
438CONFIG_HT_IRQ=y 455CONFIG_HT_IRQ=y
456# CONFIG_PCI_IOV is not set
439CONFIG_ISA_DMA_API=y 457CONFIG_ISA_DMA_API=y
440CONFIG_K8_NB=y 458CONFIG_K8_NB=y
441CONFIG_PCCARD=y 459CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
481# 499#
482# Networking options 500# Networking options
483# 501#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 502CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 503CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 504CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 656# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 657# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 658# CONFIG_WAN_ROUTER is not set
659# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 660CONFIG_NET_SCHED=y
643 661
644# 662#
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
696# 714#
697# CONFIG_NET_PKTGEN is not set 715# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 716# CONFIG_NET_TCPPROBE is not set
717# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 718CONFIG_HAMRADIO=y
700 719
701# 720#
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 725# CONFIG_IRDA is not set
707# CONFIG_BT is not set 726# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 727# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 728CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 729CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 730CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 731# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 732CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 733CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 734CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
788# CONFIG_TIFM_CORE is not set 805# CONFIG_TIFM_CORE is not set
789# CONFIG_ICS932S401 is not set 806# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 807# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_SGI_XP is not set
792# CONFIG_HP_ILO is not set 808# CONFIG_HP_ILO is not set
793# CONFIG_SGI_GRU is not set 809# CONFIG_ISL29003 is not set
794# CONFIG_C2PORT is not set 810# CONFIG_C2PORT is not set
795 811
796# 812#
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
844# CONFIG_SCSI_LOWLEVEL is not set 860# CONFIG_SCSI_LOWLEVEL is not set
845# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 861# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
846# CONFIG_SCSI_DH is not set 862# CONFIG_SCSI_DH is not set
863# CONFIG_SCSI_OSD_INITIATOR is not set
847CONFIG_ATA=y 864CONFIG_ATA=y
848# CONFIG_ATA_NONSTANDARD is not set 865# CONFIG_ATA_NONSTANDARD is not set
849CONFIG_ATA_ACPI=y 866CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 957CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 958CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 959CONFIG_NETDEVICES=y
960CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 961# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 962# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 963# CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 995CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 996# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 997# CONFIG_TYPHOON is not set
998# CONFIG_ETHOC is not set
999# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1000CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1001# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1002# CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
1026# CONFIG_E1000E is not set 1046# CONFIG_E1000E is not set
1027# CONFIG_IP1000 is not set 1047# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1048# CONFIG_IGB is not set
1049# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1050# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1051# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1052# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
1040# CONFIG_QLA3XXX is not set 1061# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1062# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1063# CONFIG_ATL1E is not set
1064# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1065# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1066CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1067# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1071# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1072# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1073# CONFIG_S2IO is not set
1074# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1075# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1076# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1077# CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1081# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1082# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1083# CONFIG_SFC is not set
1084# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1085CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1086# CONFIG_IBMOL is not set
1063# CONFIG_3C359 is not set 1087# CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
1072# CONFIG_LIBERTAS is not set 1096# CONFIG_LIBERTAS is not set
1073# CONFIG_LIBERTAS_THINFIRM is not set 1097# CONFIG_LIBERTAS_THINFIRM is not set
1074# CONFIG_AIRO is not set 1098# CONFIG_AIRO is not set
1075# CONFIG_HERMES is not set
1076# CONFIG_ATMEL is not set 1099# CONFIG_ATMEL is not set
1100# CONFIG_AT76C50X_USB is not set
1077# CONFIG_AIRO_CS is not set 1101# CONFIG_AIRO_CS is not set
1078# CONFIG_PCMCIA_WL3501 is not set 1102# CONFIG_PCMCIA_WL3501 is not set
1079# CONFIG_PRISM54 is not set 1103# CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
1083# CONFIG_RTL8187 is not set 1107# CONFIG_RTL8187 is not set
1084# CONFIG_ADM8211 is not set 1108# CONFIG_ADM8211 is not set
1085# CONFIG_MAC80211_HWSIM is not set 1109# CONFIG_MAC80211_HWSIM is not set
1110# CONFIG_MWL8K is not set
1086# CONFIG_P54_COMMON is not set 1111# CONFIG_P54_COMMON is not set
1087CONFIG_ATH5K=y 1112CONFIG_ATH5K=y
1088# CONFIG_ATH5K_DEBUG is not set 1113# CONFIG_ATH5K_DEBUG is not set
1089# CONFIG_ATH9K is not set 1114# CONFIG_ATH9K is not set
1115# CONFIG_AR9170_USB is not set
1090# CONFIG_IPW2100 is not set 1116# CONFIG_IPW2100 is not set
1091# CONFIG_IPW2200 is not set 1117# CONFIG_IPW2200 is not set
1092# CONFIG_IWLCORE is not set 1118# CONFIG_IWLWIFI is not set
1093# CONFIG_IWLWIFI_LEDS is not set
1094# CONFIG_IWLAGN is not set
1095# CONFIG_IWL3945 is not set
1096# CONFIG_HOSTAP is not set 1119# CONFIG_HOSTAP is not set
1097# CONFIG_B43 is not set 1120# CONFIG_B43 is not set
1098# CONFIG_B43LEGACY is not set 1121# CONFIG_B43LEGACY is not set
1099# CONFIG_ZD1211RW is not set 1122# CONFIG_ZD1211RW is not set
1100# CONFIG_RT2X00 is not set 1123# CONFIG_RT2X00 is not set
1124# CONFIG_HERMES is not set
1101 1125
1102# 1126#
1103# Enable WiMAX (Networking options) to see the WiMAX drivers 1127# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
1208# CONFIG_TABLET_USB_KBTAB is not set 1232# CONFIG_TABLET_USB_KBTAB is not set
1209# CONFIG_TABLET_USB_WACOM is not set 1233# CONFIG_TABLET_USB_WACOM is not set
1210CONFIG_INPUT_TOUCHSCREEN=y 1234CONFIG_INPUT_TOUCHSCREEN=y
1235# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1236# CONFIG_TOUCHSCREEN_AD7879 is not set
1211# CONFIG_TOUCHSCREEN_FUJITSU is not set 1237# CONFIG_TOUCHSCREEN_FUJITSU is not set
1212# CONFIG_TOUCHSCREEN_GUNZE is not set 1238# CONFIG_TOUCHSCREEN_GUNZE is not set
1213# CONFIG_TOUCHSCREEN_ELO is not set 1239# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
1301# CONFIG_LEGACY_PTYS is not set 1327# CONFIG_LEGACY_PTYS is not set
1302# CONFIG_IPMI_HANDLER is not set 1328# CONFIG_IPMI_HANDLER is not set
1303CONFIG_HW_RANDOM=y 1329CONFIG_HW_RANDOM=y
1330# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1304# CONFIG_HW_RANDOM_INTEL is not set 1331# CONFIG_HW_RANDOM_INTEL is not set
1305# CONFIG_HW_RANDOM_AMD is not set 1332# CONFIG_HW_RANDOM_AMD is not set
1306CONFIG_NVRAM=y 1333CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
1382# CONFIG_SENSORS_PCF8574 is not set 1409# CONFIG_SENSORS_PCF8574 is not set
1383# CONFIG_PCF8575 is not set 1410# CONFIG_PCF8575 is not set
1384# CONFIG_SENSORS_PCA9539 is not set 1411# CONFIG_SENSORS_PCA9539 is not set
1385# CONFIG_SENSORS_PCF8591 is not set
1386# CONFIG_SENSORS_MAX6875 is not set 1412# CONFIG_SENSORS_MAX6875 is not set
1387# CONFIG_SENSORS_TSL2550 is not set 1413# CONFIG_SENSORS_TSL2550 is not set
1388# CONFIG_I2C_DEBUG_CORE is not set 1414# CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
1416# CONFIG_SENSORS_ADT7475 is not set 1442# CONFIG_SENSORS_ADT7475 is not set
1417# CONFIG_SENSORS_K8TEMP is not set 1443# CONFIG_SENSORS_K8TEMP is not set
1418# CONFIG_SENSORS_ASB100 is not set 1444# CONFIG_SENSORS_ASB100 is not set
1445# CONFIG_SENSORS_ATK0110 is not set
1419# CONFIG_SENSORS_ATXP1 is not set 1446# CONFIG_SENSORS_ATXP1 is not set
1420# CONFIG_SENSORS_DS1621 is not set 1447# CONFIG_SENSORS_DS1621 is not set
1421# CONFIG_SENSORS_I5K_AMB is not set 1448# CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
1425# CONFIG_SENSORS_FSCHER is not set 1452# CONFIG_SENSORS_FSCHER is not set
1426# CONFIG_SENSORS_FSCPOS is not set 1453# CONFIG_SENSORS_FSCPOS is not set
1427# CONFIG_SENSORS_FSCHMD is not set 1454# CONFIG_SENSORS_FSCHMD is not set
1455# CONFIG_SENSORS_G760A is not set
1428# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1429# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1430# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
1440# CONFIG_SENSORS_LM90 is not set 1468# CONFIG_SENSORS_LM90 is not set
1441# CONFIG_SENSORS_LM92 is not set 1469# CONFIG_SENSORS_LM92 is not set
1442# CONFIG_SENSORS_LM93 is not set 1470# CONFIG_SENSORS_LM93 is not set
1471# CONFIG_SENSORS_LTC4215 is not set
1443# CONFIG_SENSORS_LTC4245 is not set 1472# CONFIG_SENSORS_LTC4245 is not set
1473# CONFIG_SENSORS_LM95241 is not set
1444# CONFIG_SENSORS_MAX1619 is not set 1474# CONFIG_SENSORS_MAX1619 is not set
1445# CONFIG_SENSORS_MAX6650 is not set 1475# CONFIG_SENSORS_MAX6650 is not set
1446# CONFIG_SENSORS_PC87360 is not set 1476# CONFIG_SENSORS_PC87360 is not set
1447# CONFIG_SENSORS_PC87427 is not set 1477# CONFIG_SENSORS_PC87427 is not set
1478# CONFIG_SENSORS_PCF8591 is not set
1448# CONFIG_SENSORS_SIS5595 is not set 1479# CONFIG_SENSORS_SIS5595 is not set
1449# CONFIG_SENSORS_DME1737 is not set 1480# CONFIG_SENSORS_DME1737 is not set
1450# CONFIG_SENSORS_SMSC47M1 is not set 1481# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
1635# CONFIG_FB_VIRTUAL is not set 1666# CONFIG_FB_VIRTUAL is not set
1636# CONFIG_FB_METRONOME is not set 1667# CONFIG_FB_METRONOME is not set
1637# CONFIG_FB_MB862XX is not set 1668# CONFIG_FB_MB862XX is not set
1669# CONFIG_FB_BROADSHEET is not set
1638CONFIG_BACKLIGHT_LCD_SUPPORT=y 1670CONFIG_BACKLIGHT_LCD_SUPPORT=y
1639# CONFIG_LCD_CLASS_DEVICE is not set 1671# CONFIG_LCD_CLASS_DEVICE is not set
1640CONFIG_BACKLIGHT_CLASS_DEVICE=y 1672CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
1720# CONFIG_SND_INDIGO is not set 1752# CONFIG_SND_INDIGO is not set
1721# CONFIG_SND_INDIGOIO is not set 1753# CONFIG_SND_INDIGOIO is not set
1722# CONFIG_SND_INDIGODJ is not set 1754# CONFIG_SND_INDIGODJ is not set
1755# CONFIG_SND_INDIGOIOX is not set
1756# CONFIG_SND_INDIGODJX is not set
1723# CONFIG_SND_EMU10K1 is not set 1757# CONFIG_SND_EMU10K1 is not set
1724# CONFIG_SND_EMU10K1X is not set 1758# CONFIG_SND_EMU10K1X is not set
1725# CONFIG_SND_ENS1370 is not set 1759# CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
1792# 1826#
1793# Special HID drivers 1827# Special HID drivers
1794# 1828#
1795CONFIG_HID_COMPAT=y
1796CONFIG_HID_A4TECH=y 1829CONFIG_HID_A4TECH=y
1797CONFIG_HID_APPLE=y 1830CONFIG_HID_APPLE=y
1798CONFIG_HID_BELKIN=y 1831CONFIG_HID_BELKIN=y
1799CONFIG_HID_CHERRY=y 1832CONFIG_HID_CHERRY=y
1800CONFIG_HID_CHICONY=y 1833CONFIG_HID_CHICONY=y
1801CONFIG_HID_CYPRESS=y 1834CONFIG_HID_CYPRESS=y
1835# CONFIG_DRAGONRISE_FF is not set
1802CONFIG_HID_EZKEY=y 1836CONFIG_HID_EZKEY=y
1837CONFIG_HID_KYE=y
1803CONFIG_HID_GYRATION=y 1838CONFIG_HID_GYRATION=y
1839CONFIG_HID_KENSINGTON=y
1804CONFIG_HID_LOGITECH=y 1840CONFIG_HID_LOGITECH=y
1805CONFIG_LOGITECH_FF=y 1841CONFIG_LOGITECH_FF=y
1806# CONFIG_LOGIRUMBLEPAD2_FF is not set 1842# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
1866# CONFIG_USB_TMC is not set 1902# CONFIG_USB_TMC is not set
1867 1903
1868# 1904#
1869# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1905# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1870# 1906#
1871 1907
1872# 1908#
1873# see USB_STORAGE Help for more information 1909# also be needed; see USB_STORAGE Help for more info
1874# 1910#
1875CONFIG_USB_STORAGE=y 1911CONFIG_USB_STORAGE=y
1876# CONFIG_USB_STORAGE_DEBUG is not set 1912# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
1912# CONFIG_USB_LED is not set 1948# CONFIG_USB_LED is not set
1913# CONFIG_USB_CYPRESS_CY7C63 is not set 1949# CONFIG_USB_CYPRESS_CY7C63 is not set
1914# CONFIG_USB_CYTHERM is not set 1950# CONFIG_USB_CYTHERM is not set
1915# CONFIG_USB_PHIDGET is not set
1916# CONFIG_USB_IDMOUSE is not set 1951# CONFIG_USB_IDMOUSE is not set
1917# CONFIG_USB_FTDI_ELAN is not set 1952# CONFIG_USB_FTDI_ELAN is not set
1918# CONFIG_USB_APPLEDISPLAY is not set 1953# CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
1928# 1963#
1929# OTG and related infrastructure 1964# OTG and related infrastructure
1930# 1965#
1966# CONFIG_NOP_USB_XCEIV is not set
1931# CONFIG_UWB is not set 1967# CONFIG_UWB is not set
1932# CONFIG_MMC is not set 1968# CONFIG_MMC is not set
1933# CONFIG_MEMSTICK is not set 1969# CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
1939# 1975#
1940# CONFIG_LEDS_ALIX2 is not set 1976# CONFIG_LEDS_ALIX2 is not set
1941# CONFIG_LEDS_PCA9532 is not set 1977# CONFIG_LEDS_PCA9532 is not set
1978# CONFIG_LEDS_LP5521 is not set
1942# CONFIG_LEDS_CLEVO_MAIL is not set 1979# CONFIG_LEDS_CLEVO_MAIL is not set
1943# CONFIG_LEDS_PCA955X is not set 1980# CONFIG_LEDS_PCA955X is not set
1981# CONFIG_LEDS_BD2802 is not set
1944 1982
1945# 1983#
1946# LED Triggers 1984# LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
1950# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 1988# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1951# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 1989# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1952# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 1990# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1991
1992#
1993# iptables trigger is under Netfilter config (LED target)
1994#
1953# CONFIG_ACCESSIBILITY is not set 1995# CONFIG_ACCESSIBILITY is not set
1954# CONFIG_INFINIBAND is not set 1996# CONFIG_INFINIBAND is not set
1955CONFIG_EDAC=y 1997CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
2018# DMA Devices 2060# DMA Devices
2019# 2061#
2020# CONFIG_INTEL_IOATDMA is not set 2062# CONFIG_INTEL_IOATDMA is not set
2063# CONFIG_AUXDISPLAY is not set
2021# CONFIG_UIO is not set 2064# CONFIG_UIO is not set
2022# CONFIG_STAGING is not set 2065# CONFIG_STAGING is not set
2023CONFIG_X86_PLATFORM_DEVICES=y 2066CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
2051# 2094#
2052# CONFIG_EXT2_FS is not set 2095# CONFIG_EXT2_FS is not set
2053CONFIG_EXT3_FS=y 2096CONFIG_EXT3_FS=y
2097# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2054CONFIG_EXT3_FS_XATTR=y 2098CONFIG_EXT3_FS_XATTR=y
2055CONFIG_EXT3_FS_POSIX_ACL=y 2099CONFIG_EXT3_FS_POSIX_ACL=y
2056CONFIG_EXT3_FS_SECURITY=y 2100CONFIG_EXT3_FS_SECURITY=y
@@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
2082CONFIG_GENERIC_ACL=y 2126CONFIG_GENERIC_ACL=y
2083 2127
2084# 2128#
2129# Caches
2130#
2131# CONFIG_FSCACHE is not set
2132
2133#
2085# CD-ROM/DVD Filesystems 2134# CD-ROM/DVD Filesystems
2086# 2135#
2087CONFIG_ISO9660_FS=y 2136CONFIG_ISO9660_FS=y
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
2132# CONFIG_ROMFS_FS is not set 2181# CONFIG_ROMFS_FS is not set
2133# CONFIG_SYSV_FS is not set 2182# CONFIG_SYSV_FS is not set
2134# CONFIG_UFS_FS is not set 2183# CONFIG_UFS_FS is not set
2184# CONFIG_NILFS2_FS is not set
2135CONFIG_NETWORK_FILESYSTEMS=y 2185CONFIG_NETWORK_FILESYSTEMS=y
2136CONFIG_NFS_FS=y 2186CONFIG_NFS_FS=y
2137CONFIG_NFS_V3=y 2187CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2145CONFIG_NFS_COMMON=y 2195CONFIG_NFS_COMMON=y
2146CONFIG_SUNRPC=y 2196CONFIG_SUNRPC=y
2147CONFIG_SUNRPC_GSS=y 2197CONFIG_SUNRPC_GSS=y
2148# CONFIG_SUNRPC_REGISTER_V4 is not set
2149CONFIG_RPCSEC_GSS_KRB5=y 2198CONFIG_RPCSEC_GSS_KRB5=y
2150# CONFIG_RPCSEC_GSS_SPKM3 is not set 2199# CONFIG_RPCSEC_GSS_SPKM3 is not set
2151# CONFIG_SMB_FS is not set 2200# CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
2232CONFIG_DEBUG_KERNEL=y 2281CONFIG_DEBUG_KERNEL=y
2233# CONFIG_DEBUG_SHIRQ is not set 2282# CONFIG_DEBUG_SHIRQ is not set
2234# CONFIG_DETECT_SOFTLOCKUP is not set 2283# CONFIG_DETECT_SOFTLOCKUP is not set
2284# CONFIG_DETECT_HUNG_TASK is not set
2235# CONFIG_SCHED_DEBUG is not set 2285# CONFIG_SCHED_DEBUG is not set
2236CONFIG_SCHEDSTATS=y 2286CONFIG_SCHEDSTATS=y
2237CONFIG_TIMER_STATS=y 2287CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
2247# CONFIG_LOCK_STAT is not set 2297# CONFIG_LOCK_STAT is not set
2248# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2298# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2249# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2299# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2300CONFIG_STACKTRACE=y
2250# CONFIG_DEBUG_KOBJECT is not set 2301# CONFIG_DEBUG_KOBJECT is not set
2251CONFIG_DEBUG_BUGVERBOSE=y 2302CONFIG_DEBUG_BUGVERBOSE=y
2252# CONFIG_DEBUG_INFO is not set 2303# CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
2269# CONFIG_FAULT_INJECTION is not set 2320# CONFIG_FAULT_INJECTION is not set
2270# CONFIG_LATENCYTOP is not set 2321# CONFIG_LATENCYTOP is not set
2271CONFIG_SYSCTL_SYSCALL_CHECK=y 2322CONFIG_SYSCTL_SYSCALL_CHECK=y
2323# CONFIG_DEBUG_PAGEALLOC is not set
2272CONFIG_USER_STACKTRACE_SUPPORT=y 2324CONFIG_USER_STACKTRACE_SUPPORT=y
2325CONFIG_NOP_TRACER=y
2273CONFIG_HAVE_FUNCTION_TRACER=y 2326CONFIG_HAVE_FUNCTION_TRACER=y
2274CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2327CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2275CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2328CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2276CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2277CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2278CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y
2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y
2279 2336
2280# 2337#
2281# Tracers 2338# Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2285# CONFIG_SYSPROF_TRACER is not set 2342# CONFIG_SYSPROF_TRACER is not set
2286# CONFIG_SCHED_TRACER is not set 2343# CONFIG_SCHED_TRACER is not set
2287# CONFIG_CONTEXT_SWITCH_TRACER is not set 2344# CONFIG_CONTEXT_SWITCH_TRACER is not set
2345# CONFIG_EVENT_TRACER is not set
2346# CONFIG_FTRACE_SYSCALLS is not set
2288# CONFIG_BOOT_TRACER is not set 2347# CONFIG_BOOT_TRACER is not set
2289# CONFIG_TRACE_BRANCH_PROFILING is not set 2348# CONFIG_TRACE_BRANCH_PROFILING is not set
2290# CONFIG_POWER_TRACER is not set 2349# CONFIG_POWER_TRACER is not set
2291# CONFIG_STACK_TRACER is not set 2350# CONFIG_STACK_TRACER is not set
2292# CONFIG_HW_BRANCH_TRACER is not set 2351# CONFIG_HW_BRANCH_TRACER is not set
2352# CONFIG_KMEMTRACE is not set
2353# CONFIG_WORKQUEUE_TRACER is not set
2354CONFIG_BLK_DEV_IO_TRACE=y
2355# CONFIG_FTRACE_STARTUP_TEST is not set
2356# CONFIG_MMIOTRACE is not set
2293CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2357CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2294# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2358# CONFIG_DYNAMIC_DEBUG is not set
2359# CONFIG_DMA_API_DEBUG is not set
2295# CONFIG_SAMPLES is not set 2360# CONFIG_SAMPLES is not set
2296CONFIG_HAVE_ARCH_KGDB=y 2361CONFIG_HAVE_ARCH_KGDB=y
2297# CONFIG_KGDB is not set 2362# CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
2301CONFIG_EARLY_PRINTK_DBGP=y 2366CONFIG_EARLY_PRINTK_DBGP=y
2302CONFIG_DEBUG_STACKOVERFLOW=y 2367CONFIG_DEBUG_STACKOVERFLOW=y
2303CONFIG_DEBUG_STACK_USAGE=y 2368CONFIG_DEBUG_STACK_USAGE=y
2304# CONFIG_DEBUG_PAGEALLOC is not set
2305# CONFIG_DEBUG_PER_CPU_MAPS is not set 2369# CONFIG_DEBUG_PER_CPU_MAPS is not set
2306# CONFIG_X86_PTDUMP is not set 2370# CONFIG_X86_PTDUMP is not set
2307CONFIG_DEBUG_RODATA=y 2371CONFIG_DEBUG_RODATA=y
2308# CONFIG_DEBUG_RODATA_TEST is not set 2372# CONFIG_DEBUG_RODATA_TEST is not set
2309CONFIG_DEBUG_NX_TEST=m 2373CONFIG_DEBUG_NX_TEST=m
2310# CONFIG_IOMMU_DEBUG is not set 2374# CONFIG_IOMMU_DEBUG is not set
2311# CONFIG_MMIOTRACE is not set 2375CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2312CONFIG_IO_DELAY_TYPE_0X80=0 2376CONFIG_IO_DELAY_TYPE_0X80=0
2313CONFIG_IO_DELAY_TYPE_0XED=1 2377CONFIG_IO_DELAY_TYPE_0XED=1
2314CONFIG_IO_DELAY_TYPE_UDELAY=2 2378CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2344CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2408CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2345# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2409# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2346# CONFIG_SECURITY_SMACK is not set 2410# CONFIG_SECURITY_SMACK is not set
2411# CONFIG_SECURITY_TOMOYO is not set
2412# CONFIG_IMA is not set
2347CONFIG_CRYPTO=y 2413CONFIG_CRYPTO=y
2348 2414
2349# 2415#
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2359CONFIG_CRYPTO_HASH=y 2425CONFIG_CRYPTO_HASH=y
2360CONFIG_CRYPTO_HASH2=y 2426CONFIG_CRYPTO_HASH2=y
2361CONFIG_CRYPTO_RNG2=y 2427CONFIG_CRYPTO_RNG2=y
2428CONFIG_CRYPTO_PCOMP=y
2362CONFIG_CRYPTO_MANAGER=y 2429CONFIG_CRYPTO_MANAGER=y
2363CONFIG_CRYPTO_MANAGER2=y 2430CONFIG_CRYPTO_MANAGER2=y
2364# CONFIG_CRYPTO_GF128MUL is not set 2431# CONFIG_CRYPTO_GF128MUL is not set
2365# CONFIG_CRYPTO_NULL is not set 2432# CONFIG_CRYPTO_NULL is not set
2433CONFIG_CRYPTO_WORKQUEUE=y
2366# CONFIG_CRYPTO_CRYPTD is not set 2434# CONFIG_CRYPTO_CRYPTD is not set
2367CONFIG_CRYPTO_AUTHENC=y 2435CONFIG_CRYPTO_AUTHENC=y
2368# CONFIG_CRYPTO_TEST is not set 2436# CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
2414# 2482#
2415CONFIG_CRYPTO_AES=y 2483CONFIG_CRYPTO_AES=y
2416# CONFIG_CRYPTO_AES_X86_64 is not set 2484# CONFIG_CRYPTO_AES_X86_64 is not set
2485# CONFIG_CRYPTO_AES_NI_INTEL is not set
2417# CONFIG_CRYPTO_ANUBIS is not set 2486# CONFIG_CRYPTO_ANUBIS is not set
2418CONFIG_CRYPTO_ARC4=y 2487CONFIG_CRYPTO_ARC4=y
2419# CONFIG_CRYPTO_BLOWFISH is not set 2488# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
2435# Compression 2504# Compression
2436# 2505#
2437# CONFIG_CRYPTO_DEFLATE is not set 2506# CONFIG_CRYPTO_DEFLATE is not set
2507# CONFIG_CRYPTO_ZLIB is not set
2438# CONFIG_CRYPTO_LZO is not set 2508# CONFIG_CRYPTO_LZO is not set
2439 2509
2440# 2510#
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
2444CONFIG_CRYPTO_HW=y 2514CONFIG_CRYPTO_HW=y
2445# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2515# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2446CONFIG_HAVE_KVM=y 2516CONFIG_HAVE_KVM=y
2517CONFIG_HAVE_KVM_IRQCHIP=y
2447CONFIG_VIRTUALIZATION=y 2518CONFIG_VIRTUALIZATION=y
2448# CONFIG_KVM is not set 2519# CONFIG_KVM is not set
2449# CONFIG_VIRTIO_PCI is not set 2520# CONFIG_VIRTIO_PCI is not set
2450# CONFIG_VIRTIO_BALLOON is not set 2521# CONFIG_VIRTIO_BALLOON is not set
2522CONFIG_BINARY_PRINTF=y
2451 2523
2452# 2524#
2453# Library routines 2525# Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
2464# CONFIG_CRC7 is not set 2536# CONFIG_CRC7 is not set
2465# CONFIG_LIBCRC32C is not set 2537# CONFIG_LIBCRC32C is not set
2466CONFIG_ZLIB_INFLATE=y 2538CONFIG_ZLIB_INFLATE=y
2467CONFIG_PLIST=y 2539CONFIG_DECOMPRESS_GZIP=y
2540CONFIG_DECOMPRESS_BZIP2=y
2541CONFIG_DECOMPRESS_LZMA=y
2468CONFIG_HAS_IOMEM=y 2542CONFIG_HAS_IOMEM=y
2469CONFIG_HAS_IOPORT=y 2543CONFIG_HAS_IOPORT=y
2470CONFIG_HAS_DMA=y 2544CONFIG_HAS_DMA=y
2545CONFIG_NLATTR=y
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf71..1a37bcdc8606 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h>
6#include <asm/asm.h> 7#include <asm/asm.h>
7 8
8/* 9/*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
74 75
75const unsigned char *const *find_nop_table(void); 76const unsigned char *const *find_nop_table(void);
76 77
78/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \
81 "661:\n\t" oldinstr "\n662:\n" \
82 ".section .altinstructions,\"a\"\n" \
83 _ASM_ALIGN "\n" \
84 _ASM_PTR "661b\n" /* label */ \
85 _ASM_PTR "663f\n" /* new instruction */ \
86 " .byte " __stringify(feature) "\n" /* feature bit */ \
87 " .byte 662b-661b\n" /* sourcelen */ \
88 " .byte 664f-663f\n" /* replacementlen */ \
89 ".previous\n" \
90 ".section .altinstr_replacement, \"ax\"\n" \
91 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
92 ".previous"
93
77/* 94/*
78 * Alternative instructions for different CPU types or capabilities. 95 * Alternative instructions for different CPU types or capabilities.
79 * 96 *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
87 * without volatile and memory clobber. 104 * without volatile and memory clobber.
88 */ 105 */
89#define alternative(oldinstr, newinstr, feature) \ 106#define alternative(oldinstr, newinstr, feature) \
90 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 107 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
91 ".section .altinstructions,\"a\"\n" \
92 _ASM_ALIGN "\n" \
93 _ASM_PTR "661b\n" /* label */ \
94 _ASM_PTR "663f\n" /* new instruction */ \
95 " .byte %c0\n" /* feature bit */ \
96 " .byte 662b-661b\n" /* sourcelen */ \
97 " .byte 664f-663f\n" /* replacementlen */ \
98 ".previous\n" \
99 ".section .altinstr_replacement,\"ax\"\n" \
100 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
101 ".previous" :: "i" (feature) : "memory")
102 108
103/* 109/*
104 * Alternative inline assembly with input. 110 * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
109 * Best is to use constraints that are fixed size (like (%1) ... "r") 115 * Best is to use constraints that are fixed size (like (%1) ... "r")
110 * If you use variable sized constraints like "m" or "g" in the 116 * If you use variable sized constraints like "m" or "g" in the
111 * replacement make sure to pad to the worst case length. 117 * replacement make sure to pad to the worst case length.
118 * Leaving an unused argument 0 to keep API compatibility.
112 */ 119 */
113#define alternative_input(oldinstr, newinstr, feature, input...) \ 120#define alternative_input(oldinstr, newinstr, feature, input...) \
114 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 121 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
115 ".section .altinstructions,\"a\"\n" \ 122 : : "i" (0), ## input)
116 _ASM_ALIGN "\n" \
117 _ASM_PTR "661b\n" /* label */ \
118 _ASM_PTR "663f\n" /* new instruction */ \
119 " .byte %c0\n" /* feature bit */ \
120 " .byte 662b-661b\n" /* sourcelen */ \
121 " .byte 664f-663f\n" /* replacementlen */ \
122 ".previous\n" \
123 ".section .altinstr_replacement,\"ax\"\n" \
124 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
125 ".previous" :: "i" (feature), ##input)
126 123
127/* Like alternative_input, but with a single output argument */ 124/* Like alternative_input, but with a single output argument */
128#define alternative_io(oldinstr, newinstr, feature, output, input...) \ 125#define alternative_io(oldinstr, newinstr, feature, output, input...) \
129 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 126 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
130 ".section .altinstructions,\"a\"\n" \ 127 : output : "i" (0), ## input)
131 _ASM_ALIGN "\n" \
132 _ASM_PTR "661b\n" /* label */ \
133 _ASM_PTR "663f\n" /* new instruction */ \
134 " .byte %c[feat]\n" /* feature bit */ \
135 " .byte 662b-661b\n" /* sourcelen */ \
136 " .byte 664f-663f\n" /* replacementlen */ \
137 ".previous\n" \
138 ".section .altinstr_replacement,\"ax\"\n" \
139 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
140 ".previous" : output : [feat] "i" (feature), ##input)
141 128
142/* 129/*
143 * use this macro(s) if you need more than one output parameter 130 * use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329bc..262e02820049 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,8 @@ extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void); 27extern int amd_iommu_init_dma_ops(void);
28extern void amd_iommu_detect(void); 28extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void);
31extern void amd_iommu_flush_all_devices(void);
30#else 32#else
31static inline int amd_iommu_init(void) { return -ENODEV; } 33static inline int amd_iommu_init(void) { return -ENODEV; }
32static inline void amd_iommu_detect(void) { } 34static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b5..0c878caaa0a2 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ 194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops 195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */ 196 domain for an IOMMU */
197extern bool amd_iommu_dump;
198#define DUMP_printk(format, arg...) \
199 do { \
200 if (amd_iommu_dump) \
201 printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
202 } while(0);
203
204/*
205 * Make iterating over all IOMMUs easier
206 */
207#define for_each_iommu(iommu) \
208 list_for_each_entry((iommu), &amd_iommu_list, list)
209#define for_each_iommu_safe(iommu, next) \
210 list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
211
212#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
213#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
214#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
215#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
216#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
217#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
197 218
198/* 219/*
199 * This structure contains generic data for IOMMU protection domains 220 * This structure contains generic data for IOMMU protection domains
@@ -210,6 +231,26 @@ struct protection_domain {
210}; 231};
211 232
212/* 233/*
234 * For dynamic growth the aperture size is split into ranges of 128MB of
235 * DMA address space each. This struct represents one such range.
236 */
237struct aperture_range {
238
239 /* address allocation bitmap */
240 unsigned long *bitmap;
241
242 /*
243 * Array of PTE pages for the aperture. In this array we save all the
244 * leaf pages of the domain page table used for the aperture. This way
245 * we don't need to walk the page table to find a specific PTE. We can
246 * just calculate its address in constant time.
247 */
248 u64 *pte_pages[64];
249
250 unsigned long offset;
251};
252
253/*
213 * Data container for a dma_ops specific protection domain 254 * Data container for a dma_ops specific protection domain
214 */ 255 */
215struct dma_ops_domain { 256struct dma_ops_domain {
@@ -222,18 +263,10 @@ struct dma_ops_domain {
222 unsigned long aperture_size; 263 unsigned long aperture_size;
223 264
224 /* address we start to search for free addresses */ 265 /* address we start to search for free addresses */
225 unsigned long next_bit; 266 unsigned long next_address;
226
227 /* address allocation bitmap */
228 unsigned long *bitmap;
229 267
230 /* 268 /* address space relevant data */
231 * Array of PTE pages for the aperture. In this array we save all the 269 struct aperture_range *aperture[APERTURE_MAX_RANGES];
232 * leaf pages of the domain page table used for the aperture. This way
233 * we don't need to walk the page table to find a specific PTE. We can
234 * just calculate its address in constant time.
235 */
236 u64 **pte_pages;
237 270
238 /* This will be set to true when TLB needs to be flushed */ 271 /* This will be set to true when TLB needs to be flushed */
239 bool need_flush; 272 bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f8377422..bb7d47925847 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
107extern void native_apic_icr_write(u32 low, u32 id); 107extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
109 109
110#define EIM_8BIT_APIC_ID 0 110extern int x2apic_mode;
111#define EIM_32BIT_APIC_ID 1
112 111
113#ifdef CONFIG_X86_X2APIC 112#ifdef CONFIG_X86_X2APIC
114/* 113/*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
166 return val; 165 return val;
167} 166}
168 167
169extern int x2apic, x2apic_phys; 168extern int x2apic_phys;
170extern void check_x2apic(void); 169extern void check_x2apic(void);
171extern void enable_x2apic(void); 170extern void enable_x2apic(void);
172extern void enable_IR_x2apic(void);
173extern void x2apic_icr_write(u32 low, u32 id); 171extern void x2apic_icr_write(u32 low, u32 id);
174static inline int x2apic_enabled(void) 172static inline int x2apic_enabled(void)
175{ 173{
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
183 return 1; 181 return 1;
184 return 0; 182 return 0;
185} 183}
184
185#define x2apic_supported() (cpu_has_x2apic)
186#else 186#else
187static inline void check_x2apic(void) 187static inline void check_x2apic(void)
188{ 188{
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
190static inline void enable_x2apic(void) 190static inline void enable_x2apic(void)
191{ 191{
192} 192}
193static inline void enable_IR_x2apic(void)
194{
195}
196static inline int x2apic_enabled(void) 193static inline int x2apic_enabled(void)
197{ 194{
198 return 0; 195 return 0;
199} 196}
200 197
201#define x2apic 0 198#define x2apic_preenabled 0
202 199#define x2apic_supported() 0
203#endif 200#endif
204 201
205extern int get_physical_broadcast(void); 202extern void enable_IR_x2apic(void);
206 203
207#ifdef CONFIG_X86_X2APIC 204extern int get_physical_broadcast(void);
208static inline void ack_x2APIC_irq(void)
209{
210 /* Docs say use 0 for future compatibility */
211 native_apic_msr_write(APIC_EOI, 0);
212}
213#endif
214 205
206extern void apic_disable(void);
215extern int lapic_get_maxlvt(void); 207extern int lapic_get_maxlvt(void);
216extern void clear_local_APIC(void); 208extern void clear_local_APIC(void);
217extern void connect_bsp_APIC(void); 209extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
252#define local_apic_timer_c2_ok 1 244#define local_apic_timer_c2_ok 1
253static inline void init_apic_mappings(void) { } 245static inline void init_apic_mappings(void) { }
254static inline void disable_local_APIC(void) { } 246static inline void disable_local_APIC(void) { }
255 247static inline void apic_disable(void) { }
256#endif /* !CONFIG_X86_LOCAL_APIC */ 248#endif /* !CONFIG_X86_LOCAL_APIC */
257 249
258#ifdef CONFIG_X86_64 250#ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
410{ 402{
411 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 403 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
412 404
413 if (APIC_XAPIC(ver)) 405 if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
414 return (x >> 24) & 0xFF; 406 return (x >> 24) & 0xFF;
415 else 407 else
416 return (x >> 24) & 0x0F; 408 return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
478extern void default_setup_apic_routing(void); 470extern void default_setup_apic_routing(void);
479 471
480#ifdef CONFIG_X86_32 472#ifdef CONFIG_X86_32
473
474extern struct apic apic_default;
475
481/* 476/*
482 * Set up the logical destination ID. 477 * Set up the logical destination ID.
483 * 478 *
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514fb3b13..7ddb36ab933b 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
22# define APIC_INTEGRATED(x) (1) 22# define APIC_INTEGRATED(x) (1)
23#endif 23#endif
24#define APIC_XAPIC(x) ((x) >= 0x14) 24#define APIC_XAPIC(x) ((x) >= 0x14)
25#define APIC_EXT_SPACE(x) ((x) & 0x80000000)
25#define APIC_TASKPRI 0x80 26#define APIC_TASKPRI 0x80
26#define APIC_TPRI_MASK 0xFFu 27#define APIC_TPRI_MASK 0xFFu
27#define APIC_ARBPRI 0x90 28#define APIC_ARBPRI 0x90
@@ -116,7 +117,9 @@
116#define APIC_TDR_DIV_32 0x8 117#define APIC_TDR_DIV_32 0x8
117#define APIC_TDR_DIV_64 0x9 118#define APIC_TDR_DIV_64 0x9
118#define APIC_TDR_DIV_128 0xA 119#define APIC_TDR_DIV_128 0xA
119#define APIC_EILVT0 0x500 120#define APIC_EFEAT 0x400
121#define APIC_ECTRL 0x410
122#define APIC_EILVTn(n) (0x500 + 0x10 * n)
120#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ 123#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
121#define APIC_EILVT_NR_AMD_10H 4 124#define APIC_EILVT_NR_AMD_10H 4
122#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) 125#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
125#define APIC_EILVT_MSG_NMI 0x4 128#define APIC_EILVT_MSG_NMI 0x4
126#define APIC_EILVT_MSG_EXT 0x7 129#define APIC_EILVT_MSG_EXT 0x7
127#define APIC_EILVT_MASKED (1 << 16) 130#define APIC_EILVT_MASKED (1 << 16)
128#define APIC_EILVT1 0x510
129#define APIC_EILVT2 0x520
130#define APIC_EILVT3 0x530
131 131
132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) 132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
133#define APIC_BASE_MSR 0x800 133#define APIC_BASE_MSR 0x800
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc92..418e632d4a80 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
8 8
9#ifdef __KERNEL__ 9#ifdef __KERNEL__
10 10
11#include <asm/page_types.h>
12
11/* Physical address where kernel should be loaded. */ 13/* Physical address where kernel should be loaded. */
12#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
13 + (CONFIG_PHYSICAL_ALIGN - 1)) \ 15 + (CONFIG_PHYSICAL_ALIGN - 1)) \
14 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
15 17
18/* Minimum kernel alignment, as a power of two */
19#ifdef CONFIG_x86_64
20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
21#else
22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1)
23#endif
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25
26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
27 (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
28#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
29#endif
30
16#ifdef CONFIG_KERNEL_BZIP2 31#ifdef CONFIG_KERNEL_BZIP2
17#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
18#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b6..1724e8de317c 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
50 __u32 ramdisk_size; 50 __u32 ramdisk_size;
51 __u32 bootsect_kludge; 51 __u32 bootsect_kludge;
52 __u16 heap_end_ptr; 52 __u16 heap_end_ptr;
53 __u16 _pad1; 53 __u8 ext_loader_ver;
54 __u8 ext_loader_type;
54 __u32 cmd_line_ptr; 55 __u32 cmd_line_ptr;
55 __u32 initrd_addr_max; 56 __u32 initrd_addr_max;
56 __u32 kernel_alignment; 57 __u32 kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa6..d96c1ee3a95c 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
86 CPU_VALUE_BIT, /* value */ 86 CPU_VALUE_BIT, /* value */
87}; 87};
88 88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) 89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188 90
189#define MAX_CPU_FILES 512 91#define MAX_CPU_FILES 512
190 92
@@ -220,7 +122,6 @@ struct cpu_debug_range {
220 unsigned min; /* Register range min */ 122 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */ 123 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */ 124 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224}; 125};
225 126
226#endif /* _ASM_X86_CPU_DEBUG_H */ 127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397aa..19af42138f78 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ 22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ 23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ 24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ 25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ 26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ 27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ 28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
97 98
98/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 99/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
99#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 100#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -192,11 +193,11 @@ extern const char * const x86_power_flags[32];
192#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) 193#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
193#define setup_clear_cpu_cap(bit) do { \ 194#define setup_clear_cpu_cap(bit) do { \
194 clear_cpu_cap(&boot_cpu_data, bit); \ 195 clear_cpu_cap(&boot_cpu_data, bit); \
195 set_bit(bit, (unsigned long *)cleared_cpu_caps); \ 196 set_bit(bit, (unsigned long *)cpu_caps_cleared); \
196} while (0) 197} while (0)
197#define setup_force_cpu_cap(bit) do { \ 198#define setup_force_cpu_cap(bit) do { \
198 set_cpu_cap(&boot_cpu_data, bit); \ 199 set_cpu_cap(&boot_cpu_data, bit); \
199 clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ 200 set_bit(bit, (unsigned long *)cpu_caps_set); \
200} while (0) 201} while (0)
201 202
202#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) 203#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd70..3bd1777a4c8b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -63,7 +63,26 @@ extern unsigned long io_apic_irqs;
63extern void init_VISWS_APIC_irqs(void); 63extern void init_VISWS_APIC_irqs(void);
64extern void setup_IO_APIC(void); 64extern void setup_IO_APIC(void);
65extern void disable_IO_APIC(void); 65extern void disable_IO_APIC(void);
66extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); 66
67struct io_apic_irq_attr {
68 int ioapic;
69 int ioapic_pin;
70 int trigger;
71 int polarity;
72};
73
74static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
75 int ioapic, int ioapic_pin,
76 int trigger, int polarity)
77{
78 irq_attr->ioapic = ioapic;
79 irq_attr->ioapic_pin = ioapic_pin;
80 irq_attr->trigger = trigger;
81 irq_attr->polarity = polarity;
82}
83
84extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
85 struct io_apic_irq_attr *irq_attr);
67extern void setup_ioapic_dest(void); 86extern void setup_ioapic_dest(void);
68 87
69extern void enable_IO_APIC(void); 88extern void enable_IO_APIC(void);
@@ -78,7 +97,11 @@ extern void eisa_set_level_irq(unsigned int irq);
78/* SMP */ 97/* SMP */
79extern void smp_apic_timer_interrupt(struct pt_regs *); 98extern void smp_apic_timer_interrupt(struct pt_regs *);
80extern void smp_spurious_interrupt(struct pt_regs *); 99extern void smp_spurious_interrupt(struct pt_regs *);
100extern void smp_generic_interrupt(struct pt_regs *);
81extern void smp_error_interrupt(struct pt_regs *); 101extern void smp_error_interrupt(struct pt_regs *);
102#ifdef CONFIG_X86_IO_APIC
103extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
104#endif
82#ifdef CONFIG_SMP 105#ifdef CONFIG_SMP
83extern void smp_reschedule_interrupt(struct pt_regs *); 106extern void smp_reschedule_interrupt(struct pt_regs *);
84extern void smp_call_function_interrupt(struct pt_regs *); 107extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e5183982..175adf58dd4f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
67 ".previous\n" 67 ".previous\n"
68 _ASM_EXTABLE(1b, 3b) 68 _ASM_EXTABLE(1b, 3b)
69 : [err] "=r" (err) 69 : [err] "=r" (err)
70#if 0 /* See comment in __save_init_fpu() below. */ 70#if 0 /* See comment in fxsave() below. */
71 : [fx] "r" (fx), "m" (*fx), "0" (0)); 71 : [fx] "r" (fx), "m" (*fx), "0" (0));
72#else 72#else
73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); 73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
75 return err; 75 return err;
76} 76}
77 77
78static inline int restore_fpu_checking(struct task_struct *tsk)
79{
80 if (task_thread_info(tsk)->status & TS_XSAVE)
81 return xrstor_checking(&tsk->thread.xstate->xsave);
82 else
83 return fxrstor_checking(&tsk->thread.xstate->fxsave);
84}
85
86/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception 78/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
87 is pending. Clear the x87 state here by setting it to fixed 79 is pending. Clear the x87 state here by setting it to fixed
88 values. The kernel data segment can be sometimes 0 and sometimes 80 values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
120 ".previous\n" 112 ".previous\n"
121 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
122 : [err] "=r" (err), "=m" (*fx) 114 : [err] "=r" (err), "=m" (*fx)
123#if 0 /* See comment in __fxsave_clear() below. */ 115#if 0 /* See comment in fxsave() below. */
124 : [fx] "r" (fx), "0" (0)); 116 : [fx] "r" (fx), "0" (0));
125#else 117#else
126 : [fx] "cdaSDb" (fx), "0" (0)); 118 : [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
185 asm volatile("fnclex ; fwait"); 177 asm volatile("fnclex ; fwait");
186} 178}
187 179
188static inline void restore_fpu(struct task_struct *tsk) 180/* perform fxrstor iff the processor has extended states, otherwise frstor */
181static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
189{ 182{
190 if (task_thread_info(tsk)->status & TS_XSAVE) {
191 xrstor_checking(&tsk->thread.xstate->xsave);
192 return;
193 }
194 /* 183 /*
195 * The "nop" is needed to make the instructions the same 184 * The "nop" is needed to make the instructions the same
196 * length. 185 * length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
199 "nop ; frstor %1", 188 "nop ; frstor %1",
200 "fxrstor %1", 189 "fxrstor %1",
201 X86_FEATURE_FXSR, 190 X86_FEATURE_FXSR,
202 "m" (tsk->thread.xstate->fxsave)); 191 "m" (*fx));
192
193 return 0;
203} 194}
204 195
205/* We need a safe address that is cheap to find and that is already 196/* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
262 253
263#endif /* CONFIG_X86_64 */ 254#endif /* CONFIG_X86_64 */
264 255
256static inline int restore_fpu_checking(struct task_struct *tsk)
257{
258 if (task_thread_info(tsk)->status & TS_XSAVE)
259 return xrstor_checking(&tsk->thread.xstate->xsave);
260 else
261 return fxrstor_checking(&tsk->thread.xstate->fxsave);
262}
263
265/* 264/*
266 * Signal frame handlers... 265 * Signal frame handlers...
267 */ 266 */
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
305/* 304/*
306 * Some instructions like VIA's padlock instructions generate a spurious 305 * Some instructions like VIA's padlock instructions generate a spurious
307 * DNA fault but don't modify SSE registers. And these instructions 306 * DNA fault but don't modify SSE registers. And these instructions
308 * get used from interrupt context aswell. To prevent these kernel instructions 307 * get used from interrupt context as well. To prevent these kernel instructions
309 * in interrupt context interact wrongly with other user/kernel fpu usage, we 308 * in interrupt context interacting wrongly with other user/kernel fpu usage, we
310 * should use them only in the context of irq_ts_save/restore() 309 * should use them only in the context of irq_ts_save/restore()
311 */ 310 */
312static inline int irq_ts_save(void) 311static inline int irq_ts_save(void)
313{ 312{
314 /* 313 /*
315 * If we are in process context, we are ok to take a spurious DNA fault. 314 * If in process context and not atomic, we can take a spurious DNA fault.
316 * Otherwise, doing clts() in process context require pre-emption to 315 * Otherwise, doing clts() in process context requires disabling preemption
317 * be disabled or some heavy lifting like kernel_fpu_begin() 316 * or some heavy lifting like kernel_fpu_begin()
318 */ 317 */
319 if (!in_interrupt()) 318 if (!in_atomic())
320 return 0; 319 return 0;
321 320
322 if (read_cr0() & X86_CR0_TS) { 321 if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c092af..58d7091eeb1f 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
60extern void mask_8259A(void); 60extern void mask_8259A(void);
61extern void unmask_8259A(void); 61extern void unmask_8259A(void);
62 62
63#ifdef CONFIG_X86_32
64extern void init_ISA_irqs(void);
65#endif
66
67#endif /* _ASM_X86_I8259_H */ 63#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e436010..daf866ed0612 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,22 +154,19 @@ extern int timer_through_8259;
154extern int io_apic_get_unique_id(int ioapic, int apic_id); 154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic); 155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic); 156extern int io_apic_get_redir_entries(int ioapic);
157extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
158 int edge_level, int active_high_low);
159#endif /* CONFIG_ACPI */ 157#endif /* CONFIG_ACPI */
160 158
159struct io_apic_irq_attr;
160extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr);
161extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
163 164
164#ifdef CONFIG_X86_64
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
171 struct IO_APIC_route_entry **ioapic_entries);
172#endif
173 170
174extern void probe_nr_irqs_gsi(void); 171extern void probe_nr_irqs_gsi(void);
175 172
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6c..0e9fe1d9d971 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
1#ifndef _ASM_X86_IOMAP_H
2#define _ASM_X86_IOMAP_H
3
1/* 4/*
2 * Copyright © 2008 Ingo Molnar 5 * Copyright © 2008 Ingo Molnar
3 * 6 *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
31 34
32void 35void
33iounmap_atomic(void *kvaddr, enum km_type type); 36iounmap_atomic(void *kvaddr, enum km_type type);
37
38#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb8..f275e2244505 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
5 5
6#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..910b5a3d6751 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,6 +34,7 @@
34 34
35#ifdef CONFIG_X86_32 35#ifdef CONFIG_X86_32
36# define SYSCALL_VECTOR 0x80 36# define SYSCALL_VECTOR 0x80
37# define IA32_SYSCALL_VECTOR 0x80
37#else 38#else
38# define IA32_SYSCALL_VECTOR 0x80 39# define IA32_SYSCALL_VECTOR 0x80
39#endif 40#endif
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24d..c2d1f3b58e5f 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 13extern int k8_scan_nodes(unsigned long start, unsigned long end);
14 14
15#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node)
17{
18 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
19}
20#else
21static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{
23 return NULL;
24}
25#endif
26
27
15#endif /* _ASM_X86_K8_H */ 28#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c1..ef51b501e22a 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
9 9
10struct device; 10struct device;
11 11
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13
12struct microcode_ops { 14struct microcode_ops {
13 int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); 15 enum ucode_state (*request_microcode_user) (int cpu,
14 int (*request_microcode_fw) (int cpu, struct device *device); 16 const void __user *buf, size_t size);
15 17
16 void (*apply_microcode) (int cpu); 18 enum ucode_state (*request_microcode_fw) (int cpu,
19 struct device *device);
17 20
18 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
19 void (*microcode_fini_cpu) (int cpu); 21 void (*microcode_fini_cpu) (int cpu);
22
23 /*
24 * The generic 'microcode_core' part guarantees that
25 * the callbacks below run on a target cpu when they
26 * are being called.
27 * See also the "Synchronization" section in microcode_core.c.
28 */
29 int (*apply_microcode) (int cpu);
30 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
20}; 31};
21 32
22struct ucode_cpu_info { 33struct ucode_cpu_info {
23 struct cpu_signature cpu_sig; 34 struct cpu_signature cpu_sig;
24 int valid; 35 int valid;
25 void *mc; 36 void *mc;
26}; 37};
27extern struct ucode_cpu_info ucode_cpu_info[]; 38extern struct ucode_cpu_info ucode_cpu_info[];
28 39
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cdc..e2a1bb6d71ea 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
61#ifdef CONFIG_X86_MPPARSE 61#ifdef CONFIG_X86_MPPARSE
62extern void find_smp_config(void); 62extern void find_smp_config(void);
63extern void early_reserve_e820_mpc_new(void); 63extern void early_reserve_e820_mpc_new(void);
64extern int enable_update_mptable;
64#else 65#else
65static inline void find_smp_config(void) { } 66static inline void find_smp_config(void) { }
66static inline void early_reserve_e820_mpc_new(void) { } 67static inline void early_reserve_e820_mpc_new(void) { }
68#define enable_update_mptable 0
67#endif 69#endif
68 70
69void __cpuinit generic_processor_info(int apicid, int version); 71void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
72extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, 74extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
73 u32 gsi); 75 u32 gsi);
74extern void mp_config_acpi_legacy_irqs(void); 76extern void mp_config_acpi_legacy_irqs(void);
75extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); 77struct device;
78extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
79 int active_high_low);
76extern int acpi_probe_gsi(void); 80extern int acpi_probe_gsi(void);
77#ifdef CONFIG_X86_IO_APIC 81#ifdef CONFIG_X86_IO_APIC
78extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
79 u32 gsi, int triggering, int polarity);
80extern int mp_find_ioapic(int gsi); 82extern int mp_find_ioapic(int gsi);
81extern int mp_find_ioapic_pin(int ioapic, int gsi); 83extern int mp_find_ioapic_pin(int ioapic, int gsi);
82#else
83static inline int
84mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
85 u32 gsi, int triggering, int polarity)
86{
87 return 0;
88}
89#endif 84#endif
90#else /* !CONFIG_ACPI: */ 85#else /* !CONFIG_ACPI: */
91static inline int acpi_probe_gsi(void) 86static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c167..4d58d04fca83 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
121#define MSR_K8_TOP_MEM1 0xc001001a 121#define MSR_K8_TOP_MEM1 0xc001001a
122#define MSR_K8_TOP_MEM2 0xc001001d 122#define MSR_K8_TOP_MEM2 0xc001001d
123#define MSR_K8_SYSCFG 0xc0010010 123#define MSR_K8_SYSCFG 0xc0010010
124#define MSR_K8_HWCR 0xc0010015
125#define MSR_K8_INT_PENDING_MSG 0xc0010055 124#define MSR_K8_INT_PENDING_MSG 0xc0010055
126/* C1E active bits in int pending message */ 125/* C1E active bits in int pending message */
127#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 126#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568dff..c97264409934 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
64 * but since they are power of two we could use a 64 * but since they are power of two we could use a
65 * cheaper way --cvg 65 * cheaper way --cvg
66 */ 66 */
67 return nmi_watchdog & 0x3; 67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68} 68}
69#endif 69#endif
70 70
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cbe..c4ae822e415f 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
17extern void numa_init_array(void); 17extern void numa_init_array(void);
18extern int numa_off; 18extern int numa_off;
19 19
20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent;
22
23extern s16 apicid_to_node[MAX_LOCAL_APIC]; 20extern s16 apicid_to_node[MAX_LOCAL_APIC];
24 21
25extern unsigned long numa_free_all_bootmem(void); 22extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
27 unsigned long end); 24 unsigned long end);
28 25
29#ifdef CONFIG_NUMA 26#ifdef CONFIG_NUMA
27/*
28 * Too small node sizes may confuse the VM badly. Usually they
29 * result from BIOS bugs. So dont recognize nodes as standalone
30 * NUMA entities that have less than this amount of RAM listed:
31 */
32#define NODE_MIN_SIZE (4*1024*1024)
33
30extern void __init init_cpu_to_node(void); 34extern void __init init_cpu_to_node(void);
31extern void __cpuinit numa_set_node(int cpu, int node); 35extern void __cpuinit numa_set_node(int cpu, int node);
32extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a7..6f1b7331313f 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
54extern int sysctl_legacy_va_layout; 54extern int sysctl_legacy_va_layout;
55 55
56extern void find_low_pfn_range(void); 56extern void find_low_pfn_range(void);
57extern unsigned long init_memory_mapping(unsigned long start,
58 unsigned long end);
59extern void initmem_init(unsigned long, unsigned long);
60extern void free_initmem(void);
61extern void setup_bootmem_allocator(void); 57extern void setup_bootmem_allocator(void);
62 58
63#endif /* !__ASSEMBLY__ */ 59#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b70248..8d382d3abf38 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,22 +32,14 @@
32 */ 32 */
33#define __PAGE_OFFSET _AC(0xffff880000000000, UL) 33#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
34 34
35#define __PHYSICAL_START CONFIG_PHYSICAL_START 35#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
36#define __KERNEL_ALIGN 0x200000 36 (CONFIG_PHYSICAL_ALIGN - 1)) & \
37 37 ~(CONFIG_PHYSICAL_ALIGN - 1))
38/*
39 * Make sure kernel is aligned to 2MB address. Catching it at compile
40 * time is better. Change your config file and compile the kernel
41 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
42 */
43#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
44#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
45#endif
46 38
47#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) 39#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
48#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 40#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
49 41
50/* See Documentation/x86_64/mm.txt for a description of the memory map. */ 42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
51#define __PHYSICAL_MASK_SHIFT 46 43#define __PHYSICAL_MASK_SHIFT 46
52#define __VIRTUAL_MASK_SHIFT 48 44#define __VIRTUAL_MASK_SHIFT 48
53 45
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
71 63
72#define vmemmap ((struct page *)VMEMMAP_START) 64#define vmemmap ((struct page *)VMEMMAP_START)
73 65
74extern unsigned long init_memory_mapping(unsigned long start,
75 unsigned long end);
76
77extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
78extern void free_initmem(void);
79
80extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); 66extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
81extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); 67extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
82 68
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006ab..6473f5ccff85 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
46extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
47extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
48 48
49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end);
51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
53extern void free_initmem(void);
54
49#endif /* !__ASSEMBLY__ */ 55#endif /* !__ASSEMBLY__ */
50 56
51#endif /* _ASM_X86_PAGE_DEFS_H */ 57#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a53da004e08e..4fb37c8a0832 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
56struct tss_struct; 56struct tss_struct;
57struct mm_struct; 57struct mm_struct;
58struct desc_struct; 58struct desc_struct;
59struct task_struct;
59 60
60/* 61/*
61 * Wrapper type for pointers to code which uses the non-standard 62 * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
203 204
204 void (*swapgs)(void); 205 void (*swapgs)(void);
205 206
206 struct pv_lazy_ops lazy_mode; 207 void (*start_context_switch)(struct task_struct *prev);
208 void (*end_context_switch)(struct task_struct *next);
207}; 209};
208 210
209struct pv_irq_ops { 211struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
1399}; 1401};
1400 1402
1401enum paravirt_lazy_mode paravirt_get_lazy_mode(void); 1403enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1402void paravirt_enter_lazy_cpu(void); 1404void paravirt_start_context_switch(struct task_struct *prev);
1403void paravirt_leave_lazy_cpu(void); 1405void paravirt_end_context_switch(struct task_struct *next);
1406
1404void paravirt_enter_lazy_mmu(void); 1407void paravirt_enter_lazy_mmu(void);
1405void paravirt_leave_lazy_mmu(void); 1408void paravirt_leave_lazy_mmu(void);
1406void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1407 1409
1408#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE 1410#define __HAVE_ARCH_START_CONTEXT_SWITCH
1409static inline void arch_enter_lazy_cpu_mode(void) 1411static inline void arch_start_context_switch(struct task_struct *prev)
1410{ 1412{
1411 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); 1413 PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
1412} 1414}
1413 1415
1414static inline void arch_leave_lazy_cpu_mode(void) 1416static inline void arch_end_context_switch(struct task_struct *next)
1415{ 1417{
1416 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); 1418 PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
1417} 1419}
1418 1420
1419void arch_flush_lazy_cpu_mode(void);
1420
1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE 1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1422static inline void arch_enter_lazy_mmu_mode(void) 1422static inline void arch_enter_lazy_mmu_mode(void)
1423{ 1423{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc0..18ef7ebf2631 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
81#define pte_val(x) native_pte_val(x) 81#define pte_val(x) native_pte_val(x)
82#define __pte(x) native_make_pte(x) 82#define __pte(x) native_make_pte(x)
83 83
84#define arch_end_context_switch(prev) do {} while(0)
85
84#endif /* CONFIG_PARAVIRT */ 86#endif /* CONFIG_PARAVIRT */
85 87
86/* 88/*
@@ -503,6 +505,8 @@ static inline int pgd_none(pgd_t pgd)
503 505
504#ifndef __ASSEMBLY__ 506#ifndef __ASSEMBLY__
505 507
508extern int direct_gbpages;
509
506/* local pte updates need not use xchg for locking */ 510/* local pte updates need not use xchg for locking */
507static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 511static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
508{ 512{
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d5018..abde308fdb0f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
25 25
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#endif /* !__ASSEMBLY__ */
29
30#ifndef __ASSEMBLY__
31
32#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
33 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 printk("%s:%d: bad pte %p(%016lx).\n", \
34 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
135 131
136#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, pte) do { } while (0)
137 133
138extern int direct_gbpages;
139
140/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
141#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
142#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 136#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e0383..766ea16fbbbd 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) 51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
52#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 52#define PGDIR_MASK (~(PGDIR_SIZE - 1))
53 53
54 54/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
56#define VMALLOC_START _AC(0xffffc20000000000, UL) 56#define VMALLOC_START _AC(0xffffc90000000000, UL)
57#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) 57#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
58#define VMEMMAP_START _AC(0xffffe20000000000, UL) 58#define VMEMMAP_START _AC(0xffffea0000000000, UL)
59#define MODULES_VADDR _AC(0xffffffffa0000000, UL) 59#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
60#define MODULES_END _AC(0xffffffffff000000, UL) 60#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 61#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786d..4d258ad76a0f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
273 273
274extern pteval_t __supported_pte_mask; 274extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 275extern int nx_enabled;
276extern void set_nx(void);
277 276
278#define pgprot_writecombine pgprot_writecombine 277#define pgprot_writecombine pgprot_writecombine
279extern pgprot_t pgprot_writecombine(pgprot_t prot); 278extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae709c8..87ede2f31bc7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
135extern struct cpuinfo_x86 new_cpu_data; 135extern struct cpuinfo_x86 new_cpu_data;
136 136
137extern struct tss_struct doublefault_tss; 137extern struct tss_struct doublefault_tss;
138extern __u32 cleared_cpu_caps[NCAPINTS]; 138extern __u32 cpu_caps_cleared[NCAPINTS];
139extern __u32 cpu_caps_set[NCAPINTS];
139 140
140#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
141DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
409extern unsigned int xstate_size; 410extern unsigned int xstate_size;
410extern void free_thread_xstate(struct task_struct *); 411extern void free_thread_xstate(struct task_struct *);
411extern struct kmem_cache *task_xstate_cachep; 412extern struct kmem_cache *task_xstate_cachep;
412extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
413extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
414extern unsigned short num_cache_leaves;
415 413
416struct thread_struct { 414struct thread_struct {
417 /* Cached TLS descriptors: */ 415 /* Cached TLS descriptors: */
@@ -427,8 +425,12 @@ struct thread_struct {
427 unsigned short fsindex; 425 unsigned short fsindex;
428 unsigned short gsindex; 426 unsigned short gsindex;
429#endif 427#endif
428#ifdef CONFIG_X86_32
430 unsigned long ip; 429 unsigned long ip;
430#endif
431#ifdef CONFIG_X86_64
431 unsigned long fs; 432 unsigned long fs;
433#endif
432 unsigned long gs; 434 unsigned long gs;
433 /* Hardware debugging registers: */ 435 /* Hardware debugging registers: */
434 unsigned long debugreg0; 436 unsigned long debugreg0;
@@ -814,6 +816,7 @@ extern unsigned int BIOS_revision;
814 816
815/* Boot loader type from the setup header: */ 817/* Boot loader type from the setup header: */
816extern int bootloader_type; 818extern int bootloader_type;
819extern int bootloader_version;
817 820
818extern char ignore_fpu_irq; 821extern char ignore_fpu_irq;
819 822
@@ -874,7 +877,6 @@ static inline void spin_lock_prefetch(const void *x)
874 .vm86_info = NULL, \ 877 .vm86_info = NULL, \
875 .sysenter_cs = __KERNEL_CS, \ 878 .sysenter_cs = __KERNEL_CS, \
876 .io_bitmap_ptr = NULL, \ 879 .io_bitmap_ptr = NULL, \
877 .fs = __KERNEL_PERCPU, \
878} 880}
879 881
880/* 882/*
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd58..64cf2d24fad1 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
48#endif 48#endif
49 49
50#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */
51#define NEED_PSE 0 53#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE 0 54#define NEED_PGE 0
55#else
56#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
57#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
58#endif
59#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) 60#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) 61#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) 62#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada05ae0..4093d1ed6db2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
33 int (*setup_ioapic_ids)(void); 33 int (*setup_ioapic_ids)(void);
34}; 34};
35 35
36extern void x86_quirk_pre_intr_init(void);
37extern void x86_quirk_intr_init(void); 36extern void x86_quirk_intr_init(void);
38 37
39extern void x86_quirk_trap_init(void); 38extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88b966d..6a84ed166aec 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
180static inline int logical_smp_processor_id(void) 180static inline int logical_smp_processor_id(void)
181{ 181{
182 /* we don't want to mark this access volatile - bad code generation */ 182 /* we don't want to mark this access volatile - bad code generation */
183 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); 183 return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
184} 184}
185 185
186#endif 186#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec5..4517d6b93188 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ 30# define MAX_PHYSMEM_BITS 46
31#endif 31#endif
32 32
33#endif /* CONFIG_SPARSEMEM */ 33#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f6904..372b76edd63f 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * syscalls.h - Linux syscall interfaces (arch-specific) 2 * syscalls.h - Linux syscall interfaces (arch-specific)
3 * 3 *
4 * Copyright (c) 2008 Jaswinder Singh 4 * Copyright (c) 2008 Jaswinder Singh Rajput
5 * 5 *
6 * This file is released under the GPLv2. 6 * This file is released under the GPLv2.
7 * See the file COPYING for more details. 7 * See the file COPYING for more details.
@@ -12,50 +12,55 @@
12 12
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/linkage.h> 14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h> 15#include <linux/signal.h>
16#include <linux/types.h>
17 17
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21 21
22/* kernel/process.c */
23int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *);
25
22/* kernel/ldt.c */ 26/* kernel/ldt.c */
23asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
24 28
29/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *);
31
25/* kernel/tls.c */ 32/* kernel/tls.c */
26asmlinkage int sys_set_thread_area(struct user_desc __user *); 33asmlinkage int sys_set_thread_area(struct user_desc __user *);
27asmlinkage int sys_get_thread_area(struct user_desc __user *); 34asmlinkage int sys_get_thread_area(struct user_desc __user *);
28 35
29/* X86_32 only */ 36/* X86_32 only */
30#ifdef CONFIG_X86_32 37#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
31/* kernel/process_32.c */ 41/* kernel/process_32.c */
32int sys_fork(struct pt_regs *);
33int sys_clone(struct pt_regs *); 42int sys_clone(struct pt_regs *);
34int sys_vfork(struct pt_regs *);
35int sys_execve(struct pt_regs *); 43int sys_execve(struct pt_regs *);
36 44
37/* kernel/signal_32.c */ 45/* kernel/signal.c */
38asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 46asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
39asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
40 struct old_sigaction __user *); 48 struct old_sigaction __user *);
41int sys_sigaltstack(struct pt_regs *); 49int sys_sigaltstack(struct pt_regs *);
42unsigned long sys_sigreturn(struct pt_regs *); 50unsigned long sys_sigreturn(struct pt_regs *);
43long sys_rt_sigreturn(struct pt_regs *);
44
45/* kernel/ioport.c */
46long sys_iopl(struct pt_regs *);
47 51
48/* kernel/sys_i386_32.c */ 52/* kernel/sys_i386_32.c */
53struct mmap_arg_struct;
54struct sel_arg_struct;
55struct oldold_utsname;
56struct old_utsname;
57
49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
50 unsigned long, unsigned long, unsigned long); 59 unsigned long, unsigned long, unsigned long);
51struct mmap_arg_struct;
52asmlinkage int old_mmap(struct mmap_arg_struct __user *); 60asmlinkage int old_mmap(struct mmap_arg_struct __user *);
53struct sel_arg_struct;
54asmlinkage int old_select(struct sel_arg_struct __user *); 61asmlinkage int old_select(struct sel_arg_struct __user *);
55asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
56struct old_utsname;
57asmlinkage int sys_uname(struct old_utsname __user *); 63asmlinkage int sys_uname(struct old_utsname __user *);
58struct oldold_utsname;
59asmlinkage int sys_olduname(struct oldold_utsname __user *); 64asmlinkage int sys_olduname(struct oldold_utsname __user *);
60 65
61/* kernel/vm86_32.c */ 66/* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
65#else /* CONFIG_X86_32 */ 70#else /* CONFIG_X86_32 */
66 71
67/* X86_64 only */ 72/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
68/* kernel/process_64.c */ 76/* kernel/process_64.c */
69asmlinkage long sys_fork(struct pt_regs *);
70asmlinkage long sys_clone(unsigned long, unsigned long, 77asmlinkage long sys_clone(unsigned long, unsigned long,
71 void __user *, void __user *, 78 void __user *, void __user *,
72 struct pt_regs *); 79 struct pt_regs *);
73asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *, 80asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *, 81 char __user * __user *,
76 struct pt_regs *); 82 struct pt_regs *);
77long sys_arch_prctl(int, unsigned long); 83long sys_arch_prctl(int, unsigned long);
78 84
79/* kernel/ioport.c */ 85/* kernel/signal.c */
80asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
81
82/* kernel/signal_64.c */
83asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, 86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
84 struct pt_regs *); 87 struct pt_regs *);
85long sys_rt_sigreturn(struct pt_regs *);
86 88
87/* kernel/sys_x86_64.c */ 89/* kernel/sys_x86_64.c */
90struct new_utsname;
91
88asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 92asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
89 unsigned long, unsigned long, unsigned long); 93 unsigned long, unsigned long, unsigned long);
90struct new_utsname;
91asmlinkage long sys_uname(struct new_utsname __user *); 94asmlinkage long sys_uname(struct new_utsname __user *);
92 95
93#endif /* CONFIG_X86_32 */ 96#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae090..602c769fc98c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
98 99
99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
117#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
118#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
119#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
120 122
121/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca49..066ef590d7e0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
203void x86_pci_root_bus_res_quirks(struct pci_bus *b); 203void x86_pci_root_bus_res_quirks(struct pci_bus *b);
204 204
205#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
206#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) 206#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
207 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
207#define smt_capable() (smp_num_siblings > 1) 208#define smt_capable() (smp_num_siblings > 1)
208#endif 209#endif
209 210
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b86..bfd74c032fca 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <asm/debugreg.h> 4#include <asm/debugreg.h>
5#include <asm/siginfo.h> /* TRAP_TRACE, ... */
5 6
6#ifdef CONFIG_X86_32 7#ifdef CONFIG_X86_32
7#define dotraplinkage 8#define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
13asmlinkage void debug(void); 14asmlinkage void debug(void);
14asmlinkage void nmi(void); 15asmlinkage void nmi(void);
15asmlinkage void int3(void); 16asmlinkage void int3(void);
17asmlinkage void xen_debug(void);
18asmlinkage void xen_int3(void);
19asmlinkage void xen_stack_segment(void);
16asmlinkage void overflow(void); 20asmlinkage void overflow(void);
17asmlinkage void bounds(void); 21asmlinkage void bounds(void);
18asmlinkage void invalid_op(void); 22asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
74} 78}
75 79
76extern int panic_on_unrecovered_nmi; 80extern int panic_on_unrecovered_nmi;
77extern int kstack_depth_to_print;
78 81
79void math_error(void __user *); 82void math_error(void __user *);
80void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a88..bddd44f2f0ab 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
37#define UV_CPUS_PER_ACT_STATUS 32 37#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3 38#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2 39#define UV_ACT_STATUS_SIZE 2
40#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 40#define UV_ADP_SIZE 32
41#define UV_DISTRIBUTION_SIZE 256 41#define UV_DISTRIBUTION_SIZE 256
42#define UV_SW_ACK_NPENDING 8 42#define UV_SW_ACK_NPENDING 8
43#define UV_NET_ENDPOINT_INTD 0x38 43#define UV_NET_ENDPOINT_INTD 0x38
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062e..341070f7ad5c 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
133struct uv_hub_info_s { 133struct uv_hub_info_s {
134 unsigned long global_mmr_base; 134 unsigned long global_mmr_base;
135 unsigned long gpa_mask; 135 unsigned long gpa_mask;
136 unsigned int gnode_extra;
136 unsigned long gnode_upper; 137 unsigned long gnode_upper;
137 unsigned long lowmem_remap_top; 138 unsigned long lowmem_remap_top;
138 unsigned long lowmem_remap_base; 139 unsigned long lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
159 * p - PNODE (local part of nsids, right shifted 1) 160 * p - PNODE (local part of nsids, right shifted 1)
160 */ 161 */
161#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 162#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
162#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) 163#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
164#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
163 165
164#define UV_LOCAL_MMR_BASE 0xf4000000UL 166#define UV_LOCAL_MMR_BASE 0xf4000000UL
165#define UV_GLOBAL_MMR32_BASE 0xf8000000UL 167#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
173#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
174 176
175#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
176 ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
177 179
178#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
179 181
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d1bfc847d3..235f5927bb97 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp)
28obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
31obj-y += setup.o i8259.o irqinit_$(BITS).o 31obj-y += setup.o i8259.o irqinit.o
32obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
33obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f802..631086159c53 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/ioport.h> 35#include <linux/ioport.h>
36#include <linux/pci.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/io_apic.h> 39#include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
522 * success: return IRQ number (>=0) 523 * success: return IRQ number (>=0)
523 * failure: return < 0 524 * failure: return < 0
524 */ 525 */
525int acpi_register_gsi(u32 gsi, int triggering, int polarity) 526int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
526{ 527{
527 unsigned int irq; 528 unsigned int irq;
528 unsigned int plat_gsi = gsi; 529 unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
532 * Make sure all (legacy) PCI IRQs are set as level-triggered. 533 * Make sure all (legacy) PCI IRQs are set as level-triggered.
533 */ 534 */
534 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 535 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
535 if (triggering == ACPI_LEVEL_SENSITIVE) 536 if (trigger == ACPI_LEVEL_SENSITIVE)
536 eisa_set_level_irq(gsi); 537 eisa_set_level_irq(gsi);
537 } 538 }
538#endif 539#endif
539 540
540#ifdef CONFIG_X86_IO_APIC 541#ifdef CONFIG_X86_IO_APIC
541 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { 542 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
542 plat_gsi = mp_register_gsi(gsi, triggering, polarity); 543 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
543 } 544 }
544#endif 545#endif
545 acpi_gsi_to_irq(plat_gsi, &irq); 546 acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
903#endif 904#endif
904 905
905static struct { 906static struct {
906 int apic_id;
907 int gsi_base; 907 int gsi_base;
908 int gsi_end; 908 int gsi_end;
909 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
910} mp_ioapic_routing[MAX_IO_APICS]; 909} mp_ioapic_routing[MAX_IO_APICS];
911 910
912int mp_find_ioapic(int gsi) 911int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
986 985
987 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
988 mp_ioapics[idx].apicid = uniq_ioapic_id(id); 987 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
989#ifdef CONFIG_X86_32
990 mp_ioapics[idx].apicver = io_apic_get_version(idx); 988 mp_ioapics[idx].apicver = io_apic_get_version(idx);
991#else 989
992 mp_ioapics[idx].apicver = 0;
993#endif
994 /* 990 /*
995 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 991 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
996 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 992 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
997 */ 993 */
998 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
999 mp_ioapic_routing[idx].gsi_base = gsi_base; 994 mp_ioapic_routing[idx].gsi_base = gsi_base;
1000 mp_ioapic_routing[idx].gsi_end = gsi_base + 995 mp_ioapic_routing[idx].gsi_end = gsi_base +
1001 io_apic_get_redir_entries(idx); 996 io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
1158 } 1153 }
1159} 1154}
1160 1155
1161int mp_register_gsi(u32 gsi, int triggering, int polarity) 1156static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1157 int polarity)
1162{ 1158{
1159#ifdef CONFIG_X86_MPPARSE
1160 struct mpc_intsrc mp_irq;
1161 struct pci_dev *pdev;
1162 unsigned char number;
1163 unsigned int devfn;
1163 int ioapic; 1164 int ioapic;
1164 int ioapic_pin; 1165 u8 pin;
1165#ifdef CONFIG_X86_32
1166#define MAX_GSI_NUM 4096
1167#define IRQ_COMPRESSION_START 64
1168 1166
1169 static int pci_irq = IRQ_COMPRESSION_START; 1167 if (!acpi_ioapic)
1170 /* 1168 return 0;
1171 * Mapping between Global System Interrupts, which 1169 if (!dev)
1172 * represent all possible interrupts, and IRQs 1170 return 0;
1173 * assigned to actual devices. 1171 if (dev->bus != &pci_bus_type)
1174 */ 1172 return 0;
1175 static int gsi_to_irq[MAX_GSI_NUM]; 1173
1176#else 1174 pdev = to_pci_dev(dev);
1175 number = pdev->bus->number;
1176 devfn = pdev->devfn;
1177 pin = pdev->pin;
1178 /* print the entry should happen on mptable identically */
1179 mp_irq.type = MP_INTSRC;
1180 mp_irq.irqtype = mp_INT;
1181 mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1182 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1183 mp_irq.srcbus = number;
1184 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1185 ioapic = mp_find_ioapic(gsi);
1186 mp_irq.dstapic = mp_ioapics[ioapic].apicid;
1187 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1188
1189 save_mp_irq(&mp_irq);
1190#endif
1191 return 0;
1192}
1193
1194int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1195{
1196 int ioapic;
1197 int ioapic_pin;
1198 struct io_apic_irq_attr irq_attr;
1177 1199
1178 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1200 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1179 return gsi; 1201 return gsi;
1180#endif
1181 1202
1182 /* Don't set up the ACPI SCI because it's already set up */ 1203 /* Don't set up the ACPI SCI because it's already set up */
1183 if (acpi_gbl_FADT.sci_interrupt == gsi) 1204 if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1196 gsi = ioapic_renumber_irq(ioapic, gsi); 1217 gsi = ioapic_renumber_irq(ioapic, gsi);
1197#endif 1218#endif
1198 1219
1199 /*
1200 * Avoid pin reprogramming. PRTs typically include entries
1201 * with redundant pin->gsi mappings (but unique PCI devices);
1202 * we only program the IOAPIC on the first.
1203 */
1204 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1220 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1205 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1221 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1206 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 1222 "%d-%d\n", mp_ioapics[ioapic].apicid,
1207 ioapic_pin); 1223 ioapic_pin);
1208 return gsi; 1224 return gsi;
1209 } 1225 }
1210 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1211 pr_debug("Pin %d-%d already programmed\n",
1212 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1213#ifdef CONFIG_X86_32
1214 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1215#else
1216 return gsi;
1217#endif
1218 }
1219
1220 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1221#ifdef CONFIG_X86_32
1222 /*
1223 * For GSI >= 64, use IRQ compression
1224 */
1225 if ((gsi >= IRQ_COMPRESSION_START)
1226 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1227 /*
1228 * For PCI devices assign IRQs in order, avoiding gaps
1229 * due to unused I/O APIC pins.
1230 */
1231 int irq = gsi;
1232 if (gsi < MAX_GSI_NUM) {
1233 /*
1234 * Retain the VIA chipset work-around (gsi > 15), but
1235 * avoid a problem where the 8254 timer (IRQ0) is setup
1236 * via an override (so it's not on pin 0 of the ioapic),
1237 * and at the same time, the pin 0 interrupt is a PCI
1238 * type. The gsi > 15 test could cause these two pins
1239 * to be shared as IRQ0, and they are not shareable.
1240 * So test for this condition, and if necessary, avoid
1241 * the pin collision.
1242 */
1243 gsi = pci_irq++;
1244 /*
1245 * Don't assign IRQ used by ACPI SCI
1246 */
1247 if (gsi == acpi_gbl_FADT.sci_interrupt)
1248 gsi = pci_irq++;
1249 gsi_to_irq[irq] = gsi;
1250 } else {
1251 printk(KERN_ERR "GSI %u is too high\n", gsi);
1252 return gsi;
1253 }
1254 }
1255#endif
1256 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1257 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1258 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1259 return gsi;
1260}
1261 1226
1262int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, 1227 if (enable_update_mptable)
1263 u32 gsi, int triggering, int polarity) 1228 mp_config_acpi_gsi(dev, gsi, trigger, polarity);
1264{
1265#ifdef CONFIG_X86_MPPARSE
1266 struct mpc_intsrc mp_irq;
1267 int ioapic;
1268 1229
1269 if (!acpi_ioapic) 1230 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1270 return 0; 1231 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1232 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1233 io_apic_set_pci_routing(dev, gsi, &irq_attr);
1271 1234
1272 /* print the entry should happen on mptable identically */ 1235 return gsi;
1273 mp_irq.type = MP_INTSRC;
1274 mp_irq.irqtype = mp_INT;
1275 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1276 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1277 mp_irq.srcbus = number;
1278 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1279 ioapic = mp_find_ioapic(gsi);
1280 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1281 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1282
1283 save_mp_irq(&mp_irq);
1284#endif
1285 return 0;
1286} 1236}
1287 1237
1288/* 1238/*
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9def..167bc16ce0e5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
9always := wakeup.bin 9always := wakeup.bin
10targets := wakeup.elf wakeup.lds 10targets := wakeup.elf wakeup.lds
11 11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o 12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13 13
14# The link order of the video-*.o modules can matter. In particular, 14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o. 15# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 000000000000..f51eb0bb56ce
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 000000000000..6206033ba202
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52..1c60554537c3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 56 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 57static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom,
59 unsigned long address, u64
60 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page,
63 unsigned int pages);
58 64
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
67#endif
59 68
60#ifdef CONFIG_AMD_IOMMU_STATS 69#ifdef CONFIG_AMD_IOMMU_STATS
61 70
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
213{ 222{
214 struct amd_iommu *iommu; 223 struct amd_iommu *iommu;
215 224
216 list_for_each_entry(iommu, &amd_iommu_list, list) 225 for_each_iommu(iommu)
217 iommu_poll_events(iommu); 226 iommu_poll_events(iommu);
218 227
219 return IRQ_HANDLED; 228 return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
440 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 449 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
441 domid, 1, 1); 450 domid, 1, 1);
442 451
443 list_for_each_entry(iommu, &amd_iommu_list, list) { 452 for_each_iommu(iommu) {
444 spin_lock_irqsave(&iommu->lock, flags); 453 spin_lock_irqsave(&iommu->lock, flags);
445 __iommu_queue_command(iommu, &cmd); 454 __iommu_queue_command(iommu, &cmd);
446 __iommu_completion_wait(iommu); 455 __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
449 } 458 }
450} 459}
451 460
461void amd_iommu_flush_all_domains(void)
462{
463 int i;
464
465 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
466 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
467 continue;
468 iommu_flush_domain(i);
469 }
470}
471
472void amd_iommu_flush_all_devices(void)
473{
474 struct amd_iommu *iommu;
475 int i;
476
477 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
478 if (amd_iommu_pd_table[i] == NULL)
479 continue;
480
481 iommu = amd_iommu_rlookup_table[i];
482 if (!iommu)
483 continue;
484
485 iommu_queue_inv_dev_entry(iommu, i);
486 iommu_completion_wait(iommu);
487 }
488}
489
452/**************************************************************************** 490/****************************************************************************
453 * 491 *
454 * The functions below are used the create the page table mappings for 492 * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
468 unsigned long phys_addr, 506 unsigned long phys_addr,
469 int prot) 507 int prot)
470{ 508{
471 u64 __pte, *pte, *page; 509 u64 __pte, *pte;
472 510
473 bus_addr = PAGE_ALIGN(bus_addr); 511 bus_addr = PAGE_ALIGN(bus_addr);
474 phys_addr = PAGE_ALIGN(phys_addr); 512 phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
477 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 515 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
478 return -EINVAL; 516 return -EINVAL;
479 517
480 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; 518 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
481
482 if (!IOMMU_PTE_PRESENT(*pte)) {
483 page = (u64 *)get_zeroed_page(GFP_KERNEL);
484 if (!page)
485 return -ENOMEM;
486 *pte = IOMMU_L2_PDE(virt_to_phys(page));
487 }
488
489 pte = IOMMU_PTE_PAGE(*pte);
490 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
491
492 if (!IOMMU_PTE_PRESENT(*pte)) {
493 page = (u64 *)get_zeroed_page(GFP_KERNEL);
494 if (!page)
495 return -ENOMEM;
496 *pte = IOMMU_L1_PDE(virt_to_phys(page));
497 }
498
499 pte = IOMMU_PTE_PAGE(*pte);
500 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
501 519
502 if (IOMMU_PTE_PRESENT(*pte)) 520 if (IOMMU_PTE_PRESENT(*pte))
503 return -EBUSY; 521 return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
595 * as allocated in the aperture 613 * as allocated in the aperture
596 */ 614 */
597 if (addr < dma_dom->aperture_size) 615 if (addr < dma_dom->aperture_size)
598 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); 616 __set_bit(addr >> PAGE_SHIFT,
617 dma_dom->aperture[0]->bitmap);
599 } 618 }
600 619
601 return 0; 620 return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
632 ****************************************************************************/ 651 ****************************************************************************/
633 652
634/* 653/*
635 * The address allocator core function. 654 * The address allocator core functions.
636 * 655 *
637 * called with domain->lock held 656 * called with domain->lock held
638 */ 657 */
658
659/*
660 * This function checks if there is a PTE for a given dma address. If
661 * there is one, it returns the pointer to it.
662 */
663static u64* fetch_pte(struct protection_domain *domain,
664 unsigned long address)
665{
666 u64 *pte;
667
668 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
669
670 if (!IOMMU_PTE_PRESENT(*pte))
671 return NULL;
672
673 pte = IOMMU_PTE_PAGE(*pte);
674 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
675
676 if (!IOMMU_PTE_PRESENT(*pte))
677 return NULL;
678
679 pte = IOMMU_PTE_PAGE(*pte);
680 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
681
682 return pte;
683}
684
685/*
686 * This function is used to add a new aperture range to an existing
687 * aperture in case of dma_ops domain allocation or address allocation
688 * failure.
689 */
690static int alloc_new_range(struct amd_iommu *iommu,
691 struct dma_ops_domain *dma_dom,
692 bool populate, gfp_t gfp)
693{
694 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
695 int i;
696
697#ifdef CONFIG_IOMMU_STRESS
698 populate = false;
699#endif
700
701 if (index >= APERTURE_MAX_RANGES)
702 return -ENOMEM;
703
704 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
705 if (!dma_dom->aperture[index])
706 return -ENOMEM;
707
708 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
709 if (!dma_dom->aperture[index]->bitmap)
710 goto out_free;
711
712 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
713
714 if (populate) {
715 unsigned long address = dma_dom->aperture_size;
716 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
717 u64 *pte, *pte_page;
718
719 for (i = 0; i < num_ptes; ++i) {
720 pte = alloc_pte(&dma_dom->domain, address,
721 &pte_page, gfp);
722 if (!pte)
723 goto out_free;
724
725 dma_dom->aperture[index]->pte_pages[i] = pte_page;
726
727 address += APERTURE_RANGE_SIZE / 64;
728 }
729 }
730
731 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
732
733 /* Intialize the exclusion range if necessary */
734 if (iommu->exclusion_start &&
735 iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
736 iommu->exclusion_start < dma_dom->aperture_size) {
737 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
738 int pages = iommu_num_pages(iommu->exclusion_start,
739 iommu->exclusion_length,
740 PAGE_SIZE);
741 dma_ops_reserve_addresses(dma_dom, startpage, pages);
742 }
743
744 /*
745 * Check for areas already mapped as present in the new aperture
746 * range and mark those pages as reserved in the allocator. Such
747 * mappings may already exist as a result of requested unity
748 * mappings for devices.
749 */
750 for (i = dma_dom->aperture[index]->offset;
751 i < dma_dom->aperture_size;
752 i += PAGE_SIZE) {
753 u64 *pte = fetch_pte(&dma_dom->domain, i);
754 if (!pte || !IOMMU_PTE_PRESENT(*pte))
755 continue;
756
757 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
758 }
759
760 return 0;
761
762out_free:
763 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
764
765 kfree(dma_dom->aperture[index]);
766 dma_dom->aperture[index] = NULL;
767
768 return -ENOMEM;
769}
770
771static unsigned long dma_ops_area_alloc(struct device *dev,
772 struct dma_ops_domain *dom,
773 unsigned int pages,
774 unsigned long align_mask,
775 u64 dma_mask,
776 unsigned long start)
777{
778 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
779 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
780 int i = start >> APERTURE_RANGE_SHIFT;
781 unsigned long boundary_size;
782 unsigned long address = -1;
783 unsigned long limit;
784
785 next_bit >>= PAGE_SHIFT;
786
787 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
788 PAGE_SIZE) >> PAGE_SHIFT;
789
790 for (;i < max_index; ++i) {
791 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
792
793 if (dom->aperture[i]->offset >= dma_mask)
794 break;
795
796 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
797 dma_mask >> PAGE_SHIFT);
798
799 address = iommu_area_alloc(dom->aperture[i]->bitmap,
800 limit, next_bit, pages, 0,
801 boundary_size, align_mask);
802 if (address != -1) {
803 address = dom->aperture[i]->offset +
804 (address << PAGE_SHIFT);
805 dom->next_address = address + (pages << PAGE_SHIFT);
806 break;
807 }
808
809 next_bit = 0;
810 }
811
812 return address;
813}
814
639static unsigned long dma_ops_alloc_addresses(struct device *dev, 815static unsigned long dma_ops_alloc_addresses(struct device *dev,
640 struct dma_ops_domain *dom, 816 struct dma_ops_domain *dom,
641 unsigned int pages, 817 unsigned int pages,
642 unsigned long align_mask, 818 unsigned long align_mask,
643 u64 dma_mask) 819 u64 dma_mask)
644{ 820{
645 unsigned long limit;
646 unsigned long address; 821 unsigned long address;
647 unsigned long boundary_size;
648 822
649 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 823#ifdef CONFIG_IOMMU_STRESS
650 PAGE_SIZE) >> PAGE_SHIFT; 824 dom->next_address = 0;
651 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, 825 dom->need_flush = true;
652 dma_mask >> PAGE_SHIFT); 826#endif
653 827
654 if (dom->next_bit >= limit) { 828 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
655 dom->next_bit = 0; 829 dma_mask, dom->next_address);
656 dom->need_flush = true;
657 }
658 830
659 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
660 0 , boundary_size, align_mask);
661 if (address == -1) { 831 if (address == -1) {
662 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 832 dom->next_address = 0;
663 0, boundary_size, align_mask); 833 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
834 dma_mask, 0);
664 dom->need_flush = true; 835 dom->need_flush = true;
665 } 836 }
666 837
667 if (likely(address != -1)) { 838 if (unlikely(address == -1))
668 dom->next_bit = address + pages;
669 address <<= PAGE_SHIFT;
670 } else
671 address = bad_dma_address; 839 address = bad_dma_address;
672 840
673 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 841 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
684 unsigned long address, 852 unsigned long address,
685 unsigned int pages) 853 unsigned int pages)
686{ 854{
687 address >>= PAGE_SHIFT; 855 unsigned i = address >> APERTURE_RANGE_SHIFT;
688 iommu_area_free(dom->bitmap, address, pages); 856 struct aperture_range *range = dom->aperture[i];
689 857
690 if (address >= dom->next_bit) 858 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
859
860#ifdef CONFIG_IOMMU_STRESS
861 if (i < 4)
862 return;
863#endif
864
865 if (address >= dom->next_address)
691 dom->need_flush = true; 866 dom->need_flush = true;
867
868 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
869
870 iommu_area_free(range->bitmap, address, pages);
871
692} 872}
693 873
694/**************************************************************************** 874/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
736 unsigned long start_page, 916 unsigned long start_page,
737 unsigned int pages) 917 unsigned int pages)
738{ 918{
739 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; 919 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
740 920
741 if (start_page + pages > last_page) 921 if (start_page + pages > last_page)
742 pages = last_page - start_page; 922 pages = last_page - start_page;
743 923
744 iommu_area_reserve(dom->bitmap, start_page, pages); 924 for (i = start_page; i < start_page + pages; ++i) {
925 int index = i / APERTURE_RANGE_PAGES;
926 int page = i % APERTURE_RANGE_PAGES;
927 __set_bit(page, dom->aperture[index]->bitmap);
928 }
745} 929}
746 930
747static void free_pagetable(struct protection_domain *domain) 931static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
780 */ 964 */
781static void dma_ops_domain_free(struct dma_ops_domain *dom) 965static void dma_ops_domain_free(struct dma_ops_domain *dom)
782{ 966{
967 int i;
968
783 if (!dom) 969 if (!dom)
784 return; 970 return;
785 971
786 free_pagetable(&dom->domain); 972 free_pagetable(&dom->domain);
787 973
788 kfree(dom->pte_pages); 974 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
789 975 if (!dom->aperture[i])
790 kfree(dom->bitmap); 976 continue;
977 free_page((unsigned long)dom->aperture[i]->bitmap);
978 kfree(dom->aperture[i]);
979 }
791 980
792 kfree(dom); 981 kfree(dom);
793} 982}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
797 * It also intializes the page table and the address allocator data 986 * It also intializes the page table and the address allocator data
798 * structures required for the dma_ops interface 987 * structures required for the dma_ops interface
799 */ 988 */
800static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 989static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
801 unsigned order)
802{ 990{
803 struct dma_ops_domain *dma_dom; 991 struct dma_ops_domain *dma_dom;
804 unsigned i, num_pte_pages;
805 u64 *l2_pde;
806 u64 address;
807
808 /*
809 * Currently the DMA aperture must be between 32 MB and 1GB in size
810 */
811 if ((order < 25) || (order > 30))
812 return NULL;
813 992
814 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 993 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
815 if (!dma_dom) 994 if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
826 dma_dom->domain.priv = dma_dom; 1005 dma_dom->domain.priv = dma_dom;
827 if (!dma_dom->domain.pt_root) 1006 if (!dma_dom->domain.pt_root)
828 goto free_dma_dom; 1007 goto free_dma_dom;
829 dma_dom->aperture_size = (1ULL << order);
830 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
831 GFP_KERNEL);
832 if (!dma_dom->bitmap)
833 goto free_dma_dom;
834 /*
835 * mark the first page as allocated so we never return 0 as
836 * a valid dma-address. So we can use 0 as error value
837 */
838 dma_dom->bitmap[0] = 1;
839 dma_dom->next_bit = 0;
840 1008
841 dma_dom->need_flush = false; 1009 dma_dom->need_flush = false;
842 dma_dom->target_dev = 0xffff; 1010 dma_dom->target_dev = 0xffff;
843 1011
844 /* Intialize the exclusion range if necessary */ 1012 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
845 if (iommu->exclusion_start && 1013 goto free_dma_dom;
846 iommu->exclusion_start < dma_dom->aperture_size) {
847 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
848 int pages = iommu_num_pages(iommu->exclusion_start,
849 iommu->exclusion_length,
850 PAGE_SIZE);
851 dma_ops_reserve_addresses(dma_dom, startpage, pages);
852 }
853 1014
854 /* 1015 /*
855 * At the last step, build the page tables so we don't need to 1016 * mark the first page as allocated so we never return 0 as
856 * allocate page table pages in the dma_ops mapping/unmapping 1017 * a valid dma-address. So we can use 0 as error value
857 * path.
858 */ 1018 */
859 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 1019 dma_dom->aperture[0]->bitmap[0] = 1;
860 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 1020 dma_dom->next_address = 0;
861 GFP_KERNEL);
862 if (!dma_dom->pte_pages)
863 goto free_dma_dom;
864
865 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
866 if (l2_pde == NULL)
867 goto free_dma_dom;
868 1021
869 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
870
871 for (i = 0; i < num_pte_pages; ++i) {
872 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
873 if (!dma_dom->pte_pages[i])
874 goto free_dma_dom;
875 address = virt_to_phys(dma_dom->pte_pages[i]);
876 l2_pde[i] = IOMMU_L1_PDE(address);
877 }
878 1022
879 return dma_dom; 1023 return dma_dom;
880 1024
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
983 struct protection_domain *domain; 1127 struct protection_domain *domain;
984 struct dma_ops_domain *dma_domain; 1128 struct dma_ops_domain *dma_domain;
985 struct amd_iommu *iommu; 1129 struct amd_iommu *iommu;
986 int order = amd_iommu_aperture_order;
987 unsigned long flags; 1130 unsigned long flags;
988 1131
989 if (devid > amd_iommu_last_bdf) 1132 if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
1002 "to a non-dma-ops domain\n", dev_name(dev)); 1145 "to a non-dma-ops domain\n", dev_name(dev));
1003 1146
1004 switch (action) { 1147 switch (action) {
1005 case BUS_NOTIFY_BOUND_DRIVER: 1148 case BUS_NOTIFY_UNBOUND_DRIVER:
1006 if (domain)
1007 goto out;
1008 dma_domain = find_protection_domain(devid);
1009 if (!dma_domain)
1010 dma_domain = iommu->default_dom;
1011 attach_device(iommu, &dma_domain->domain, devid);
1012 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1013 "device %s\n", dma_domain->domain.id, dev_name(dev));
1014 break;
1015 case BUS_NOTIFY_UNBIND_DRIVER:
1016 if (!domain) 1149 if (!domain)
1017 goto out; 1150 goto out;
1018 detach_device(domain, devid); 1151 detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
1022 dma_domain = find_protection_domain(devid); 1155 dma_domain = find_protection_domain(devid);
1023 if (dma_domain) 1156 if (dma_domain)
1024 goto out; 1157 goto out;
1025 dma_domain = dma_ops_domain_alloc(iommu, order); 1158 dma_domain = dma_ops_domain_alloc(iommu);
1026 if (!dma_domain) 1159 if (!dma_domain)
1027 goto out; 1160 goto out;
1028 dma_domain->target_dev = devid; 1161 dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
1133 dma_dom = (*iommu)->default_dom; 1266 dma_dom = (*iommu)->default_dom;
1134 *domain = &dma_dom->domain; 1267 *domain = &dma_dom->domain;
1135 attach_device(*iommu, *domain, *bdf); 1268 attach_device(*iommu, *domain, *bdf);
1136 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1269 DUMP_printk("Using protection domain %d for device %s\n",
1137 "device %s\n", (*domain)->id, dev_name(dev)); 1270 (*domain)->id, dev_name(dev));
1138 } 1271 }
1139 1272
1140 if (domain_for_device(_bdf) == NULL) 1273 if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
1144} 1277}
1145 1278
1146/* 1279/*
1280 * If the pte_page is not yet allocated this function is called
1281 */
1282static u64* alloc_pte(struct protection_domain *dom,
1283 unsigned long address, u64 **pte_page, gfp_t gfp)
1284{
1285 u64 *pte, *page;
1286
1287 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
1288
1289 if (!IOMMU_PTE_PRESENT(*pte)) {
1290 page = (u64 *)get_zeroed_page(gfp);
1291 if (!page)
1292 return NULL;
1293 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1294 }
1295
1296 pte = IOMMU_PTE_PAGE(*pte);
1297 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
1298
1299 if (!IOMMU_PTE_PRESENT(*pte)) {
1300 page = (u64 *)get_zeroed_page(gfp);
1301 if (!page)
1302 return NULL;
1303 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1304 }
1305
1306 pte = IOMMU_PTE_PAGE(*pte);
1307
1308 if (pte_page)
1309 *pte_page = pte;
1310
1311 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
1312
1313 return pte;
1314}
1315
1316/*
1317 * This function fetches the PTE for a given address in the aperture
1318 */
1319static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1320 unsigned long address)
1321{
1322 struct aperture_range *aperture;
1323 u64 *pte, *pte_page;
1324
1325 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1326 if (!aperture)
1327 return NULL;
1328
1329 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1330 if (!pte) {
1331 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
1332 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1333 } else
1334 pte += IOMMU_PTE_L0_INDEX(address);
1335
1336 return pte;
1337}
1338
1339/*
1147 * This is the generic map function. It maps one 4kb page at paddr to 1340 * This is the generic map function. It maps one 4kb page at paddr to
1148 * the given address in the DMA address space for the domain. 1341 * the given address in the DMA address space for the domain.
1149 */ 1342 */
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1159 1352
1160 paddr &= PAGE_MASK; 1353 paddr &= PAGE_MASK;
1161 1354
1162 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1355 pte = dma_ops_get_pte(dom, address);
1163 pte += IOMMU_PTE_L0_INDEX(address); 1356 if (!pte)
1357 return bad_dma_address;
1164 1358
1165 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1359 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1166 1360
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1185 struct dma_ops_domain *dom, 1379 struct dma_ops_domain *dom,
1186 unsigned long address) 1380 unsigned long address)
1187{ 1381{
1382 struct aperture_range *aperture;
1188 u64 *pte; 1383 u64 *pte;
1189 1384
1190 if (address >= dom->aperture_size) 1385 if (address >= dom->aperture_size)
1191 return; 1386 return;
1192 1387
1193 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); 1388 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1389 if (!aperture)
1390 return;
1391
1392 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1393 if (!pte)
1394 return;
1194 1395
1195 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
1196 pte += IOMMU_PTE_L0_INDEX(address); 1396 pte += IOMMU_PTE_L0_INDEX(address);
1197 1397
1198 WARN_ON(!*pte); 1398 WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
1216 u64 dma_mask) 1416 u64 dma_mask)
1217{ 1417{
1218 dma_addr_t offset = paddr & ~PAGE_MASK; 1418 dma_addr_t offset = paddr & ~PAGE_MASK;
1219 dma_addr_t address, start; 1419 dma_addr_t address, start, ret;
1220 unsigned int pages; 1420 unsigned int pages;
1221 unsigned long align_mask = 0; 1421 unsigned long align_mask = 0;
1222 int i; 1422 int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
1232 if (align) 1432 if (align)
1233 align_mask = (1UL << get_order(size)) - 1; 1433 align_mask = (1UL << get_order(size)) - 1;
1234 1434
1435retry:
1235 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1436 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1236 dma_mask); 1437 dma_mask);
1237 if (unlikely(address == bad_dma_address)) 1438 if (unlikely(address == bad_dma_address)) {
1238 goto out; 1439 /*
1440 * setting next_address here will let the address
1441 * allocator only scan the new allocated range in the
1442 * first run. This is a small optimization.
1443 */
1444 dma_dom->next_address = dma_dom->aperture_size;
1445
1446 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
1447 goto out;
1448
1449 /*
1450 * aperture was sucessfully enlarged by 128 MB, try
1451 * allocation again
1452 */
1453 goto retry;
1454 }
1239 1455
1240 start = address; 1456 start = address;
1241 for (i = 0; i < pages; ++i) { 1457 for (i = 0; i < pages; ++i) {
1242 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1458 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
1459 if (ret == bad_dma_address)
1460 goto out_unmap;
1461
1243 paddr += PAGE_SIZE; 1462 paddr += PAGE_SIZE;
1244 start += PAGE_SIZE; 1463 start += PAGE_SIZE;
1245 } 1464 }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
1255 1474
1256out: 1475out:
1257 return address; 1476 return address;
1477
1478out_unmap:
1479
1480 for (--i; i >= 0; --i) {
1481 start -= PAGE_SIZE;
1482 dma_ops_domain_unmap(iommu, dma_dom, start);
1483 }
1484
1485 dma_ops_free_addresses(dma_dom, address, pages);
1486
1487 return bad_dma_address;
1258} 1488}
1259 1489
1260/* 1490/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
1537 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1767 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1538 size, DMA_BIDIRECTIONAL, true, dma_mask); 1768 size, DMA_BIDIRECTIONAL, true, dma_mask);
1539 1769
1540 if (*dma_addr == bad_dma_address) 1770 if (*dma_addr == bad_dma_address) {
1771 spin_unlock_irqrestore(&domain->lock, flags);
1541 goto out_free; 1772 goto out_free;
1773 }
1542 1774
1543 iommu_completion_wait(iommu); 1775 iommu_completion_wait(iommu);
1544 1776
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
1625 struct pci_dev *dev = NULL; 1857 struct pci_dev *dev = NULL;
1626 struct dma_ops_domain *dma_dom; 1858 struct dma_ops_domain *dma_dom;
1627 struct amd_iommu *iommu; 1859 struct amd_iommu *iommu;
1628 int order = amd_iommu_aperture_order;
1629 u16 devid; 1860 u16 devid;
1630 1861
1631 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1862 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
1638 iommu = amd_iommu_rlookup_table[devid]; 1869 iommu = amd_iommu_rlookup_table[devid];
1639 if (!iommu) 1870 if (!iommu)
1640 continue; 1871 continue;
1641 dma_dom = dma_ops_domain_alloc(iommu, order); 1872 dma_dom = dma_ops_domain_alloc(iommu);
1642 if (!dma_dom) 1873 if (!dma_dom)
1643 continue; 1874 continue;
1644 init_unity_mappings_for_device(dma_dom, devid); 1875 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
1664int __init amd_iommu_init_dma_ops(void) 1895int __init amd_iommu_init_dma_ops(void)
1665{ 1896{
1666 struct amd_iommu *iommu; 1897 struct amd_iommu *iommu;
1667 int order = amd_iommu_aperture_order;
1668 int ret; 1898 int ret;
1669 1899
1670 /* 1900 /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
1672 * found in the system. Devices not assigned to any other 1902 * found in the system. Devices not assigned to any other
1673 * protection domain will be assigned to the default one. 1903 * protection domain will be assigned to the default one.
1674 */ 1904 */
1675 list_for_each_entry(iommu, &amd_iommu_list, list) { 1905 for_each_iommu(iommu) {
1676 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1906 iommu->default_dom = dma_ops_domain_alloc(iommu);
1677 if (iommu->default_dom == NULL) 1907 if (iommu->default_dom == NULL)
1678 return -ENOMEM; 1908 return -ENOMEM;
1679 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 1909 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
1710 1940
1711free_domains: 1941free_domains:
1712 1942
1713 list_for_each_entry(iommu, &amd_iommu_list, list) { 1943 for_each_iommu(iommu) {
1714 if (iommu->default_dom) 1944 if (iommu->default_dom)
1715 dma_ops_domain_free(iommu->default_dom); 1945 dma_ops_domain_free(iommu->default_dom);
1716 } 1946 }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
1842 2072
1843 old_domain = domain_for_device(devid); 2073 old_domain = domain_for_device(devid);
1844 if (old_domain) 2074 if (old_domain)
1845 return -EBUSY; 2075 detach_device(old_domain, devid);
1846 2076
1847 attach_device(iommu, domain, devid); 2077 attach_device(iommu, domain, devid);
1848 2078
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dac..238989ec077d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
115 u64 range_length; 115 u64 range_length;
116} __attribute__((packed)); 116} __attribute__((packed));
117 117
118bool amd_iommu_dump;
119
118static int __initdata amd_iommu_detected; 120static int __initdata amd_iommu_detected;
119 121
120u16 amd_iommu_last_bdf; /* largest PCI device id we have 122u16 amd_iommu_last_bdf; /* largest PCI device id we have
121 to handle */ 123 to handle */
122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
123 we find in ACPI */ 125 we find in ACPI */
124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
125bool amd_iommu_isolate = true; /* if true, device isolation is 129bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */ 130 enabled */
131#endif
132
127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
128 134
129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
175static inline unsigned long tbl_size(int entry_size) 181static inline unsigned long tbl_size(int entry_size)
176{ 182{
177 unsigned shift = PAGE_SHIFT + 183 unsigned shift = PAGE_SHIFT +
178 get_order(amd_iommu_last_bdf * entry_size); 184 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
179 185
180 return 1UL << shift; 186 return 1UL << shift;
181} 187}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
193 * This function set the exclusion range in the IOMMU. DMA accesses to the 199 * This function set the exclusion range in the IOMMU. DMA accesses to the
194 * exclusion range are passed through untranslated 200 * exclusion range are passed through untranslated
195 */ 201 */
196static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 202static void iommu_set_exclusion_range(struct amd_iommu *iommu)
197{ 203{
198 u64 start = iommu->exclusion_start & PAGE_MASK; 204 u64 start = iommu->exclusion_start & PAGE_MASK;
199 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; 205 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
225} 231}
226 232
227/* Generic functions to enable/disable certain features of the IOMMU. */ 233/* Generic functions to enable/disable certain features of the IOMMU. */
228static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 234static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
229{ 235{
230 u32 ctrl; 236 u32 ctrl;
231 237
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244} 250}
245 251
246/* Function to enable the hardware */ 252/* Function to enable the hardware */
247static void __init iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
248{ 254{
249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
250 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
253} 259}
254 260
255/* Function to enable IOMMU event logging and event interrupts */ 261static void iommu_disable(struct amd_iommu *iommu)
256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
257{ 262{
258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
260} 264}
261 265
262/* 266/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
413{ 417{
414 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 418 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
415 get_order(CMD_BUFFER_SIZE)); 419 get_order(CMD_BUFFER_SIZE));
416 u64 entry;
417 420
418 if (cmd_buf == NULL) 421 if (cmd_buf == NULL)
419 return NULL; 422 return NULL;
420 423
421 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 424 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
422 425
423 entry = (u64)virt_to_phys(cmd_buf); 426 return cmd_buf;
427}
428
429/*
430 * This function writes the command buffer address to the hardware and
431 * enables it.
432 */
433static void iommu_enable_command_buffer(struct amd_iommu *iommu)
434{
435 u64 entry;
436
437 BUG_ON(iommu->cmd_buf == NULL);
438
439 entry = (u64)virt_to_phys(iommu->cmd_buf);
424 entry |= MMIO_CMD_SIZE_512; 440 entry |= MMIO_CMD_SIZE_512;
441
425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 442 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
426 &entry, sizeof(entry)); 443 &entry, sizeof(entry));
427 444
428 /* set head and tail to zero manually */ 445 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 446 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 447 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431 448
432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 449 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
433
434 return cmd_buf;
435} 450}
436 451
437static void __init free_command_buffer(struct amd_iommu *iommu) 452static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
443/* allocates the memory where the IOMMU will log its events to */ 458/* allocates the memory where the IOMMU will log its events to */
444static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) 459static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
445{ 460{
446 u64 entry;
447 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 461 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
448 get_order(EVT_BUFFER_SIZE)); 462 get_order(EVT_BUFFER_SIZE));
449 463
450 if (iommu->evt_buf == NULL) 464 if (iommu->evt_buf == NULL)
451 return NULL; 465 return NULL;
452 466
467 return iommu->evt_buf;
468}
469
470static void iommu_enable_event_buffer(struct amd_iommu *iommu)
471{
472 u64 entry;
473
474 BUG_ON(iommu->evt_buf == NULL);
475
453 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 476 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
477
454 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
455 &entry, sizeof(entry)); 479 &entry, sizeof(entry));
456 480
457 iommu->evt_buf_size = EVT_BUFFER_SIZE; 481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
458
459 return iommu->evt_buf;
460} 482}
461 483
462static void __init free_event_buffer(struct amd_iommu *iommu) 484static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
596 p += sizeof(struct ivhd_header); 618 p += sizeof(struct ivhd_header);
597 end += h->length; 619 end += h->length;
598 620
621
599 while (p < end) { 622 while (p < end) {
600 e = (struct ivhd_entry *)p; 623 e = (struct ivhd_entry *)p;
601 switch (e->type) { 624 switch (e->type) {
602 case IVHD_DEV_ALL: 625 case IVHD_DEV_ALL:
626
627 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
628 " last device %02x:%02x.%x flags: %02x\n",
629 PCI_BUS(iommu->first_device),
630 PCI_SLOT(iommu->first_device),
631 PCI_FUNC(iommu->first_device),
632 PCI_BUS(iommu->last_device),
633 PCI_SLOT(iommu->last_device),
634 PCI_FUNC(iommu->last_device),
635 e->flags);
636
603 for (dev_i = iommu->first_device; 637 for (dev_i = iommu->first_device;
604 dev_i <= iommu->last_device; ++dev_i) 638 dev_i <= iommu->last_device; ++dev_i)
605 set_dev_entry_from_acpi(iommu, dev_i, 639 set_dev_entry_from_acpi(iommu, dev_i,
606 e->flags, 0); 640 e->flags, 0);
607 break; 641 break;
608 case IVHD_DEV_SELECT: 642 case IVHD_DEV_SELECT:
643
644 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
645 "flags: %02x\n",
646 PCI_BUS(e->devid),
647 PCI_SLOT(e->devid),
648 PCI_FUNC(e->devid),
649 e->flags);
650
609 devid = e->devid; 651 devid = e->devid;
610 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 652 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
611 break; 653 break;
612 case IVHD_DEV_SELECT_RANGE_START: 654 case IVHD_DEV_SELECT_RANGE_START:
655
656 DUMP_printk(" DEV_SELECT_RANGE_START\t "
657 "devid: %02x:%02x.%x flags: %02x\n",
658 PCI_BUS(e->devid),
659 PCI_SLOT(e->devid),
660 PCI_FUNC(e->devid),
661 e->flags);
662
613 devid_start = e->devid; 663 devid_start = e->devid;
614 flags = e->flags; 664 flags = e->flags;
615 ext_flags = 0; 665 ext_flags = 0;
616 alias = false; 666 alias = false;
617 break; 667 break;
618 case IVHD_DEV_ALIAS: 668 case IVHD_DEV_ALIAS:
669
670 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
671 "flags: %02x devid_to: %02x:%02x.%x\n",
672 PCI_BUS(e->devid),
673 PCI_SLOT(e->devid),
674 PCI_FUNC(e->devid),
675 e->flags,
676 PCI_BUS(e->ext >> 8),
677 PCI_SLOT(e->ext >> 8),
678 PCI_FUNC(e->ext >> 8));
679
619 devid = e->devid; 680 devid = e->devid;
620 devid_to = e->ext >> 8; 681 devid_to = e->ext >> 8;
621 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
622 amd_iommu_alias_table[devid] = devid_to; 683 amd_iommu_alias_table[devid] = devid_to;
623 break; 684 break;
624 case IVHD_DEV_ALIAS_RANGE: 685 case IVHD_DEV_ALIAS_RANGE:
686
687 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
688 "devid: %02x:%02x.%x flags: %02x "
689 "devid_to: %02x:%02x.%x\n",
690 PCI_BUS(e->devid),
691 PCI_SLOT(e->devid),
692 PCI_FUNC(e->devid),
693 e->flags,
694 PCI_BUS(e->ext >> 8),
695 PCI_SLOT(e->ext >> 8),
696 PCI_FUNC(e->ext >> 8));
697
625 devid_start = e->devid; 698 devid_start = e->devid;
626 flags = e->flags; 699 flags = e->flags;
627 devid_to = e->ext >> 8; 700 devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
629 alias = true; 702 alias = true;
630 break; 703 break;
631 case IVHD_DEV_EXT_SELECT: 704 case IVHD_DEV_EXT_SELECT:
705
706 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
707 "flags: %02x ext: %08x\n",
708 PCI_BUS(e->devid),
709 PCI_SLOT(e->devid),
710 PCI_FUNC(e->devid),
711 e->flags, e->ext);
712
632 devid = e->devid; 713 devid = e->devid;
633 set_dev_entry_from_acpi(iommu, devid, e->flags, 714 set_dev_entry_from_acpi(iommu, devid, e->flags,
634 e->ext); 715 e->ext);
635 break; 716 break;
636 case IVHD_DEV_EXT_SELECT_RANGE: 717 case IVHD_DEV_EXT_SELECT_RANGE:
718
719 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
720 "%02x:%02x.%x flags: %02x ext: %08x\n",
721 PCI_BUS(e->devid),
722 PCI_SLOT(e->devid),
723 PCI_FUNC(e->devid),
724 e->flags, e->ext);
725
637 devid_start = e->devid; 726 devid_start = e->devid;
638 flags = e->flags; 727 flags = e->flags;
639 ext_flags = e->ext; 728 ext_flags = e->ext;
640 alias = false; 729 alias = false;
641 break; 730 break;
642 case IVHD_DEV_RANGE_END: 731 case IVHD_DEV_RANGE_END:
732
733 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
734 PCI_BUS(e->devid),
735 PCI_SLOT(e->devid),
736 PCI_FUNC(e->devid));
737
643 devid = e->devid; 738 devid = e->devid;
644 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
645 if (alias) 740 if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
679{ 774{
680 struct amd_iommu *iommu, *next; 775 struct amd_iommu *iommu, *next;
681 776
682 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { 777 for_each_iommu_safe(iommu, next) {
683 list_del(&iommu->list); 778 list_del(&iommu->list);
684 free_iommu_one(iommu); 779 free_iommu_one(iommu);
685 kfree(iommu); 780 kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
710 if (!iommu->mmio_base) 805 if (!iommu->mmio_base)
711 return -ENOMEM; 806 return -ENOMEM;
712 807
713 iommu_set_device_table(iommu);
714 iommu->cmd_buf = alloc_command_buffer(iommu); 808 iommu->cmd_buf = alloc_command_buffer(iommu);
715 if (!iommu->cmd_buf) 809 if (!iommu->cmd_buf)
716 return -ENOMEM; 810 return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
746 h = (struct ivhd_header *)p; 840 h = (struct ivhd_header *)p;
747 switch (*p) { 841 switch (*p) {
748 case ACPI_IVHD_TYPE: 842 case ACPI_IVHD_TYPE:
843
844 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
845 "seg: %d flags: %01x info %04x\n",
846 PCI_BUS(h->devid), PCI_SLOT(h->devid),
847 PCI_FUNC(h->devid), h->cap_ptr,
848 h->pci_seg, h->flags, h->info);
849 DUMP_printk(" mmio-addr: %016llx\n",
850 h->mmio_phys);
851
749 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 852 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
750 if (iommu == NULL) 853 if (iommu == NULL)
751 return -ENOMEM; 854 return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
773 * 876 *
774 ****************************************************************************/ 877 ****************************************************************************/
775 878
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu) 879static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{ 880{
818 int r; 881 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826 882
827 if (pci_enable_msi(iommu->dev)) 883 if (pci_enable_msi(iommu->dev))
828 return 1; 884 return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
837 return 1; 893 return 1;
838 } 894 }
839 895
896 iommu->int_enabled = true;
897 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
898
840 return 0; 899 return 0;
841} 900}
842 901
843static int __init iommu_init_msi(struct amd_iommu *iommu) 902static int iommu_init_msi(struct amd_iommu *iommu)
844{ 903{
845 if (iommu->int_enabled) 904 if (iommu->int_enabled)
846 return 0; 905 return 0;
847 906
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) 907 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu); 908 return iommu_setup_msi(iommu);
852 909
853 return 1; 910 return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
899static int __init init_unity_map_range(struct ivmd_header *m) 956static int __init init_unity_map_range(struct ivmd_header *m)
900{ 957{
901 struct unity_map_entry *e = 0; 958 struct unity_map_entry *e = 0;
959 char *s;
902 960
903 e = kzalloc(sizeof(*e), GFP_KERNEL); 961 e = kzalloc(sizeof(*e), GFP_KERNEL);
904 if (e == NULL) 962 if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
906 964
907 switch (m->type) { 965 switch (m->type) {
908 default: 966 default:
967 kfree(e);
968 return 0;
909 case ACPI_IVMD_TYPE: 969 case ACPI_IVMD_TYPE:
970 s = "IVMD_TYPEi\t\t\t";
910 e->devid_start = e->devid_end = m->devid; 971 e->devid_start = e->devid_end = m->devid;
911 break; 972 break;
912 case ACPI_IVMD_TYPE_ALL: 973 case ACPI_IVMD_TYPE_ALL:
974 s = "IVMD_TYPE_ALL\t\t";
913 e->devid_start = 0; 975 e->devid_start = 0;
914 e->devid_end = amd_iommu_last_bdf; 976 e->devid_end = amd_iommu_last_bdf;
915 break; 977 break;
916 case ACPI_IVMD_TYPE_RANGE: 978 case ACPI_IVMD_TYPE_RANGE:
979 s = "IVMD_TYPE_RANGE\t\t";
917 e->devid_start = m->devid; 980 e->devid_start = m->devid;
918 e->devid_end = m->aux; 981 e->devid_end = m->aux;
919 break; 982 break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
922 e->address_end = e->address_start + PAGE_ALIGN(m->range_length); 985 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
923 e->prot = m->flags >> 1; 986 e->prot = m->flags >> 1;
924 987
988 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
989 " range_start: %016llx range_end: %016llx flags: %x\n", s,
990 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
991 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
992 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
993 e->address_start, e->address_end, m->flags);
994
925 list_add_tail(&e->list, &amd_iommu_unity_map); 995 list_add_tail(&e->list, &amd_iommu_unity_map);
926 996
927 return 0; 997 return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
967 * This function finally enables all IOMMUs found in the system after 1037 * This function finally enables all IOMMUs found in the system after
968 * they have been initialized 1038 * they have been initialized
969 */ 1039 */
970static void __init enable_iommus(void) 1040static void enable_iommus(void)
971{ 1041{
972 struct amd_iommu *iommu; 1042 struct amd_iommu *iommu;
973 1043
974 list_for_each_entry(iommu, &amd_iommu_list, list) { 1044 for_each_iommu(iommu) {
1045 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu);
975 iommu_set_exclusion_range(iommu); 1048 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu); 1049 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
978 iommu_enable(iommu); 1050 iommu_enable(iommu);
979 } 1051 }
980} 1052}
981 1053
1054static void disable_iommus(void)
1055{
1056 struct amd_iommu *iommu;
1057
1058 for_each_iommu(iommu)
1059 iommu_disable(iommu);
1060}
1061
982/* 1062/*
983 * Suspend/Resume support 1063 * Suspend/Resume support
984 * disable suspend until real resume implemented 1064 * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
986 1066
987static int amd_iommu_resume(struct sys_device *dev) 1067static int amd_iommu_resume(struct sys_device *dev)
988{ 1068{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */
1076 enable_iommus();
1077
1078 /*
1079 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send
1081 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices();
1084
989 return 0; 1085 return 0;
990} 1086}
991 1087
992static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1088static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
993{ 1089{
994 return -EINVAL; 1090 /* disable IOMMUs to go out of the way for BIOS */
1091 disable_iommus();
1092
1093 return 0;
995} 1094}
996 1095
997static struct sysdev_class amd_iommu_sysdev_class = { 1096static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
1137 1236
1138 enable_iommus(); 1237 enable_iommus();
1139 1238
1140 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1141 (1 << (amd_iommu_aperture_order-20)));
1142
1143 printk(KERN_INFO "AMD IOMMU: device isolation "); 1239 printk(KERN_INFO "AMD IOMMU: device isolation ");
1144 if (amd_iommu_isolate) 1240 if (amd_iommu_isolate)
1145 printk("enabled\n"); 1241 printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
1211 * 1307 *
1212 ****************************************************************************/ 1308 ****************************************************************************/
1213 1309
1310static int __init parse_amd_iommu_dump(char *str)
1311{
1312 amd_iommu_dump = true;
1313
1314 return 1;
1315}
1316
1214static int __init parse_amd_iommu_options(char *str) 1317static int __init parse_amd_iommu_options(char *str)
1215{ 1318{
1216 for (; *str; ++str) { 1319 for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
1225 return 1; 1328 return 1;
1226} 1329}
1227 1330
1228static int __init parse_amd_iommu_size_options(char *str) 1331__setup("amd_iommu_dump", parse_amd_iommu_dump);
1229{
1230 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1231
1232 if ((order > 24) && (order < 31))
1233 amd_iommu_aperture_order = order;
1234
1235 return 1;
1236}
1237
1238__setup("amd_iommu=", parse_amd_iommu_options); 1332__setup("amd_iommu=", parse_amd_iommu_options);
1239__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..a4c9cf0bf70b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -98,6 +98,29 @@ early_param("lapic", parse_lapic);
98/* Local APIC was disabled by the BIOS and enabled by the kernel */ 98/* Local APIC was disabled by the BIOS and enabled by the kernel */
99static int enabled_via_apicbase; 99static int enabled_via_apicbase;
100 100
101/*
102 * Handle interrupt mode configuration register (IMCR).
103 * This register controls whether the interrupt signals
104 * that reach the BSP come from the master PIC or from the
105 * local APIC. Before entering Symmetric I/O Mode, either
106 * the BIOS or the operating system must switch out of
107 * PIC Mode by changing the IMCR.
108 */
109static inline void imcr_pic_to_apic(void)
110{
111 /* select IMCR register */
112 outb(0x70, 0x22);
113 /* NMI and 8259 INTR go through APIC */
114 outb(0x01, 0x23);
115}
116
117static inline void imcr_apic_to_pic(void)
118{
119 /* select IMCR register */
120 outb(0x70, 0x22);
121 /* NMI and 8259 INTR go directly to BSP */
122 outb(0x00, 0x23);
123}
101#endif 124#endif
102 125
103#ifdef CONFIG_X86_64 126#ifdef CONFIG_X86_64
@@ -111,13 +134,19 @@ static __init int setup_apicpmtimer(char *s)
111__setup("apicpmtimer", setup_apicpmtimer); 134__setup("apicpmtimer", setup_apicpmtimer);
112#endif 135#endif
113 136
137int x2apic_mode;
114#ifdef CONFIG_X86_X2APIC 138#ifdef CONFIG_X86_X2APIC
115int x2apic;
116/* x2apic enabled before OS handover */ 139/* x2apic enabled before OS handover */
117static int x2apic_preenabled; 140static int x2apic_preenabled;
118static int disable_x2apic; 141static int disable_x2apic;
119static __init int setup_nox2apic(char *str) 142static __init int setup_nox2apic(char *str)
120{ 143{
144 if (x2apic_enabled()) {
145 pr_warning("Bios already enabled x2apic, "
146 "can't enforce nox2apic");
147 return 0;
148 }
149
121 disable_x2apic = 1; 150 disable_x2apic = 1;
122 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 151 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
123 return 0; 152 return 0;
@@ -209,6 +238,31 @@ static int modern_apic(void)
209 return lapic_get_version() >= 0x14; 238 return lapic_get_version() >= 0x14;
210} 239}
211 240
241/*
242 * bare function to substitute write operation
243 * and it's _that_ fast :)
244 */
245static void native_apic_write_dummy(u32 reg, u32 v)
246{
247 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
248}
249
250static u32 native_apic_read_dummy(u32 reg)
251{
252 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
253 return 0;
254}
255
256/*
257 * right after this call apic->write/read doesn't do anything
258 * note that there is no restore operation it works one way
259 */
260void apic_disable(void)
261{
262 apic->read = native_apic_read_dummy;
263 apic->write = native_apic_write_dummy;
264}
265
212void native_apic_wait_icr_idle(void) 266void native_apic_wait_icr_idle(void)
213{ 267{
214 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 268 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +402,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
348 402
349static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 403static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
350{ 404{
351 unsigned long reg = (lvt_off << 4) + APIC_EILVT0; 405 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
352 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 406 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
353 407
354 apic_write(reg, v); 408 apic_write(reg, v);
@@ -815,7 +869,7 @@ void clear_local_APIC(void)
815 u32 v; 869 u32 v;
816 870
817 /* APIC hasn't been mapped yet */ 871 /* APIC hasn't been mapped yet */
818 if (!x2apic && !apic_phys) 872 if (!x2apic_mode && !apic_phys)
819 return; 873 return;
820 874
821 maxlvt = lapic_get_maxlvt(); 875 maxlvt = lapic_get_maxlvt();
@@ -1287,7 +1341,7 @@ void check_x2apic(void)
1287{ 1341{
1288 if (x2apic_enabled()) { 1342 if (x2apic_enabled()) {
1289 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1343 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1290 x2apic_preenabled = x2apic = 1; 1344 x2apic_preenabled = x2apic_mode = 1;
1291 } 1345 }
1292} 1346}
1293 1347
@@ -1295,7 +1349,7 @@ void enable_x2apic(void)
1295{ 1349{
1296 int msr, msr2; 1350 int msr, msr2;
1297 1351
1298 if (!x2apic) 1352 if (!x2apic_mode)
1299 return; 1353 return;
1300 1354
1301 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1355 rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1358,7 @@ void enable_x2apic(void)
1304 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1358 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1305 } 1359 }
1306} 1360}
1361#endif /* CONFIG_X86_X2APIC */
1307 1362
1308void __init enable_IR_x2apic(void) 1363void __init enable_IR_x2apic(void)
1309{ 1364{
@@ -1312,32 +1367,21 @@ void __init enable_IR_x2apic(void)
1312 unsigned long flags; 1367 unsigned long flags;
1313 struct IO_APIC_route_entry **ioapic_entries = NULL; 1368 struct IO_APIC_route_entry **ioapic_entries = NULL;
1314 1369
1315 if (!cpu_has_x2apic) 1370 ret = dmar_table_init();
1316 return; 1371 if (ret) {
1317 1372 pr_debug("dmar_table_init() failed with %d:\n", ret);
1318 if (!x2apic_preenabled && disable_x2apic) { 1373 goto ir_failed;
1319 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1320 "because of nox2apic\n");
1321 return;
1322 } 1374 }
1323 1375
1324 if (x2apic_preenabled && disable_x2apic) 1376 if (!intr_remapping_supported()) {
1325 panic("Bios already enabled x2apic, can't enforce nox2apic"); 1377 pr_debug("intr-remapping not supported\n");
1326 1378 goto ir_failed;
1327 if (!x2apic_preenabled && skip_ioapic_setup) {
1328 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1329 "because of skipping io-apic setup\n");
1330 return;
1331 } 1379 }
1332 1380
1333 ret = dmar_table_init();
1334 if (ret) {
1335 pr_info("dmar_table_init() failed with %d:\n", ret);
1336 1381
1337 if (x2apic_preenabled) 1382 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 panic("x2apic enabled by bios. But IR enabling failed"); 1383 pr_info("Skipped enabling intr-remap because of skipping "
1339 else 1384 "io-apic setup\n");
1340 pr_info("Not enabling x2apic,Intr-remapping\n");
1341 return; 1385 return;
1342 } 1386 }
1343 1387
@@ -1357,19 +1401,16 @@ void __init enable_IR_x2apic(void)
1357 mask_IO_APIC_setup(ioapic_entries); 1401 mask_IO_APIC_setup(ioapic_entries);
1358 mask_8259A(); 1402 mask_8259A();
1359 1403
1360 ret = enable_intr_remapping(EIM_32BIT_APIC_ID); 1404 ret = enable_intr_remapping(x2apic_supported());
1361
1362 if (ret && x2apic_preenabled) {
1363 local_irq_restore(flags);
1364 panic("x2apic enabled by bios. But IR enabling failed");
1365 }
1366
1367 if (ret) 1405 if (ret)
1368 goto end_restore; 1406 goto end_restore;
1369 1407
1370 if (!x2apic) { 1408 pr_info("Enabled Interrupt-remapping\n");
1371 x2apic = 1; 1409
1410 if (x2apic_supported() && !x2apic_mode) {
1411 x2apic_mode = 1;
1372 enable_x2apic(); 1412 enable_x2apic();
1413 pr_info("Enabled x2apic\n");
1373 } 1414 }
1374 1415
1375end_restore: 1416end_restore:
@@ -1378,37 +1419,34 @@ end_restore:
1378 * IR enabling failed 1419 * IR enabling failed
1379 */ 1420 */
1380 restore_IO_APIC_setup(ioapic_entries); 1421 restore_IO_APIC_setup(ioapic_entries);
1381 else
1382 reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
1383 1422
1384 unmask_8259A(); 1423 unmask_8259A();
1385 local_irq_restore(flags); 1424 local_irq_restore(flags);
1386 1425
1387end: 1426end:
1388 if (!ret) {
1389 if (!x2apic_preenabled)
1390 pr_info("Enabled x2apic and interrupt-remapping\n");
1391 else
1392 pr_info("Enabled Interrupt-remapping\n");
1393 } else
1394 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1395 if (ioapic_entries) 1427 if (ioapic_entries)
1396 free_ioapic_entries(ioapic_entries); 1428 free_ioapic_entries(ioapic_entries);
1429
1430 if (!ret)
1431 return;
1432
1433ir_failed:
1434 if (x2apic_preenabled)
1435 panic("x2apic enabled by bios. But IR enabling failed");
1436 else if (cpu_has_x2apic)
1437 pr_info("Not enabling x2apic,Intr-remapping\n");
1397#else 1438#else
1398 if (!cpu_has_x2apic) 1439 if (!cpu_has_x2apic)
1399 return; 1440 return;
1400 1441
1401 if (x2apic_preenabled) 1442 if (x2apic_preenabled)
1402 panic("x2apic enabled prior OS handover," 1443 panic("x2apic enabled prior OS handover,"
1403 " enable CONFIG_INTR_REMAP"); 1444 " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
1404
1405 pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1406 " and x2apic\n");
1407#endif 1445#endif
1408 1446
1409 return; 1447 return;
1410} 1448}
1411#endif /* CONFIG_X86_X2APIC */ 1449
1412 1450
1413#ifdef CONFIG_X86_64 1451#ifdef CONFIG_X86_64
1414/* 1452/*
@@ -1425,7 +1463,6 @@ static int __init detect_init_APIC(void)
1425 } 1463 }
1426 1464
1427 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1465 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1428 boot_cpu_physical_apicid = 0;
1429 return 0; 1466 return 0;
1430} 1467}
1431#else 1468#else
@@ -1539,32 +1576,49 @@ void __init early_init_lapic_mapping(void)
1539 */ 1576 */
1540void __init init_apic_mappings(void) 1577void __init init_apic_mappings(void)
1541{ 1578{
1542 if (x2apic) { 1579 unsigned int new_apicid;
1580
1581 if (x2apic_mode) {
1543 boot_cpu_physical_apicid = read_apic_id(); 1582 boot_cpu_physical_apicid = read_apic_id();
1544 return; 1583 return;
1545 } 1584 }
1546 1585
1547 /* 1586 /* If no local APIC can be found return early */
1548 * If no local APIC can be found then set up a fake all
1549 * zeroes page to simulate the local APIC and another
1550 * one for the IO-APIC.
1551 */
1552 if (!smp_found_config && detect_init_APIC()) { 1587 if (!smp_found_config && detect_init_APIC()) {
1553 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); 1588 /* lets NOP'ify apic operations */
1554 apic_phys = __pa(apic_phys); 1589 pr_info("APIC: disable apic facility\n");
1555 } else 1590 apic_disable();
1591 } else {
1556 apic_phys = mp_lapic_addr; 1592 apic_phys = mp_lapic_addr;
1557 1593
1558 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1594 /*
1559 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", 1595 * acpi lapic path already maps that address in
1560 APIC_BASE, apic_phys); 1596 * acpi_register_lapic_address()
1597 */
1598 if (!acpi_lapic)
1599 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1600
1601 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1602 APIC_BASE, apic_phys);
1603 }
1561 1604
1562 /* 1605 /*
1563 * Fetch the APIC ID of the BSP in case we have a 1606 * Fetch the APIC ID of the BSP in case we have a
1564 * default configuration (or the MP table is broken). 1607 * default configuration (or the MP table is broken).
1565 */ 1608 */
1566 if (boot_cpu_physical_apicid == -1U) 1609 new_apicid = read_apic_id();
1567 boot_cpu_physical_apicid = read_apic_id(); 1610 if (boot_cpu_physical_apicid != new_apicid) {
1611 boot_cpu_physical_apicid = new_apicid;
1612 /*
1613 * yeah -- we lie about apic_version
1614 * in case if apic was disabled via boot option
1615 * but it's not a problem for SMP compiled kernel
1616 * since smp_sanity_check is prepared for such a case
1617 * and disable smp mode
1618 */
1619 apic_version[new_apicid] =
1620 GET_APIC_VERSION(apic_read(APIC_LVR));
1621 }
1568} 1622}
1569 1623
1570/* 1624/*
@@ -1733,8 +1787,7 @@ void __init connect_bsp_APIC(void)
1733 */ 1787 */
1734 apic_printk(APIC_VERBOSE, "leaving PIC mode, " 1788 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1735 "enabling APIC mode.\n"); 1789 "enabling APIC mode.\n");
1736 outb(0x70, 0x22); 1790 imcr_pic_to_apic();
1737 outb(0x01, 0x23);
1738 } 1791 }
1739#endif 1792#endif
1740 if (apic->enable_apic_mode) 1793 if (apic->enable_apic_mode)
@@ -1762,8 +1815,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1762 */ 1815 */
1763 apic_printk(APIC_VERBOSE, "disabling APIC mode, " 1816 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1764 "entering PIC mode.\n"); 1817 "entering PIC mode.\n");
1765 outb(0x70, 0x22); 1818 imcr_apic_to_pic();
1766 outb(0x00, 0x23);
1767 return; 1819 return;
1768 } 1820 }
1769#endif 1821#endif
@@ -1969,10 +2021,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1969 2021
1970 local_irq_save(flags); 2022 local_irq_save(flags);
1971 disable_local_APIC(); 2023 disable_local_APIC();
1972#ifdef CONFIG_INTR_REMAP 2024
1973 if (intr_remapping_enabled) 2025 if (intr_remapping_enabled)
1974 disable_intr_remapping(); 2026 disable_intr_remapping();
1975#endif 2027
1976 local_irq_restore(flags); 2028 local_irq_restore(flags);
1977 return 0; 2029 return 0;
1978} 2030}
@@ -1982,42 +2034,34 @@ static int lapic_resume(struct sys_device *dev)
1982 unsigned int l, h; 2034 unsigned int l, h;
1983 unsigned long flags; 2035 unsigned long flags;
1984 int maxlvt; 2036 int maxlvt;
1985 2037 int ret = 0;
1986#ifdef CONFIG_INTR_REMAP
1987 int ret;
1988 struct IO_APIC_route_entry **ioapic_entries = NULL; 2038 struct IO_APIC_route_entry **ioapic_entries = NULL;
1989 2039
1990 if (!apic_pm_state.active) 2040 if (!apic_pm_state.active)
1991 return 0; 2041 return 0;
1992 2042
1993 local_irq_save(flags); 2043 local_irq_save(flags);
1994 if (x2apic) { 2044 if (intr_remapping_enabled) {
1995 ioapic_entries = alloc_ioapic_entries(); 2045 ioapic_entries = alloc_ioapic_entries();
1996 if (!ioapic_entries) { 2046 if (!ioapic_entries) {
1997 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2047 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
1998 return -ENOMEM; 2048 ret = -ENOMEM;
2049 goto restore;
1999 } 2050 }
2000 2051
2001 ret = save_IO_APIC_setup(ioapic_entries); 2052 ret = save_IO_APIC_setup(ioapic_entries);
2002 if (ret) { 2053 if (ret) {
2003 WARN(1, "Saving IO-APIC state failed: %d\n", ret); 2054 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2004 free_ioapic_entries(ioapic_entries); 2055 free_ioapic_entries(ioapic_entries);
2005 return ret; 2056 goto restore;
2006 } 2057 }
2007 2058
2008 mask_IO_APIC_setup(ioapic_entries); 2059 mask_IO_APIC_setup(ioapic_entries);
2009 mask_8259A(); 2060 mask_8259A();
2010 enable_x2apic();
2011 } 2061 }
2012#else
2013 if (!apic_pm_state.active)
2014 return 0;
2015 2062
2016 local_irq_save(flags); 2063 if (x2apic_mode)
2017 if (x2apic)
2018 enable_x2apic(); 2064 enable_x2apic();
2019#endif
2020
2021 else { 2065 else {
2022 /* 2066 /*
2023 * Make sure the APICBASE points to the right address 2067 * Make sure the APICBASE points to the right address
@@ -2055,21 +2099,16 @@ static int lapic_resume(struct sys_device *dev)
2055 apic_write(APIC_ESR, 0); 2099 apic_write(APIC_ESR, 0);
2056 apic_read(APIC_ESR); 2100 apic_read(APIC_ESR);
2057 2101
2058#ifdef CONFIG_INTR_REMAP 2102 if (intr_remapping_enabled) {
2059 if (intr_remapping_enabled) 2103 reenable_intr_remapping(x2apic_mode);
2060 reenable_intr_remapping(EIM_32BIT_APIC_ID);
2061
2062 if (x2apic) {
2063 unmask_8259A(); 2104 unmask_8259A();
2064 restore_IO_APIC_setup(ioapic_entries); 2105 restore_IO_APIC_setup(ioapic_entries);
2065 free_ioapic_entries(ioapic_entries); 2106 free_ioapic_entries(ioapic_entries);
2066 } 2107 }
2067#endif 2108restore:
2068
2069 local_irq_restore(flags); 2109 local_irq_restore(flags);
2070 2110
2071 2111 return ret;
2072 return 0;
2073} 2112}
2074 2113
2075/* 2114/*
@@ -2117,31 +2156,14 @@ static void apic_pm_activate(void) { }
2117#endif /* CONFIG_PM */ 2156#endif /* CONFIG_PM */
2118 2157
2119#ifdef CONFIG_X86_64 2158#ifdef CONFIG_X86_64
2120/* 2159
2121 * apic_is_clustered_box() -- Check if we can expect good TSC 2160static int __cpuinit apic_cluster_num(void)
2122 *
2123 * Thus far, the major user of this is IBM's Summit2 series:
2124 *
2125 * Clustered boxes may have unsynced TSC problems if they are
2126 * multi-chassis. Use available data to take a good guess.
2127 * If in doubt, go HPET.
2128 */
2129__cpuinit int apic_is_clustered_box(void)
2130{ 2161{
2131 int i, clusters, zeros; 2162 int i, clusters, zeros;
2132 unsigned id; 2163 unsigned id;
2133 u16 *bios_cpu_apicid; 2164 u16 *bios_cpu_apicid;
2134 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 2165 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2135 2166
2136 /*
2137 * there is not this kind of box with AMD CPU yet.
2138 * Some AMD box with quadcore cpu and 8 sockets apicid
2139 * will be [4, 0x23] or [8, 0x27] could be thought to
2140 * vsmp box still need checking...
2141 */
2142 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2143 return 0;
2144
2145 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2167 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2146 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2168 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2147 2169
@@ -2177,18 +2199,67 @@ __cpuinit int apic_is_clustered_box(void)
2177 ++zeros; 2199 ++zeros;
2178 } 2200 }
2179 2201
2180 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are 2202 return clusters;
2181 * not guaranteed to be synced between boards 2203}
2182 */ 2204
2183 if (is_vsmp_box() && clusters > 1) 2205static int __cpuinitdata multi_checked;
2206static int __cpuinitdata multi;
2207
2208static int __cpuinit set_multi(const struct dmi_system_id *d)
2209{
2210 if (multi)
2211 return 0;
2212 pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
2213 multi = 1;
2214 return 0;
2215}
2216
2217static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
2218 {
2219 .callback = set_multi,
2220 .ident = "IBM System Summit2",
2221 .matches = {
2222 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
2223 DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
2224 },
2225 },
2226 {}
2227};
2228
2229static void __cpuinit dmi_check_multi(void)
2230{
2231 if (multi_checked)
2232 return;
2233
2234 dmi_check_system(multi_dmi_table);
2235 multi_checked = 1;
2236}
2237
2238/*
2239 * apic_is_clustered_box() -- Check if we can expect good TSC
2240 *
2241 * Thus far, the major user of this is IBM's Summit2 series:
2242 * Clustered boxes may have unsynced TSC problems if they are
2243 * multi-chassis.
2244 * Use DMI to check them
2245 */
2246__cpuinit int apic_is_clustered_box(void)
2247{
2248 dmi_check_multi();
2249 if (multi)
2184 return 1; 2250 return 1;
2185 2251
2252 if (!is_vsmp_box())
2253 return 0;
2254
2186 /* 2255 /*
2187 * If clusters > 2, then should be multi-chassis. 2256 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2188 * May have to revisit this when multi-core + hyperthreaded CPUs come 2257 * not guaranteed to be synced between boards
2189 * out, but AFAIK this will work even for them.
2190 */ 2258 */
2191 return (clusters > 2); 2259 if (apic_cluster_num() > 1)
2260 return 1;
2261
2262 return 0;
2192} 2263}
2193#endif 2264#endif
2194 2265
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6f..d0c99abc26c3 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
161 161
162static int flat_phys_pkg_id(int initial_apic_id, int index_msb) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
163{ 163{
164 return hard_smp_processor_id() >> index_msb; 164 return initial_apic_id >> index_msb;
165} 165}
166 166
167struct apic apic_flat = { 167struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
235 * regardless of how many processors are present (x86_64 ES7000 235 * regardless of how many processors are present (x86_64 ES7000
236 * is an example). 236 * is an example).
237 */ 237 */
238 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && 238 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { 239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 302947775575..69328ac8de9c 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
145 return gsi; 145 return gsi;
146} 146}
147 147
148static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 148static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
149{ 149{
150 unsigned long vect = 0, psaival = 0; 150 unsigned long vect = 0, psaival = 0;
151 151
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e4..1946fac42ab3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
59#include <asm/setup.h> 59#include <asm/setup.h>
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h>
62#include <asm/uv/uv_hub.h> 63#include <asm/uv/uv_hub.h>
63#include <asm/uv/uv_irq.h> 64#include <asm/uv/uv_irq.h>
64 65
@@ -129,12 +130,9 @@ struct irq_pin_list {
129 struct irq_pin_list *next; 130 struct irq_pin_list *next;
130}; 131};
131 132
132static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) 133static struct irq_pin_list *get_one_free_irq_2_pin(int node)
133{ 134{
134 struct irq_pin_list *pin; 135 struct irq_pin_list *pin;
135 int node;
136
137 node = cpu_to_node(cpu);
138 136
139 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); 137 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
140 138
@@ -148,9 +146,6 @@ struct irq_cfg {
148 unsigned move_cleanup_count; 146 unsigned move_cleanup_count;
149 u8 vector; 147 u8 vector;
150 u8 move_in_progress : 1; 148 u8 move_in_progress : 1;
151#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
152 u8 move_desc_pending : 1;
153#endif
154}; 149};
155 150
156/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 151/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -212,12 +207,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
212 return cfg; 207 return cfg;
213} 208}
214 209
215static struct irq_cfg *get_one_free_irq_cfg(int cpu) 210static struct irq_cfg *get_one_free_irq_cfg(int node)
216{ 211{
217 struct irq_cfg *cfg; 212 struct irq_cfg *cfg;
218 int node;
219
220 node = cpu_to_node(cpu);
221 213
222 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 214 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
223 if (cfg) { 215 if (cfg) {
@@ -238,13 +230,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
238 return cfg; 230 return cfg;
239} 231}
240 232
241int arch_init_chip_data(struct irq_desc *desc, int cpu) 233int arch_init_chip_data(struct irq_desc *desc, int node)
242{ 234{
243 struct irq_cfg *cfg; 235 struct irq_cfg *cfg;
244 236
245 cfg = desc->chip_data; 237 cfg = desc->chip_data;
246 if (!cfg) { 238 if (!cfg) {
247 desc->chip_data = get_one_free_irq_cfg(cpu); 239 desc->chip_data = get_one_free_irq_cfg(node);
248 if (!desc->chip_data) { 240 if (!desc->chip_data) {
249 printk(KERN_ERR "can not alloc irq_cfg\n"); 241 printk(KERN_ERR "can not alloc irq_cfg\n");
250 BUG_ON(1); 242 BUG_ON(1);
@@ -254,10 +246,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
254 return 0; 246 return 0;
255} 247}
256 248
257#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC 249/* for move_irq_desc */
258
259static void 250static void
260init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) 251init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
261{ 252{
262 struct irq_pin_list *old_entry, *head, *tail, *entry; 253 struct irq_pin_list *old_entry, *head, *tail, *entry;
263 254
@@ -266,7 +257,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
266 if (!old_entry) 257 if (!old_entry)
267 return; 258 return;
268 259
269 entry = get_one_free_irq_2_pin(cpu); 260 entry = get_one_free_irq_2_pin(node);
270 if (!entry) 261 if (!entry)
271 return; 262 return;
272 263
@@ -276,7 +267,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
276 tail = entry; 267 tail = entry;
277 old_entry = old_entry->next; 268 old_entry = old_entry->next;
278 while (old_entry) { 269 while (old_entry) {
279 entry = get_one_free_irq_2_pin(cpu); 270 entry = get_one_free_irq_2_pin(node);
280 if (!entry) { 271 if (!entry) {
281 entry = head; 272 entry = head;
282 while (entry) { 273 while (entry) {
@@ -316,12 +307,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
316} 307}
317 308
318void arch_init_copy_chip_data(struct irq_desc *old_desc, 309void arch_init_copy_chip_data(struct irq_desc *old_desc,
319 struct irq_desc *desc, int cpu) 310 struct irq_desc *desc, int node)
320{ 311{
321 struct irq_cfg *cfg; 312 struct irq_cfg *cfg;
322 struct irq_cfg *old_cfg; 313 struct irq_cfg *old_cfg;
323 314
324 cfg = get_one_free_irq_cfg(cpu); 315 cfg = get_one_free_irq_cfg(node);
325 316
326 if (!cfg) 317 if (!cfg)
327 return; 318 return;
@@ -332,7 +323,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
332 323
333 memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); 324 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
334 325
335 init_copy_irq_2_pin(old_cfg, cfg, cpu); 326 init_copy_irq_2_pin(old_cfg, cfg, node);
336} 327}
337 328
338static void free_irq_cfg(struct irq_cfg *old_cfg) 329static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +347,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
356 old_desc->chip_data = NULL; 347 old_desc->chip_data = NULL;
357 } 348 }
358} 349}
359 350/* end for move_irq_desc */
360static void
361set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
362{
363 struct irq_cfg *cfg = desc->chip_data;
364
365 if (!cfg->move_in_progress) {
366 /* it means that domain is not changed */
367 if (!cpumask_intersects(desc->affinity, mask))
368 cfg->move_desc_pending = 1;
369 }
370}
371#endif
372 351
373#else 352#else
374static struct irq_cfg *irq_cfg(unsigned int irq) 353static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +357,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
378 357
379#endif 358#endif
380 359
381#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
382static inline void
383set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
384{
385}
386#endif
387
388struct io_apic { 360struct io_apic {
389 unsigned int index; 361 unsigned int index;
390 unsigned int unused[3]; 362 unsigned int unused[3];
@@ -518,132 +490,18 @@ static void ioapic_mask_entry(int apic, int pin)
518 spin_unlock_irqrestore(&ioapic_lock, flags); 490 spin_unlock_irqrestore(&ioapic_lock, flags);
519} 491}
520 492
521#ifdef CONFIG_SMP
522static void send_cleanup_vector(struct irq_cfg *cfg)
523{
524 cpumask_var_t cleanup_mask;
525
526 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
527 unsigned int i;
528 cfg->move_cleanup_count = 0;
529 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
530 cfg->move_cleanup_count++;
531 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
532 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
533 } else {
534 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
535 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
536 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
537 free_cpumask_var(cleanup_mask);
538 }
539 cfg->move_in_progress = 0;
540}
541
542static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
543{
544 int apic, pin;
545 struct irq_pin_list *entry;
546 u8 vector = cfg->vector;
547
548 entry = cfg->irq_2_pin;
549 for (;;) {
550 unsigned int reg;
551
552 if (!entry)
553 break;
554
555 apic = entry->apic;
556 pin = entry->pin;
557 /*
558 * With interrupt-remapping, destination information comes
559 * from interrupt-remapping table entry.
560 */
561 if (!irq_remapped(irq))
562 io_apic_write(apic, 0x11 + pin*2, dest);
563 reg = io_apic_read(apic, 0x10 + pin*2);
564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
565 reg |= vector;
566 io_apic_modify(apic, 0x10 + pin*2, reg);
567 if (!entry->next)
568 break;
569 entry = entry->next;
570 }
571}
572
573static int
574assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
575
576/*
577 * Either sets desc->affinity to a valid value, and returns
578 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
579 * leaves desc->affinity untouched.
580 */
581static unsigned int
582set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
583{
584 struct irq_cfg *cfg;
585 unsigned int irq;
586
587 if (!cpumask_intersects(mask, cpu_online_mask))
588 return BAD_APICID;
589
590 irq = desc->irq;
591 cfg = desc->chip_data;
592 if (assign_irq_vector(irq, cfg, mask))
593 return BAD_APICID;
594
595 /* check that before desc->addinity get updated */
596 set_extra_move_desc(desc, mask);
597
598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
601}
602
603static void
604set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
605{
606 struct irq_cfg *cfg;
607 unsigned long flags;
608 unsigned int dest;
609 unsigned int irq;
610
611 irq = desc->irq;
612 cfg = desc->chip_data;
613
614 spin_lock_irqsave(&ioapic_lock, flags);
615 dest = set_desc_affinity(desc, mask);
616 if (dest != BAD_APICID) {
617 /* Only the high 8 bits are valid. */
618 dest = SET_APIC_LOGICAL_ID(dest);
619 __target_IO_APIC_irq(irq, dest, cfg);
620 }
621 spin_unlock_irqrestore(&ioapic_lock, flags);
622}
623
624static void
625set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
626{
627 struct irq_desc *desc;
628
629 desc = irq_to_desc(irq);
630
631 set_ioapic_affinity_irq_desc(desc, mask);
632}
633#endif /* CONFIG_SMP */
634
635/* 493/*
636 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 494 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
637 * shared ISA-space IRQs, so we have to support them. We are super 495 * shared ISA-space IRQs, so we have to support them. We are super
638 * fast in the common case, and fast for shared ISA-space IRQs. 496 * fast in the common case, and fast for shared ISA-space IRQs.
639 */ 497 */
640static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) 498static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
641{ 499{
642 struct irq_pin_list *entry; 500 struct irq_pin_list *entry;
643 501
644 entry = cfg->irq_2_pin; 502 entry = cfg->irq_2_pin;
645 if (!entry) { 503 if (!entry) {
646 entry = get_one_free_irq_2_pin(cpu); 504 entry = get_one_free_irq_2_pin(node);
647 if (!entry) { 505 if (!entry) {
648 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", 506 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
649 apic, pin); 507 apic, pin);
@@ -663,7 +521,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
663 entry = entry->next; 521 entry = entry->next;
664 } 522 }
665 523
666 entry->next = get_one_free_irq_2_pin(cpu); 524 entry->next = get_one_free_irq_2_pin(node);
667 entry = entry->next; 525 entry = entry->next;
668 entry->apic = apic; 526 entry->apic = apic;
669 entry->pin = pin; 527 entry->pin = pin;
@@ -672,7 +530,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
672/* 530/*
673 * Reroute an IRQ to a different pin. 531 * Reroute an IRQ to a different pin.
674 */ 532 */
675static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, 533static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
676 int oldapic, int oldpin, 534 int oldapic, int oldpin,
677 int newapic, int newpin) 535 int newapic, int newpin)
678{ 536{
@@ -692,7 +550,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
692 550
693 /* why? call replace before add? */ 551 /* why? call replace before add? */
694 if (!replaced) 552 if (!replaced)
695 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); 553 add_pin_to_irq_node(cfg, node, newapic, newpin);
696} 554}
697 555
698static inline void io_apic_modify_irq(struct irq_cfg *cfg, 556static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +708,6 @@ static int __init ioapic_pirq_setup(char *str)
850__setup("pirq=", ioapic_pirq_setup); 708__setup("pirq=", ioapic_pirq_setup);
851#endif /* CONFIG_X86_32 */ 709#endif /* CONFIG_X86_32 */
852 710
853#ifdef CONFIG_INTR_REMAP
854struct IO_APIC_route_entry **alloc_ioapic_entries(void) 711struct IO_APIC_route_entry **alloc_ioapic_entries(void)
855{ 712{
856 int apic; 713 int apic;
@@ -948,20 +805,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
948 return 0; 805 return 0;
949} 806}
950 807
951void reinit_intr_remapped_IO_APIC(int intr_remapping,
952 struct IO_APIC_route_entry **ioapic_entries)
953
954{
955 /*
956 * for now plain restore of previous settings.
957 * TBD: In the case of OS enabling interrupt-remapping,
958 * IO-APIC RTE's need to be setup to point to interrupt-remapping
959 * table entries. for now, do a plain restore, and wait for
960 * the setup_IO_APIC_irqs() to do proper initialization.
961 */
962 restore_IO_APIC_setup(ioapic_entries);
963}
964
965void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) 808void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
966{ 809{
967 int apic; 810 int apic;
@@ -971,7 +814,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
971 814
972 kfree(ioapic_entries); 815 kfree(ioapic_entries);
973} 816}
974#endif
975 817
976/* 818/*
977 * Find the IRQ entry number of a certain pin. 819 * Find the IRQ entry number of a certain pin.
@@ -1032,54 +874,6 @@ static int __init find_isa_irq_apic(int irq, int type)
1032 return -1; 874 return -1;
1033} 875}
1034 876
1035/*
1036 * Find a specific PCI IRQ entry.
1037 * Not an __init, possibly needed by modules
1038 */
1039static int pin_2_irq(int idx, int apic, int pin);
1040
1041int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1042{
1043 int apic, i, best_guess = -1;
1044
1045 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1046 bus, slot, pin);
1047 if (test_bit(bus, mp_bus_not_pci)) {
1048 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1049 return -1;
1050 }
1051 for (i = 0; i < mp_irq_entries; i++) {
1052 int lbus = mp_irqs[i].srcbus;
1053
1054 for (apic = 0; apic < nr_ioapics; apic++)
1055 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1056 mp_irqs[i].dstapic == MP_APIC_ALL)
1057 break;
1058
1059 if (!test_bit(lbus, mp_bus_not_pci) &&
1060 !mp_irqs[i].irqtype &&
1061 (bus == lbus) &&
1062 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1063 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1064
1065 if (!(apic || IO_APIC_IRQ(irq)))
1066 continue;
1067
1068 if (pin == (mp_irqs[i].srcbusirq & 3))
1069 return irq;
1070 /*
1071 * Use the first all-but-pin matching entry as a
1072 * best-guess fuzzy result for broken mptables.
1073 */
1074 if (best_guess < 0)
1075 best_guess = irq;
1076 }
1077 }
1078 return best_guess;
1079}
1080
1081EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1082
1083#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 877#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1084/* 878/*
1085 * EISA Edge/Level control register, ELCR 879 * EISA Edge/Level control register, ELCR
@@ -1298,6 +1092,64 @@ static int pin_2_irq(int idx, int apic, int pin)
1298 return irq; 1092 return irq;
1299} 1093}
1300 1094
1095/*
1096 * Find a specific PCI IRQ entry.
1097 * Not an __init, possibly needed by modules
1098 */
1099int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1100 struct io_apic_irq_attr *irq_attr)
1101{
1102 int apic, i, best_guess = -1;
1103
1104 apic_printk(APIC_DEBUG,
1105 "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1106 bus, slot, pin);
1107 if (test_bit(bus, mp_bus_not_pci)) {
1108 apic_printk(APIC_VERBOSE,
1109 "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1110 return -1;
1111 }
1112 for (i = 0; i < mp_irq_entries; i++) {
1113 int lbus = mp_irqs[i].srcbus;
1114
1115 for (apic = 0; apic < nr_ioapics; apic++)
1116 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1117 mp_irqs[i].dstapic == MP_APIC_ALL)
1118 break;
1119
1120 if (!test_bit(lbus, mp_bus_not_pci) &&
1121 !mp_irqs[i].irqtype &&
1122 (bus == lbus) &&
1123 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1124 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1125
1126 if (!(apic || IO_APIC_IRQ(irq)))
1127 continue;
1128
1129 if (pin == (mp_irqs[i].srcbusirq & 3)) {
1130 set_io_apic_irq_attr(irq_attr, apic,
1131 mp_irqs[i].dstirq,
1132 irq_trigger(i),
1133 irq_polarity(i));
1134 return irq;
1135 }
1136 /*
1137 * Use the first all-but-pin matching entry as a
1138 * best-guess fuzzy result for broken mptables.
1139 */
1140 if (best_guess < 0) {
1141 set_io_apic_irq_attr(irq_attr, apic,
1142 mp_irqs[i].dstirq,
1143 irq_trigger(i),
1144 irq_polarity(i));
1145 best_guess = irq;
1146 }
1147 }
1148 }
1149 return best_guess;
1150}
1151EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1152
1301void lock_vector_lock(void) 1153void lock_vector_lock(void)
1302{ 1154{
1303 /* Used to the online set of cpus does not change 1155 /* Used to the online set of cpus does not change
@@ -1628,58 +1480,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1628 ioapic_write_entry(apic_id, pin, entry); 1480 ioapic_write_entry(apic_id, pin, entry);
1629} 1481}
1630 1482
1483static struct {
1484 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1485} mp_ioapic_routing[MAX_IO_APICS];
1486
1631static void __init setup_IO_APIC_irqs(void) 1487static void __init setup_IO_APIC_irqs(void)
1632{ 1488{
1633 int apic_id, pin, idx, irq; 1489 int apic_id = 0, pin, idx, irq;
1634 int notcon = 0; 1490 int notcon = 0;
1635 struct irq_desc *desc; 1491 struct irq_desc *desc;
1636 struct irq_cfg *cfg; 1492 struct irq_cfg *cfg;
1637 int cpu = boot_cpu_id; 1493 int node = cpu_to_node(boot_cpu_id);
1638 1494
1639 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1495 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1640 1496
1641 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 1497#ifdef CONFIG_ACPI
1642 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1498 if (!acpi_disabled && acpi_ioapic) {
1643 1499 apic_id = mp_find_ioapic(0);
1644 idx = find_irq_entry(apic_id, pin, mp_INT); 1500 if (apic_id < 0)
1645 if (idx == -1) { 1501 apic_id = 0;
1646 if (!notcon) { 1502 }
1647 notcon = 1; 1503#endif
1648 apic_printk(APIC_VERBOSE,
1649 KERN_DEBUG " %d-%d",
1650 mp_ioapics[apic_id].apicid, pin);
1651 } else
1652 apic_printk(APIC_VERBOSE, " %d-%d",
1653 mp_ioapics[apic_id].apicid, pin);
1654 continue;
1655 }
1656 if (notcon) {
1657 apic_printk(APIC_VERBOSE,
1658 " (apicid-pin) not connected\n");
1659 notcon = 0;
1660 }
1661 1504
1662 irq = pin_2_irq(idx, apic_id, pin); 1505 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1506 idx = find_irq_entry(apic_id, pin, mp_INT);
1507 if (idx == -1) {
1508 if (!notcon) {
1509 notcon = 1;
1510 apic_printk(APIC_VERBOSE,
1511 KERN_DEBUG " %d-%d",
1512 mp_ioapics[apic_id].apicid, pin);
1513 } else
1514 apic_printk(APIC_VERBOSE, " %d-%d",
1515 mp_ioapics[apic_id].apicid, pin);
1516 continue;
1517 }
1518 if (notcon) {
1519 apic_printk(APIC_VERBOSE,
1520 " (apicid-pin) not connected\n");
1521 notcon = 0;
1522 }
1663 1523
1664 /* 1524 irq = pin_2_irq(idx, apic_id, pin);
1665 * Skip the timer IRQ if there's a quirk handler
1666 * installed and if it returns 1:
1667 */
1668 if (apic->multi_timer_check &&
1669 apic->multi_timer_check(apic_id, irq))
1670 continue;
1671 1525
1672 desc = irq_to_desc_alloc_cpu(irq, cpu); 1526 /*
1673 if (!desc) { 1527 * Skip the timer IRQ if there's a quirk handler
1674 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1528 * installed and if it returns 1:
1675 continue; 1529 */
1676 } 1530 if (apic->multi_timer_check &&
1677 cfg = desc->chip_data; 1531 apic->multi_timer_check(apic_id, irq))
1678 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); 1532 continue;
1679 1533
1680 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1534 desc = irq_to_desc_alloc_node(irq, node);
1681 irq_trigger(idx), irq_polarity(idx)); 1535 if (!desc) {
1536 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1537 continue;
1682 } 1538 }
1539 cfg = desc->chip_data;
1540 add_pin_to_irq_node(cfg, node, apic_id, pin);
1541 /*
1542 * don't mark it in pin_programmed, so later acpi could
1543 * set it correctly when irq < 16
1544 */
1545 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1546 irq_trigger(idx), irq_polarity(idx));
1683 } 1547 }
1684 1548
1685 if (notcon) 1549 if (notcon)
@@ -1869,7 +1733,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
1869 1733
1870__apicdebuginit(void) print_local_APIC(void *dummy) 1734__apicdebuginit(void) print_local_APIC(void *dummy)
1871{ 1735{
1872 unsigned int v, ver, maxlvt; 1736 unsigned int i, v, ver, maxlvt;
1873 u64 icr; 1737 u64 icr;
1874 1738
1875 if (apic_verbosity == APIC_QUIET) 1739 if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1957 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); 1821 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1958 v = apic_read(APIC_TDCR); 1822 v = apic_read(APIC_TDCR);
1959 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); 1823 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1824
1825 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
1826 v = apic_read(APIC_EFEAT);
1827 maxlvt = (v >> 16) & 0xff;
1828 printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
1829 v = apic_read(APIC_ECTRL);
1830 printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
1831 for (i = 0; i < maxlvt; i++) {
1832 v = apic_read(APIC_EILVTn(i));
1833 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1834 }
1835 }
1960 printk("\n"); 1836 printk("\n");
1961} 1837}
1962 1838
@@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void)
2005__apicdebuginit(int) print_all_ICs(void) 1881__apicdebuginit(int) print_all_ICs(void)
2006{ 1882{
2007 print_PIC(); 1883 print_PIC();
1884
1885 /* don't print out if apic is not there */
1886 if (!cpu_has_apic || disable_apic)
1887 return 0;
1888
2008 print_all_local_APICs(); 1889 print_all_local_APICs();
2009 print_IO_APIC(); 1890 print_IO_APIC();
2010 1891
@@ -2360,6 +2241,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
2360 */ 2241 */
2361 2242
2362#ifdef CONFIG_SMP 2243#ifdef CONFIG_SMP
2244static void send_cleanup_vector(struct irq_cfg *cfg)
2245{
2246 cpumask_var_t cleanup_mask;
2247
2248 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2249 unsigned int i;
2250 cfg->move_cleanup_count = 0;
2251 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2252 cfg->move_cleanup_count++;
2253 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2254 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2255 } else {
2256 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2257 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2258 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2259 free_cpumask_var(cleanup_mask);
2260 }
2261 cfg->move_in_progress = 0;
2262}
2263
2264static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2265{
2266 int apic, pin;
2267 struct irq_pin_list *entry;
2268 u8 vector = cfg->vector;
2269
2270 entry = cfg->irq_2_pin;
2271 for (;;) {
2272 unsigned int reg;
2273
2274 if (!entry)
2275 break;
2276
2277 apic = entry->apic;
2278 pin = entry->pin;
2279 /*
2280 * With interrupt-remapping, destination information comes
2281 * from interrupt-remapping table entry.
2282 */
2283 if (!irq_remapped(irq))
2284 io_apic_write(apic, 0x11 + pin*2, dest);
2285 reg = io_apic_read(apic, 0x10 + pin*2);
2286 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2287 reg |= vector;
2288 io_apic_modify(apic, 0x10 + pin*2, reg);
2289 if (!entry->next)
2290 break;
2291 entry = entry->next;
2292 }
2293}
2294
2295static int
2296assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2297
2298/*
2299 * Either sets desc->affinity to a valid value, and returns
2300 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
2301 * leaves desc->affinity untouched.
2302 */
2303static unsigned int
2304set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
2305{
2306 struct irq_cfg *cfg;
2307 unsigned int irq;
2308
2309 if (!cpumask_intersects(mask, cpu_online_mask))
2310 return BAD_APICID;
2311
2312 irq = desc->irq;
2313 cfg = desc->chip_data;
2314 if (assign_irq_vector(irq, cfg, mask))
2315 return BAD_APICID;
2316
2317 cpumask_copy(desc->affinity, mask);
2318
2319 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2320}
2321
2322static int
2323set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2324{
2325 struct irq_cfg *cfg;
2326 unsigned long flags;
2327 unsigned int dest;
2328 unsigned int irq;
2329 int ret = -1;
2330
2331 irq = desc->irq;
2332 cfg = desc->chip_data;
2333
2334 spin_lock_irqsave(&ioapic_lock, flags);
2335 dest = set_desc_affinity(desc, mask);
2336 if (dest != BAD_APICID) {
2337 /* Only the high 8 bits are valid. */
2338 dest = SET_APIC_LOGICAL_ID(dest);
2339 __target_IO_APIC_irq(irq, dest, cfg);
2340 ret = 0;
2341 }
2342 spin_unlock_irqrestore(&ioapic_lock, flags);
2343
2344 return ret;
2345}
2346
2347static int
2348set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2349{
2350 struct irq_desc *desc;
2351
2352 desc = irq_to_desc(irq);
2353
2354 return set_ioapic_affinity_irq_desc(desc, mask);
2355}
2363 2356
2364#ifdef CONFIG_INTR_REMAP 2357#ifdef CONFIG_INTR_REMAP
2365 2358
@@ -2374,26 +2367,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
2374 * Real vector that is used for interrupting cpu will be coming from 2367 * Real vector that is used for interrupting cpu will be coming from
2375 * the interrupt-remapping table entry. 2368 * the interrupt-remapping table entry.
2376 */ 2369 */
2377static void 2370static int
2378migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2371migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2379{ 2372{
2380 struct irq_cfg *cfg; 2373 struct irq_cfg *cfg;
2381 struct irte irte; 2374 struct irte irte;
2382 unsigned int dest; 2375 unsigned int dest;
2383 unsigned int irq; 2376 unsigned int irq;
2377 int ret = -1;
2384 2378
2385 if (!cpumask_intersects(mask, cpu_online_mask)) 2379 if (!cpumask_intersects(mask, cpu_online_mask))
2386 return; 2380 return ret;
2387 2381
2388 irq = desc->irq; 2382 irq = desc->irq;
2389 if (get_irte(irq, &irte)) 2383 if (get_irte(irq, &irte))
2390 return; 2384 return ret;
2391 2385
2392 cfg = desc->chip_data; 2386 cfg = desc->chip_data;
2393 if (assign_irq_vector(irq, cfg, mask)) 2387 if (assign_irq_vector(irq, cfg, mask))
2394 return; 2388 return ret;
2395
2396 set_extra_move_desc(desc, mask);
2397 2389
2398 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2390 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2399 2391
@@ -2409,27 +2401,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2409 send_cleanup_vector(cfg); 2401 send_cleanup_vector(cfg);
2410 2402
2411 cpumask_copy(desc->affinity, mask); 2403 cpumask_copy(desc->affinity, mask);
2404
2405 return 0;
2412} 2406}
2413 2407
2414/* 2408/*
2415 * Migrates the IRQ destination in the process context. 2409 * Migrates the IRQ destination in the process context.
2416 */ 2410 */
2417static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2411static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2418 const struct cpumask *mask) 2412 const struct cpumask *mask)
2419{ 2413{
2420 migrate_ioapic_irq_desc(desc, mask); 2414 return migrate_ioapic_irq_desc(desc, mask);
2421} 2415}
2422static void set_ir_ioapic_affinity_irq(unsigned int irq, 2416static int set_ir_ioapic_affinity_irq(unsigned int irq,
2423 const struct cpumask *mask) 2417 const struct cpumask *mask)
2424{ 2418{
2425 struct irq_desc *desc = irq_to_desc(irq); 2419 struct irq_desc *desc = irq_to_desc(irq);
2426 2420
2427 set_ir_ioapic_affinity_irq_desc(desc, mask); 2421 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2428} 2422}
2429#else 2423#else
2430static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2424static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2431 const struct cpumask *mask) 2425 const struct cpumask *mask)
2432{ 2426{
2427 return 0;
2433} 2428}
2434#endif 2429#endif
2435 2430
@@ -2491,86 +2486,19 @@ static void irq_complete_move(struct irq_desc **descp)
2491 struct irq_cfg *cfg = desc->chip_data; 2486 struct irq_cfg *cfg = desc->chip_data;
2492 unsigned vector, me; 2487 unsigned vector, me;
2493 2488
2494 if (likely(!cfg->move_in_progress)) { 2489 if (likely(!cfg->move_in_progress))
2495#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2496 if (likely(!cfg->move_desc_pending))
2497 return;
2498
2499 /* domain has not changed, but affinity did */
2500 me = smp_processor_id();
2501 if (cpumask_test_cpu(me, desc->affinity)) {
2502 *descp = desc = move_irq_desc(desc, me);
2503 /* get the new one */
2504 cfg = desc->chip_data;
2505 cfg->move_desc_pending = 0;
2506 }
2507#endif
2508 return; 2490 return;
2509 }
2510 2491
2511 vector = ~get_irq_regs()->orig_ax; 2492 vector = ~get_irq_regs()->orig_ax;
2512 me = smp_processor_id(); 2493 me = smp_processor_id();
2513 2494
2514 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { 2495 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2515#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2516 *descp = desc = move_irq_desc(desc, me);
2517 /* get the new one */
2518 cfg = desc->chip_data;
2519#endif
2520 send_cleanup_vector(cfg); 2496 send_cleanup_vector(cfg);
2521 }
2522} 2497}
2523#else 2498#else
2524static inline void irq_complete_move(struct irq_desc **descp) {} 2499static inline void irq_complete_move(struct irq_desc **descp) {}
2525#endif 2500#endif
2526 2501
2527static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2528{
2529 int apic, pin;
2530 struct irq_pin_list *entry;
2531
2532 entry = cfg->irq_2_pin;
2533 for (;;) {
2534
2535 if (!entry)
2536 break;
2537
2538 apic = entry->apic;
2539 pin = entry->pin;
2540 io_apic_eoi(apic, pin);
2541 entry = entry->next;
2542 }
2543}
2544
2545static void
2546eoi_ioapic_irq(struct irq_desc *desc)
2547{
2548 struct irq_cfg *cfg;
2549 unsigned long flags;
2550 unsigned int irq;
2551
2552 irq = desc->irq;
2553 cfg = desc->chip_data;
2554
2555 spin_lock_irqsave(&ioapic_lock, flags);
2556 __eoi_ioapic_irq(irq, cfg);
2557 spin_unlock_irqrestore(&ioapic_lock, flags);
2558}
2559
2560#ifdef CONFIG_X86_X2APIC
2561static void ack_x2apic_level(unsigned int irq)
2562{
2563 struct irq_desc *desc = irq_to_desc(irq);
2564 ack_x2APIC_irq();
2565 eoi_ioapic_irq(desc);
2566}
2567
2568static void ack_x2apic_edge(unsigned int irq)
2569{
2570 ack_x2APIC_irq();
2571}
2572#endif
2573
2574static void ack_apic_edge(unsigned int irq) 2502static void ack_apic_edge(unsigned int irq)
2575{ 2503{
2576 struct irq_desc *desc = irq_to_desc(irq); 2504 struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2562,6 @@ static void ack_apic_level(unsigned int irq)
2634 */ 2562 */
2635 ack_APIC_irq(); 2563 ack_APIC_irq();
2636 2564
2637 if (irq_remapped(irq))
2638 eoi_ioapic_irq(desc);
2639
2640 /* Now we can move and renable the irq */ 2565 /* Now we can move and renable the irq */
2641 if (unlikely(do_unmask_irq)) { 2566 if (unlikely(do_unmask_irq)) {
2642 /* Only migrate the irq if the ack has been received. 2567 /* Only migrate the irq if the ack has been received.
@@ -2683,22 +2608,50 @@ static void ack_apic_level(unsigned int irq)
2683} 2608}
2684 2609
2685#ifdef CONFIG_INTR_REMAP 2610#ifdef CONFIG_INTR_REMAP
2611static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2612{
2613 int apic, pin;
2614 struct irq_pin_list *entry;
2615
2616 entry = cfg->irq_2_pin;
2617 for (;;) {
2618
2619 if (!entry)
2620 break;
2621
2622 apic = entry->apic;
2623 pin = entry->pin;
2624 io_apic_eoi(apic, pin);
2625 entry = entry->next;
2626 }
2627}
2628
2629static void
2630eoi_ioapic_irq(struct irq_desc *desc)
2631{
2632 struct irq_cfg *cfg;
2633 unsigned long flags;
2634 unsigned int irq;
2635
2636 irq = desc->irq;
2637 cfg = desc->chip_data;
2638
2639 spin_lock_irqsave(&ioapic_lock, flags);
2640 __eoi_ioapic_irq(irq, cfg);
2641 spin_unlock_irqrestore(&ioapic_lock, flags);
2642}
2643
2686static void ir_ack_apic_edge(unsigned int irq) 2644static void ir_ack_apic_edge(unsigned int irq)
2687{ 2645{
2688#ifdef CONFIG_X86_X2APIC 2646 ack_APIC_irq();
2689 if (x2apic_enabled())
2690 return ack_x2apic_edge(irq);
2691#endif
2692 return ack_apic_edge(irq);
2693} 2647}
2694 2648
2695static void ir_ack_apic_level(unsigned int irq) 2649static void ir_ack_apic_level(unsigned int irq)
2696{ 2650{
2697#ifdef CONFIG_X86_X2APIC 2651 struct irq_desc *desc = irq_to_desc(irq);
2698 if (x2apic_enabled()) 2652
2699 return ack_x2apic_level(irq); 2653 ack_APIC_irq();
2700#endif 2654 eoi_ioapic_irq(desc);
2701 return ack_apic_level(irq);
2702} 2655}
2703#endif /* CONFIG_INTR_REMAP */ 2656#endif /* CONFIG_INTR_REMAP */
2704 2657
@@ -2903,7 +2856,7 @@ static inline void __init check_timer(void)
2903{ 2856{
2904 struct irq_desc *desc = irq_to_desc(0); 2857 struct irq_desc *desc = irq_to_desc(0);
2905 struct irq_cfg *cfg = desc->chip_data; 2858 struct irq_cfg *cfg = desc->chip_data;
2906 int cpu = boot_cpu_id; 2859 int node = cpu_to_node(boot_cpu_id);
2907 int apic1, pin1, apic2, pin2; 2860 int apic1, pin1, apic2, pin2;
2908 unsigned long flags; 2861 unsigned long flags;
2909 int no_pin1 = 0; 2862 int no_pin1 = 0;
@@ -2969,7 +2922,7 @@ static inline void __init check_timer(void)
2969 * Ok, does IRQ0 through the IOAPIC work? 2922 * Ok, does IRQ0 through the IOAPIC work?
2970 */ 2923 */
2971 if (no_pin1) { 2924 if (no_pin1) {
2972 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); 2925 add_pin_to_irq_node(cfg, node, apic1, pin1);
2973 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2926 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2974 } else { 2927 } else {
2975 /* for edge trigger, setup_IO_APIC_irq already 2928 /* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2959,7 @@ static inline void __init check_timer(void)
3006 /* 2959 /*
3007 * legacy devices should be connected to IO APIC #0 2960 * legacy devices should be connected to IO APIC #0
3008 */ 2961 */
3009 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); 2962 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3010 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2963 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3011 enable_8259A_irq(0); 2964 enable_8259A_irq(0);
3012 if (timer_irq_works()) { 2965 if (timer_irq_works()) {
@@ -3218,14 +3171,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
3218/* 3171/*
3219 * Dynamic irq allocate and deallocation 3172 * Dynamic irq allocate and deallocation
3220 */ 3173 */
3221unsigned int create_irq_nr(unsigned int irq_want) 3174unsigned int create_irq_nr(unsigned int irq_want, int node)
3222{ 3175{
3223 /* Allocate an unused irq */ 3176 /* Allocate an unused irq */
3224 unsigned int irq; 3177 unsigned int irq;
3225 unsigned int new; 3178 unsigned int new;
3226 unsigned long flags; 3179 unsigned long flags;
3227 struct irq_cfg *cfg_new = NULL; 3180 struct irq_cfg *cfg_new = NULL;
3228 int cpu = boot_cpu_id;
3229 struct irq_desc *desc_new = NULL; 3181 struct irq_desc *desc_new = NULL;
3230 3182
3231 irq = 0; 3183 irq = 0;
@@ -3234,7 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3234 3186
3235 spin_lock_irqsave(&vector_lock, flags); 3187 spin_lock_irqsave(&vector_lock, flags);
3236 for (new = irq_want; new < nr_irqs; new++) { 3188 for (new = irq_want; new < nr_irqs; new++) {
3237 desc_new = irq_to_desc_alloc_cpu(new, cpu); 3189 desc_new = irq_to_desc_alloc_node(new, node);
3238 if (!desc_new) { 3190 if (!desc_new) {
3239 printk(KERN_INFO "can not get irq_desc for %d\n", new); 3191 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3240 continue; 3192 continue;
@@ -3243,6 +3195,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
3243 3195
3244 if (cfg_new->vector != 0) 3196 if (cfg_new->vector != 0)
3245 continue; 3197 continue;
3198
3199 desc_new = move_irq_desc(desc_new, node);
3200
3246 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3201 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3247 irq = new; 3202 irq = new;
3248 break; 3203 break;
@@ -3260,11 +3215,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
3260 3215
3261int create_irq(void) 3216int create_irq(void)
3262{ 3217{
3218 int node = cpu_to_node(boot_cpu_id);
3263 unsigned int irq_want; 3219 unsigned int irq_want;
3264 int irq; 3220 int irq;
3265 3221
3266 irq_want = nr_irqs_gsi; 3222 irq_want = nr_irqs_gsi;
3267 irq = create_irq_nr(irq_want); 3223 irq = create_irq_nr(irq_want, node);
3268 3224
3269 if (irq == 0) 3225 if (irq == 0)
3270 irq = -1; 3226 irq = -1;
@@ -3366,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3366} 3322}
3367 3323
3368#ifdef CONFIG_SMP 3324#ifdef CONFIG_SMP
3369static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3325static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3370{ 3326{
3371 struct irq_desc *desc = irq_to_desc(irq); 3327 struct irq_desc *desc = irq_to_desc(irq);
3372 struct irq_cfg *cfg; 3328 struct irq_cfg *cfg;
@@ -3375,7 +3331,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3375 3331
3376 dest = set_desc_affinity(desc, mask); 3332 dest = set_desc_affinity(desc, mask);
3377 if (dest == BAD_APICID) 3333 if (dest == BAD_APICID)
3378 return; 3334 return -1;
3379 3335
3380 cfg = desc->chip_data; 3336 cfg = desc->chip_data;
3381 3337
@@ -3387,13 +3343,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3387 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3343 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3388 3344
3389 write_msi_msg_desc(desc, &msg); 3345 write_msi_msg_desc(desc, &msg);
3346
3347 return 0;
3390} 3348}
3391#ifdef CONFIG_INTR_REMAP 3349#ifdef CONFIG_INTR_REMAP
3392/* 3350/*
3393 * Migrate the MSI irq to another cpumask. This migration is 3351 * Migrate the MSI irq to another cpumask. This migration is
3394 * done in the process context using interrupt-remapping hardware. 3352 * done in the process context using interrupt-remapping hardware.
3395 */ 3353 */
3396static void 3354static int
3397ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3355ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3398{ 3356{
3399 struct irq_desc *desc = irq_to_desc(irq); 3357 struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3360,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3402 struct irte irte; 3360 struct irte irte;
3403 3361
3404 if (get_irte(irq, &irte)) 3362 if (get_irte(irq, &irte))
3405 return; 3363 return -1;
3406 3364
3407 dest = set_desc_affinity(desc, mask); 3365 dest = set_desc_affinity(desc, mask);
3408 if (dest == BAD_APICID) 3366 if (dest == BAD_APICID)
3409 return; 3367 return -1;
3410 3368
3411 irte.vector = cfg->vector; 3369 irte.vector = cfg->vector;
3412 irte.dest_id = IRTE_DEST(dest); 3370 irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3381,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3423 */ 3381 */
3424 if (cfg->move_in_progress) 3382 if (cfg->move_in_progress)
3425 send_cleanup_vector(cfg); 3383 send_cleanup_vector(cfg);
3384
3385 return 0;
3426} 3386}
3427 3387
3428#endif 3388#endif
@@ -3518,15 +3478,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3518 unsigned int irq_want; 3478 unsigned int irq_want;
3519 struct intel_iommu *iommu = NULL; 3479 struct intel_iommu *iommu = NULL;
3520 int index = 0; 3480 int index = 0;
3481 int node;
3521 3482
3522 /* x86 doesn't support multiple MSI yet */ 3483 /* x86 doesn't support multiple MSI yet */
3523 if (type == PCI_CAP_ID_MSI && nvec > 1) 3484 if (type == PCI_CAP_ID_MSI && nvec > 1)
3524 return 1; 3485 return 1;
3525 3486
3487 node = dev_to_node(&dev->dev);
3526 irq_want = nr_irqs_gsi; 3488 irq_want = nr_irqs_gsi;
3527 sub_handle = 0; 3489 sub_handle = 0;
3528 list_for_each_entry(msidesc, &dev->msi_list, list) { 3490 list_for_each_entry(msidesc, &dev->msi_list, list) {
3529 irq = create_irq_nr(irq_want); 3491 irq = create_irq_nr(irq_want, node);
3530 if (irq == 0) 3492 if (irq == 0)
3531 return -1; 3493 return -1;
3532 irq_want = irq + 1; 3494 irq_want = irq + 1;
@@ -3576,7 +3538,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3576 3538
3577#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3539#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3578#ifdef CONFIG_SMP 3540#ifdef CONFIG_SMP
3579static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3541static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3580{ 3542{
3581 struct irq_desc *desc = irq_to_desc(irq); 3543 struct irq_desc *desc = irq_to_desc(irq);
3582 struct irq_cfg *cfg; 3544 struct irq_cfg *cfg;
@@ -3585,7 +3547,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3585 3547
3586 dest = set_desc_affinity(desc, mask); 3548 dest = set_desc_affinity(desc, mask);
3587 if (dest == BAD_APICID) 3549 if (dest == BAD_APICID)
3588 return; 3550 return -1;
3589 3551
3590 cfg = desc->chip_data; 3552 cfg = desc->chip_data;
3591 3553
@@ -3597,6 +3559,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3597 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3559 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3598 3560
3599 dmar_msi_write(irq, &msg); 3561 dmar_msi_write(irq, &msg);
3562
3563 return 0;
3600} 3564}
3601 3565
3602#endif /* CONFIG_SMP */ 3566#endif /* CONFIG_SMP */
@@ -3630,7 +3594,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3630#ifdef CONFIG_HPET_TIMER 3594#ifdef CONFIG_HPET_TIMER
3631 3595
3632#ifdef CONFIG_SMP 3596#ifdef CONFIG_SMP
3633static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3597static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3634{ 3598{
3635 struct irq_desc *desc = irq_to_desc(irq); 3599 struct irq_desc *desc = irq_to_desc(irq);
3636 struct irq_cfg *cfg; 3600 struct irq_cfg *cfg;
@@ -3639,7 +3603,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3639 3603
3640 dest = set_desc_affinity(desc, mask); 3604 dest = set_desc_affinity(desc, mask);
3641 if (dest == BAD_APICID) 3605 if (dest == BAD_APICID)
3642 return; 3606 return -1;
3643 3607
3644 cfg = desc->chip_data; 3608 cfg = desc->chip_data;
3645 3609
@@ -3651,6 +3615,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3651 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3615 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3652 3616
3653 hpet_msi_write(irq, &msg); 3617 hpet_msi_write(irq, &msg);
3618
3619 return 0;
3654} 3620}
3655 3621
3656#endif /* CONFIG_SMP */ 3622#endif /* CONFIG_SMP */
@@ -3707,7 +3673,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3707 write_ht_irq_msg(irq, &msg); 3673 write_ht_irq_msg(irq, &msg);
3708} 3674}
3709 3675
3710static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3676static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3711{ 3677{
3712 struct irq_desc *desc = irq_to_desc(irq); 3678 struct irq_desc *desc = irq_to_desc(irq);
3713 struct irq_cfg *cfg; 3679 struct irq_cfg *cfg;
@@ -3715,11 +3681,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3715 3681
3716 dest = set_desc_affinity(desc, mask); 3682 dest = set_desc_affinity(desc, mask);
3717 if (dest == BAD_APICID) 3683 if (dest == BAD_APICID)
3718 return; 3684 return -1;
3719 3685
3720 cfg = desc->chip_data; 3686 cfg = desc->chip_data;
3721 3687
3722 target_ht_irq(irq, dest, cfg->vector); 3688 target_ht_irq(irq, dest, cfg->vector);
3689
3690 return 0;
3723} 3691}
3724 3692
3725#endif 3693#endif
@@ -3794,6 +3762,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3794 unsigned long flags; 3762 unsigned long flags;
3795 int err; 3763 int err;
3796 3764
3765 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3766
3797 cfg = irq_cfg(irq); 3767 cfg = irq_cfg(irq);
3798 3768
3799 err = assign_irq_vector(irq, cfg, eligible_cpu); 3769 err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3777,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3807 3777
3808 mmr_value = 0; 3778 mmr_value = 0;
3809 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3779 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3810 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3780 entry->vector = cfg->vector;
3811 3781 entry->delivery_mode = apic->irq_delivery_mode;
3812 entry->vector = cfg->vector; 3782 entry->dest_mode = apic->irq_dest_mode;
3813 entry->delivery_mode = apic->irq_delivery_mode; 3783 entry->polarity = 0;
3814 entry->dest_mode = apic->irq_dest_mode; 3784 entry->trigger = 0;
3815 entry->polarity = 0; 3785 entry->mask = 0;
3816 entry->trigger = 0; 3786 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3817 entry->mask = 0;
3818 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3819 3787
3820 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3788 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3821 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3789 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3801,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3833 struct uv_IO_APIC_route_entry *entry; 3801 struct uv_IO_APIC_route_entry *entry;
3834 int mmr_pnode; 3802 int mmr_pnode;
3835 3803
3804 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3805
3836 mmr_value = 0; 3806 mmr_value = 0;
3837 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3807 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3838 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3839
3840 entry->mask = 1; 3808 entry->mask = 1;
3841 3809
3842 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3810 mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3868,71 @@ int __init arch_probe_nr_irqs(void)
3900} 3868}
3901#endif 3869#endif
3902 3870
3871static int __io_apic_set_pci_routing(struct device *dev, int irq,
3872 struct io_apic_irq_attr *irq_attr)
3873{
3874 struct irq_desc *desc;
3875 struct irq_cfg *cfg;
3876 int node;
3877 int ioapic, pin;
3878 int trigger, polarity;
3879
3880 ioapic = irq_attr->ioapic;
3881 if (!IO_APIC_IRQ(irq)) {
3882 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3883 ioapic);
3884 return -EINVAL;
3885 }
3886
3887 if (dev)
3888 node = dev_to_node(dev);
3889 else
3890 node = cpu_to_node(boot_cpu_id);
3891
3892 desc = irq_to_desc_alloc_node(irq, node);
3893 if (!desc) {
3894 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3895 return 0;
3896 }
3897
3898 pin = irq_attr->ioapic_pin;
3899 trigger = irq_attr->trigger;
3900 polarity = irq_attr->polarity;
3901
3902 /*
3903 * IRQs < 16 are already in the irq_2_pin[] map
3904 */
3905 if (irq >= NR_IRQS_LEGACY) {
3906 cfg = desc->chip_data;
3907 add_pin_to_irq_node(cfg, node, ioapic, pin);
3908 }
3909
3910 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
3911
3912 return 0;
3913}
3914
3915int io_apic_set_pci_routing(struct device *dev, int irq,
3916 struct io_apic_irq_attr *irq_attr)
3917{
3918 int ioapic, pin;
3919 /*
3920 * Avoid pin reprogramming. PRTs typically include entries
3921 * with redundant pin->gsi mappings (but unique PCI devices);
3922 * we only program the IOAPIC on the first.
3923 */
3924 ioapic = irq_attr->ioapic;
3925 pin = irq_attr->ioapic_pin;
3926 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3927 pr_debug("Pin %d-%d already programmed\n",
3928 mp_ioapics[ioapic].apicid, pin);
3929 return 0;
3930 }
3931 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3932
3933 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3934}
3935
3903/* -------------------------------------------------------------------------- 3936/* --------------------------------------------------------------------------
3904 ACPI-based IOAPIC Configuration 3937 ACPI-based IOAPIC Configuration
3905 -------------------------------------------------------------------------- */ 3938 -------------------------------------------------------------------------- */
@@ -3980,6 +4013,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3980 4013
3981 return apic_id; 4014 return apic_id;
3982} 4015}
4016#endif
3983 4017
3984int __init io_apic_get_version(int ioapic) 4018int __init io_apic_get_version(int ioapic)
3985{ 4019{
@@ -3992,39 +4026,6 @@ int __init io_apic_get_version(int ioapic)
3992 4026
3993 return reg_01.bits.version; 4027 return reg_01.bits.version;
3994} 4028}
3995#endif
3996
3997int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3998{
3999 struct irq_desc *desc;
4000 struct irq_cfg *cfg;
4001 int cpu = boot_cpu_id;
4002
4003 if (!IO_APIC_IRQ(irq)) {
4004 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4005 ioapic);
4006 return -EINVAL;
4007 }
4008
4009 desc = irq_to_desc_alloc_cpu(irq, cpu);
4010 if (!desc) {
4011 printk(KERN_INFO "can not get irq_desc %d\n", irq);
4012 return 0;
4013 }
4014
4015 /*
4016 * IRQs < 16 are already in the irq_2_pin[] map
4017 */
4018 if (irq >= NR_IRQS_LEGACY) {
4019 cfg = desc->chip_data;
4020 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
4021 }
4022
4023 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
4024
4025 return 0;
4026}
4027
4028 4029
4029int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4030int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4030{ 4031{
@@ -4055,51 +4056,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4055#ifdef CONFIG_SMP 4056#ifdef CONFIG_SMP
4056void __init setup_ioapic_dest(void) 4057void __init setup_ioapic_dest(void)
4057{ 4058{
4058 int pin, ioapic, irq, irq_entry; 4059 int pin, ioapic = 0, irq, irq_entry;
4059 struct irq_desc *desc; 4060 struct irq_desc *desc;
4060 struct irq_cfg *cfg;
4061 const struct cpumask *mask; 4061 const struct cpumask *mask;
4062 4062
4063 if (skip_ioapic_setup == 1) 4063 if (skip_ioapic_setup == 1)
4064 return; 4064 return;
4065 4065
4066 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { 4066#ifdef CONFIG_ACPI
4067 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4067 if (!acpi_disabled && acpi_ioapic) {
4068 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4068 ioapic = mp_find_ioapic(0);
4069 if (irq_entry == -1) 4069 if (ioapic < 0)
4070 continue; 4070 ioapic = 0;
4071 irq = pin_2_irq(irq_entry, ioapic, pin); 4071 }
4072 4072#endif
4073 /* setup_IO_APIC_irqs could fail to get vector for some device
4074 * when you have too many devices, because at that time only boot
4075 * cpu is online.
4076 */
4077 desc = irq_to_desc(irq);
4078 cfg = desc->chip_data;
4079 if (!cfg->vector) {
4080 setup_IO_APIC_irq(ioapic, pin, irq, desc,
4081 irq_trigger(irq_entry),
4082 irq_polarity(irq_entry));
4083 continue;
4084 4073
4085 } 4074 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4075 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4076 if (irq_entry == -1)
4077 continue;
4078 irq = pin_2_irq(irq_entry, ioapic, pin);
4086 4079
4087 /* 4080 desc = irq_to_desc(irq);
4088 * Honour affinities which have been set in early boot
4089 */
4090 if (desc->status &
4091 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4092 mask = desc->affinity;
4093 else
4094 mask = apic->target_cpus();
4095 4081
4096 if (intr_remapping_enabled) 4082 /*
4097 set_ir_ioapic_affinity_irq_desc(desc, mask); 4083 * Honour affinities which have been set in early boot
4098 else 4084 */
4099 set_ioapic_affinity_irq_desc(desc, mask); 4085 if (desc->status &
4100 } 4086 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4087 mask = desc->affinity;
4088 else
4089 mask = apic->target_cpus();
4101 4090
4091 if (intr_remapping_enabled)
4092 set_ir_ioapic_affinity_irq_desc(desc, mask);
4093 else
4094 set_ioapic_affinity_irq_desc(desc, mask);
4102 } 4095 }
4096
4103} 4097}
4104#endif 4098#endif
4105 4099
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a1..a691302dc3ff 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
104} 104}
105#endif 105#endif
106 106
107static void report_broken_nmi(int cpu, int *prev_nmi_count) 107static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
108{ 108{
109 printk(KERN_CONT "\n"); 109 printk(KERN_CONT "\n");
110 110
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e4..440a8bccd91a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster; 162extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164 163
165struct apic *apic = &apic_default; 164struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic); 165EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e5..bc3e880f9b82 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
50void __init default_setup_apic_routing(void) 50void __init default_setup_apic_routing(void)
51{ 51{
52#ifdef CONFIG_X86_X2APIC 52#ifdef CONFIG_X86_X2APIC
53 if (x2apic && (apic != &apic_x2apic_phys && 53 if (x2apic_mode && (apic != &apic_x2apic_phys &&
54#ifdef CONFIG_X86_UV 54#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x && 55 apic != &apic_x2apic_uv_x &&
56#endif 56#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d81..344eee4ac0a4 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); 173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
174} 174}
175 175
176
177/* In clustered mode, the high nibble of APIC ID is a cluster number.
178 * The low nibble is a 4-bit bitmap. */
179#define XAPIC_DEST_CPUS_SHIFT 4
180#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
181#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
182
183#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 176#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
184 177
185static const struct cpumask *summit_target_cpus(void) 178static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d17..8e4cbb255c38 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
10#include <asm/apic.h> 10#include <asm/apic.h>
11#include <asm/ipi.h> 11#include <asm/ipi.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda69352976..ef0ae207a7c8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
105 cpumask_set_cpu(cpu, retmask); 105 cpumask_set_cpu(cpu, retmask);
106} 106}
107 107
108static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 108static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
109{ 109{
110#ifdef CONFIG_SMP 110#ifdef CONFIG_SMP
111 unsigned long val; 111 unsigned long val;
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
562 union uvh_node_id_u node_id; 562 union uvh_node_id_u node_id;
563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
565 int max_pnode = 0; 565 int gnode_extra, max_pnode = 0;
566 unsigned long mmr_base, present, paddr; 566 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 567 unsigned short pnode_mask;
568 568
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
574 mmr_base = 574 mmr_base =
575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
576 ~UV_MMR_ENABLE; 576 ~UV_MMR_ENABLE;
577 pnode_mask = (1 << n_val) - 1;
578 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
579 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
580 gnode_upper = ((unsigned long)gnode_extra << m_val);
581 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
582 n_val, m_val, gnode_upper, gnode_extra);
583
577 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 584 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
578 585
579 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 586 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
583 590
584 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
585 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 592 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info);
586 594
587 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
588 596
589 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); 597 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
590 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); 598 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
599 BUG_ON(!uv_node_to_blade);
591 memset(uv_node_to_blade, 255, bytes); 600 memset(uv_node_to_blade, 255, bytes);
592 601
593 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); 602 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
594 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); 603 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
604 BUG_ON(!uv_cpu_to_blade);
595 memset(uv_cpu_to_blade, 255, bytes); 605 memset(uv_cpu_to_blade, 255, bytes);
596 606
597 blade = 0; 607 blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
607 } 617 }
608 } 618 }
609 619
610 pnode_mask = (1 << n_val) - 1;
611 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
612 gnode_upper = (((unsigned long)node_id.s.node_id) &
613 ~((1 << n_val) - 1)) << m_val;
614
615 uv_bios_init(); 620 uv_bios_init();
616 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 621 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
617 &sn_coherency_id, &sn_region_size); 622 &sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
634 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 639 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
635 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 640 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
636 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 641 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
642 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
637 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 643 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
638 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 644 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
639 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 645 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162f..1a830cbd7015 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
146 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 146 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
148 OFFSET(BP_version, boot_params, hdr.version); 148 OFFSET(BP_version, boot_params, hdr.version);
149 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 150}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b5..898ecc47e129 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
125 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 125 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
127 OFFSET(BP_version, boot_params, hdr.version); 127 OFFSET(BP_version, boot_params, hdr.version);
128 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
128 129
129 BLANK(); 130 BLANK();
130 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 131 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa64..e5b27d8f1b47 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
9#include <asm/pci-direct.h>
9 10
10#ifdef CONFIG_X86_64 11#ifdef CONFIG_X86_64
11# include <asm/numa_64.h> 12# include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
272#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 273#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
273 int cpu = smp_processor_id(); 274 int cpu = smp_processor_id();
274 int node; 275 int node;
275 unsigned apicid = hard_smp_processor_id(); 276 unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
276 277
277 node = c->phys_proc_id; 278 node = c->phys_proc_id;
278 if (apicid_to_node[apicid] != NUMA_NO_NODE) 279 if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
351 (c->x86_model == 8 && c->x86_mask >= 8)) 352 (c->x86_model == 8 && c->x86_mask >= 8))
352 set_cpu_cap(c, X86_FEATURE_K6_MTRR); 353 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
353#endif 354#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) {
358 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
361 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
362 }
363#endif
354} 364}
355 365
356static void __cpuinit init_amd(struct cpuinfo_x86 *c) 366static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 77848d9fca68..b0517aa2bd3b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -299,7 +299,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
299 return NULL; /* Not found */ 299 return NULL; /* Not found */
300} 300}
301 301
302__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 302__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
303__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
303 304
304void load_percpu_segment(int cpu) 305void load_percpu_segment(int cpu)
305{ 306{
@@ -768,6 +769,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
768 if (this_cpu->c_identify) 769 if (this_cpu->c_identify)
769 this_cpu->c_identify(c); 770 this_cpu->c_identify(c);
770 771
772 /* Clear/Set all flags overriden by options, after probe */
773 for (i = 0; i < NCAPINTS; i++) {
774 c->x86_capability[i] &= ~cpu_caps_cleared[i];
775 c->x86_capability[i] |= cpu_caps_set[i];
776 }
777
771#ifdef CONFIG_X86_64 778#ifdef CONFIG_X86_64
772 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); 779 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
773#endif 780#endif
@@ -813,6 +820,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
813#endif 820#endif
814 821
815 init_hypervisor(c); 822 init_hypervisor(c);
823
824 /*
825 * Clear/Set all flags overriden by options, need do it
826 * before following smp all cpus cap AND.
827 */
828 for (i = 0; i < NCAPINTS; i++) {
829 c->x86_capability[i] &= ~cpu_caps_cleared[i];
830 c->x86_capability[i] |= cpu_caps_set[i];
831 }
832
816 /* 833 /*
817 * On SMP, boot_cpu_data holds the common feature set between 834 * On SMP, boot_cpu_data holds the common feature set between
818 * all CPUs; so make sure that we indicate which features are 835 * all CPUs; so make sure that we indicate which features are
@@ -825,10 +842,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
825 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 842 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
826 } 843 }
827 844
828 /* Clear all flags overriden by options */
829 for (i = 0; i < NCAPINTS; i++)
830 c->x86_capability[i] &= ~cleared_cpu_caps[i];
831
832#ifdef CONFIG_X86_MCE 845#ifdef CONFIG_X86_MCE
833 /* Init Machine Check Exception if available. */ 846 /* Init Machine Check Exception if available. */
834 mcheck_init(c); 847 mcheck_init(c);
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6a..6b2a52dd0403 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38 36
39static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
40 38
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
80 { "value", CPU_REG_ALL, 1 }, 78 { "value", CPU_REG_ALL, 1 },
81}; 79};
82 80
83/* Intel Registers Range */ 81/* CPU Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = { 82static struct cpu_debug_range cpu_reg_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, 83 { 0x00000000, 0x00000001, CPU_MC, },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, 84 { 0x00000006, 0x00000007, CPU_MONITOR, },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, 85 { 0x00000010, 0x00000010, CPU_TIME, },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, 86 { 0x00000011, 0x00000013, CPU_PMC, },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, 87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, 88 { 0x0000001B, 0x0000001B, CPU_APIC, },
91 89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, 90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, 91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, 92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, 93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
96 94 { 0x00000079, 0x00000079, CPU_BIOS, },
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, 95 { 0x00000088, 0x0000008A, CPU_CACHE, },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, 96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, 97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, 98 { 0x000000C1, 0x000000C4, CPU_PMC, },
101 99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, 100 { 0x000000E7, 0x000000E8, CPU_PERF, },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, 101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, 102
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, 103 { 0x00000116, 0x0000011E, CPU_CACHE, },
106 104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, 105 { 0x00000179, 0x0000017B, CPU_MC, },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, 106 { 0x00000186, 0x00000189, CPU_PMC, },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, 107 { 0x00000198, 0x00000199, CPU_PERF, },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, 108 { 0x0000019A, 0x0000019A, CPU_TIME, },
111 109 { 0x0000019B, 0x0000019D, CPU_THERM, },
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, 110 { 0x000001A0, 0x000001A0, CPU_MISC, },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, 111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, 112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, 113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, 114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
117 115
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, 116 { 0x00000200, 0x0000020F, CPU_MTRR, },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, 117 { 0x00000250, 0x00000250, CPU_MTRR, },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, 118 { 0x00000258, 0x00000259, CPU_MTRR, },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, 119 { 0x00000268, 0x0000026F, CPU_MTRR, },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, 120 { 0x00000277, 0x00000277, CPU_PAT, },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, 121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, 122
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, 123 { 0x00000300, 0x00000311, CPU_PMC, },
126 124 { 0x00000345, 0x00000345, CPU_PMC, },
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, 125 { 0x00000360, 0x00000371, CPU_PMC, },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, 126 { 0x0000038D, 0x00000390, CPU_PMC, },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, 127 { 0x000003A0, 0x000003BE, CPU_PMC, },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, 128 { 0x000003C0, 0x000003CD, CPU_PMC, },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, 129 { 0x000003E0, 0x000003E1, CPU_PMC, },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, 130 { 0x000003F0, 0x000003F2, CPU_PMC, },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, 131
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, 132 { 0x00000400, 0x00000417, CPU_MC, },
135 133 { 0x00000480, 0x0000048B, CPU_VMX, },
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, 134
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, 135 { 0x00000600, 0x00000600, CPU_DEBUG, },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, 136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, 137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, 138
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, 139 { 0x000107CC, 0x000107D3, CPU_PMC, },
142 140
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, 141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, 142 { 0xC0000081, 0xC0000084, CPU_CALL, },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, 143 { 0xC0000100, 0xC0000102, CPU_BASE, },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, 144 { 0xC0000103, 0xC0000103, CPU_TIME, },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, 145
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, 146 { 0xC0010000, 0xC0010007, CPU_PMC, },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, 147 { 0xC0010010, 0xC0010010, CPU_CONF, },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, 148 { 0xC0010015, 0xC0010015, CPU_CONF, },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, 149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, 150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, 151 { 0xC001001F, 0xC001001F, CPU_CONF, },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, 152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
155 153 { 0xC0010044, 0xC0010048, CPU_MC, },
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, 154 { 0xC0010050, 0xC0010056, CPU_SMM, },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, 155 { 0xC0010058, 0xC0010058, CPU_CONF, },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, 156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, 157 { 0xC0010061, 0xC0010068, CPU_SMM, },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, 158 { 0xC0010069, 0xC001006B, CPU_SMM, },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, 159 { 0xC0010070, 0xC0010071, CPU_SMM, },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, 160 { 0xC0010111, 0xC0010113, CPU_SMM, },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, 161 { 0xC0010114, 0xC0010118, CPU_SVM, },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, 162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, 163 { 0xC0011022, 0xC0011023, CPU_CONF, },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178}; 164};
179 165
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag) 166static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{ 167{
355 unsigned vendor, modelflag; 168 int i;
356 int i, index;
357 169
358 /* Standard Registers should be always valid */ 170 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS) 171 if (flag >= CPU_TSS)
360 return 1; 172 return 1;
361 173
362 modelflag = per_cpu(cpu_modelflag, cpu); 174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
363 vendor = per_cpu(cpu_model, cpu) >> 16; 175 if (cpu_reg_range[i].flag == flag)
364 index = get_cpu_range_count(cpu); 176 return 1;
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 } 177 }
380 178
381 /* Invalid */ 179 /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, 183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag) 184 int index, unsigned flag)
387{ 185{
388 unsigned modelflag; 186 if (cpu_reg_range[index].flag == flag) {
389 187 *min = cpu_reg_range[index].min;
390 modelflag = per_cpu(cpu_modelflag, cpu); 188 *max = cpu_reg_range[index].max;
391 *max = 0; 189 } else
392 switch (per_cpu(cpu_model, cpu) >> 16) { 190 *max = 0;
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408 191
409 return *max; 192 return *max;
410} 193}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
434 unsigned msr, msr_min, msr_max; 217 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv; 218 struct cpu_private *priv;
436 u32 low, high; 219 u32 low, high;
437 int i, range; 220 int i;
438 221
439 if (seq) { 222 if (seq) {
440 priv = seq->private; 223 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
446 } 229 }
447 } 230 }
448 231
449 range = get_cpu_range_count(cpu); 232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) 233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue; 234 continue;
454 235
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); 369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); 370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); 371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */ 372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
592 379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
593 seq_printf(seq, "\n MSR\t:\n"); 386 seq_printf(seq, "\n MSR\t:\n");
594} 387}
595 388
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{ 581{
789 struct dentry *cpu_dentry = NULL; 582 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max; 583 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0; 584 int i, err = 0;
792 char reg_dir[12]; 585 char reg_dir[12];
793 u32 low, high; 586 u32 low, high;
794 587
795 range = get_cpu_range_count(cpu); 588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i, 589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag)) 590 cpu_base[type].flag))
800 continue; 591 continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
850 cpui = &cpu_data(cpu); 641 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR)) 642 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue; 643 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857 644
858 sprintf(cpu_dir, "cpu%d", cpu); 645 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); 646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c839875478..f138c6c389b9 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
220 If in doubt, say N. 220 If in doubt, say N.
221 221
222config X86_E_POWERSAVER 222config X86_E_POWERSAVER
223 tristate "VIA C7 Enhanced PowerSaver" 223 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
224 select CPU_FREQ_TABLE 224 select CPU_FREQ_TABLE
225 depends on X86_32 225 depends on X86_32 && EXPERIMENTAL
226 help 226 help
227 This adds the CPUFreq driver for VIA C7 processors. 227 This adds the CPUFreq driver for VIA C7 processors. However, this driver
228 does not have any safeguards to prevent operating the CPU out of spec
229 and is thus considered dangerous. Please use the regular ACPI cpufreq
230 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
228 231
229 If in doubt, say N. 232 If in doubt, say N.
230 233
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 54b6de2cd947..ae9b503220ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
90{ 90{
91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid); 91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92 92
93 if (cpu->x86_vendor != X86_VENDOR_INTEL || 93 return cpu_has(cpu, X86_FEATURE_EST);
94 !cpu_has(cpu, X86_FEATURE_EST))
95 return 0;
96
97 return 1;
98} 94}
99 95
100static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) 96static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)
550 return -ENOMEM; 546 return -ENOMEM;
551 } 547 }
552 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
553 if (!alloc_cpumask_var_node( 549 if (!zalloc_cpumask_var_node(
554 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, 550 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
555 GFP_KERNEL, cpu_to_node(i))) { 551 GFP_KERNEL, cpu_to_node(i))) {
556 552
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index a8363e5be4ef..d47c775eb0ab 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -322,7 +322,7 @@ static int powernow_acpi_init(void)
322 goto err0; 322 goto err0;
323 } 323 }
324 324
325 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, 325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) { 326 GFP_KERNEL)) {
327 retval = -ENOMEM; 327 retval = -ENOMEM;
328 goto err05; 328 goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 35dc8fbe92bd..cf52215d9eb1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -887,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
887 /* notify BIOS that we exist */ 887 /* notify BIOS that we exist */
888 acpi_processor_notify_smm(THIS_MODULE); 888 acpi_processor_notify_smm(THIS_MODULE);
889 889
890 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { 890 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
891 printk(KERN_ERR PFX 891 printk(KERN_ERR PFX
892 "unable to alloc powernow_k8_data cpumask\n"); 892 "unable to alloc powernow_k8_data cpumask\n");
893 ret_val = -ENOMEM; 893 ret_val = -ENOMEM;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc02830..55c831ed71ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
471 471
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473 return -ENOMEM; 473 return -ENOMEM;
474 if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { 474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask); 475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 476 return -ENOMEM;
477 } 477 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c02..daed39ba2614 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
229} 229}
230#endif 230#endif
231 231
232static void __cpuinit srat_detect_node(void) 232static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
233{ 233{
234#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 234#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
235 unsigned node; 235 unsigned node;
236 int cpu = smp_processor_id(); 236 int cpu = smp_processor_id();
237 int apicid = hard_smp_processor_id(); 237 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
238 238
239 /* Don't do the funky fallback heuristics the AMD version employs 239 /* Don't do the funky fallback heuristics the AMD version employs
240 for now. */ 240 for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
400 } 400 }
401 401
402 /* Work around errata */ 402 /* Work around errata */
403 srat_detect_node(); 403 srat_detect_node(c);
404 404
405 if (cpu_has(c, X86_FEATURE_VMX)) 405 if (cpu_has(c, X86_FEATURE_VMX))
406 detect_vmx_virtcap(c); 406 detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e102..789efe217e1a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/k8.h>
20 21
21#define LVL_1_INST 1 22#define LVL_1_INST 1
22#define LVL_1_DATA 2 23#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 160 unsigned long can_disable;
160}; 161};
161 162
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
166 {}
167};
168#endif
169
170unsigned short num_cache_leaves; 163unsigned short num_cache_leaves;
171 164
172/* AMD doesn't have CPUID4. Emulate it here to report the same 165/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
207}; 200};
208 201
209static const unsigned short __cpuinitconst assocs[] = { 202static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 203 [1] = 1,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 204 [2] = 2,
205 [4] = 4,
206 [6] = 8,
207 [8] = 16,
208 [0xa] = 32,
209 [0xb] = 48,
212 [0xc] = 64, 210 [0xc] = 64,
213 [0xf] = 0xffff // ?? 211 [0xd] = 96,
212 [0xe] = 128,
213 [0xf] = 0xffff /* fully associative - no way to show this currently */
214}; 214};
215 215
216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
271 eax->split.type = types[leaf]; 271 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 272 eax->split.level = levels[leaf];
273 if (leaf == 3) 273 if (leaf == 3)
274 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; 274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
275 else 276 else
276 eax->split.num_threads_sharing = 0; 277 eax->split.num_threads_sharing = 0;
277 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
291{ 292{
292 if (index < 3) 293 if (index < 3)
293 return; 294 return;
295
296 if (boot_cpu_data.x86 == 0x11)
297 return;
298
299 /* see erratum #382 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
301 return;
302
294 this_leaf->can_disable = 1; 303 this_leaf->can_disable = 1;
295} 304}
296 305
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
696#define to_object(k) container_of(k, struct _index_kobject, kobj) 705#define to_object(k) container_of(k, struct _index_kobject, kobj)
697#define to_attr(a) container_of(a, struct _cache_attr, attr) 706#define to_attr(a) container_of(a, struct _cache_attr, attr)
698 707
699#ifdef CONFIG_PCI 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
700static struct pci_dev *get_k8_northbridge(int node) 709 unsigned int index)
701{
702 struct pci_dev *dev = NULL;
703 int i;
704
705 for (i = 0; i <= node; i++) {
706 do {
707 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
708 if (!dev)
709 break;
710 } while (!pci_match_id(&k8_nb_id[0], dev));
711 if (!dev)
712 break;
713 }
714 return dev;
715}
716#else
717static struct pci_dev *get_k8_northbridge(int node)
718{
719 return NULL;
720}
721#endif
722
723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
724{ 710{
725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 711 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
726 int node = cpu_to_node(cpumask_first(mask)); 712 int node = cpu_to_node(cpu);
727 struct pci_dev *dev = NULL; 713 struct pci_dev *dev = node_to_k8_nb_misc(node);
728 ssize_t ret = 0; 714 unsigned int reg = 0;
729 int i;
730 715
731 if (!this_leaf->can_disable) 716 if (!this_leaf->can_disable)
732 return sprintf(buf, "Feature not enabled\n");
733
734 dev = get_k8_northbridge(node);
735 if (!dev) {
736 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
737 return -EINVAL; 717 return -EINVAL;
738 }
739 718
740 for (i = 0; i < 2; i++) { 719 if (!dev)
741 unsigned int reg; 720 return -EINVAL;
742 721
743 pci_read_config_dword(dev, 0x1BC + i * 4, &reg); 722 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
723 return sprintf(buf, "%x\n", reg);
724}
744 725
745 ret += sprintf(buf, "%sEntry: %d\n", buf, i); 726#define SHOW_CACHE_DISABLE(index) \
746 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", 727static ssize_t \
747 buf, 728show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
748 reg & 0x80000000 ? "Disabled" : "Allowed", 729{ \
749 reg & 0x40000000 ? "Disabled" : "Allowed"); 730 return show_cache_disable(this_leaf, buf, index); \
750 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
751 buf, (reg & 0x30000) >> 16, reg & 0xfff);
752 }
753 return ret;
754} 731}
732SHOW_CACHE_DISABLE(0)
733SHOW_CACHE_DISABLE(1)
755 734
756static ssize_t 735static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 736 const char *buf, size_t count, unsigned int index)
758 size_t count)
759{ 737{
760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 738 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
761 int node = cpu_to_node(cpumask_first(mask)); 739 int node = cpu_to_node(cpu);
762 struct pci_dev *dev = NULL; 740 struct pci_dev *dev = node_to_k8_nb_misc(node);
763 unsigned int ret, index, val; 741 unsigned long val = 0;
742 unsigned int scrubber = 0;
764 743
765 if (!this_leaf->can_disable) 744 if (!this_leaf->can_disable)
766 return 0;
767
768 if (strlen(buf) > 15)
769 return -EINVAL; 745 return -EINVAL;
770 746
771 ret = sscanf(buf, "%x %x", &index, &val); 747 if (!capable(CAP_SYS_ADMIN))
772 if (ret != 2) 748 return -EPERM;
749
750 if (!dev)
773 return -EINVAL; 751 return -EINVAL;
774 if (index > 1) 752
753 if (strict_strtoul(buf, 10, &val) < 0)
775 return -EINVAL; 754 return -EINVAL;
776 755
777 val |= 0xc0000000; 756 val |= 0xc0000000;
778 dev = get_k8_northbridge(node); 757
779 if (!dev) { 758 pci_read_config_dword(dev, 0x58, &scrubber);
780 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); 759 scrubber &= ~0x1f000000;
781 return -EINVAL; 760 pci_write_config_dword(dev, 0x58, scrubber);
782 }
783 761
784 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); 762 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
785 wbinvd(); 763 wbinvd();
786 pci_write_config_dword(dev, 0x1BC + index * 4, val); 764 pci_write_config_dword(dev, 0x1BC + index * 4, val);
765 return count;
766}
787 767
788 return 1; 768#define STORE_CACHE_DISABLE(index) \
769static ssize_t \
770store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
771 const char *buf, size_t count) \
772{ \
773 return store_cache_disable(this_leaf, buf, count, index); \
789} 774}
775STORE_CACHE_DISABLE(0)
776STORE_CACHE_DISABLE(1)
790 777
791struct _cache_attr { 778struct _cache_attr {
792 struct attribute attr; 779 struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
808define_one_ro(shared_cpu_map); 795define_one_ro(shared_cpu_map);
809define_one_ro(shared_cpu_list); 796define_one_ro(shared_cpu_list);
810 797
811static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); 798static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
799 show_cache_disable_0, store_cache_disable_0);
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1);
812 802
813static struct attribute * default_attrs[] = { 803static struct attribute * default_attrs[] = {
814 &type.attr, 804 &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
820 &size.attr, 810 &size.attr,
821 &shared_cpu_map.attr, 811 &shared_cpu_map.attr,
822 &shared_cpu_list.attr, 812 &shared_cpu_list.attr,
823 &cache_disable.attr, 813 &cache_disable_0.attr,
814 &cache_disable_1.attr,
824 NULL 815 NULL
825}; 816};
826 817
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 6fb0b359d2a5..09dd1d414fc3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -1163,7 +1163,7 @@ static __init int mce_init_device(void)
1163 if (!mce_available(&boot_cpu_data)) 1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO; 1164 return -EIO;
1165 1165
1166 alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 1166 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167 1167
1168 err = mce_init_banks(); 1168 err = mce_init_banks();
1169 if (err) 1169 if (err)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index cef3ee30744b..65a0fceedcd7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
15#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
19 18
20asmlinkage void smp_thermal_interrupt(void) 19asmlinkage void smp_thermal_interrupt(void)
21{ 20{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04f..1d584a18a50d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
808 808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy); 811 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 814 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy); 1006 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1009 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d21d4fb161f7..0543f69f0b27 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
20}; 20};
21 21
22static struct fixed_range_block fixed_range_blocks[] = { 22static struct fixed_range_block fixed_range_blocks[] = {
23 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ 23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ 24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ 25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 26 {}
27}; 27};
28 28
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
194 194
195 k8_check_syscfg_dram_mod_en(); 195 k8_check_syscfg_dram_mod_en();
196 196
197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
198 198
199 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
200 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); 200 rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
201 for (i = 0; i < 8; i++) 201 for (i = 0; i < 8; i++)
202 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 202 rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
203} 203}
204 204
205void mtrr_save_fixed_ranges(void *info) 205void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
310 310
311 vrs = mtrr_state.var_ranges; 311 vrs = mtrr_state.var_ranges;
312 312
313 rdmsr(MTRRcap_MSR, lo, dummy); 313 rdmsr(MSR_MTRRcap, lo, dummy);
314 mtrr_state.have_fixed = (lo >> 8) & 1; 314 mtrr_state.have_fixed = (lo >> 8) & 1;
315 315
316 for (i = 0; i < num_var_ranges; i++) 316 for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
318 if (mtrr_state.have_fixed) 318 if (mtrr_state.have_fixed)
319 get_fixed_ranges(mtrr_state.fixed_ranges); 319 get_fixed_ranges(mtrr_state.fixed_ranges);
320 320
321 rdmsr(MTRRdefType_MSR, lo, dummy); 321 rdmsr(MSR_MTRRdefType, lo, dummy);
322 mtrr_state.def_type = (lo & 0xff); 322 mtrr_state.def_type = (lo & 0xff);
323 mtrr_state.enabled = (lo & 0xc00) >> 10; 323 mtrr_state.enabled = (lo & 0xc00) >> 10;
324 324
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
583 __flush_tlb(); 583 __flush_tlb();
584 584
585 /* Save MTRR state */ 585 /* Save MTRR state */
586 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
587 587
588 /* Disable MTRRs, and set the default type to uncached */ 588 /* Disable MTRRs, and set the default type to uncached */
589 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); 589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
590} 590}
591 591
592static void post_set(void) __releases(set_atomicity_lock) 592static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
595 __flush_tlb(); 595 __flush_tlb();
596 596
597 /* Intel (P6) standard MTRRs */ 597 /* Intel (P6) standard MTRRs */
598 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
599 599
600 /* Enable caches */ 600 /* Enable caches */
601 write_cr0(read_cr0() & 0xbfffffff); 601 write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
707static int generic_have_wrcomb(void) 707static int generic_have_wrcomb(void)
708{ 708{
709 unsigned long config, dummy; 709 unsigned long config, dummy;
710 rdmsr(MTRRcap_MSR, config, dummy); 710 rdmsr(MSR_MTRRcap, config, dummy);
711 return (config & (1 << 10)); 711 return (config & (1 << 10));
712} 712}
713 713
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c7..8fc248b5aeaf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
104 unsigned long config = 0, dummy; 104 unsigned long config = 0, dummy;
105 105
106 if (use_intel()) { 106 if (use_intel()) {
107 rdmsr(MTRRcap_MSR, config, dummy); 107 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 108 } else if (is_cpu(AMD))
109 config = 2; 109 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347a..7538b767f206 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff
10
11#define MTRRfix64K_00000_MSR 0x250
12#define MTRRfix16K_80000_MSR 0x258
13#define MTRRfix16K_A0000_MSR 0x259
14#define MTRRfix4K_C0000_MSR 0x268
15#define MTRRfix4K_C8000_MSR 0x269
16#define MTRRfix4K_D0000_MSR 0x26a
17#define MTRRfix4K_D8000_MSR 0x26b
18#define MTRRfix4K_E0000_MSR 0x26c
19#define MTRRfix4K_E8000_MSR 0x26d
20#define MTRRfix4K_F0000_MSR 0x26e
21#define MTRRfix4K_F8000_MSR 0x26f
22
23#define MTRR_CHANGE_MASK_FIXED 0x01 8#define MTRR_CHANGE_MASK_FIXED 0x01
24#define MTRR_CHANGE_MASK_VARIABLE 0x02 9#define MTRR_CHANGE_MASK_VARIABLE 0x02
25#define MTRR_CHANGE_MASK_DEFTYPE 0x04 10#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685b..1f5fb1588d1f 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
35 35
36 if (use_intel()) 36 if (use_intel())
37 /* Save MTRR state */ 37 /* Save MTRR state */
38 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 39 else
40 /* Cyrix ARRs - everything else were excluded at the top */ 40 /* Cyrix ARRs - everything else were excluded at the top */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 41 ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 46{
47 if (use_intel()) 47 if (use_intel())
48 /* Disable MTRRs, and set the default type to uncached */ 48 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, 49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 50 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 51 else if (is_cpu(CYRIX))
52 /* Cyrix ARRs - everything else were excluded at the top */ 52 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
64 /* Restore MTRRdefType */ 64 /* Restore MTRRdefType */
65 if (use_intel()) 65 if (use_intel())
66 /* Intel (P6) standard MTRRs */ 66 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
68 else 68 else
69 /* Cyrix ARRs - everything else was excluded at the top */ 69 /* Cyrix ARRs - everything else was excluded at the top */
70 setCx86(CX86_CCR3, ctxt->ccr3); 70 setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b8698..81086c227ab7 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl); 29 unsigned long *sp, unsigned long bp, char *log_lvl);
30 30
31extern unsigned int code_bytes; 31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33 32
34/* The form of the top of the frame on the stack */ 33/* The form of the top of the frame on the stack */
35struct stack_frame { 34struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 006281302925..7271fa33d791 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
617 */ 617 */
618__init void e820_setup_gap(void) 618__init void e820_setup_gap(void)
619{ 619{
620 unsigned long gapstart, gapsize, round; 620 unsigned long gapstart, gapsize;
621 int found; 621 int found;
622 622
623 gapstart = 0x10000000; 623 gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
635#endif 635#endif
636 636
637 /* 637 /*
638 * See how much we want to round up: start off with 638 * e820_reserve_resources_late protect stolen RAM already
639 * rounding to the next 1MB area.
640 */ 639 */
641 round = 0x100000; 640 pci_mem_start = gapstart;
642 while ((gapsize >> 4) > round)
643 round += round;
644 /* Fun with two's complement */
645 pci_mem_start = (gapstart + round) & -round;
646 641
647 printk(KERN_INFO 642 printk(KERN_INFO
648 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 643 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
1371 } 1366 }
1372} 1367}
1373 1368
1369/* How much should we pad RAM ending depending on where it is? */
1370static unsigned long ram_alignment(resource_size_t pos)
1371{
1372 unsigned long mb = pos >> 20;
1373
1374 /* To 64kB in the first megabyte */
1375 if (!mb)
1376 return 64*1024;
1377
1378 /* To 1MB in the first 16MB */
1379 if (mb < 16)
1380 return 1024*1024;
1381
1382 /* To 32MB for anything above that */
1383 return 32*1024*1024;
1384}
1385
1374void __init e820_reserve_resources_late(void) 1386void __init e820_reserve_resources_late(void)
1375{ 1387{
1376 int i; 1388 int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
1382 insert_resource_expand_to_fit(&iomem_resource, res); 1394 insert_resource_expand_to_fit(&iomem_resource, res);
1383 res++; 1395 res++;
1384 } 1396 }
1397
1398 /*
1399 * Try to bump up RAM regions to reasonable boundaries to
1400 * avoid stolen RAM:
1401 */
1402 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i];
1404 resource_size_t start, end;
1405
1406 if (entry->type != E820_RAM)
1407 continue;
1408 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start));
1410 if (start == end)
1411 continue;
1412 reserve_region_with_split(&iomem_resource, start,
1413 end - 1, "RAM buffer");
1414 }
1385} 1415}
1386 1416
1387char *__init default_machine_specific_memory_setup(void) 1417char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 101static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
101{ 102{
102 u32 d; 103 u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
114 d &= 0xff; 115 d &= 0xff;
115 return d; 116 return d;
116} 117}
118#endif
117 119
118static void __init ati_bugs(int num, int slot, int func) 120static void __init ati_bugs(int num, int slot, int func)
119{ 121{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..bb01ce080b80 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1379,6 +1379,11 @@ END(xen_failsafe_callback)
1379paranoidzeroentry_ist debug do_debug DEBUG_STACK 1379paranoidzeroentry_ist debug do_debug DEBUG_STACK
1380paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1380paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1381paranoiderrorentry stack_segment do_stack_segment 1381paranoiderrorentry stack_segment do_stack_segment
1382#ifdef CONFIG_XEN
1383zeroentry xen_debug do_debug
1384zeroentry xen_int3 do_int3
1385errorentry xen_stack_segment do_stack_segment
1386#endif
1382errorentry general_protection do_general_protection 1387errorentry general_protection do_general_protection
1383errorentry page_fault do_page_fault 1388errorentry page_fault do_page_fault
1384#ifdef CONFIG_X86_MCE 1389#ifdef CONFIG_X86_MCE
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0cd..dc5ed4bdd88d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
608ENTRY(initial_code) 608ENTRY(initial_code)
609 .long i386_start_kernel 609 .long i386_start_kernel
610 610
611.section .text
612/*
613 * Real beginning of normal "text" segment
614 */
615ENTRY(stext)
616ENTRY(_stext)
617
618/* 611/*
619 * BSS section 612 * BSS section
620 */ 613 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..9a391bbb8ba8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/hw_irq.h>
15 16
16atomic_t irq_err_count; 17atomic_t irq_err_count;
17 18
@@ -24,9 +25,9 @@ void (*generic_interrupt_extension)(void) = NULL;
24 */ 25 */
25void ack_bad_irq(unsigned int irq) 26void ack_bad_irq(unsigned int irq)
26{ 27{
27 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); 28 if (printk_ratelimit())
29 pr_err("unexpected IRQ trap at vector %02x\n", irq);
28 30
29#ifdef CONFIG_X86_LOCAL_APIC
30 /* 31 /*
31 * Currently unexpected vectors happen only on SMP and APIC. 32 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N 33 * We _must_ ack these because every local APIC has only N
@@ -36,9 +37,7 @@ void ack_bad_irq(unsigned int irq)
36 * completely. 37 * completely.
37 * But only ack when the APIC is enabled -AK 38 * But only ack when the APIC is enabled -AK
38 */ 39 */
39 if (cpu_has_apic) 40 ack_APIC_irq();
40 ack_APIC_irq();
41#endif
42} 41}
43 42
44#define irq_stats(x) (&per_cpu(irq_stat, x)) 43#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -178,7 +177,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
178 sum += irq_stats(cpu)->irq_thermal_count; 177 sum += irq_stats(cpu)->irq_thermal_count;
179# ifdef CONFIG_X86_64 178# ifdef CONFIG_X86_64
180 sum += irq_stats(cpu)->irq_threshold_count; 179 sum += irq_stats(cpu)->irq_threshold_count;
181#endif 180# endif
182#endif 181#endif
183 return sum; 182 return sum;
184} 183}
@@ -213,14 +212,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
213 irq = __get_cpu_var(vector_irq)[vector]; 212 irq = __get_cpu_var(vector_irq)[vector];
214 213
215 if (!handle_irq(irq, regs)) { 214 if (!handle_irq(irq, regs)) {
216#ifdef CONFIG_X86_64 215 ack_APIC_irq();
217 if (!disable_apic)
218 ack_APIC_irq();
219#endif
220 216
221 if (printk_ratelimit()) 217 if (printk_ratelimit())
222 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", 218 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
223 __func__, smp_processor_id(), vector, irq); 219 __func__, smp_processor_id(), vector, irq);
224 } 220 }
225 221
226 irq_exit(); 222 irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f9..2e08b10ad51a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
1#include <linux/linkage.h>
1#include <linux/errno.h> 2#include <linux/errno.h>
2#include <linux/signal.h> 3#include <linux/signal.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/ioport.h> 5#include <linux/ioport.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h>
6#include <linux/slab.h> 8#include <linux/slab.h>
7#include <linux/random.h> 9#include <linux/random.h>
10#include <linux/kprobes.h>
8#include <linux/init.h> 11#include <linux/init.h>
9#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
10#include <linux/sysdev.h> 13#include <linux/sysdev.h>
11#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/acpi.h>
12#include <linux/io.h> 16#include <linux/io.h>
13#include <linux/delay.h> 17#include <linux/delay.h>
14 18
15#include <asm/atomic.h> 19#include <asm/atomic.h>
16#include <asm/system.h> 20#include <asm/system.h>
17#include <asm/timer.h> 21#include <asm/timer.h>
22#include <asm/hw_irq.h>
18#include <asm/pgtable.h> 23#include <asm/pgtable.h>
19#include <asm/desc.h> 24#include <asm/desc.h>
20#include <asm/apic.h> 25#include <asm/apic.h>
@@ -22,7 +27,23 @@
22#include <asm/i8259.h> 27#include <asm/i8259.h>
23#include <asm/traps.h> 28#include <asm/traps.h>
24 29
30/*
31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
32 * (these are usually mapped to vectors 0x30-0x3f)
33 */
34
35/*
36 * The IO-APIC gives us many more interrupt sources. Most of these
37 * are unused but an SMP system is supposed to have enough memory ...
38 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
39 * across the spectrum, so we really want to be prepared to get all
40 * of these. Plus, more powerful systems might have more than 64
41 * IO-APIC registers.
42 *
43 * (these are usually mapped into the 0x30-0xff vector range)
44 */
25 45
46#ifdef CONFIG_X86_32
26/* 47/*
27 * Note that on a 486, we don't want to do a SIGFPE on an irq13 48 * Note that on a 486, we don't want to do a SIGFPE on an irq13
28 * as the irq is unreliable, and exception 16 works correctly 49 * as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
52 .handler = math_error_irq, 73 .handler = math_error_irq,
53 .name = "fpu", 74 .name = "fpu",
54}; 75};
55
56void __init init_ISA_irqs(void)
57{
58 int i;
59
60#ifdef CONFIG_X86_LOCAL_APIC
61 init_bsp_APIC();
62#endif 76#endif
63 init_8259A(0);
64
65 /*
66 * 16 old-style INTA-cycle interrupts:
67 */
68 for (i = 0; i < NR_IRQS_LEGACY; i++) {
69 struct irq_desc *desc = irq_to_desc(i);
70
71 desc->status = IRQ_DISABLED;
72 desc->action = NULL;
73 desc->depth = 1;
74
75 set_irq_chip_and_handler_name(i, &i8259A_chip,
76 handle_level_irq, "XT");
77 }
78}
79 77
80/* 78/*
81 * IRQ2 is cascade interrupt to second interrupt controller 79 * IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 116 return 0;
119} 117}
120 118
121/* Overridden in paravirt.c */ 119static void __init init_ISA_irqs(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 120{
126 int i; 121 int i;
127 122
128 /* Execute any quirks before the call gates are initialised: */ 123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
129 x86_quirk_pre_intr_init(); 124 init_bsp_APIC();
125#endif
126 init_8259A(0);
130 127
131 /* 128 /*
132 * Cover the whole vector space, no vector can escape 129 * 16 old-style INTA-cycle interrupts:
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */ 130 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 131 for (i = 0; i < NR_IRQS_LEGACY; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */ 132 struct irq_desc *desc = irq_to_desc(i);
138 if (i != SYSCALL_VECTOR) 133
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 134 desc->status = IRQ_DISABLED;
135 desc->action = NULL;
136 desc->depth = 1;
137
138 set_irq_chip_and_handler_name(i, &i8259A_chip,
139 handle_level_irq, "XT");
140 } 140 }
141}
141 142
143/* Overridden in paravirt.c */
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
142 145
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 146static void __init smp_intr_init(void)
147{
148#ifdef CONFIG_SMP
149#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
144 /* 150 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 151 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
146 * IPI, driven by wakeup. 152 * IPI, driven by wakeup.
@@ -160,16 +166,27 @@ void __init native_init_IRQ(void)
160 /* IPI for generic function call */ 166 /* IPI for generic function call */
161 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 167 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
162 168
163 /* IPI for single call function */ 169 /* IPI for generic single function call */
164 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, 170 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
165 call_function_single_interrupt); 171 call_function_single_interrupt);
166 172
167 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
170#endif 176#endif
177#endif /* CONFIG_SMP */
178}
179
180static void __init apic_intr_init(void)
181{
182 smp_intr_init();
183
184#ifdef CONFIG_X86_64
185 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
186 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
187#endif
171 188
172#ifdef CONFIG_X86_LOCAL_APIC 189#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
173 /* self generated IPI for local APIC timer */ 190 /* self generated IPI for local APIC timer */
174 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 191 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
175 192
@@ -179,16 +196,67 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 196 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 197 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 198 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
199
200 /* Performance monitoring interrupts: */
201# ifdef CONFIG_PERF_COUNTERS
202 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
203 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
204# endif
205
182#endif 206#endif
183 207
208#ifdef CONFIG_X86_32
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 209#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
185 /* thermal monitor LVT interrupt */ 210 /* thermal monitor LVT interrupt */
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 211 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
187#endif 212#endif
213#endif
214}
215
216/**
217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void)
237{
238 int i;
239
240 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init();
242
243 apic_intr_init();
244
245 /*
246 * Cover the whole vector space, no vector can escape
247 * us. (some of these will be overridden and become
248 * 'special' SMP interrupts)
249 */
250 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
251 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
252 if (!test_bit(i, used_vectors))
253 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
254 }
188 255
189 if (!acpi_ioapic) 256 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 257 setup_irq(2, &irq2);
191 258
259#ifdef CONFIG_X86_32
192 /* 260 /*
193 * Call quirks after call gates are initialised (usually add in 261 * Call quirks after call gates are initialised (usually add in
194 * the architecture specific gates): 262 * the architecture specific gates):
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void)
203 setup_irq(FPU_IRQ, &fpu_irq); 271 setup_irq(FPU_IRQ, &fpu_irq);
204 272
205 irq_ctx_init(smp_processor_id()); 273 irq_ctx_init(smp_processor_id());
274#endif
206} 275}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd46..000000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14#include <linux/acpi.h>
15#include <linux/io.h>
16#include <linux/delay.h>
17
18#include <asm/atomic.h>
19#include <asm/system.h>
20#include <asm/hw_irq.h>
21#include <asm/pgtable.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
28 * (these are usually mapped to vectors 0x30-0x3f)
29 */
30
31/*
32 * The IO-APIC gives us many more interrupt sources. Most of these
33 * are unused but an SMP system is supposed to have enough memory ...
34 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
35 * across the spectrum, so we really want to be prepared to get all
36 * of these. Plus, more powerful systems might have more than 64
37 * IO-APIC registers.
38 *
39 * (these are usually mapped into the 0x30-0xff vector range)
40 */
41
42/*
43 * IRQ2 is cascade interrupt to second interrupt controller
44 */
45
46static struct irqaction irq2 = {
47 .handler = no_action,
48 .name = "cascade",
49};
50DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
51 [0 ... IRQ0_VECTOR - 1] = -1,
52 [IRQ0_VECTOR] = 0,
53 [IRQ1_VECTOR] = 1,
54 [IRQ2_VECTOR] = 2,
55 [IRQ3_VECTOR] = 3,
56 [IRQ4_VECTOR] = 4,
57 [IRQ5_VECTOR] = 5,
58 [IRQ6_VECTOR] = 6,
59 [IRQ7_VECTOR] = 7,
60 [IRQ8_VECTOR] = 8,
61 [IRQ9_VECTOR] = 9,
62 [IRQ10_VECTOR] = 10,
63 [IRQ11_VECTOR] = 11,
64 [IRQ12_VECTOR] = 12,
65 [IRQ13_VECTOR] = 13,
66 [IRQ14_VECTOR] = 14,
67 [IRQ15_VECTOR] = 15,
68 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
69};
70
71int vector_used_by_percpu_irq(unsigned int vector)
72{
73 int cpu;
74
75 for_each_online_cpu(cpu) {
76 if (per_cpu(vector_irq, cpu)[vector] != -1)
77 return 1;
78 }
79
80 return 0;
81}
82
83static void __init init_ISA_irqs(void)
84{
85 int i;
86
87 init_bsp_APIC();
88 init_8259A(0);
89
90 for (i = 0; i < NR_IRQS_LEGACY; i++) {
91 struct irq_desc *desc = irq_to_desc(i);
92
93 desc->status = IRQ_DISABLED;
94 desc->action = NULL;
95 desc->depth = 1;
96
97 /*
98 * 16 old-style INTA-cycle interrupts:
99 */
100 set_irq_chip_and_handler_name(i, &i8259A_chip,
101 handle_level_irq, "XT");
102 }
103}
104
105void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
106
107static void __init smp_intr_init(void)
108{
109#ifdef CONFIG_SMP
110 /*
111 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
112 * IPI, driven by wakeup.
113 */
114 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
115
116 /* IPIs for invalidation */
117 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
118 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
119 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
120 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
121 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
122 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
123 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for generic single function call */
130 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
131 call_function_single_interrupt);
132
133 /* Low priority IPI to cleanup after moving an irq */
134 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
135 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
136#endif
137}
138
139static void __init apic_intr_init(void)
140{
141 smp_intr_init();
142
143 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
144 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
145
146 /* self generated IPI for local APIC timer */
147 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
148
149 /* generic IPI for platform specific use */
150 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
151
152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155}
156
157void __init native_init_IRQ(void)
158{
159 int i;
160
161 init_ISA_irqs();
162 /*
163 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become
165 * 'special' SMP interrupts)
166 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR)
170 set_intr_gate(vector, interrupt[i]);
171 }
172
173 apic_intr_init();
174
175 if (!acpi_ioapic)
176 setup_irq(2, &irq2);
177}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919e..8d82a77a3f3b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
143 gdb_regs32[GDB_CS] = __KERNEL_CS; 143 gdb_regs32[GDB_CS] = __KERNEL_CS;
144 gdb_regs32[GDB_SS] = __KERNEL_DS; 144 gdb_regs32[GDB_SS] = __KERNEL_DS;
145 gdb_regs[GDB_PC] = p->thread.ip; 145 gdb_regs[GDB_PC] = 0;
146 gdb_regs[GDB_R8] = 0; 146 gdb_regs[GDB_R8] = 0;
147 gdb_regs[GDB_R9] = 0; 147 gdb_regs[GDB_R9] = 0;
148 gdb_regs[GDB_R10] = 0; 148 gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b4..6551dedee20c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -195,7 +195,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 195 struct kvm_para_state *state = kvm_para_state();
196 196
197 mmu_queue_flush(state); 197 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 198 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 199 state->mode = paravirt_get_lazy_mode();
200} 200}
201 201
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c6..366baa179913 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16#include <linux/platform_device.h>
17#include <linux/capability.h>
18#include <linux/miscdevice.h>
19#include <linux/firmware.h> 16#include <linux/firmware.h>
20#include <linux/spinlock.h>
21#include <linux/cpumask.h>
22#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
23#include <linux/uaccess.h> 18#include <linux/uaccess.h>
24#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
25#include <linux/kernel.h> 20#include <linux/kernel.h>
26#include <linux/module.h> 21#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <linux/pci.h> 22#include <linux/pci.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35 23
36#include <asm/microcode.h> 24#include <asm/microcode.h>
37#include <asm/processor.h> 25#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
79#define UCODE_CONTAINER_SECTION_HDR 8 67#define UCODE_CONTAINER_SECTION_HDR 8
80#define UCODE_CONTAINER_HEADER_SIZE 12 68#define UCODE_CONTAINER_HEADER_SIZE 12
81 69
82/* serialize access to the physical write */
83static DEFINE_SPINLOCK(microcode_update_lock);
84
85static struct equiv_cpu_entry *equiv_cpu_table; 70static struct equiv_cpu_entry *equiv_cpu_table;
86 71
87static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
144 return 1; 129 return 1;
145} 130}
146 131
147static void apply_microcode_amd(int cpu) 132static int apply_microcode_amd(int cpu)
148{ 133{
149 unsigned long flags;
150 u32 rev, dummy; 134 u32 rev, dummy;
151 int cpu_num = raw_smp_processor_id(); 135 int cpu_num = raw_smp_processor_id();
152 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
156 BUG_ON(cpu_num != cpu); 140 BUG_ON(cpu_num != cpu);
157 141
158 if (mc_amd == NULL) 142 if (mc_amd == NULL)
159 return; 143 return 0;
160 144
161 spin_lock_irqsave(&microcode_update_lock, flags);
162 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 145 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
163 /* get patch id after patching */ 146 /* get patch id after patching */
164 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 147 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
165 spin_unlock_irqrestore(&microcode_update_lock, flags);
166 148
167 /* check current patch id and patch's id for match */ 149 /* check current patch id and patch's id for match */
168 if (rev != mc_amd->hdr.patch_id) { 150 if (rev != mc_amd->hdr.patch_id) {
169 printk(KERN_ERR "microcode: CPU%d: update failed " 151 printk(KERN_ERR "microcode: CPU%d: update failed "
170 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
171 return; 153 return -1;
172 } 154 }
173 155
174 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
175 cpu, rev); 157 cpu, rev);
176 158
177 uci->cpu_sig.rev = rev; 159 uci->cpu_sig.rev = rev;
160
161 return 0;
178} 162}
179 163
180static int get_ucode_data(void *to, const u8 *from, size_t n) 164static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
257 241
258static void free_equiv_cpu_table(void) 242static void free_equiv_cpu_table(void)
259{ 243{
260 if (equiv_cpu_table) { 244 vfree(equiv_cpu_table);
261 vfree(equiv_cpu_table); 245 equiv_cpu_table = NULL;
262 equiv_cpu_table = NULL;
263 }
264} 246}
265 247
266static int generic_load_microcode(int cpu, const u8 *data, size_t size) 248static enum ucode_state
249generic_load_microcode(int cpu, const u8 *data, size_t size)
267{ 250{
268 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 251 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
269 const u8 *ucode_ptr = data; 252 const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
272 int new_rev = uci->cpu_sig.rev; 255 int new_rev = uci->cpu_sig.rev;
273 unsigned int leftover; 256 unsigned int leftover;
274 unsigned long offset; 257 unsigned long offset;
258 enum ucode_state state = UCODE_OK;
275 259
276 offset = install_equiv_cpu_table(ucode_ptr); 260 offset = install_equiv_cpu_table(ucode_ptr);
277 if (!offset) { 261 if (!offset) {
278 printk(KERN_ERR "microcode: failed to create " 262 printk(KERN_ERR "microcode: failed to create "
279 "equivalent cpu table\n"); 263 "equivalent cpu table\n");
280 return -EINVAL; 264 return UCODE_ERROR;
281 } 265 }
282 266
283 ucode_ptr += offset; 267 ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
293 277
294 mc_header = (struct microcode_header_amd *)mc; 278 mc_header = (struct microcode_header_amd *)mc;
295 if (get_matching_microcode(cpu, mc, new_rev)) { 279 if (get_matching_microcode(cpu, mc, new_rev)) {
296 if (new_mc) 280 vfree(new_mc);
297 vfree(new_mc);
298 new_rev = mc_header->patch_id; 281 new_rev = mc_header->patch_id;
299 new_mc = mc; 282 new_mc = mc;
300 } else 283 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
306 289
307 if (new_mc) { 290 if (new_mc) {
308 if (!leftover) { 291 if (!leftover) {
309 if (uci->mc) 292 vfree(uci->mc);
310 vfree(uci->mc);
311 uci->mc = new_mc; 293 uci->mc = new_mc;
312 pr_debug("microcode: CPU%d found a matching microcode " 294 pr_debug("microcode: CPU%d found a matching microcode "
313 "update with version 0x%x (current=0x%x)\n", 295 "update with version 0x%x (current=0x%x)\n",
314 cpu, new_rev, uci->cpu_sig.rev); 296 cpu, new_rev, uci->cpu_sig.rev);
315 } else 297 } else {
316 vfree(new_mc); 298 vfree(new_mc);
317 } 299 state = UCODE_ERROR;
300 }
301 } else
302 state = UCODE_NFOUND;
318 303
319 free_equiv_cpu_table(); 304 free_equiv_cpu_table();
320 305
321 return (int)leftover; 306 return state;
322} 307}
323 308
324static int request_microcode_fw(int cpu, struct device *device) 309static enum ucode_state request_microcode_fw(int cpu, struct device *device)
325{ 310{
326 const char *fw_name = "amd-ucode/microcode_amd.bin"; 311 const char *fw_name = "amd-ucode/microcode_amd.bin";
327 const struct firmware *firmware; 312 const struct firmware *firmware;
328 int ret; 313 enum ucode_state ret;
329
330 /* We should bind the task to the CPU */
331 BUG_ON(cpu != raw_smp_processor_id());
332 314
333 ret = request_firmware(&firmware, fw_name, device); 315 if (request_firmware(&firmware, fw_name, device)) {
334 if (ret) {
335 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
336 return ret; 317 return UCODE_NFOUND;
337 } 318 }
338 319
339 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 320 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
343 return ret; 324 return ret;
344} 325}
345 326
346static int request_microcode_user(int cpu, const void __user *buf, size_t size) 327static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size)
347{ 329{
348 printk(KERN_INFO "microcode: AMD microcode update via " 330 printk(KERN_INFO "microcode: AMD microcode update via "
349 "/dev/cpu/microcode not supported\n"); 331 "/dev/cpu/microcode not supported\n");
350 return -1; 332 return UCODE_ERROR;
351} 333}
352 334
353static void microcode_fini_cpu_amd(int cpu) 335static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d1..9c4461501fcb 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
76#include <linux/firmware.h> 75#include <linux/capability.h>
77#include <linux/smp_lock.h> 76#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 77#include <linux/kernel.h>
83#include <linux/module.h> 78#include <linux/module.h>
84#include <linux/mutex.h> 79#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h> 80#include <linux/cpu.h>
89#include <linux/fs.h> 81#include <linux/fs.h>
90#include <linux/mm.h> 82#include <linux/mm.h>
91 83
92#include <asm/microcode.h> 84#include <asm/microcode.h>
93#include <asm/processor.h> 85#include <asm/processor.h>
94#include <asm/msr.h>
95 86
96MODULE_DESCRIPTION("Microcode Update Driver"); 87MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 88MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
101 92
102static struct microcode_ops *microcode_ops; 93static struct microcode_ops *microcode_ops;
103 94
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 95/*
96 * Synchronization.
97 *
98 * All non cpu-hotplug-callback call sites use:
99 *
100 * - microcode_mutex to synchronize with each other;
101 * - get/put_online_cpus() to synchronize with
102 * the cpu-hotplug-callback call sites.
103 *
104 * We guarantee that only a single cpu is being
105 * updated at any particular moment of time.
106 */
105static DEFINE_MUTEX(microcode_mutex); 107static DEFINE_MUTEX(microcode_mutex);
106 108
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 109struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 110EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 111
112/*
113 * Operations that are run on a target cpu:
114 */
115
116struct cpu_info_ctx {
117 struct cpu_signature *cpu_sig;
118 int err;
119};
120
121static void collect_cpu_info_local(void *arg)
122{
123 struct cpu_info_ctx *ctx = arg;
124
125 ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
126 ctx->cpu_sig);
127}
128
129static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
130{
131 struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
132 int ret;
133
134 ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
135 if (!ret)
136 ret = ctx.err;
137
138 return ret;
139}
140
141static int collect_cpu_info(int cpu)
142{
143 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
144 int ret;
145
146 memset(uci, 0, sizeof(*uci));
147
148 ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
149 if (!ret)
150 uci->valid = 1;
151
152 return ret;
153}
154
155struct apply_microcode_ctx {
156 int err;
157};
158
159static void apply_microcode_local(void *arg)
160{
161 struct apply_microcode_ctx *ctx = arg;
162
163 ctx->err = microcode_ops->apply_microcode(smp_processor_id());
164}
165
166static int apply_microcode_on_target(int cpu)
167{
168 struct apply_microcode_ctx ctx = { .err = 0 };
169 int ret;
170
171 ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
172 if (!ret)
173 ret = ctx.err;
174
175 return ret;
176}
177
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 178#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size) 179static int do_microcode_update(const void __user *buf, size_t size)
112{ 180{
113 cpumask_t old;
114 int error = 0; 181 int error = 0;
115 int cpu; 182 int cpu;
116 183
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) { 184 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 185 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
186 enum ucode_state ustate;
121 187
122 if (!uci->valid) 188 if (!uci->valid)
123 continue; 189 continue;
124 190
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 191 ustate = microcode_ops->request_microcode_user(cpu, buf, size);
126 error = microcode_ops->request_microcode_user(cpu, buf, size); 192 if (ustate == UCODE_ERROR) {
127 if (error < 0) 193 error = -1;
128 goto out; 194 break;
129 if (!error) 195 } else if (ustate == UCODE_OK)
130 microcode_ops->apply_microcode(cpu); 196 apply_microcode_on_target(cpu);
131 } 197 }
132out: 198
133 set_cpus_allowed_ptr(current, &old);
134 return error; 199 return error;
135} 200}
136 201
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
143static ssize_t microcode_write(struct file *file, const char __user *buf, 208static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos) 209 size_t len, loff_t *ppos)
145{ 210{
146 ssize_t ret; 211 ssize_t ret = -EINVAL;
147 212
148 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
150 num_physpages); 215 return ret;
151 return -EINVAL;
152 } 216 }
153 217
154 get_online_cpus(); 218 get_online_cpus();
155 mutex_lock(&microcode_mutex); 219 mutex_lock(&microcode_mutex);
156 220
157 ret = do_microcode_update(buf, len); 221 if (do_microcode_update(buf, len) == 0)
158 if (!ret)
159 ret = (ssize_t)len; 222 ret = (ssize_t)len;
160 223
161 mutex_unlock(&microcode_mutex); 224 mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
165} 228}
166 229
167static const struct file_operations microcode_fops = { 230static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
169 .write = microcode_write, 232 .write = microcode_write,
170 .open = microcode_open, 233 .open = microcode_open,
171}; 234};
172 235
173static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
175 .name = "microcode", 238 .name = "microcode",
176 .fops = &microcode_fops, 239 .fops = &microcode_fops,
177}; 240};
178 241
179static int __init microcode_dev_init(void) 242static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
182 245
183 error = misc_register(&microcode_dev); 246 error = misc_register(&microcode_dev);
184 if (error) { 247 if (error) {
185 printk(KERN_ERR 248 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error; 249 return error;
189 } 250 }
190 251
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
205/* fake device for request_firmware */ 266/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 267static struct platform_device *microcode_pdev;
207 268
208static long reload_for_cpu(void *unused) 269static int reload_for_cpu(int cpu)
209{ 270{
210 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 271 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
211 int err = 0; 272 int err = 0;
212 273
213 mutex_lock(&microcode_mutex); 274 mutex_lock(&microcode_mutex);
214 if (uci->valid) { 275 if (uci->valid) {
215 err = microcode_ops->request_microcode_fw(smp_processor_id(), 276 enum ucode_state ustate;
216 &microcode_pdev->dev); 277
217 if (!err) 278 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
218 microcode_ops->apply_microcode(smp_processor_id()); 279 if (ustate == UCODE_OK)
280 apply_microcode_on_target(cpu);
281 else
282 if (ustate == UCODE_ERROR)
283 err = -EINVAL;
219 } 284 }
220 mutex_unlock(&microcode_mutex); 285 mutex_unlock(&microcode_mutex);
286
221 return err; 287 return err;
222} 288}
223 289
224static ssize_t reload_store(struct sys_device *dev, 290static ssize_t reload_store(struct sys_device *dev,
225 struct sysdev_attribute *attr, 291 struct sysdev_attribute *attr,
226 const char *buf, size_t sz) 292 const char *buf, size_t size)
227{ 293{
228 char *end; 294 unsigned long val;
229 unsigned long val = simple_strtoul(buf, &end, 0);
230 int err = 0;
231 int cpu = dev->id; 295 int cpu = dev->id;
296 int ret = 0;
297 char *end;
232 298
299 val = simple_strtoul(buf, &end, 0);
233 if (end == buf) 300 if (end == buf)
234 return -EINVAL; 301 return -EINVAL;
302
235 if (val == 1) { 303 if (val == 1) {
236 get_online_cpus(); 304 get_online_cpus();
237 if (cpu_online(cpu)) 305 if (cpu_online(cpu))
238 err = work_on_cpu(cpu, reload_for_cpu, NULL); 306 ret = reload_for_cpu(cpu);
239 put_online_cpus(); 307 put_online_cpus();
240 } 308 }
241 if (err) 309
242 return err; 310 if (!ret)
243 return sz; 311 ret = size;
312
313 return ret;
244} 314}
245 315
246static ssize_t version_show(struct sys_device *dev, 316static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
271}; 341};
272 342
273static struct attribute_group mc_attr_group = { 343static struct attribute_group mc_attr_group = {
274 .attrs = mc_default_attrs, 344 .attrs = mc_default_attrs,
275 .name = "microcode", 345 .name = "microcode",
276}; 346};
277 347
278static void __microcode_fini_cpu(int cpu) 348static void microcode_fini_cpu(int cpu)
279{ 349{
280 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
281 351
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
283 uci->valid = 0; 353 uci->valid = 0;
284} 354}
285 355
286static void microcode_fini_cpu(int cpu) 356static enum ucode_state microcode_resume_cpu(int cpu)
287{
288 mutex_lock(&microcode_mutex);
289 __microcode_fini_cpu(cpu);
290 mutex_unlock(&microcode_mutex);
291}
292
293static void collect_cpu_info(int cpu)
294{ 357{
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 358 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
296 359
297 memset(uci, 0, sizeof(*uci)); 360 if (!uci->mc)
298 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) 361 return UCODE_NFOUND;
299 uci->valid = 1; 362
363 pr_debug("microcode: CPU%d updated upon resume\n", cpu);
364 apply_microcode_on_target(cpu);
365
366 return UCODE_OK;
300} 367}
301 368
302static int microcode_resume_cpu(int cpu) 369static enum ucode_state microcode_init_cpu(int cpu)
303{ 370{
304 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 371 enum ucode_state ustate;
305 struct cpu_signature nsig;
306 372
307 pr_debug("microcode: CPU%d resumed\n", cpu); 373 if (collect_cpu_info(cpu))
374 return UCODE_ERROR;
308 375
309 if (!uci->mc) 376 /* --dimm. Trigger a delayed update? */
310 return 1; 377 if (system_state != SYSTEM_RUNNING)
378 return UCODE_NFOUND;
311 379
312 /* 380 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
313 * Let's verify that the 'cached' ucode does belong
314 * to this cpu (a bit of paranoia):
315 */
316 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
317 __microcode_fini_cpu(cpu);
318 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
319 cpu);
320 return -1;
321 }
322 381
323 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { 382 if (ustate == UCODE_OK) {
324 __microcode_fini_cpu(cpu); 383 pr_debug("microcode: CPU%d updated upon init\n", cpu);
325 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", 384 apply_microcode_on_target(cpu);
326 cpu);
327 /* Should we look for a new ucode here? */
328 return 1;
329 } 385 }
330 386
331 return 0; 387 return ustate;
332} 388}
333 389
334static long microcode_update_cpu(void *unused) 390static enum ucode_state microcode_update_cpu(int cpu)
335{ 391{
336 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 392 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
337 int err = 0; 393 enum ucode_state ustate;
338 394
339 /* 395 if (uci->valid)
340 * Check if the system resume is in progress (uci->valid != NULL), 396 ustate = microcode_resume_cpu(cpu);
341 * otherwise just request a firmware: 397 else
342 */ 398 ustate = microcode_init_cpu(cpu);
343 if (uci->valid) {
344 err = microcode_resume_cpu(smp_processor_id());
345 } else {
346 collect_cpu_info(smp_processor_id());
347 if (uci->valid && system_state == SYSTEM_RUNNING)
348 err = microcode_ops->request_microcode_fw(
349 smp_processor_id(),
350 &microcode_pdev->dev);
351 }
352 if (!err)
353 microcode_ops->apply_microcode(smp_processor_id());
354 return err;
355}
356 399
357static int microcode_init_cpu(int cpu) 400 return ustate;
358{
359 int err;
360 mutex_lock(&microcode_mutex);
361 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex);
363
364 return err;
365} 401}
366 402
367static int mc_sysdev_add(struct sys_device *sys_dev) 403static int mc_sysdev_add(struct sys_device *sys_dev)
368{ 404{
369 int err, cpu = sys_dev->id; 405 int err, cpu = sys_dev->id;
370 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
371 406
372 if (!cpu_online(cpu)) 407 if (!cpu_online(cpu))
373 return 0; 408 return 0;
374 409
375 pr_debug("microcode: CPU%d added\n", cpu); 410 pr_debug("microcode: CPU%d added\n", cpu);
376 memset(uci, 0, sizeof(*uci));
377 411
378 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 412 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
379 if (err) 413 if (err)
380 return err; 414 return err;
381 415
382 err = microcode_init_cpu(cpu); 416 if (microcode_init_cpu(cpu) == UCODE_ERROR)
417 err = -EINVAL;
383 418
384 return err; 419 return err;
385} 420}
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
400static int mc_sysdev_resume(struct sys_device *dev) 435static int mc_sysdev_resume(struct sys_device *dev)
401{ 436{
402 int cpu = dev->id; 437 int cpu = dev->id;
438 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
403 439
404 if (!cpu_online(cpu)) 440 if (!cpu_online(cpu))
405 return 0; 441 return 0;
406 442
407 /* only CPU 0 will apply ucode here */ 443 /*
408 microcode_update_cpu(NULL); 444 * All non-bootup cpus are still disabled,
445 * so only CPU 0 will apply ucode here.
446 *
447 * Moreover, there can be no concurrent
448 * updates from any other places at this point.
449 */
450 WARN_ON(cpu != 0);
451
452 if (uci->valid && uci->mc)
453 microcode_ops->apply_microcode(cpu);
454
409 return 0; 455 return 0;
410} 456}
411 457
412static struct sysdev_driver mc_sysdev_driver = { 458static struct sysdev_driver mc_sysdev_driver = {
413 .add = mc_sysdev_add, 459 .add = mc_sysdev_add,
414 .remove = mc_sysdev_remove, 460 .remove = mc_sysdev_remove,
415 .resume = mc_sysdev_resume, 461 .resume = mc_sysdev_resume,
416}; 462};
417 463
418static __cpuinit int 464static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
425 switch (action) { 471 switch (action) {
426 case CPU_ONLINE: 472 case CPU_ONLINE:
427 case CPU_ONLINE_FROZEN: 473 case CPU_ONLINE_FROZEN:
428 if (microcode_init_cpu(cpu)) 474 microcode_update_cpu(cpu);
429 printk(KERN_ERR "microcode: failed to init CPU%d\n",
430 cpu);
431 case CPU_DOWN_FAILED: 475 case CPU_DOWN_FAILED:
432 case CPU_DOWN_FAILED_FROZEN: 476 case CPU_DOWN_FAILED_FROZEN:
433 pr_debug("microcode: CPU%d added\n", cpu); 477 pr_debug("microcode: CPU%d added\n", cpu);
434 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 478 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
435 printk(KERN_ERR "microcode: Failed to create the sysfs " 479 pr_err("microcode: Failed to create group for CPU%d\n", cpu);
436 "group for CPU%d\n", cpu);
437 break; 480 break;
438 case CPU_DOWN_PREPARE: 481 case CPU_DOWN_PREPARE:
439 case CPU_DOWN_PREPARE_FROZEN: 482 case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
465 microcode_ops = init_amd_microcode(); 508 microcode_ops = init_amd_microcode();
466 509
467 if (!microcode_ops) { 510 if (!microcode_ops) {
468 printk(KERN_ERR "microcode: no support for this CPU vendor\n"); 511 pr_err("microcode: no support for this CPU vendor\n");
469 return -ENODEV; 512 return -ENODEV;
470 } 513 }
471 514
472 error = microcode_dev_init();
473 if (error)
474 return error;
475 microcode_pdev = platform_device_register_simple("microcode", -1, 515 microcode_pdev = platform_device_register_simple("microcode", -1,
476 NULL, 0); 516 NULL, 0);
477 if (IS_ERR(microcode_pdev)) { 517 if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
480 } 520 }
481 521
482 get_online_cpus(); 522 get_online_cpus();
523 mutex_lock(&microcode_mutex);
524
483 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 525 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
526
527 mutex_unlock(&microcode_mutex);
484 put_online_cpus(); 528 put_online_cpus();
529
485 if (error) { 530 if (error) {
486 microcode_dev_exit();
487 platform_device_unregister(microcode_pdev); 531 platform_device_unregister(microcode_pdev);
488 return error; 532 return error;
489 } 533 }
490 534
535 error = microcode_dev_init();
536 if (error)
537 return error;
538
491 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
492 540
493 printk(KERN_INFO 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
494 "Microcode Update Driver: v" MICROCODE_VERSION
495 " <tigran@aivazian.fsnet.co.uk>," 542 " <tigran@aivazian.fsnet.co.uk>,"
496 " Peter Oruba\n"); 543 " Peter Oruba\n");
497 544
498 return 0; 545 return 0;
499} 546}
547module_init(microcode_init);
500 548
501static void __exit microcode_exit(void) 549static void __exit microcode_exit(void)
502{ 550{
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
505 unregister_hotcpu_notifier(&mc_cpu_notifier); 553 unregister_hotcpu_notifier(&mc_cpu_notifier);
506 554
507 get_online_cpus(); 555 get_online_cpus();
556 mutex_lock(&microcode_mutex);
557
508 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 558 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
559
560 mutex_unlock(&microcode_mutex);
509 put_online_cpus(); 561 put_online_cpus();
510 562
511 platform_device_unregister(microcode_pdev); 563 platform_device_unregister(microcode_pdev);
512 564
513 microcode_ops = NULL; 565 microcode_ops = NULL;
514 566
515 printk(KERN_INFO 567 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
516 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
517} 568}
518
519module_init(microcode_init);
520module_exit(microcode_exit); 569module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1ab..0d334ddd0a96 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h>
76#include <linux/firmware.h> 73#include <linux/firmware.h>
77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h> 74#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 75#include <linux/kernel.h>
83#include <linux/module.h> 76#include <linux/module.h>
84#include <linux/mutex.h> 77#include <linux/vmalloc.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h>
89#include <linux/fs.h>
90#include <linux/mm.h>
91 78
92#include <asm/microcode.h> 79#include <asm/microcode.h>
93#include <asm/processor.h> 80#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
150 137
151#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 138#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
152 139
153/* serialize access to the physical write to MSR 0x79 */
154static DEFINE_SPINLOCK(microcode_update_lock);
155
156static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 140static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
157{ 141{
158 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 142 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
159 unsigned long flags;
160 unsigned int val[2]; 143 unsigned int val[2];
161 144
162 memset(csig, 0, sizeof(*csig)); 145 memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
176 csig->pf = 1 << ((val[1] >> 18) & 7); 159 csig->pf = 1 << ((val[1] >> 18) & 7);
177 } 160 }
178 161
179 /* serialize access to the physical write to MSR 0x79 */
180 spin_lock_irqsave(&microcode_update_lock, flags);
181
182 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 162 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
183 /* see notes above for revision 1.07. Apparent chip bug */ 163 /* see notes above for revision 1.07. Apparent chip bug */
184 sync_core(); 164 sync_core();
185 /* get the current revision from MSR 0x8B */ 165 /* get the current revision from MSR 0x8B */
186 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
187 spin_unlock_irqrestore(&microcode_update_lock, flags);
188 167
189 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
190 csig->sig, csig->pf, csig->rev); 169 cpu_num, csig->sig, csig->pf, csig->rev);
191 170
192 return 0; 171 return 0;
193} 172}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 return 0; 297 return 0;
319} 298}
320 299
321static void apply_microcode(int cpu) 300static int apply_microcode(int cpu)
322{ 301{
323 struct microcode_intel *mc_intel; 302 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci; 303 struct ucode_cpu_info *uci;
325 unsigned long flags;
326 unsigned int val[2]; 304 unsigned int val[2];
327 int cpu_num; 305 int cpu_num;
328 306
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
334 BUG_ON(cpu_num != cpu); 312 BUG_ON(cpu_num != cpu);
335 313
336 if (mc_intel == NULL) 314 if (mc_intel == NULL)
337 return; 315 return 0;
338
339 /* serialize access to the physical write to MSR 0x79 */
340 spin_lock_irqsave(&microcode_update_lock, flags);
341 316
342 /* write microcode via MSR 0x79 */ 317 /* write microcode via MSR 0x79 */
343 wrmsr(MSR_IA32_UCODE_WRITE, 318 wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
351 /* get the current revision from MSR 0x8B */ 326 /* get the current revision from MSR 0x8B */
352 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
353 328
354 spin_unlock_irqrestore(&microcode_update_lock, flags);
355 if (val[1] != mc_intel->hdr.rev) { 329 if (val[1] != mc_intel->hdr.rev) {
356 printk(KERN_ERR "microcode: CPU%d update from revision " 330 printk(KERN_ERR "microcode: CPU%d update "
357 "0x%x to 0x%x failed\n", 331 "to revision 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]); 332 cpu_num, mc_intel->hdr.rev);
359 return; 333 return -1;
360 } 334 }
361 printk(KERN_INFO "microcode: CPU%d updated from revision " 335 printk(KERN_INFO "microcode: CPU%d updated to revision "
362 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 336 "0x%x, date = %04x-%02x-%02x \n",
363 cpu_num, uci->cpu_sig.rev, val[1], 337 cpu_num, val[1],
364 mc_intel->hdr.date & 0xffff, 338 mc_intel->hdr.date & 0xffff,
365 mc_intel->hdr.date >> 24, 339 mc_intel->hdr.date >> 24,
366 (mc_intel->hdr.date >> 16) & 0xff); 340 (mc_intel->hdr.date >> 16) & 0xff);
367 341
368 uci->cpu_sig.rev = val[1]; 342 uci->cpu_sig.rev = val[1];
343
344 return 0;
369} 345}
370 346
371static int generic_load_microcode(int cpu, void *data, size_t size, 347static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
372 int (*get_ucode_data)(void *, const void *, size_t)) 348 int (*get_ucode_data)(void *, const void *, size_t))
373{ 349{
374 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
375 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 351 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
376 int new_rev = uci->cpu_sig.rev; 352 int new_rev = uci->cpu_sig.rev;
377 unsigned int leftover = size; 353 unsigned int leftover = size;
354 enum ucode_state state = UCODE_OK;
378 355
379 while (leftover) { 356 while (leftover) {
380 struct microcode_header_intel mc_header; 357 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
412 leftover -= mc_size; 389 leftover -= mc_size;
413 } 390 }
414 391
415 if (!new_mc) 392 if (leftover) {
393 if (new_mc)
394 vfree(new_mc);
395 state = UCODE_ERROR;
416 goto out; 396 goto out;
397 }
417 398
418 if (leftover) { 399 if (!new_mc) {
419 vfree(new_mc); 400 state = UCODE_NFOUND;
420 goto out; 401 goto out;
421 } 402 }
422 403
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
427 pr_debug("microcode: CPU%d found a matching microcode update with" 408 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n", 409 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev); 410 cpu, new_rev, uci->cpu_sig.rev);
430 411out:
431 out: 412 return state;
432 return (int)leftover;
433} 413}
434 414
435static int get_ucode_fw(void *to, const void *from, size_t n) 415static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
438 return 0; 418 return 0;
439} 419}
440 420
441static int request_microcode_fw(int cpu, struct device *device) 421static enum ucode_state request_microcode_fw(int cpu, struct device *device)
442{ 422{
443 char name[30]; 423 char name[30];
444 struct cpuinfo_x86 *c = &cpu_data(cpu); 424 struct cpuinfo_x86 *c = &cpu_data(cpu);
445 const struct firmware *firmware; 425 const struct firmware *firmware;
446 int ret; 426 enum ucode_state ret;
447 427
448 /* We should bind the task to the CPU */
449 BUG_ON(cpu != raw_smp_processor_id());
450 sprintf(name, "intel-ucode/%02x-%02x-%02x", 428 sprintf(name, "intel-ucode/%02x-%02x-%02x",
451 c->x86, c->x86_model, c->x86_mask); 429 c->x86, c->x86_model, c->x86_mask);
452 ret = request_firmware(&firmware, name, device); 430
453 if (ret) { 431 if (request_firmware(&firmware, name, device)) {
454 pr_debug("microcode: data file %s load failed\n", name); 432 pr_debug("microcode: data file %s load failed\n", name);
455 return ret; 433 return UCODE_NFOUND;
456 } 434 }
457 435
458 ret = generic_load_microcode(cpu, (void *)firmware->data, 436 ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
468 return copy_from_user(to, from, n); 446 return copy_from_user(to, from, n);
469} 447}
470 448
471static int request_microcode_user(int cpu, const void __user *buf, size_t size) 449static enum ucode_state
450request_microcode_user(int cpu, const void __user *buf, size_t size)
472{ 451{
473 /* We should bind the task to the CPU */
474 BUG_ON(cpu != raw_smp_processor_id());
475
476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); 452 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
477} 453}
478 454
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c15..651c93b28862 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
17#include <linux/acpi.h> 17#include <linux/acpi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/pci.h>
20 21
21#include <asm/mtrr.h> 22#include <asm/mtrr.h>
22#include <asm/mpspec.h> 23#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
870inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} 871inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
871#endif /* CONFIG_X86_IO_APIC */ 872#endif /* CONFIG_X86_IO_APIC */
872 873
873static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, 874static int
874 int count) 875check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
875{ 876{
876 if (!mpc_new_phys) { 877 int ret = 0;
877 pr_info("No spare slots, try to append...take your risk, " 878
878 "new mpc_length %x\n", count); 879 if (!mpc_new_phys || count <= mpc_new_length) {
879 } else { 880 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
880 if (count <= mpc_new_length) 881 return -1;
881 pr_info("No spare slots, try to append..., "
882 "new mpc_length %x\n", count);
883 else {
884 pr_err("mpc_new_length %lx is too small\n",
885 mpc_new_length);
886 return -1;
887 }
888 } 882 }
889 883
890 return 0; 884 return ret;
891} 885}
892 886
893static int __init replace_intsrc_all(struct mpc_table *mpc, 887static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
946 } else { 940 } else {
947 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 941 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
948 count += sizeof(struct mpc_intsrc); 942 count += sizeof(struct mpc_intsrc);
949 if (!check_slot(mpc_new_phys, mpc_new_length, count)) 943 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
950 goto out; 944 goto out;
951 assign_to_mpc_intsrc(&mp_irqs[i], m); 945 assign_to_mpc_intsrc(&mp_irqs[i], m);
952 mpc->length = count; 946 mpc->length = count;
@@ -963,11 +957,14 @@ out:
963 return 0; 957 return 0;
964} 958}
965 959
966static int __initdata enable_update_mptable; 960int enable_update_mptable;
967 961
968static int __init update_mptable_setup(char *str) 962static int __init update_mptable_setup(char *str)
969{ 963{
970 enable_update_mptable = 1; 964 enable_update_mptable = 1;
965#ifdef CONFIG_PCI
966 pci_routeirq = 1;
967#endif
971 return 0; 968 return 0;
972} 969}
973early_param("update_mptable", update_mptable_setup); 970early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
980static int __init parse_alloc_mptable_opt(char *p) 977static int __init parse_alloc_mptable_opt(char *p)
981{ 978{
982 enable_update_mptable = 1; 979 enable_update_mptable = 1;
980#ifdef CONFIG_PCI
981 pci_routeirq = 1;
982#endif
983 alloc_mptable = 1; 983 alloc_mptable = 1;
984 if (!p) 984 if (!p)
985 return 0; 985 return 0;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea336..70ec9b951d76 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
248 248
249static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
250{ 250{
251 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
252 BUG_ON(preemptible());
253 252
254 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
255} 254}
256 255
257void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
258{ 257{
259 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
260 BUG_ON(preemptible());
261 259
262 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
263} 261}
264 262
265void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
269 267
270void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
271{ 269{
272 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
273} 271}
274 272
275void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
276{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
277 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
278} 282}
279 283
280void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
281{ 285{
282 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
283} 292}
284 293
285enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
286{ 295{
287 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
288} 300}
289 301
290void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
292 preempt_disable(); 304 preempt_disable();
293 305
294 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
295 WARN_ON(preempt_count() == 1);
296 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
297 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
298 } 309 }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
300 preempt_enable(); 311 preempt_enable();
301} 312}
302 313
303void arch_flush_lazy_cpu_mode(void)
304{
305 preempt_disable();
306
307 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
308 WARN_ON(preempt_count() == 1);
309 arch_leave_lazy_cpu_mode();
310 arch_enter_lazy_cpu_mode();
311 }
312
313 preempt_enable();
314}
315
316struct pv_info pv_info = { 314struct pv_info pv_info = {
317 .name = "bare hardware", 315 .name = "bare hardware",
318 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
404 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
405 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
406 404
407 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
408 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
409 .leave = paravirt_nop,
410 },
411}; 407};
412 408
413struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f3..971a3bec47a8 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
186 186
187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
188 188
189/* enable this to stress test the chip's TCE cache */
190#ifdef CONFIG_IOMMU_DEBUG
191static int debugging = 1;
192
193static inline unsigned long verify_bit_range(unsigned long* bitmap,
194 int expected, unsigned long start, unsigned long end)
195{
196 unsigned long idx = start;
197
198 BUG_ON(start >= end);
199
200 while (idx < end) {
201 if (!!test_bit(idx, bitmap) != expected)
202 return idx;
203 ++idx;
204 }
205
206 /* all bits have the expected value */
207 return ~0UL;
208}
209#else /* debugging is disabled */
210static int debugging;
211
212static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 int expected, unsigned long start, unsigned long end)
214{
215 return ~0UL;
216}
217
218#endif /* CONFIG_IOMMU_DEBUG */
219
220static inline int translation_enabled(struct iommu_table *tbl) 189static inline int translation_enabled(struct iommu_table *tbl)
221{ 190{
222 /* only PHBs with translation enabled have an IOMMU table */ 191 /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
228{ 197{
229 unsigned long index; 198 unsigned long index;
230 unsigned long end; 199 unsigned long end;
231 unsigned long badbit;
232 unsigned long flags; 200 unsigned long flags;
233 201
234 index = start_addr >> PAGE_SHIFT; 202 index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
243 211
244 spin_lock_irqsave(&tbl->it_lock, flags); 212 spin_lock_irqsave(&tbl->it_lock, flags);
245 213
246 badbit = verify_bit_range(tbl->it_map, 0, index, end);
247 if (badbit != ~0UL) {
248 if (printk_ratelimit())
249 printk(KERN_ERR "Calgary: entry already allocated at "
250 "0x%lx tbl %p dma 0x%lx npages %u\n",
251 badbit, tbl, start_addr, npages);
252 }
253
254 iommu_area_reserve(tbl->it_map, index, npages); 214 iommu_area_reserve(tbl->it_map, index, npages);
255 215
256 spin_unlock_irqrestore(&tbl->it_lock, flags); 216 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
326 unsigned int npages) 286 unsigned int npages)
327{ 287{
328 unsigned long entry; 288 unsigned long entry;
329 unsigned long badbit;
330 unsigned long badend; 289 unsigned long badend;
331 unsigned long flags; 290 unsigned long flags;
332 291
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
346 305
347 spin_lock_irqsave(&tbl->it_lock, flags); 306 spin_lock_irqsave(&tbl->it_lock, flags);
348 307
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 iommu_area_free(tbl->it_map, entry, npages); 308 iommu_area_free(tbl->it_map, entry, npages);
358 309
359 spin_unlock_irqrestore(&tbl->it_lock, flags); 310 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
1488 iommu_detected = 1; 1439 iommu_detected = 1;
1489 calgary_detected = 1; 1440 calgary_detected = 1;
1490 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); 1441 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1491 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1492 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1443 specified_table_size);
1493 debugging ? "enabled" : "disabled");
1494 1444
1495 /* swiotlb for devices that aren't behind the Calgary. */ 1445 /* swiotlb for devices that aren't behind the Calgary. */
1496 if (max_pfn > MAX_DMA32_PFN) 1446 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035c..cfd9f9063896 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
144} 144}
145 145
146#ifdef CONFIG_IOMMU_LEAK 146#ifdef CONFIG_IOMMU_LEAK
147
148#define SET_LEAK(x) \
149 do { \
150 if (iommu_leak_tab) \
151 iommu_leak_tab[x] = __builtin_return_address(0);\
152 } while (0)
153
154#define CLEAR_LEAK(x) \
155 do { \
156 if (iommu_leak_tab) \
157 iommu_leak_tab[x] = NULL; \
158 } while (0)
159
160/* Debugging aid for drivers that don't free their IOMMU tables */ 147/* Debugging aid for drivers that don't free their IOMMU tables */
161static void **iommu_leak_tab;
162static int leak_trace; 148static int leak_trace;
163static int iommu_leak_pages = 20; 149static int iommu_leak_pages = 20;
164 150
165static void dump_leak(void) 151static void dump_leak(void)
166{ 152{
167 int i;
168 static int dump; 153 static int dump;
169 154
170 if (dump || !iommu_leak_tab) 155 if (dump)
171 return; 156 return;
172 dump = 1; 157 dump = 1;
173 show_stack(NULL, NULL);
174 158
175 /* Very crude. dump some from the end of the table too */ 159 show_stack(NULL, NULL);
176 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", 160 debug_dma_dump_mappings(NULL);
177 iommu_leak_pages);
178 for (i = 0; i < iommu_leak_pages; i += 2) {
179 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
180 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
181 0);
182 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
183 }
184 printk(KERN_DEBUG "\n");
185} 161}
186#else
187# define SET_LEAK(x)
188# define CLEAR_LEAK(x)
189#endif 162#endif
190 163
191static void iommu_full(struct device *dev, size_t size, int dir) 164static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
248 221
249 for (i = 0; i < npages; i++) { 222 for (i = 0; i < npages; i++) {
250 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); 223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
251 SET_LEAK(iommu_page + i);
252 phys_mem += PAGE_SIZE; 224 phys_mem += PAGE_SIZE;
253 } 225 }
254 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 226 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 266 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
295 for (i = 0; i < npages; i++) { 267 for (i = 0; i < npages; i++) {
296 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 268 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
297 CLEAR_LEAK(iommu_page + i);
298 } 269 }
299 free_iommu(iommu_page, npages); 270 free_iommu(iommu_page, npages);
300} 271}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
377 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); 348 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
378 while (pages--) { 349 while (pages--) {
379 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
380 SET_LEAK(iommu_page);
381 addr += PAGE_SIZE; 351 addr += PAGE_SIZE;
382 iommu_page++; 352 iommu_page++;
383 } 353 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
688 658
689 agp_gatt_table = gatt; 659 agp_gatt_table = gatt;
690 660
691 enable_gart_translations();
692
693 error = sysdev_class_register(&gart_sysdev_class); 661 error = sysdev_class_register(&gart_sysdev_class);
694 if (!error) 662 if (!error)
695 error = sysdev_register(&device_gart); 663 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
801 769
802#ifdef CONFIG_IOMMU_LEAK 770#ifdef CONFIG_IOMMU_LEAK
803 if (leak_trace) { 771 if (leak_trace) {
804 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 772 int ret;
805 get_order(iommu_pages*sizeof(void *))); 773
806 if (!iommu_leak_tab) 774 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret)
807 printk(KERN_DEBUG 776 printk(KERN_DEBUG
808 "PCI-DMA: Cannot allocate leak trace area\n"); 777 "PCI-DMA: Cannot trace all the entries\n");
809 } 778 }
810#endif 779#endif
811 780
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
845 * the pages as Not-Present: 814 * the pages as Not-Present:
846 */ 815 */
847 wbinvd(); 816 wbinvd();
817
818 /*
819 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility
821 * of stale cache entries that can lead to GART PTE
822 * errors.
823 */
824 enable_gart_translations();
848 825
849 /* 826 /*
850 * Try to workaround a bug (thanks to BenH): 827 * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e268..a1712f2b50f1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
28 return paddr; 28 return paddr;
29} 29}
30 30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) 31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{ 32{
33 return baddr; 33 return baddr;
34} 34}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e847..e22d63bdc8ff 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,9 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h>
11#include <trace/power.h> 12#include <trace/power.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/uaccess.h> 17#include <asm/uaccess.h>
16#include <asm/i387.h> 18#include <asm/i387.h>
@@ -613,3 +615,16 @@ static int __init idle_setup(char *str)
613} 615}
614early_param("idle", idle_setup); 616early_param("idle", idle_setup);
615 617
618unsigned long arch_align_stack(unsigned long sp)
619{
620 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
621 sp -= get_random_int() % 8192;
622 return sp & ~0xf;
623}
624
625unsigned long arch_randomize_brk(struct mm_struct *mm)
626{
627 unsigned long range_end = mm->brk + 0x02000000;
628 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
629}
630
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a2..c60924b5d123 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <stdarg.h>
13
14#include <linux/stackprotector.h> 12#include <linux/stackprotector.h>
15#include <linux/cpu.h> 13#include <linux/cpu.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
@@ -33,7 +31,6 @@
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
35#include <linux/ptrace.h> 33#include <linux/ptrace.h>
36#include <linux/random.h>
37#include <linux/personality.h> 34#include <linux/personality.h>
38#include <linux/tick.h> 35#include <linux/tick.h>
39#include <linux/percpu.h> 36#include <linux/percpu.h>
@@ -407,7 +404,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
407 * done before math_state_restore, so the TS bit is up 404 * done before math_state_restore, so the TS bit is up
408 * to date. 405 * to date.
409 */ 406 */
410 arch_leave_lazy_cpu_mode(); 407 arch_end_context_switch(next_p);
411 408
412 /* If the task has used fpu the last 5 timeslices, just do a full 409 /* If the task has used fpu the last 5 timeslices, just do a full
413 * restore of the math state immediately to avoid the trap; the 410 * restore of the math state immediately to avoid the trap; the
@@ -497,15 +494,3 @@ unsigned long get_wchan(struct task_struct *p)
497 return 0; 494 return 0;
498} 495}
499 496
500unsigned long arch_align_stack(unsigned long sp)
501{
502 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
503 sp -= get_random_int() % 8192;
504 return sp & ~0xf;
505}
506
507unsigned long arch_randomize_brk(struct mm_struct *mm)
508{
509 unsigned long range_end = mm->brk + 0x02000000;
510 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
511}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b1..45f010fb2e20 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h> 17#include <linux/stackprotector.h>
20#include <linux/cpu.h> 18#include <linux/cpu.h>
21#include <linux/errno.h> 19#include <linux/errno.h>
@@ -32,7 +30,6 @@
32#include <linux/delay.h> 30#include <linux/delay.h>
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/ptrace.h> 32#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h> 33#include <linux/notifier.h>
37#include <linux/kprobes.h> 34#include <linux/kprobes.h>
38#include <linux/kdebug.h> 35#include <linux/kdebug.h>
@@ -428,7 +425,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
428 * done before math_state_restore, so the TS bit is up 425 * done before math_state_restore, so the TS bit is up
429 * to date. 426 * to date.
430 */ 427 */
431 arch_leave_lazy_cpu_mode(); 428 arch_end_context_switch(next_p);
432 429
433 /* 430 /*
434 * Switch FS and GS. 431 * Switch FS and GS.
@@ -660,15 +657,3 @@ long sys_arch_prctl(int code, unsigned long addr)
660 return do_arch_prctl(current, code, addr); 657 return do_arch_prctl(current, code, addr);
661} 658}
662 659
663unsigned long arch_align_stack(unsigned long sp)
664{
665 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
666 sp -= get_random_int() % 8192;
667 return sp & ~0xf;
668}
669
670unsigned long arch_randomize_brk(struct mm_struct *mm)
671{
672 unsigned long range_end = mm->brk + 0x02000000;
673 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
674}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f03..af71d06624bf 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494#endif
495
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
497/* Set correct numa_node information for AMD NB functions */
498static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{
500 struct pci_dev *nb_ht;
501 unsigned int devfn;
502 u32 val;
503
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
505 nb_ht = pci_get_slot(dev->bus, devfn);
506 if (!nb_ht)
507 return;
508
509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev);
512}
494 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
515 quirk_amd_nb_node);
516DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
517 quirk_amd_nb_node);
518DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
519 quirk_amd_nb_node);
520DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
521 quirk_amd_nb_node);
522DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
523 quirk_amd_nb_node);
524DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
525 quirk_amd_nb_node);
526DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
527 quirk_amd_nb_node);
528DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
529 quirk_amd_nb_node);
530DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
531 quirk_amd_nb_node);
495#endif 532#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a0..d2d1ce8170f0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
193 }, 193 },
194 }, 194 },
195 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
196 .callback = set_bios_reboot,
197 .ident = "Dell OptiPlex 360",
198 .matches = {
199 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
200 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
201 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
202 },
203 },
195 { /* Handle problems with rebooting on Dell 2400's */ 204 { /* Handle problems with rebooting on Dell 2400's */
196 .callback = set_bios_reboot, 205 .callback = set_bios_reboot,
197 .ident = "Dell PowerEdge 2400", 206 .ident = "Dell PowerEdge 2400",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf63..d1c636bf31a7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access
118 * apertures, ACPI and other tables without having to play with fixmaps.
119 */
120unsigned long max_low_pfn_mapped;
121unsigned long max_pfn_mapped;
122
115RESERVE_BRK(dmi_alloc, 65536); 123RESERVE_BRK(dmi_alloc, 65536);
116 124
117unsigned int boot_cpu_id __read_mostly; 125unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
214unsigned long mmu_cr4_features = X86_CR4_PAE; 222unsigned long mmu_cr4_features = X86_CR4_PAE;
215#endif 223#endif
216 224
217/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 225/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
218int bootloader_type; 226int bootloader_type, bootloader_version;
219 227
220/* 228/*
221 * Setup options 229 * Setup options
@@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p)
706#endif 714#endif
707 saved_video_mode = boot_params.hdr.vid_mode; 715 saved_video_mode = boot_params.hdr.vid_mode;
708 bootloader_type = boot_params.hdr.type_of_loader; 716 bootloader_type = boot_params.hdr.type_of_loader;
717 if ((bootloader_type >> 4) == 0xe) {
718 bootloader_type &= 0xf;
719 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
720 }
721 bootloader_version = bootloader_type & 0xf;
722 bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
709 723
710#ifdef CONFIG_BLK_DEV_RAM 724#ifdef CONFIG_BLK_DEV_RAM
711 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 725 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
854 max_low_pfn = max_pfn; 868 max_low_pfn = max_pfn;
855 869
856 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 870 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
871 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
857#endif 872#endif
858 873
859#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 874#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
860 setup_bios_corruption_check(); 875 setup_bios_corruption_check();
861#endif 876#endif
862 877
878 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
879 max_pfn_mapped<<PAGE_SHIFT);
880
863 reserve_brk(); 881 reserve_brk();
864 882
865 /* max_pfn_mapped is updated here */ 883 /* max_pfn_mapped is updated here */
@@ -997,24 +1015,6 @@ void __init setup_arch(char **cmdline_p)
997#ifdef CONFIG_X86_32 1015#ifdef CONFIG_X86_32
998 1016
999/** 1017/**
1000 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
1001 *
1002 * Description:
1003 * Perform any necessary interrupt initialisation prior to setting up
1004 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
1005 * interrupts should be initialised here if the machine emulates a PC
1006 * in any way.
1007 **/
1008void __init x86_quirk_pre_intr_init(void)
1009{
1010 if (x86_quirks->arch_pre_intr_init) {
1011 if (x86_quirks->arch_pre_intr_init())
1012 return;
1013 }
1014 init_ISA_irqs();
1015}
1016
1017/**
1018 * x86_quirk_intr_init - post gate setup interrupt initialisation 1018 * x86_quirk_intr_init - post gate setup interrupt initialisation
1019 * 1019 *
1020 * Description: 1020 * Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13be36b3..9c3f0823e6aa 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
426#endif 426#endif
427 427
428#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
429 /*
430 * make sure boot cpu node_number is right, when boot cpu is on the
431 * node that doesn't have mem installed
432 */
433 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
434#endif
435
428 /* Setup node to cpumask map */ 436 /* Setup node to cpumask map */
429 setup_node_to_cpumask_map(); 437 setup_node_to_cpumask_map();
430 438
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8ccaa..f6db48c405b8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
193} 193}
194 194
195struct smp_ops smp_ops = { 195struct smp_ops smp_ops = {
196 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 196 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
197 .smp_prepare_cpus = native_smp_prepare_cpus, 197 .smp_prepare_cpus = native_smp_prepare_cpus,
198 .smp_cpus_done = native_smp_cpus_done, 198 .smp_cpus_done = native_smp_cpus_done,
199 199
200 .smp_send_stop = native_smp_send_stop, 200 .smp_send_stop = native_smp_send_stop,
201 .smp_send_reschedule = native_smp_send_reschedule, 201 .smp_send_reschedule = native_smp_send_reschedule,
202 202
203 .cpu_up = native_cpu_up, 203 .cpu_up = native_cpu_up,
204 .cpu_die = native_cpu_die, 204 .cpu_die = native_cpu_die,
205 .cpu_disable = native_cpu_disable, 205 .cpu_disable = native_cpu_disable,
206 .play_dead = native_play_dead, 206 .play_dead = native_play_dead,
207 207
208 .send_call_func_ipi = native_send_call_func_ipi, 208 .send_call_func_ipi = native_send_call_func_ipi,
209 .send_call_func_single_ipi = native_send_call_func_single_ipi, 209 .send_call_func_single_ipi = native_send_call_func_single_ipi,
210}; 210};
211EXPORT_SYMBOL_GPL(smp_ops); 211EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef917d8..7c80007ea5f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
504 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 504 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
505 * won't ... remember to clear down the APIC, etc later. 505 * won't ... remember to clear down the APIC, etc later.
506 */ 506 */
507int __devinit 507int __cpuinit
508wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) 508wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
509{ 509{
510 unsigned long send_status, accept_status = 0; 510 unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
538 return (send_status | accept_status); 538 return (send_status | accept_status);
539} 539}
540 540
541int __devinit 541static int __cpuinit
542wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) 542wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
543{ 543{
544 unsigned long send_status, accept_status = 0; 544 unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
822 /* mark "stuck" area as not stuck */ 822 /* mark "stuck" area as not stuck */
823 *((volatile unsigned long *)trampoline_base) = 0; 823 *((volatile unsigned long *)trampoline_base) = 0;
824 824
825 /* 825 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
826 * Cleanup possible dangling ends... 826 /*
827 */ 827 * Cleanup possible dangling ends...
828 smpboot_restore_warm_reset_vector(); 828 */
829 smpboot_restore_warm_reset_vector();
830 }
829 831
830 return boot_error; 832 return boot_error;
831} 833}
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
990 */ 992 */
991 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && 993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
992 !cpu_has_apic) { 994 !cpu_has_apic) {
993 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 995 if (!disable_apic) {
994 boot_cpu_physical_apicid); 996 pr_err("BIOS bug, local APIC #%d not detected!...\n",
995 printk(KERN_ERR "... forcing use of dummy APIC emulation." 997 boot_cpu_physical_apicid);
998 pr_err("... forcing use of dummy APIC emulation."
996 "(tell your hw vendor)\n"); 999 "(tell your hw vendor)\n");
1000 }
997 smpboot_clear_io_apic(); 1001 smpboot_clear_io_apic();
998 arch_disable_smp_support(); 1002 arch_disable_smp_support();
999 return -1; 1003 return -1;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6d..124d40c575df 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
715 struct bau_desc *adp; 715 struct bau_desc *adp;
716 struct bau_desc *ad2; 716 struct bau_desc *ad2;
717 717
718 adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); 718 /*
719 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
720 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
721 */
722 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
723 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
719 BUG_ON(!adp); 724 BUG_ON(!adp);
720 725
721 pa = uv_gpa(adp); /* need the real nasid*/ 726 pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
729 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
730 } 735 }
731 736
732 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { 737 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
739 * cpu even though we only use the first one; one descriptor can
740 * describe a broadcast to 256 nodes.
741 */
742 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
743 i++, ad2++) {
733 memset(ad2, 0, sizeof(struct bau_desc)); 744 memset(ad2, 0, sizeof(struct bau_desc));
734 ad2->header.sw_ack_flag = 1; 745 ad2->header.sw_ack_flag = 1;
735 /* 746 /*
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void)
832 return 0; 843 return 0;
833 844
834 for_each_possible_cpu(cur_cpu) 845 for_each_possible_cpu(cur_cpu)
835 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 846 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
836 GFP_KERNEL, cpu_to_node(cur_cpu)); 847 GFP_KERNEL, cpu_to_node(cur_cpu));
837 848
838 uv_bau_retry_limit = 1; 849 uv_bau_retry_limit = 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..ede024531f8f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
839 } 839 }
840 840
841 clts(); /* Allow maths ops (or we recurse) */ 841 clts(); /* Allow maths ops (or we recurse) */
842#ifdef CONFIG_X86_32
843 restore_fpu(tsk);
844#else
845 /* 842 /*
846 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 843 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
847 */ 844 */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
850 force_sig(SIGSEGV, tsk); 847 force_sig(SIGSEGV, tsk);
851 return; 848 return;
852 } 849 }
853#endif 850
854 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 851 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
855 tsk->fpu_counter++; 852 tsk->fpu_counter++;
856} 853}
@@ -969,11 +966,8 @@ void __init trap_init(void)
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 966 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors); 967 set_bit(i, used_vectors);
971 968
972#ifdef CONFIG_X86_64
973 set_bit(IA32_SYSCALL_VECTOR, used_vectors); 969 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
974#else 970
975 set_bit(SYSCALL_VECTOR, used_vectors);
976#endif
977 /* 971 /*
978 * Should be a barrier for any external CPU state: 972 * Should be a barrier for any external CPU state:
979 */ 973 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc430..3e1c057e98fe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
384{ 384{
385 u64 tsc1, tsc2, delta, ref1, ref2; 385 u64 tsc1, tsc2, delta, ref1, ref2;
386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
387 unsigned long flags, latch, ms, fast_calibrate, tsc_khz; 387 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
388 int hpet = is_hpet_enabled(), i, loopmin; 388 int hpet = is_hpet_enabled(), i, loopmin;
389 389
390 tsc_khz = get_hypervisor_tsc_freq(); 390 hv_tsc_khz = get_hypervisor_tsc_freq();
391 if (tsc_khz) { 391 if (hv_tsc_khz) {
392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); 392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
393 return tsc_khz; 393 return hv_tsc_khz;
394 } 394 }
395 395
396 local_irq_save(flags); 396 local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
710#ifdef CONFIG_X86_64 710#ifdef CONFIG_X86_64
711static cycle_t __vsyscall_fn vread_tsc(void) 711static cycle_t __vsyscall_fn vread_tsc(void)
712{ 712{
713 cycle_t ret = (cycle_t)vget_cycles(); 713 cycle_t ret;
714
715 /*
716 * Surround the RDTSC by barriers, to make sure it's not
717 * speculated to outside the seqlock critical section and
718 * does not cause time warps:
719 */
720 rdtsc_barrier();
721 ret = (cycle_t)vget_cycles();
722 rdtsc_barrier();
714 723
715 return ret >= __vsyscall_gtod_data.clock.cycle_last ? 724 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
716 ret : __vsyscall_gtod_data.clock.cycle_last; 725 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef9..027b5b498993 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37
37static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps; 40static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 return; 114 return;
114 115
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO 117 pr_info("Skipping synchronization checks as TSC is reliable.\n");
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
123 123
124 /* 124 /*
125 * Reset it - in case this is a second bootup: 125 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
143 143
144 if (nr_warps) { 144 if (nr_warps) {
145 printk("\n"); 145 printk("\n");
146 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 " turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
195 while (atomic_read(&stop_count) != cpus) 195 while (atomic_read(&stop_count) != cpus)
196 cpu_relax(); 196 cpu_relax();
197} 197}
198#undef NR_LOOPS
199
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c..9c4e62539058 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
287 info->regs.pt.ds = 0; 287 info->regs.pt.ds = 0;
288 info->regs.pt.es = 0; 288 info->regs.pt.es = 0;
289 info->regs.pt.fs = 0; 289 info->regs.pt.fs = 0;
290 290#ifndef CONFIG_X86_32_LAZY_GS
291/* we are clearing gs later just before "jmp resume_userspace", 291 info->regs.pt.gs = 0;
292 * because it is not saved/restored. 292#endif
293 */
294 293
295/* 294/*
296 * The flags register is also special: we cannot trust that the user 295 * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 } 317 }
319 318
320/* 319/*
321 * Save old state, set default return value (%ax) to 0 320 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
322 */ 321 */
323 info->regs32->ax = 0; 322 info->regs32->ax = VM86_SIGNAL;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 323 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 324 tsk->thread.saved_fs = info->regs32->fs;
326 tsk->thread.saved_gs = get_user_gs(info->regs32); 325 tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
343 __asm__ __volatile__( 342 __asm__ __volatile__(
344 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
345 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345#ifdef CONFIG_X86_32_LAZY_GS
346 "mov %2, %%gs\n\t" 346 "mov %2, %%gs\n\t"
347#endif
347 "jmp resume_userspace" 348 "jmp resume_userspace"
348 : /* no outputs */ 349 : /* no outputs */
349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 350 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211e..b263423fbe2a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f013..4c85b2e2bb65 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,431 @@
1/*
2 * ld script for the x86 kernel
3 *
4 * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
5 *
6 * Modernisation, unification and other changes and fixes:
7 * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
8 *
9 *
10 * Don't define absolute symbols until and unless you know that symbol
11 * value is should remain constant even if kernel image is relocated
12 * at run time. Absolute symbols are not relocated. If symbol value should
13 * change if kernel is relocated, make the symbol section relative and
14 * put it inside the section definition.
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S" 18#define LOAD_OFFSET __PAGE_OFFSET
3#else 19#else
4# include "vmlinux_64.lds.S" 20#define LOAD_OFFSET __START_KERNEL_map
5#endif 21#endif
22
23#include <asm-generic/vmlinux.lds.h>
24#include <asm/asm-offsets.h>
25#include <asm/thread_info.h>
26#include <asm/page_types.h>
27#include <asm/cache.h>
28#include <asm/boot.h>
29
30#undef i386 /* in case the preprocessor is a 32bit one */
31
32OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
33
34#ifdef CONFIG_X86_32
35OUTPUT_ARCH(i386)
36ENTRY(phys_startup_32)
37jiffies = jiffies_64;
38#else
39OUTPUT_ARCH(i386:x86-64)
40ENTRY(phys_startup_64)
41jiffies_64 = jiffies;
42#endif
43
44PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */
54#endif
55 note PT_NOTE FLAGS(0); /* ___ */
56}
57
58SECTIONS
59{
60#ifdef CONFIG_X86_32
61 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
62 phys_startup_32 = startup_32 - LOAD_OFFSET;
63#else
64 . = __START_KERNEL;
65 phys_startup_64 = startup_64 - LOAD_OFFSET;
66#endif
67
68 /* Text and read-only data */
69
70 /* bootstrapping code */
71 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
72 _text = .;
73 *(.text.head)
74 } :text = 0x9090
75
76 /* The rest of the text */
77 .text : AT(ADDR(.text) - LOAD_OFFSET) {
78#ifdef CONFIG_X86_32
79 /* not really needed, already page aligned */
80 . = ALIGN(PAGE_SIZE);
81 *(.text.page_aligned)
82#endif
83 . = ALIGN(8);
84 _stext = .;
85 TEXT_TEXT
86 SCHED_TEXT
87 LOCK_TEXT
88 KPROBES_TEXT
89 IRQENTRY_TEXT
90 *(.fixup)
91 *(.gnu.warning)
92 /* End of text section */
93 _etext = .;
94 } :text = 0x9090
95
96 NOTES :text :note
97
98 /* Exception table */
99 . = ALIGN(16);
100 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
101 __start___ex_table = .;
102 *(__ex_table)
103 __stop___ex_table = .;
104 } :text = 0x9090
105
106 RODATA
107
108 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 DATA_DATA
112 CONSTRUCTORS
113
114#ifdef CONFIG_X86_64
115 /* End of data section */
116 _edata = .;
117#endif
118 } :data
119
120#ifdef CONFIG_X86_32
121 /* 32 bit has nosave before _edata */
122 . = ALIGN(PAGE_SIZE);
123 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
124 __nosave_begin = .;
125 *(.data.nosave)
126 . = ALIGN(PAGE_SIZE);
127 __nosave_end = .;
128 }
129#endif
130
131 . = ALIGN(PAGE_SIZE);
132 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
133 *(.data.page_aligned)
134 *(.data.idt)
135 }
136
137#ifdef CONFIG_X86_32
138 . = ALIGN(32);
139#else
140 . = ALIGN(PAGE_SIZE);
141 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
142#endif
143 .data.cacheline_aligned :
144 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
145 *(.data.cacheline_aligned)
146 }
147
148 /* rarely changed data like cpu maps */
149#ifdef CONFIG_X86_32
150 . = ALIGN(32);
151#else
152 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
153#endif
154 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
155 *(.data.read_mostly)
156
157#ifdef CONFIG_X86_32
158 /* End of data section */
159 _edata = .;
160#endif
161 }
162
163#ifdef CONFIG_X86_64
164
165#define VSYSCALL_ADDR (-10*1024*1024)
166#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
167 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
168#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
170
171#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
172#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
173
174#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
175#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
176
177 . = VSYSCALL_ADDR;
178 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
179 *(.vsyscall_0)
180 } :user
181
182 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
183
184 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
185 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
186 *(.vsyscall_fn)
187 }
188
189 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
190 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
191 *(.vsyscall_gtod_data)
192 }
193
194 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
195 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
196 *(.vsyscall_clock)
197 }
198 vsyscall_clock = VVIRT(.vsyscall_clock);
199
200
201 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
202 *(.vsyscall_1)
203 }
204 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
205 *(.vsyscall_2)
206 }
207
208 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
209 *(.vgetcpu_mode)
210 }
211 vgetcpu_mode = VVIRT(.vgetcpu_mode);
212
213 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
214 .jiffies : AT(VLOAD(.jiffies)) {
215 *(.jiffies)
216 }
217 jiffies = VVIRT(.jiffies);
218
219 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
220 *(.vsyscall_3)
221 }
222
223 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
224
225#undef VSYSCALL_ADDR
226#undef VSYSCALL_PHYS_ADDR
227#undef VSYSCALL_VIRT_ADDR
228#undef VLOAD_OFFSET
229#undef VLOAD
230#undef VVIRT_OFFSET
231#undef VVIRT
232
233#endif /* CONFIG_X86_64 */
234
235 /* init_task */
236 . = ALIGN(THREAD_SIZE);
237 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
238 *(.data.init_task)
239 }
240#ifdef CONFIG_X86_64
241 :data.init
242#endif
243
244 /*
245 * smp_locks might be freed after init
246 * start/end must be page aligned
247 */
248 . = ALIGN(PAGE_SIZE);
249 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
250 __smp_locks = .;
251 *(.smp_locks)
252 __smp_locks_end = .;
253 . = ALIGN(PAGE_SIZE);
254 }
255
256 /* Init code and data - will be freed after init */
257 . = ALIGN(PAGE_SIZE);
258 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
259 __init_begin = .; /* paired with __init_end */
260 _sinittext = .;
261 INIT_TEXT
262 _einittext = .;
263 }
264
265 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
266 INIT_DATA
267 }
268
269 . = ALIGN(16);
270 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
271 __setup_start = .;
272 *(.init.setup)
273 __setup_end = .;
274 }
275 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
276 __initcall_start = .;
277 INITCALLS
278 __initcall_end = .;
279 }
280
281 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
282 __con_initcall_start = .;
283 *(.con_initcall.init)
284 __con_initcall_end = .;
285 }
286
287 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
288 __x86_cpu_dev_start = .;
289 *(.x86_cpu_dev.init)
290 __x86_cpu_dev_end = .;
291 }
292
293 SECURITY_INIT
294
295 . = ALIGN(8);
296 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
297 __parainstructions = .;
298 *(.parainstructions)
299 __parainstructions_end = .;
300 }
301
302 . = ALIGN(8);
303 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
304 __alt_instructions = .;
305 *(.altinstructions)
306 __alt_instructions_end = .;
307 }
308
309 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
310 *(.altinstr_replacement)
311 }
312
313 /*
314 * .exit.text is discard at runtime, not link time, to deal with
315 * references from .altinstructions and .eh_frame
316 */
317 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
318 EXIT_TEXT
319 }
320
321 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
322 EXIT_DATA
323 }
324
325#ifdef CONFIG_BLK_DEV_INITRD
326 . = ALIGN(PAGE_SIZE);
327 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
328 __initramfs_start = .;
329 *(.init.ramfs)
330 __initramfs_end = .;
331 }
332#endif
333
334#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
335 /*
336 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
337 * output PHDR, so the next output section - __data_nosave - should
338 * start another section data.init2. Also, pda should be at the head of
339 * percpu area. Preallocate it and define the percpu offset symbol
340 * so that it can be accessed as a percpu variable.
341 */
342 . = ALIGN(PAGE_SIZE);
343 PERCPU_VADDR(0, :percpu)
344#else
345 PERCPU(PAGE_SIZE)
346#endif
347
348 . = ALIGN(PAGE_SIZE);
349
350 /* freed after init ends here */
351 .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
352 __init_end = .;
353 }
354
355#ifdef CONFIG_X86_64
356 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
357 . = ALIGN(PAGE_SIZE);
358 __nosave_begin = .;
359 *(.data.nosave)
360 . = ALIGN(PAGE_SIZE);
361 __nosave_end = .;
362 } :data.init2
363 /* use another section data.init2, see PERCPU_VADDR() above */
364#endif
365
366 /* BSS */
367 . = ALIGN(PAGE_SIZE);
368 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
369 __bss_start = .;
370 *(.bss.page_aligned)
371 *(.bss)
372 . = ALIGN(4);
373 __bss_stop = .;
374 }
375
376 . = ALIGN(PAGE_SIZE);
377 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
378 __brk_base = .;
379 . += 64 * 1024; /* 64k alignment slop space */
380 *(.brk_reservation) /* areas brk users have reserved */
381 __brk_limit = .;
382 }
383
384 .end : AT(ADDR(.end) - LOAD_OFFSET) {
385 _end = .;
386 }
387
388 /* Sections to be discarded */
389 /DISCARD/ : {
390 *(.exitcall.exit)
391 *(.eh_frame)
392 *(.discard)
393 }
394
395 STABS_DEBUG
396 DWARF_DEBUG
397}
398
399
400#ifdef CONFIG_X86_32
401ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
402 "kernel image bigger than KERNEL_IMAGE_SIZE")
403#else
404/*
405 * Per-cpu symbols which need to be offset from __per_cpu_load
406 * for the boot processor.
407 */
408#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
409INIT_PER_CPU(gdt_page);
410INIT_PER_CPU(irq_stack_union);
411
412/*
413 * Build-time check on the image size:
414 */
415ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
416 "kernel image bigger than KERNEL_IMAGE_SIZE")
417
418#ifdef CONFIG_SMP
419ASSERT((per_cpu__irq_stack_union == 0),
420 "irq_stack_union is not at start of per-cpu area");
421#endif
422
423#endif /* CONFIG_X86_32 */
424
425#ifdef CONFIG_KEXEC
426#include <asm/kexec.h>
427
428ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
429 "kexec control code size is too big")
430#endif
431
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f3..000000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11#define LOAD_OFFSET __PAGE_OFFSET
12
13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h>
15#include <asm/page_types.h>
16#include <asm/cache.h>
17#include <asm/boot.h>
18
19OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
20OUTPUT_ARCH(i386)
21ENTRY(phys_startup_32)
22jiffies = jiffies_64;
23
24PHDRS {
25 text PT_LOAD FLAGS(5); /* R_E */
26 data PT_LOAD FLAGS(7); /* RWE */
27 note PT_NOTE FLAGS(0); /* ___ */
28}
29SECTIONS
30{
31 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
32 phys_startup_32 = startup_32 - LOAD_OFFSET;
33
34 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
35 _text = .; /* Text and read-only data */
36 *(.text.head)
37 } :text = 0x9090
38
39 /* read-only */
40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
42 *(.text.page_aligned)
43 TEXT_TEXT
44 SCHED_TEXT
45 LOCK_TEXT
46 KPROBES_TEXT
47 IRQENTRY_TEXT
48 *(.fixup)
49 *(.gnu.warning)
50 _etext = .; /* End of text section */
51 } :text = 0x9090
52
53 NOTES :text :note
54
55 . = ALIGN(16); /* Exception table */
56 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
57 __start___ex_table = .;
58 *(__ex_table)
59 __stop___ex_table = .;
60 } :text = 0x9090
61
62 RODATA
63
64 /* writeable */
65 . = ALIGN(PAGE_SIZE);
66 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 . = ALIGN(PAGE_SIZE);
72 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
73 __nosave_begin = .;
74 *(.data.nosave)
75 . = ALIGN(PAGE_SIZE);
76 __nosave_end = .;
77 }
78
79 . = ALIGN(PAGE_SIZE);
80 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
81 *(.data.page_aligned)
82 *(.data.idt)
83 }
84
85 . = ALIGN(32);
86 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
87 *(.data.cacheline_aligned)
88 }
89
90 /* rarely changed data like cpu maps */
91 . = ALIGN(32);
92 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
93 *(.data.read_mostly)
94 _edata = .; /* End of data section */
95 }
96
97 . = ALIGN(THREAD_SIZE); /* init_task */
98 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
99 *(.data.init_task)
100 }
101
102 /* might get freed after init */
103 . = ALIGN(PAGE_SIZE);
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
105 __smp_locks = .;
106 *(.smp_locks)
107 __smp_locks_end = .;
108 }
109 /* will be freed after init
110 * Following ALIGN() is required to make sure no other data falls on the
111 * same page where __smp_alt_end is pointing as that page might be freed
112 * after boot. Always make sure that ALIGN() directive is present after
113 * the section which contains __smp_alt_end.
114 */
115 . = ALIGN(PAGE_SIZE);
116
117 /* will be freed after init */
118 . = ALIGN(PAGE_SIZE); /* Init code and data */
119 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
120 __init_begin = .;
121 _sinittext = .;
122 INIT_TEXT
123 _einittext = .;
124 }
125 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
126 INIT_DATA
127 }
128 . = ALIGN(16);
129 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
130 __setup_start = .;
131 *(.init.setup)
132 __setup_end = .;
133 }
134 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
135 __initcall_start = .;
136 INITCALLS
137 __initcall_end = .;
138 }
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
140 __con_initcall_start = .;
141 *(.con_initcall.init)
142 __con_initcall_end = .;
143 }
144 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
145 __x86_cpu_dev_start = .;
146 *(.x86_cpu_dev.init)
147 __x86_cpu_dev_end = .;
148 }
149 SECURITY_INIT
150 . = ALIGN(4);
151 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
152 __alt_instructions = .;
153 *(.altinstructions)
154 __alt_instructions_end = .;
155 }
156 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
157 *(.altinstr_replacement)
158 }
159 . = ALIGN(4);
160 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
161 __parainstructions = .;
162 *(.parainstructions)
163 __parainstructions_end = .;
164 }
165 /* .exit.text is discard at runtime, not link time, to deal with references
166 from .altinstructions and .eh_frame */
167 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
168 EXIT_TEXT
169 }
170 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
171 EXIT_DATA
172 }
173#if defined(CONFIG_BLK_DEV_INITRD)
174 . = ALIGN(PAGE_SIZE);
175 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
176 __initramfs_start = .;
177 *(.init.ramfs)
178 __initramfs_end = .;
179 }
180#endif
181 PERCPU(PAGE_SIZE)
182 . = ALIGN(PAGE_SIZE);
183 /* freed after init ends here */
184
185 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
186 __init_end = .;
187 __bss_start = .; /* BSS */
188 *(.bss.page_aligned)
189 *(.bss)
190 . = ALIGN(4);
191 __bss_stop = .;
192 }
193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
195 . = ALIGN(PAGE_SIZE);
196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
204 }
205
206 /* Sections to be discarded */
207 /DISCARD/ : {
208 *(.exitcall.exit)
209 *(.discard)
210 }
211
212 STABS_DEBUG
213
214 DWARF_DEBUG
215}
216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
223#ifdef CONFIG_KEXEC
224/* Link time checks */
225#include <asm/kexec.h>
226
227ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
228 "kexec control code size is too big")
229#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b030..000000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
14OUTPUT_ARCH(i386:x86-64)
15ENTRY(phys_startup_64)
16jiffies_64 = jiffies;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
26 note PT_NOTE FLAGS(0); /* ___ */
27}
28SECTIONS
29{
30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */
35 *(.text.head)
36 _stext = .;
37 /* Then the rest */
38 TEXT_TEXT
39 SCHED_TEXT
40 LOCK_TEXT
41 KPROBES_TEXT
42 IRQENTRY_TEXT
43 *(.fixup)
44 *(.gnu.warning)
45 _etext = .; /* End of text section */
46 } :text = 0x9090
47
48 NOTES :text :note
49
50 . = ALIGN(16); /* Exception table */
51 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
52 __start___ex_table = .;
53 *(__ex_table)
54 __stop___ex_table = .;
55 } :text = 0x9090
56
57 RODATA
58
59 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
60 /* Data */
61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA
63 CONSTRUCTORS
64 _edata = .; /* End of data section */
65 } :data
66
67
68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned)
72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
74 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
75 *(.data.read_mostly)
76 }
77
78#define VSYSCALL_ADDR (-10*1024*1024)
79#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
80#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
81
82#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
83#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
84
85#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
86#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
87
88 . = VSYSCALL_ADDR;
89 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
90 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
91
92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
94 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
95 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
96 { *(.vsyscall_gtod_data) }
97 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
98 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
99 { *(.vsyscall_clock) }
100 vsyscall_clock = VVIRT(.vsyscall_clock);
101
102
103 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
104 { *(.vsyscall_1) }
105 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
106 { *(.vsyscall_2) }
107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
111 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
112 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
113 jiffies = VVIRT(.jiffies);
114
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
116 { *(.vsyscall_3) }
117
118 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
119
120#undef VSYSCALL_ADDR
121#undef VSYSCALL_PHYS_ADDR
122#undef VSYSCALL_VIRT_ADDR
123#undef VLOAD_OFFSET
124#undef VLOAD
125#undef VVIRT_OFFSET
126#undef VVIRT
127
128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task)
131 }:data.init
132
133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned)
136 }
137
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
147 }
148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .;
153 INIT_TEXT
154 _einittext = .;
155 }
156 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
157 __initdata_begin = .;
158 INIT_DATA
159 __initdata_end = .;
160 }
161
162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 . = ALIGN(16);
164 __setup_start = .;
165 *(.init.setup)
166 __setup_end = .;
167 }
168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
170 INITCALLS
171 __initcall_end = .;
172 }
173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
175 *(.con_initcall.init)
176 __con_initcall_end = .;
177 }
178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
182 }
183 SECURITY_INIT
184
185 . = ALIGN(8);
186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
187 __parainstructions = .;
188 *(.parainstructions)
189 __parainstructions_end = .;
190 }
191
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
195 *(.altinstructions)
196 __alt_instructions_end = .;
197 }
198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
199 *(.altinstr_replacement)
200 }
201 /* .exit.text is discard at runtime, not link time, to deal with references
202 from .altinstructions and .eh_frame */
203 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
204 EXIT_TEXT
205 }
206 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
207 EXIT_DATA
208 }
209
210#ifdef CONFIG_BLK_DEV_INITRD
211 . = ALIGN(PAGE_SIZE);
212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
213 __initramfs_start = .;
214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
217#endif
218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
230 PERCPU(PAGE_SIZE)
231#endif
232
233 . = ALIGN(PAGE_SIZE);
234 __init_end = .;
235
236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
243
244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
247 *(.bss.page_aligned)
248 *(.bss)
249 __bss_stop = .;
250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
259
260 _end = . ;
261
262 /* Sections to be discarded */
263 /DISCARD/ : {
264 *(.exitcall.exit)
265 *(.eh_frame)
266 *(.discard)
267 }
268
269 STABS_DEBUG
270
271 DWARF_DEBUG
272}
273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
282/*
283 * Build-time check on the image size:
284 */
285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc9067..25ee06a80aad 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
132 return; 132 return;
133 } 133 }
134 134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
141 now = vread(); 135 now = vread();
142 rdtsc_barrier();
143
144 base = __vsyscall_gtod_data.clock.cycle_last; 136 base = __vsyscall_gtod_data.clock.cycle_last;
145 mask = __vsyscall_gtod_data.clock.mask; 137 mask = __vsyscall_gtod_data.clock.mask;
146 mult = __vsyscall_gtod_data.clock.mult; 138 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 33a93b417396..4e0c26559395 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -167,10 +167,16 @@ static void lazy_hcall3(unsigned long call,
167 167
168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
169 * issue the do-nothing hypercall to flush any stored calls. */ 169 * issue the do-nothing hypercall to flush any stored calls. */
170static void lguest_leave_lazy_mode(void) 170static void lguest_leave_lazy_mmu_mode(void)
171{ 171{
172 paravirt_leave_lazy(paravirt_get_lazy_mode());
173 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 172 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
173 paravirt_leave_lazy_mmu();
174}
175
176static void lguest_end_context_switch(struct task_struct *next)
177{
178 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
179 paravirt_end_context_switch(next);
174} 180}
175 181
176/*G:033 182/*G:033
@@ -637,7 +643,7 @@ static void __init lguest_init_IRQ(void)
637 643
638void lguest_setup_irq(unsigned int irq) 644void lguest_setup_irq(unsigned int irq)
639{ 645{
640 irq_to_desc_alloc_cpu(irq, 0); 646 irq_to_desc_alloc_node(irq, 0);
641 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 647 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
642 handle_level_irq, "level"); 648 handle_level_irq, "level");
643} 649}
@@ -1054,8 +1060,8 @@ __init void lguest_init(void)
1054 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1060 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1055 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1061 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1056 pv_cpu_ops.wbinvd = lguest_wbinvd; 1062 pv_cpu_ops.wbinvd = lguest_wbinvd;
1057 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; 1063 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1058 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1064 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1059 1065
1060 /* pagetable management */ 1066 /* pagetable management */
1061 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1067 pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1068,7 +1074,7 @@ __init void lguest_init(void)
1068 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1074 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1069 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1075 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1070 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1076 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1071 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1077 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
1072 pv_mmu_ops.pte_update = lguest_pte_update; 1078 pv_mmu_ops.pte_update = lguest_pte_update;
1073 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1079 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1074 1080
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb40..a725b7f760ae 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
161 st->current_address >= st->marker[1].start_address) { 161 st->current_address >= st->marker[1].start_address) {
162 const char *unit = units; 162 const char *unit = units;
163 unsigned long delta; 163 unsigned long delta;
164 int width = sizeof(unsigned long) * 2;
164 165
165 /* 166 /*
166 * Now print the actual finished series 167 * Now print the actual finished series
167 */ 168 */
168 seq_printf(m, "0x%p-0x%p ", 169 seq_printf(m, "0x%0*lx-0x%0*lx ",
169 (void *)st->start_address, 170 width, st->start_address,
170 (void *)st->current_address); 171 width, st->current_address);
171 172
172 delta = (st->current_address - st->start_address) >> 10; 173 delta = (st->current_address - st->start_address) >> 10;
173 while (!(delta & 1023) && unit[1]) { 174 while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..5ec7ae366615 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,16 @@
3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5 */ 5 */
6#include <linux/interrupt.h> 6#include <linux/magic.h> /* STACK_END_MAGIC */
7#include <linux/mmiotrace.h> 7#include <linux/sched.h> /* test_thread_flag(), ... */
8#include <linux/bootmem.h> 8#include <linux/kdebug.h> /* oops_begin/end, ... */
9#include <linux/compiler.h> 9#include <linux/module.h> /* search_exception_table */
10#include <linux/highmem.h> 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/uaccess.h> 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/vmalloc.h> 13
14#include <linux/vt_kern.h> 14#include <asm/traps.h> /* dotraplinkage, ... */
15#include <linux/signal.h> 15#include <asm/pgalloc.h> /* pgd_*(), ... */
16#include <linux/kernel.h>
17#include <linux/ptrace.h>
18#include <linux/string.h>
19#include <linux/module.h>
20#include <linux/kdebug.h>
21#include <linux/errno.h>
22#include <linux/magic.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/init.h>
26#include <linux/mman.h>
27#include <linux/tty.h>
28#include <linux/smp.h>
29#include <linux/mm.h>
30
31#include <asm-generic/sections.h>
32
33#include <asm/tlbflush.h>
34#include <asm/pgalloc.h>
35#include <asm/segment.h>
36#include <asm/system.h>
37#include <asm/proto.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40 16
41/* 17/*
42 * Page fault error code bits: 18 * Page fault error code bits:
@@ -225,12 +201,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225 if (!pmd_present(*pmd_k)) 201 if (!pmd_present(*pmd_k))
226 return NULL; 202 return NULL;
227 203
228 if (!pmd_present(*pmd)) { 204 if (!pmd_present(*pmd))
229 set_pmd(pmd, *pmd_k); 205 set_pmd(pmd, *pmd_k);
230 arch_flush_lazy_mmu_mode(); 206 else
231 } else {
232 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 207 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
233 }
234 208
235 return pmd_k; 209 return pmd_k;
236} 210}
@@ -538,8 +512,6 @@ bad:
538static int is_errata93(struct pt_regs *regs, unsigned long address) 512static int is_errata93(struct pt_regs *regs, unsigned long address)
539{ 513{
540#ifdef CONFIG_X86_64 514#ifdef CONFIG_X86_64
541 static int once;
542
543 if (address != regs->ip) 515 if (address != regs->ip)
544 return 0; 516 return 0;
545 517
@@ -549,10 +521,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
549 address |= 0xffffffffUL << 32; 521 address |= 0xffffffffUL << 32;
550 if ((address >= (u64)_stext && address <= (u64)_etext) || 522 if ((address >= (u64)_stext && address <= (u64)_etext) ||
551 (address >= MODULES_VADDR && address <= MODULES_END)) { 523 (address >= MODULES_VADDR && address <= MODULES_END)) {
552 if (!once) { 524 printk_once(errata93_warning);
553 printk(errata93_warning);
554 once = 1;
555 }
556 regs->ip = address; 525 regs->ip = address;
557 return 1; 526 return 1;
558 } 527 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a4..58f621e81919 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 45 BUG_ON(!pte_none(*(kmap_pte-idx)));
46 set_pte(kmap_pte-idx, mk_pte(page, prot)); 46 set_pte(kmap_pte-idx, mk_pte(page, prot));
47 arch_flush_lazy_mmu_mode();
48 47
49 return (void *)vaddr; 48 return (void *)vaddr;
50} 49}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
74#endif 73#endif
75 } 74 }
76 75
77 arch_flush_lazy_mmu_mode();
78 pagefault_enable(); 76 pagefault_enable();
79} 77}
80 78
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ae4f7b5d7104..34c1bfb64f1c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/initrd.h>
1#include <linux/ioport.h> 2#include <linux/ioport.h>
2#include <linux/swap.h> 3#include <linux/swap.h>
3 4
@@ -10,6 +11,9 @@
10#include <asm/setup.h> 11#include <asm/setup.h>
11#include <asm/system.h> 12#include <asm/system.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h>
15
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13 17
14unsigned long __initdata e820_table_start; 18unsigned long __initdata e820_table_start;
15unsigned long __meminitdata e820_table_end; 19unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
23#endif 27#endif
24; 28;
25 29
30int nx_enabled;
31
32#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
33static int disable_nx __cpuinitdata;
34
35/*
36 * noexec = on|off
37 *
38 * Control non-executable mappings for processes.
39 *
40 * on Enable
41 * off Disable
42 */
43static int __init noexec_setup(char *str)
44{
45 if (!str)
46 return -EINVAL;
47 if (!strncmp(str, "on", 2)) {
48 __supported_pte_mask |= _PAGE_NX;
49 disable_nx = 0;
50 } else if (!strncmp(str, "off", 3)) {
51 disable_nx = 1;
52 __supported_pte_mask &= ~_PAGE_NX;
53 }
54 return 0;
55}
56early_param("noexec", noexec_setup);
57#endif
58
59#ifdef CONFIG_X86_PAE
60static void __init set_nx(void)
61{
62 unsigned int v[4], l, h;
63
64 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
65 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
66
67 if ((v[3] & (1 << 20)) && !disable_nx) {
68 rdmsr(MSR_EFER, l, h);
69 l |= EFER_NX;
70 wrmsr(MSR_EFER, l, h);
71 nx_enabled = 1;
72 __supported_pte_mask |= _PAGE_NX;
73 }
74 }
75}
76#else
77static inline void set_nx(void)
78{
79}
80#endif
81
82#ifdef CONFIG_X86_64
83void __cpuinit check_efer(void)
84{
85 unsigned long efer;
86
87 rdmsrl(MSR_EFER, efer);
88 if (!(efer & EFER_NX) || disable_nx)
89 __supported_pte_mask &= ~_PAGE_NX;
90}
91#endif
92
26static void __init find_early_table_space(unsigned long end, int use_pse, 93static void __init find_early_table_space(unsigned long end, int use_pse,
27 int use_gbpages) 94 int use_gbpages)
28{ 95{
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
66 */ 133 */
67#ifdef CONFIG_X86_32 134#ifdef CONFIG_X86_32
68 start = 0x7000; 135 start = 0x7000;
69 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 136#else
70 tables, PAGE_SIZE);
71#else /* CONFIG_X86_64 */
72 start = 0x8000; 137 start = 0x8000;
73 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
74#endif 138#endif
139 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
140 tables, PAGE_SIZE);
75 if (e820_table_start == -1UL) 141 if (e820_table_start == -1UL)
76 panic("Cannot find space for the kernel page tables"); 142 panic("Cannot find space for the kernel page tables");
77 143
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
159 use_gbpages = direct_gbpages; 225 use_gbpages = direct_gbpages;
160#endif 226#endif
161 227
162#ifdef CONFIG_X86_32
163#ifdef CONFIG_X86_PAE
164 set_nx(); 228 set_nx();
165 if (nx_enabled) 229 if (nx_enabled)
166 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 230 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
167#endif
168 231
169 /* Enable PSE if available */ 232 /* Enable PSE if available */
170 if (cpu_has_pse) 233 if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
175 set_in_cr4(X86_CR4_PGE); 238 set_in_cr4(X86_CR4_PGE);
176 __supported_pte_mask |= _PAGE_GLOBAL; 239 __supported_pte_mask |= _PAGE_GLOBAL;
177 } 240 }
178#endif
179 241
180 if (use_gbpages) 242 if (use_gbpages)
181 page_size_mask |= 1 << PG_LEVEL_1G; 243 page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f5..949708d7a481 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/page_types.h>
52#include <asm/init.h> 53#include <asm/init.h>
53 54
54unsigned long max_low_pfn_mapped;
55unsigned long max_pfn_mapped;
56
57DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
58unsigned long highstart_pfn, highend_pfn; 55unsigned long highstart_pfn, highend_pfn;
59 56
60static noinline int do_test_wp_bit(void); 57static noinline int do_test_wp_bit(void);
@@ -587,61 +584,9 @@ void zap_low_mappings(void)
587 flush_tlb_all(); 584 flush_tlb_all();
588} 585}
589 586
590int nx_enabled;
591
592pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 587pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
593EXPORT_SYMBOL_GPL(__supported_pte_mask); 588EXPORT_SYMBOL_GPL(__supported_pte_mask);
594 589
595#ifdef CONFIG_X86_PAE
596
597static int disable_nx __initdata;
598
599/*
600 * noexec = on|off
601 *
602 * Control non executable mappings.
603 *
604 * on Enable
605 * off Disable
606 */
607static int __init noexec_setup(char *str)
608{
609 if (!str || !strcmp(str, "on")) {
610 if (cpu_has_nx) {
611 __supported_pte_mask |= _PAGE_NX;
612 disable_nx = 0;
613 }
614 } else {
615 if (!strcmp(str, "off")) {
616 disable_nx = 1;
617 __supported_pte_mask &= ~_PAGE_NX;
618 } else {
619 return -EINVAL;
620 }
621 }
622
623 return 0;
624}
625early_param("noexec", noexec_setup);
626
627void __init set_nx(void)
628{
629 unsigned int v[4], l, h;
630
631 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
632 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
633
634 if ((v[3] & (1 << 20)) && !disable_nx) {
635 rdmsr(MSR_EFER, l, h);
636 l |= EFER_NX;
637 wrmsr(MSR_EFER, l, h);
638 nx_enabled = 1;
639 __supported_pte_mask |= _PAGE_NX;
640 }
641 }
642}
643#endif
644
645/* user-defined highmem size */ 590/* user-defined highmem size */
646static unsigned int highmem_pages = -1; 591static unsigned int highmem_pages = -1;
647 592
@@ -761,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
761 highstart_pfn = highend_pfn = max_pfn; 706 highstart_pfn = highend_pfn = max_pfn;
762 if (max_pfn > max_low_pfn) 707 if (max_pfn > max_low_pfn)
763 highstart_pfn = max_low_pfn; 708 highstart_pfn = max_low_pfn;
764 memory_present(0, 0, highend_pfn);
765 e820_register_active_regions(0, 0, highend_pfn); 709 e820_register_active_regions(0, 0, highend_pfn);
710 sparse_memory_present_with_active_regions(0);
766 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 711 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
767 pages_to_mb(highend_pfn - highstart_pfn)); 712 pages_to_mb(highend_pfn - highstart_pfn));
768 num_physpages = highend_pfn; 713 num_physpages = highend_pfn;
769 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 714 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
770#else 715#else
771 memory_present(0, 0, max_low_pfn);
772 e820_register_active_regions(0, 0, max_low_pfn); 716 e820_register_active_regions(0, 0, max_low_pfn);
717 sparse_memory_present_with_active_regions(0);
773 num_physpages = max_low_pfn; 718 num_physpages = max_low_pfn;
774 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 719 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
775#endif 720#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df6..52bb9519bb86 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h> 51#include <asm/init.h>
52 52
53/*
54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
55 * The direct mapping extends to max_pfn_mapped, so that we can directly access
56 * apertures, ACPI and other tables without having to play with fixmaps.
57 */
58unsigned long max_low_pfn_mapped;
59unsigned long max_pfn_mapped;
60
61static unsigned long dma_reserve __initdata; 53static unsigned long dma_reserve __initdata;
62 54
63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64
65static int __init parse_direct_gbpages_off(char *arg) 55static int __init parse_direct_gbpages_off(char *arg)
66{ 56{
67 direct_gbpages = 0; 57 direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 75pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
86EXPORT_SYMBOL_GPL(__supported_pte_mask); 76EXPORT_SYMBOL_GPL(__supported_pte_mask);
87 77
88static int disable_nx __cpuinitdata;
89
90/*
91 * noexec=on|off
92 * Control non-executable mappings for 64-bit processes.
93 *
94 * on Enable (default)
95 * off Disable
96 */
97static int __init nonx_setup(char *str)
98{
99 if (!str)
100 return -EINVAL;
101 if (!strncmp(str, "on", 2)) {
102 __supported_pte_mask |= _PAGE_NX;
103 disable_nx = 0;
104 } else if (!strncmp(str, "off", 3)) {
105 disable_nx = 1;
106 __supported_pte_mask &= ~_PAGE_NX;
107 }
108 return 0;
109}
110early_param("noexec", nonx_setup);
111
112void __cpuinit check_efer(void)
113{
114 unsigned long efer;
115
116 rdmsrl(MSR_EFER, efer);
117 if (!(efer & EFER_NX) || disable_nx)
118 __supported_pte_mask &= ~_PAGE_NX;
119}
120
121int force_personality32; 78int force_personality32;
122 79
123/* 80/*
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
628 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 585 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
629 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 586 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
630} 587}
588#endif
631 589
632void __init paging_init(void) 590void __init paging_init(void)
633{ 591{
@@ -638,11 +596,10 @@ void __init paging_init(void)
638 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 596 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
639 max_zone_pfns[ZONE_NORMAL] = max_pfn; 597 max_zone_pfns[ZONE_NORMAL] = max_pfn;
640 598
641 memory_present(0, 0, max_pfn); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
642 sparse_init(); 600 sparse_init();
643 free_area_init_nodes(max_zone_pfns); 601 free_area_init_nodes(max_zone_pfns);
644} 602}
645#endif
646 603
647/* 604/*
648 * Memory hotplug specific functions 605 * Memory hotplug specific functions
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d39..fe6f84ca121e 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
83 kpte_clear_flush(kmap_pte-idx, vaddr); 83 kpte_clear_flush(kmap_pte-idx, vaddr);
84 84
85 arch_flush_lazy_mmu_mode();
86 pagefault_enable(); 85 pagefault_enable();
87} 86}
88EXPORT_SYMBOL_GPL(iounmap_atomic); 87EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be06217..c0bedcd10f97 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{ 42{
43 u64 i, count; 43 u64 *p;
44 u64 *start; 44 void *start, *end;
45 u64 start_bad, last_bad; 45 u64 start_bad, last_bad;
46 u64 start_phys_aligned; 46 u64 start_phys_aligned;
47 size_t incr; 47 size_t incr;
48 48
49 incr = sizeof(pattern); 49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned); 51 start = __va(start_phys_aligned);
52 end = start + size - (start_phys_aligned - start_phys);
53 start_bad = 0; 53 start_bad = 0;
54 last_bad = 0; 54 last_bad = 0;
55 55
56 for (i = 0; i < count; i++) 56 for (p = start; p < end; p++)
57 start[i] = pattern; 57 *p = pattern;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { 58 for (p = start; p < end; p++, start_phys_aligned += incr) {
59 if (*start == pattern) 59 if (*p == pattern)
60 continue; 60 continue;
61 if (start_phys_aligned == last_bad + incr) { 61 if (start_phys_aligned == last_bad + incr) {
62 last_bad += incr; 62 last_bad += incr;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029dc..459913beac71 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
179} 179}
180 180
181/* Initialize bootmem allocator for a node */ 181/* Initialize bootmem allocator for a node */
182void __init setup_node_bootmem(int nodeid, unsigned long start, 182void __init
183 unsigned long end) 183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 184{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
186 unsigned long bootmap_start, nodedata_phys; 187 unsigned long bootmap_start, nodedata_phys;
187 void *bootmap; 188 void *bootmap;
188 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
189 int nid; 189 int nid;
190 190
191 if (!end) 191 if (!end)
192 return; 192 return;
193 193
194 /*
195 * Don't confuse VM with a node that doesn't have the
196 * minimum amount of memory:
197 */
198 if (end && (end - start) < NODE_MIN_SIZE)
199 return;
200
194 start = roundup(start, ZONE_ALIGN); 201 start = roundup(start, ZONE_ALIGN);
195 202
196 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
272 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 279 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
273 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 280 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
274 281
275#ifdef CONFIG_ACPI_NUMA
276 srat_reserve_add_area(nodeid);
277#endif
278 node_set_online(nodeid); 282 node_set_online(nodeid);
279} 283}
280 284
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
578 return pages; 582 return pages;
579} 583}
580 584
581void __init paging_init(void)
582{
583 unsigned long max_zone_pfns[MAX_NR_ZONES];
584
585 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
586 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
587 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
588 max_zone_pfns[ZONE_NORMAL] = max_pfn;
589
590 sparse_memory_present_with_active_regions(MAX_NUMNODES);
591 sparse_init();
592
593 free_area_init_nodes(max_zone_pfns);
594}
595
596static __init int numa_setup(char *opt) 585static __init int numa_setup(char *opt)
597{ 586{
598 if (!opt) 587 if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
606#ifdef CONFIG_ACPI_NUMA 595#ifdef CONFIG_ACPI_NUMA
607 if (!strncmp(opt, "noacpi", 6)) 596 if (!strncmp(opt, "noacpi", 6))
608 acpi_numa = -1; 597 acpi_numa = -1;
609 if (!strncmp(opt, "hotadd=", 7))
610 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
611#endif 598#endif
612 return 0; 599 return 0;
613} 600}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed088c5..6ce9518fe2ac 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
839 839
840 vm_unmap_aliases(); 840 vm_unmap_aliases();
841 841
842 /*
843 * If we're called with lazy mmu updates enabled, the
844 * in-memory pte state may be stale. Flush pending updates to
845 * bring them up to date.
846 */
847 arch_flush_lazy_mmu_mode();
848
849 cpa.vaddr = addr; 842 cpa.vaddr = addr;
850 cpa.pages = pages; 843 cpa.pages = pages;
851 cpa.numpages = numpages; 844 cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
890 } else 883 } else
891 cpa_flush_all(cache); 884 cpa_flush_all(cache);
892 885
893 /*
894 * If we've been called with lazy mmu updates enabled, then
895 * make sure that everything gets flushed out before we
896 * return.
897 */
898 arch_flush_lazy_mmu_mode();
899
900out: 886out:
901 return ret; 887 return ret;
902} 888}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baaf..2dfcbf9df2ae 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
31static nodemask_t cpu_nodes_parsed __initdata; 31static nodemask_t cpu_nodes_parsed __initdata;
32static struct bootnode nodes[MAX_NUMNODES] __initdata; 32static struct bootnode nodes[MAX_NUMNODES] __initdata;
33static struct bootnode nodes_add[MAX_NUMNODES]; 33static struct bootnode nodes_add[MAX_NUMNODES];
34static int found_add_area __initdata;
35int hotadd_percent __initdata = 0;
36 34
37static int num_node_memblks __initdata; 35static int num_node_memblks __initdata;
38static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; 36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
39static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; 37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
40 38
41/* Too small nodes confuse the VM badly. Usually they result
42 from BIOS bugs. */
43#define NODE_MIN_SIZE (4*1024*1024)
44
45static __init int setup_node(int pxm) 39static __init int setup_node(int pxm)
46{ 40{
47 return acpi_map_pxm_to_node(pxm); 41 return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
66{ 60{
67 struct bootnode *nd = &nodes[i]; 61 struct bootnode *nd = &nodes[i];
68 62
69 if (found_add_area)
70 return;
71
72 if (nd->start < start) { 63 if (nd->start < start) {
73 nd->start = start; 64 nd->start = start;
74 if (nd->end < nd->start) 65 if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
86 int i; 77 int i;
87 printk(KERN_ERR "SRAT: SRAT not used.\n"); 78 printk(KERN_ERR "SRAT: SRAT not used.\n");
88 acpi_numa = -1; 79 acpi_numa = -1;
89 found_add_area = 0;
90 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
91 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
92 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
182 pxm, apic_id, node); 172 pxm, apic_id, node);
183} 173}
184 174
185static int update_end_of_memory(unsigned long end) {return -1;}
186static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
187#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 175#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
188static inline int save_add_info(void) {return 1;} 176static inline int save_add_info(void) {return 1;}
189#else 177#else
190static inline int save_add_info(void) {return 0;} 178static inline int save_add_info(void) {return 0;}
191#endif 179#endif
192/* 180/*
193 * Update nodes_add and decide if to include add are in the zone. 181 * Update nodes_add[]
194 * Both SPARSE and RESERVE need nodes_add information. 182 * This code supports one contiguous hot add area per node
195 * This code supports one contiguous hot add area per node.
196 */ 183 */
197static int __init 184static void __init
198reserve_hotadd(int node, unsigned long start, unsigned long end) 185update_nodes_add(int node, unsigned long start, unsigned long end)
199{ 186{
200 unsigned long s_pfn = start >> PAGE_SHIFT; 187 unsigned long s_pfn = start >> PAGE_SHIFT;
201 unsigned long e_pfn = end >> PAGE_SHIFT; 188 unsigned long e_pfn = end >> PAGE_SHIFT;
202 int ret = 0, changed = 0; 189 int changed = 0;
203 struct bootnode *nd = &nodes_add[node]; 190 struct bootnode *nd = &nodes_add[node];
204 191
205 /* I had some trouble with strange memory hotadd regions breaking 192 /* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
210 mistakes */ 197 mistakes */
211 if ((signed long)(end - start) < NODE_MIN_SIZE) { 198 if ((signed long)(end - start) < NODE_MIN_SIZE) {
212 printk(KERN_ERR "SRAT: Hotplug area too small\n"); 199 printk(KERN_ERR "SRAT: Hotplug area too small\n");
213 return -1; 200 return;
214 } 201 }
215 202
216 /* This check might be a bit too strict, but I'm keeping it for now. */ 203 /* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
218 printk(KERN_ERR 205 printk(KERN_ERR
219 "SRAT: Hotplug area %lu -> %lu has existing memory\n", 206 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
220 s_pfn, e_pfn); 207 s_pfn, e_pfn);
221 return -1; 208 return;
222 }
223
224 if (!hotadd_enough_memory(&nodes_add[node])) {
225 printk(KERN_ERR "SRAT: Hotplug area too large\n");
226 return -1;
227 } 209 }
228 210
229 /* Looks good */ 211 /* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
245 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 227 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
246 } 228 }
247 229
248 ret = update_end_of_memory(nd->end);
249
250 if (changed) 230 if (changed)
251 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); 231 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
252 return ret; 232 nd->start, nd->end);
253} 233}
254 234
255/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 235/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
310 start, end); 290 start, end);
311 e820_register_active_regions(node, start >> PAGE_SHIFT, 291 e820_register_active_regions(node, start >> PAGE_SHIFT,
312 end >> PAGE_SHIFT); 292 end >> PAGE_SHIFT);
313 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
314 nd->end >> PAGE_SHIFT);
315 293
316 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
317 (reserve_hotadd(node, start, end) < 0)) { 295 update_nodes_add(node, start, end);
318 /* Ignore hotadd region. Undo damage */ 296 /* restore nodes[node] */
319 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
320 *nd = oldnode; 297 *nd = oldnode;
321 if ((nd->start | nd->end) == 0) 298 if ((nd->start | nd->end) == 0)
322 node_clear(node, nodes_parsed); 299 node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
345 pxmram = 0; 322 pxmram = 0;
346 } 323 }
347 324
348 e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 325 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
349 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 326 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
350 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 327 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
351 printk(KERN_ERR 328 printk(KERN_ERR
352 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", 329 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
353 (pxmram << PAGE_SHIFT) >> 20, 330 (pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
357 return 1; 334 return 1;
358} 335}
359 336
360static void __init unparse_node(int node)
361{
362 int i;
363 node_clear(node, nodes_parsed);
364 node_clear(node, cpu_nodes_parsed);
365 for (i = 0; i < MAX_LOCAL_APIC; i++) {
366 if (apicid_to_node[i] == node)
367 apicid_to_node[i] = NUMA_NO_NODE;
368 }
369}
370
371void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
372 338
373/* Use the information discovered above to actually set up the nodes. */ 339/* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
379 return -1; 345 return -1;
380 346
381 /* First clean up the node list */ 347 /* First clean up the node list */
382 for (i = 0; i < MAX_NUMNODES; i++) { 348 for (i = 0; i < MAX_NUMNODES; i++)
383 cutoff_node(i, start, end); 349 cutoff_node(i, start, end);
384 /*
385 * don't confuse VM with a node that doesn't have the
386 * minimum memory.
387 */
388 if (nodes[i].end &&
389 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
390 unparse_node(i);
391 node_set_offline(i);
392 }
393 }
394 350
395 if (!nodes_cover_memory(nodes)) { 351 if (!nodes_cover_memory(nodes)) {
396 bad_srat(); 352 bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
423 379
424 if (node == NUMA_NO_NODE) 380 if (node == NUMA_NO_NODE)
425 continue; 381 continue;
426 if (!node_isset(node, node_possible_map)) 382 if (!node_online(node))
427 numa_clear_node(i); 383 numa_clear_node(i);
428 } 384 }
429 numa_init_array(); 385 numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
510} 466}
511#endif /* CONFIG_NUMA_EMU */ 467#endif /* CONFIG_NUMA_EMU */
512 468
513void __init srat_reserve_add_area(int nodeid)
514{
515 if (found_add_area && nodes_add[nodeid].end) {
516 u64 total_mb;
517
518 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
519 "for node %d at %Lx-%Lx\n",
520 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
521 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
522 >> PAGE_SHIFT;
523 total_mb *= sizeof(struct page);
524 total_mb >>= 20;
525 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
526 "pre-allocated memory.\n", (unsigned long long)total_mb);
527 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
528 nodes_add[nodeid].end - nodes_add[nodeid].start,
529 BOOTMEM_DEFAULT);
530 }
531}
532
533int __node_distance(int a, int b) 469int __node_distance(int a, int b)
534{ 470{
535 int index; 471 int index;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6e7d7c..0696d506c4ad 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
889 return 0; 889 return 0;
890 } 890 }
891 891
892 if (io_apic_assign_pci_irqs)
893 return 0;
894
892 /* Find IRQ routing entry */ 895 /* Find IRQ routing entry */
893 896
894 if (!pirq_table) 897 if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
1039 pirq_penalty[dev->irq]++; 1042 pirq_penalty[dev->irq]++;
1040 } 1043 }
1041 1044
1045 if (io_apic_assign_pci_irqs)
1046 return;
1047
1042 dev = NULL; 1048 dev = NULL;
1043 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1049 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1044 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1050 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1045 if (!pin) 1051 if (!pin)
1046 continue; 1052 continue;
1047 1053
1048#ifdef CONFIG_X86_IO_APIC
1049 /*
1050 * Recalculate IRQ numbers if we use the I/O APIC.
1051 */
1052 if (io_apic_assign_pci_irqs) {
1053 int irq;
1054
1055 /*
1056 * interrupt pins are numbered starting from 1
1057 */
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1059 PCI_SLOT(dev->devfn), pin - 1);
1060 /*
1061 * Busses behind bridges are typically not listed in the
1062 * MP-table. In this case we have to look up the IRQ
1063 * based on the parent bus, parent slot, and pin number.
1064 * The SMP code detects such bridged busses itself so we
1065 * should get into this branch reliably.
1066 */
1067 if (irq < 0 && dev->bus->parent) {
1068 /* go back to the bridge */
1069 struct pci_dev *bridge = dev->bus->self;
1070 int bus;
1071
1072 pin = pci_swizzle_interrupt_pin(dev, pin);
1073 bus = bridge->bus->number;
1074 irq = IO_APIC_get_PCI_irq_vector(bus,
1075 PCI_SLOT(bridge->devfn), pin - 1);
1076 if (irq >= 0)
1077 dev_warn(&dev->dev,
1078 "using bridge %s INT %c to "
1079 "get IRQ %d\n",
1080 pci_name(bridge),
1081 'A' + pin - 1, irq);
1082 }
1083 if (irq >= 0) {
1084 dev_info(&dev->dev,
1085 "PCI->APIC IRQ transform: INT %c "
1086 "-> IRQ %d\n",
1087 'A' + pin - 1, irq);
1088 dev->irq = irq;
1089 }
1090 }
1091#endif
1092 /* 1054 /*
1093 * Still no IRQ? Try to lookup one... 1055 * Still no IRQ? Try to lookup one...
1094 */ 1056 */
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
1183 pcibios_enable_irq = pirq_enable_irq; 1145 pcibios_enable_irq = pirq_enable_irq;
1184 1146
1185 pcibios_fixup_irqs(); 1147 pcibios_fixup_irqs();
1148
1149 if (io_apic_assign_pci_irqs && pci_routeirq) {
1150 struct pci_dev *dev = NULL;
1151 /*
1152 * PCI IRQ routing is set up by pci_enable_device(), but we
1153 * also do it here in case there are still broken drivers that
1154 * don't use pci_enable_device().
1155 */
1156 printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
1157 for_each_pci_dev(dev)
1158 pirq_enable_irq(dev);
1159 }
1160
1186 return 0; 1161 return 0;
1187} 1162}
1188 1163
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
1213static int pirq_enable_irq(struct pci_dev *dev) 1188static int pirq_enable_irq(struct pci_dev *dev)
1214{ 1189{
1215 u8 pin; 1190 u8 pin;
1216 struct pci_dev *temp_dev;
1217 1191
1218 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1192 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1219 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { 1193 if (pin && !pcibios_lookup_irq(dev, 1)) {
1220 char *msg = ""; 1194 char *msg = "";
1221 1195
1196 if (!io_apic_assign_pci_irqs && dev->irq)
1197 return 0;
1198
1222 if (io_apic_assign_pci_irqs) { 1199 if (io_apic_assign_pci_irqs) {
1200#ifdef CONFIG_X86_IO_APIC
1201 struct pci_dev *temp_dev;
1223 int irq; 1202 int irq;
1203 struct io_apic_irq_attr irq_attr;
1224 1204
1225 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); 1205 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1206 PCI_SLOT(dev->devfn),
1207 pin - 1, &irq_attr);
1226 /* 1208 /*
1227 * Busses behind bridges are typically not listed in the MP-table. 1209 * Busses behind bridges are typically not listed in the MP-table.
1228 * In this case we have to look up the IRQ based on the parent bus, 1210 * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1235 1217
1236 pin = pci_swizzle_interrupt_pin(dev, pin); 1218 pin = pci_swizzle_interrupt_pin(dev, pin);
1237 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1219 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1238 PCI_SLOT(bridge->devfn), pin - 1); 1220 PCI_SLOT(bridge->devfn),
1221 pin - 1, &irq_attr);
1239 if (irq >= 0) 1222 if (irq >= 0)
1240 dev_warn(&dev->dev, "using bridge %s " 1223 dev_warn(&dev->dev, "using bridge %s "
1241 "INT %c to get IRQ %d\n", 1224 "INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
1245 } 1228 }
1246 dev = temp_dev; 1229 dev = temp_dev;
1247 if (irq >= 0) { 1230 if (irq >= 0) {
1231 io_apic_set_pci_routing(&dev->dev, irq,
1232 &irq_attr);
1233 dev->irq = irq;
1248 dev_info(&dev->dev, "PCI->APIC IRQ transform: " 1234 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1249 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); 1235 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
1250 dev->irq = irq;
1251 return 0; 1236 return 0;
1252 } else 1237 } else
1253 msg = "; probably buggy MP table"; 1238 msg = "; probably buggy MP table";
1239#endif
1254 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1240 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1255 msg = ""; 1241 msg = "";
1256 else 1242 else
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098b..cac083386e03 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <linux/elf.h>
11#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
12#include <asm/vgtod.h> 13#include <asm/vgtod.h>
13#include <asm/proto.h> 14#include <asm/proto.h>
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee80..0a1700a2be9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/start_kernel.h> 21#include <linux/start_kernel.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/kprobes.h>
23#include <linux/bootmem.h> 24#include <linux/bootmem.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
@@ -44,6 +45,7 @@
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/proto.h> 46#include <asm/proto.h>
46#include <asm/msr-index.h> 47#include <asm/msr-index.h>
48#include <asm/traps.h>
47#include <asm/setup.h> 49#include <asm/setup.h>
48#include <asm/desc.h> 50#include <asm/desc.h>
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
240 return HYPERVISOR_get_debugreg(reg); 242 return HYPERVISOR_get_debugreg(reg);
241} 243}
242 244
243void xen_leave_lazy(void) 245static void xen_end_context_switch(struct task_struct *next)
244{ 246{
245 paravirt_leave_lazy(paravirt_get_lazy_mode());
246 xen_mc_flush(); 247 xen_mc_flush();
248 paravirt_end_context_switch(next);
247} 249}
248 250
249static unsigned long xen_store_tr(void) 251static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
428static int cvt_gate_to_trap(int vector, const gate_desc *val, 430static int cvt_gate_to_trap(int vector, const gate_desc *val,
429 struct trap_info *info) 431 struct trap_info *info)
430{ 432{
433 unsigned long addr;
434
431 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 435 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
432 return 0; 436 return 0;
433 437
434 info->vector = vector; 438 info->vector = vector;
435 info->address = gate_offset(*val); 439
440 addr = gate_offset(*val);
441#ifdef CONFIG_X86_64
442 /*
443 * Look for known traps using IST, and substitute them
444 * appropriately. The debugger ones are the only ones we care
445 * about. Xen will handle faults like double_fault and
446 * machine_check, so we should never see them. Warn if
447 * there's an unexpected IST-using fault handler.
448 */
449 if (addr == (unsigned long)debug)
450 addr = (unsigned long)xen_debug;
451 else if (addr == (unsigned long)int3)
452 addr = (unsigned long)xen_int3;
453 else if (addr == (unsigned long)stack_segment)
454 addr = (unsigned long)xen_stack_segment;
455 else if (addr == (unsigned long)double_fault ||
456 addr == (unsigned long)nmi) {
457 /* Don't need to handle these */
458 return 0;
459#ifdef CONFIG_X86_MCE
460 } else if (addr == (unsigned long)machine_check) {
461 return 0;
462#endif
463 } else {
464 /* Some other trap using IST? */
465 if (WARN_ON(val->ist != 0))
466 return 0;
467 }
468#endif /* CONFIG_X86_64 */
469 info->address = addr;
470
436 info->cs = gate_segment(*val); 471 info->cs = gate_segment(*val);
437 info->flags = val->dpl; 472 info->flags = val->dpl;
438 /* interrupt gates clear IF */ 473 /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
623 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU);
624} 659}
625 660
661static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
662
663static unsigned long xen_read_cr0(void)
664{
665 unsigned long cr0 = percpu_read(xen_cr0_value);
666
667 if (unlikely(cr0 == 0)) {
668 cr0 = native_read_cr0();
669 percpu_write(xen_cr0_value, cr0);
670 }
671
672 return cr0;
673}
674
626static void xen_write_cr0(unsigned long cr0) 675static void xen_write_cr0(unsigned long cr0)
627{ 676{
628 struct multicall_space mcs; 677 struct multicall_space mcs;
629 678
679 percpu_write(xen_cr0_value, cr0);
680
630 /* Only pay attention to cr0.TS; everything else is 681 /* Only pay attention to cr0.TS; everything else is
631 ignored. */ 682 ignored. */
632 mcs = xen_mc_entry(0); 683 mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
812 863
813 .clts = xen_clts, 864 .clts = xen_clts,
814 865
815 .read_cr0 = native_read_cr0, 866 .read_cr0 = xen_read_cr0,
816 .write_cr0 = xen_write_cr0, 867 .write_cr0 = xen_write_cr0,
817 868
818 .read_cr4 = native_read_cr4, 869 .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
860 /* Xen takes care of %gs when switching to usermode for us */ 911 /* Xen takes care of %gs when switching to usermode for us */
861 .swapgs = paravirt_nop, 912 .swapgs = paravirt_nop,
862 913
863 .lazy_mode = { 914 .start_context_switch = paravirt_start_context_switch,
864 .enter = paravirt_enter_lazy_cpu, 915 .end_context_switch = xen_end_context_switch,
865 .leave = xen_leave_lazy,
866 },
867}; 916};
868 917
869static const struct pv_apic_ops xen_apic_ops __initdata = { 918static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fba55b1a4021..4ceb28581652 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
453 pte_t *ptep, pte_t pteval) 453 pte_t *ptep, pte_t pteval)
454{ 454{
455 /* updates to init_mm may be done without lock */
456 if (mm == &init_mm)
457 preempt_disable();
458
459 ADD_STATS(set_pte_at, 1); 455 ADD_STATS(set_pte_at, 1);
460// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 456// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
461 ADD_STATS(set_pte_at_current, mm == current->mm); 457 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
476 } 472 }
477 xen_set_pte(ptep, pteval); 473 xen_set_pte(ptep, pteval);
478 474
479out: 475out: return;
480 if (mm == &init_mm)
481 preempt_enable();
482} 476}
483 477
484pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 478pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
1152 1146
1153 /* If this cpu still has a stale cr3 reference, then make sure 1147 /* If this cpu still has a stale cr3 reference, then make sure
1154 it has been flushed. */ 1148 it has been flushed. */
1155 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { 1149 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1156 load_cr3(swapper_pg_dir); 1150 load_cr3(swapper_pg_dir);
1157 arch_flush_lazy_cpu_mode();
1158 }
1159} 1151}
1160 1152
1161static void xen_drop_mm_ref(struct mm_struct *mm) 1153static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1168 load_cr3(swapper_pg_dir); 1160 load_cr3(swapper_pg_dir);
1169 else 1161 else
1170 leave_mm(smp_processor_id()); 1162 leave_mm(smp_processor_id());
1171 arch_flush_lazy_cpu_mode();
1172 } 1163 }
1173 1164
1174 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
1876 xen_mark_init_mm_pinned(); 1867 xen_mark_init_mm_pinned();
1877} 1868}
1878 1869
1870static void xen_leave_lazy_mmu(void)
1871{
1872 preempt_disable();
1873 xen_mc_flush();
1874 paravirt_leave_lazy_mmu();
1875 preempt_enable();
1876}
1877
1879const struct pv_mmu_ops xen_mmu_ops __initdata = { 1878const struct pv_mmu_ops xen_mmu_ops __initdata = {
1880 .pagetable_setup_start = xen_pagetable_setup_start, 1879 .pagetable_setup_start = xen_pagetable_setup_start,
1881 .pagetable_setup_done = xen_pagetable_setup_done, 1880 .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1949 1948
1950 .lazy_mode = { 1949 .lazy_mode = {
1951 .enter = paravirt_enter_lazy_mmu, 1950 .enter = paravirt_enter_lazy_mmu,
1952 .leave = xen_leave_lazy, 1951 .leave = xen_leave_lazy_mmu,
1953 }, 1952 },
1954 1953
1955 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a2..ad0047f47cd4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
61 * - xen_start_info 61 * - xen_start_info
62 * See comment above "struct start_info" in <xen/interface/xen.h> 62 * See comment above "struct start_info" in <xen/interface/xen.h>
63 */ 63 */
64 e820_add_region(__pa(xen_start_info->mfn_list), 64 reserve_early(__pa(xen_start_info->mfn_list),
65 xen_start_info->pt_base - xen_start_info->mfn_list, 65 __pa(xen_start_info->pt_base),
66 E820_RESERVED); 66 "XEN START INFO");
67 67
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 69
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ca6596b05d53..22494fd4c9b5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void); 33void xen_post_allocator_init(void);
35 34
36char * __init xen_memory_setup(void); 35char * __init xen_memory_setup(void);
diff --git a/block/bsg.c b/block/bsg.c
index 206060e795da..dd81be455e00 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -315,6 +315,7 @@ out:
315 blk_put_request(rq); 315 blk_put_request(rq);
316 if (next_rq) { 316 if (next_rq) {
317 blk_rq_unmap_user(next_rq->bio); 317 blk_rq_unmap_user(next_rq->bio);
318 next_rq->bio = NULL;
318 blk_put_request(next_rq); 319 blk_put_request(next_rq);
319 } 320 }
320 return ERR_PTR(ret); 321 return ERR_PTR(ret);
@@ -448,6 +449,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
448 hdr->dout_resid = rq->data_len; 449 hdr->dout_resid = rq->data_len;
449 hdr->din_resid = rq->next_rq->data_len; 450 hdr->din_resid = rq->next_rq->data_len;
450 blk_rq_unmap_user(bidi_bio); 451 blk_rq_unmap_user(bidi_bio);
452 rq->next_rq->bio = NULL;
451 blk_put_request(rq->next_rq); 453 blk_put_request(rq->next_rq);
452 } else if (rq_data_dir(rq) == READ) 454 } else if (rq_data_dir(rq) == READ)
453 hdr->din_resid = rq->data_len; 455 hdr->din_resid = rq->data_len;
@@ -466,6 +468,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
466 blk_rq_unmap_user(bio); 468 blk_rq_unmap_user(bio);
467 if (rq->cmd != rq->__cmd) 469 if (rq->cmd != rq->__cmd)
468 kfree(rq->cmd); 470 kfree(rq->cmd);
471 rq->bio = NULL;
469 blk_put_request(rq); 472 blk_put_request(rq);
470 473
471 return ret; 474 return ret;
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 51b9f8280f88..2faa9e2ac893 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
401 /* Interrupt Line values above 0xF are forbidden */ 401 /* Interrupt Line values above 0xF are forbidden */
402 if (dev->irq > 0 && (dev->irq <= 0xF)) { 402 if (dev->irq > 0 && (dev->irq <= 0xF)) {
403 printk(" - using IRQ %d\n", dev->irq); 403 printk(" - using IRQ %d\n", dev->irq);
404 acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE, 404 acpi_register_gsi(&dev->dev, dev->irq,
405 ACPI_LEVEL_SENSITIVE,
405 ACPI_ACTIVE_LOW); 406 ACPI_ACTIVE_LOW);
406 return 0; 407 return 0;
407 } else { 408 } else {
@@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
410 } 411 }
411 } 412 }
412 413
413 rc = acpi_register_gsi(gsi, triggering, polarity); 414 rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
414 if (rc < 0) { 415 if (rc < 0) {
415 dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n", 416 dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
416 pin_name(pin)); 417 pin_name(pin));
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 45ad3288c5ff..23f0fb84f1c1 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -844,7 +844,7 @@ static int acpi_processor_add(struct acpi_device *device)
844 if (!pr) 844 if (!pr)
845 return -ENOMEM; 845 return -ENOMEM;
846 846
847 if (!alloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) { 847 if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
848 kfree(pr); 848 kfree(pr);
849 return -ENOMEM; 849 return -ENOMEM;
850 } 850 }
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 340ba4f9dc54..4a9f3492b921 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
224 break; 224 break;
225 } 225 }
226 226
227 gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE, 227 gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE,
228 ACPI_ACTIVE_LOW); 228 ACPI_ACTIVE_LOW);
229 if (gsi > 0) 229 if (gsi > 0)
230 break; 230 break;
@@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
939 irqp = &res->data.extended_irq; 939 irqp = &res->data.extended_irq;
940 940
941 for (i = 0; i < irqp->interrupt_count; i++) { 941 for (i = 0; i < irqp->interrupt_count; i++) {
942 irq = acpi_register_gsi(irqp->interrupts[i], 942 irq = acpi_register_gsi(NULL, irqp->interrupts[i],
943 irqp->triggering, irqp->polarity); 943 irqp->triggering, irqp->polarity);
944 if (irq < 0) 944 if (irq < 0)
945 return AE_ERROR; 945 return AE_ERROR;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 65e12bca657c..f96d0bef855e 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf,
694 written += chunk - unwritten; 694 written += chunk - unwritten;
695 if (unwritten) 695 if (unwritten)
696 break; 696 break;
697 /* Consider changing this to just 'signal_pending()' with lots of testing */ 697 if (signal_pending(current))
698 if (fatal_signal_pending(current)) 698 return written ? written : -ERESTARTSYS;
699 return written ? written : -EINTR;
700 buf += chunk; 699 buf += chunk;
701 count -= chunk; 700 count -= chunk;
702 cond_resched(); 701 cond_resched();
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index a420e8d437dd..13f8871e5b21 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -2711,7 +2711,7 @@ static int __init mxser_module_init(void)
2711 continue; 2711 continue;
2712 2712
2713 brd = &mxser_boards[m]; 2713 brd = &mxser_boards[m];
2714 retval = mxser_get_ISA_conf(!ioaddr[b], brd); 2714 retval = mxser_get_ISA_conf(ioaddr[b], brd);
2715 if (retval <= 0) { 2715 if (retval <= 0) {
2716 brd->info = NULL; 2716 brd->info = NULL;
2717 continue; 2717 continue;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 47d2ad0ae079..6e2ec0b18948 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -808,7 +808,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
808 ret = -ENOMEM; 808 ret = -ENOMEM;
809 goto nomem_out; 809 goto nomem_out;
810 } 810 }
811 if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) { 811 if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
812 free_cpumask_var(policy->cpus); 812 free_cpumask_var(policy->cpus);
813 kfree(policy); 813 kfree(policy);
814 ret = -ENOMEM; 814 ret = -ENOMEM;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d400aef8d9b..bb37fb1b2d82 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -362,7 +362,7 @@ static void raid5_unplug_device(struct request_queue *q);
362 362
363static struct stripe_head * 363static struct stripe_head *
364get_active_stripe(raid5_conf_t *conf, sector_t sector, 364get_active_stripe(raid5_conf_t *conf, sector_t sector,
365 int previous, int noblock) 365 int previous, int noblock, int noquiesce)
366{ 366{
367 struct stripe_head *sh; 367 struct stripe_head *sh;
368 368
@@ -372,7 +372,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
372 372
373 do { 373 do {
374 wait_event_lock_irq(conf->wait_for_stripe, 374 wait_event_lock_irq(conf->wait_for_stripe,
375 conf->quiesce == 0, 375 conf->quiesce == 0 || noquiesce,
376 conf->device_lock, /* nothing */); 376 conf->device_lock, /* nothing */);
377 sh = __find_stripe(conf, sector, conf->generation - previous); 377 sh = __find_stripe(conf, sector, conf->generation - previous);
378 if (!sh) { 378 if (!sh) {
@@ -2671,7 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2671 sector_t bn = compute_blocknr(sh, i, 1); 2671 sector_t bn = compute_blocknr(sh, i, 1);
2672 sector_t s = raid5_compute_sector(conf, bn, 0, 2672 sector_t s = raid5_compute_sector(conf, bn, 0,
2673 &dd_idx, NULL); 2673 &dd_idx, NULL);
2674 sh2 = get_active_stripe(conf, s, 0, 1); 2674 sh2 = get_active_stripe(conf, s, 0, 1, 1);
2675 if (sh2 == NULL) 2675 if (sh2 == NULL)
2676 /* so far only the early blocks of this stripe 2676 /* so far only the early blocks of this stripe
2677 * have been requested. When later blocks 2677 * have been requested. When later blocks
@@ -2944,7 +2944,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2944 /* Finish reconstruct operations initiated by the expansion process */ 2944 /* Finish reconstruct operations initiated by the expansion process */
2945 if (sh->reconstruct_state == reconstruct_state_result) { 2945 if (sh->reconstruct_state == reconstruct_state_result) {
2946 struct stripe_head *sh2 2946 struct stripe_head *sh2
2947 = get_active_stripe(conf, sh->sector, 1, 1); 2947 = get_active_stripe(conf, sh->sector, 1, 1, 1);
2948 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 2948 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
2949 /* sh cannot be written until sh2 has been read. 2949 /* sh cannot be written until sh2 has been read.
2950 * so arrange for sh to be delayed a little 2950 * so arrange for sh to be delayed a little
@@ -3189,7 +3189,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3189 3189
3190 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3190 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
3191 struct stripe_head *sh2 3191 struct stripe_head *sh2
3192 = get_active_stripe(conf, sh->sector, 1, 1); 3192 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3193 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3193 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3194 /* sh cannot be written until sh2 has been read. 3194 /* sh cannot be written until sh2 has been read.
3195 * so arrange for sh to be delayed a little 3195 * so arrange for sh to be delayed a little
@@ -3288,7 +3288,7 @@ static void unplug_slaves(mddev_t *mddev)
3288 int i; 3288 int i;
3289 3289
3290 rcu_read_lock(); 3290 rcu_read_lock();
3291 for (i=0; i<mddev->raid_disks; i++) { 3291 for (i = 0; i < conf->raid_disks; i++) {
3292 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 3292 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3293 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 3293 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3294 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 3294 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -3675,7 +3675,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3675 (unsigned long long)logical_sector); 3675 (unsigned long long)logical_sector);
3676 3676
3677 sh = get_active_stripe(conf, new_sector, previous, 3677 sh = get_active_stripe(conf, new_sector, previous,
3678 (bi->bi_rw&RWA_MASK)); 3678 (bi->bi_rw&RWA_MASK), 0);
3679 if (sh) { 3679 if (sh) {
3680 if (unlikely(previous)) { 3680 if (unlikely(previous)) {
3681 /* expansion might have moved on while waiting for a 3681 /* expansion might have moved on while waiting for a
@@ -3873,7 +3873,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3873 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 3873 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
3874 int j; 3874 int j;
3875 int skipped = 0; 3875 int skipped = 0;
3876 sh = get_active_stripe(conf, stripe_addr+i, 0, 0); 3876 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
3877 set_bit(STRIPE_EXPANDING, &sh->state); 3877 set_bit(STRIPE_EXPANDING, &sh->state);
3878 atomic_inc(&conf->reshape_stripes); 3878 atomic_inc(&conf->reshape_stripes);
3879 /* If any of this stripe is beyond the end of the old 3879 /* If any of this stripe is beyond the end of the old
@@ -3916,13 +3916,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3916 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 3916 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
3917 1, &dd_idx, NULL); 3917 1, &dd_idx, NULL);
3918 last_sector = 3918 last_sector =
3919 raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) 3919 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
3920 *(new_data_disks) - 1), 3920 *(new_data_disks) - 1),
3921 1, &dd_idx, NULL); 3921 1, &dd_idx, NULL);
3922 if (last_sector >= mddev->dev_sectors) 3922 if (last_sector >= mddev->dev_sectors)
3923 last_sector = mddev->dev_sectors - 1; 3923 last_sector = mddev->dev_sectors - 1;
3924 while (first_sector <= last_sector) { 3924 while (first_sector <= last_sector) {
3925 sh = get_active_stripe(conf, first_sector, 1, 0); 3925 sh = get_active_stripe(conf, first_sector, 1, 0, 1);
3926 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3926 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3927 set_bit(STRIPE_HANDLE, &sh->state); 3927 set_bit(STRIPE_HANDLE, &sh->state);
3928 release_stripe(sh); 3928 release_stripe(sh);
@@ -4022,9 +4022,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4022 4022
4023 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4023 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4024 4024
4025 sh = get_active_stripe(conf, sector_nr, 0, 1); 4025 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4026 if (sh == NULL) { 4026 if (sh == NULL) {
4027 sh = get_active_stripe(conf, sector_nr, 0, 0); 4027 sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4028 /* make sure we don't swamp the stripe cache if someone else 4028 /* make sure we don't swamp the stripe cache if someone else
4029 * is trying to get access 4029 * is trying to get access
4030 */ 4030 */
@@ -4034,7 +4034,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4034 * We don't need to check the 'failed' flag as when that gets set, 4034 * We don't need to check the 'failed' flag as when that gets set,
4035 * recovery aborts. 4035 * recovery aborts.
4036 */ 4036 */
4037 for (i=0; i<mddev->raid_disks; i++) 4037 for (i = 0; i < conf->raid_disks; i++)
4038 if (conf->disks[i].rdev == NULL) 4038 if (conf->disks[i].rdev == NULL)
4039 still_degraded = 1; 4039 still_degraded = 1;
4040 4040
@@ -4086,7 +4086,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4086 /* already done this stripe */ 4086 /* already done this stripe */
4087 continue; 4087 continue;
4088 4088
4089 sh = get_active_stripe(conf, sector, 0, 1); 4089 sh = get_active_stripe(conf, sector, 0, 1, 0);
4090 4090
4091 if (!sh) { 4091 if (!sh) {
4092 /* failed to get a stripe - must wait */ 4092 /* failed to get a stripe - must wait */
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 8247a945a1d9..3b19e0ce290f 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -66,7 +66,6 @@ static const int multicast_filter_limit = 32;
66#define RX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ 66#define RX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */
67#define TX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ 67#define TX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */
68#define EarlyTxThld 0x3F /* 0x3F means NO early transmit */ 68#define EarlyTxThld 0x3F /* 0x3F means NO early transmit */
69#define RxPacketMaxSize 0x3FE8 /* 16K - 1 - ETH_HLEN - VLAN - CRC... */
70#define SafeMtu 0x1c20 /* ... actually life sucks beyond ~7k */ 69#define SafeMtu 0x1c20 /* ... actually life sucks beyond ~7k */
71#define InterFrameGap 0x03 /* 3 means InterFrameGap = the shortest one */ 70#define InterFrameGap 0x03 /* 3 means InterFrameGap = the shortest one */
72 71
@@ -2357,10 +2356,10 @@ static u16 rtl_rw_cpluscmd(void __iomem *ioaddr)
2357 return cmd; 2356 return cmd;
2358} 2357}
2359 2358
2360static void rtl_set_rx_max_size(void __iomem *ioaddr) 2359static void rtl_set_rx_max_size(void __iomem *ioaddr, unsigned int rx_buf_sz)
2361{ 2360{
2362 /* Low hurts. Let's disable the filtering. */ 2361 /* Low hurts. Let's disable the filtering. */
2363 RTL_W16(RxMaxSize, 16383); 2362 RTL_W16(RxMaxSize, rx_buf_sz);
2364} 2363}
2365 2364
2366static void rtl8169_set_magic_reg(void __iomem *ioaddr, unsigned mac_version) 2365static void rtl8169_set_magic_reg(void __iomem *ioaddr, unsigned mac_version)
@@ -2407,7 +2406,7 @@ static void rtl_hw_start_8169(struct net_device *dev)
2407 2406
2408 RTL_W8(EarlyTxThres, EarlyTxThld); 2407 RTL_W8(EarlyTxThres, EarlyTxThld);
2409 2408
2410 rtl_set_rx_max_size(ioaddr); 2409 rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
2411 2410
2412 if ((tp->mac_version == RTL_GIGA_MAC_VER_01) || 2411 if ((tp->mac_version == RTL_GIGA_MAC_VER_01) ||
2413 (tp->mac_version == RTL_GIGA_MAC_VER_02) || 2412 (tp->mac_version == RTL_GIGA_MAC_VER_02) ||
@@ -2668,7 +2667,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
2668 2667
2669 RTL_W8(EarlyTxThres, EarlyTxThld); 2668 RTL_W8(EarlyTxThres, EarlyTxThld);
2670 2669
2671 rtl_set_rx_max_size(ioaddr); 2670 rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
2672 2671
2673 tp->cp_cmd |= RTL_R16(CPlusCmd) | PktCntrDisable | INTT_1; 2672 tp->cp_cmd |= RTL_R16(CPlusCmd) | PktCntrDisable | INTT_1;
2674 2673
@@ -2846,7 +2845,7 @@ static void rtl_hw_start_8101(struct net_device *dev)
2846 2845
2847 RTL_W8(EarlyTxThres, EarlyTxThld); 2846 RTL_W8(EarlyTxThres, EarlyTxThld);
2848 2847
2849 rtl_set_rx_max_size(ioaddr); 2848 rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
2850 2849
2851 tp->cp_cmd |= rtl_rw_cpluscmd(ioaddr) | PCIMulRW; 2850 tp->cp_cmd |= rtl_rw_cpluscmd(ioaddr) | PCIMulRW;
2852 2851
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 73348c4047e9..4a9cc92d4d18 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
702} 702}
703 703
704#ifdef CONFIG_SMP 704#ifdef CONFIG_SMP
705static void iosapic_set_affinity_irq(unsigned int irq, 705static int iosapic_set_affinity_irq(unsigned int irq,
706 const struct cpumask *dest) 706 const struct cpumask *dest)
707{ 707{
708 struct vector_info *vi = iosapic_get_vector(irq); 708 struct vector_info *vi = iosapic_get_vector(irq);
@@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
712 712
713 dest_cpu = cpu_check_affinity(irq, dest); 713 dest_cpu = cpu_check_affinity(irq, dest);
714 if (dest_cpu < 0) 714 if (dest_cpu < 0)
715 return; 715 return -1;
716 716
717 cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu)); 717 cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
718 vi->txn_addr = txn_affinity_addr(irq, dest_cpu); 718 vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
@@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq,
724 iosapic_set_irt_data(vi, &dummy_d0, &d1); 724 iosapic_set_irt_data(vi, &dummy_d0, &d1);
725 iosapic_wr_irt_entry(vi, d0, d1); 725 iosapic_wr_irt_entry(vi, d0, d1);
726 spin_unlock_irqrestore(&iosapic_lock, flags); 726 spin_unlock_irqrestore(&iosapic_lock, flags);
727
728 return 0;
727} 729}
728#endif 730#endif
729 731
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index dd18f857dfb0..42e4260c3b12 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -153,45 +153,47 @@ int ibmphp_init_devno(struct slot **cur_slot)
153 return -1; 153 return -1;
154 } 154 }
155 for (loop = 0; loop < len; loop++) { 155 for (loop = 0; loop < len; loop++) {
156 if ((*cur_slot)->number == rtable->slots[loop].slot) { 156 if ((*cur_slot)->number == rtable->slots[loop].slot &&
157 if ((*cur_slot)->bus == rtable->slots[loop].bus) { 157 (*cur_slot)->bus == rtable->slots[loop].bus) {
158 struct io_apic_irq_attr irq_attr;
159
158 (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn); 160 (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
159 for (i = 0; i < 4; i++) 161 for (i = 0; i < 4; i++)
160 (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus, 162 (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
161 (int) (*cur_slot)->device, i); 163 (int) (*cur_slot)->device, i,
162 164 &irq_attr);
163 debug("(*cur_slot)->irq[0] = %x\n", 165
164 (*cur_slot)->irq[0]); 166 debug("(*cur_slot)->irq[0] = %x\n",
165 debug("(*cur_slot)->irq[1] = %x\n", 167 (*cur_slot)->irq[0]);
166 (*cur_slot)->irq[1]); 168 debug("(*cur_slot)->irq[1] = %x\n",
167 debug("(*cur_slot)->irq[2] = %x\n", 169 (*cur_slot)->irq[1]);
168 (*cur_slot)->irq[2]); 170 debug("(*cur_slot)->irq[2] = %x\n",
169 debug("(*cur_slot)->irq[3] = %x\n", 171 (*cur_slot)->irq[2]);
170 (*cur_slot)->irq[3]); 172 debug("(*cur_slot)->irq[3] = %x\n",
171 173 (*cur_slot)->irq[3]);
172 debug("rtable->exlusive_irqs = %x\n", 174
175 debug("rtable->exlusive_irqs = %x\n",
173 rtable->exclusive_irqs); 176 rtable->exclusive_irqs);
174 debug("rtable->slots[loop].irq[0].bitmap = %x\n", 177 debug("rtable->slots[loop].irq[0].bitmap = %x\n",
175 rtable->slots[loop].irq[0].bitmap); 178 rtable->slots[loop].irq[0].bitmap);
176 debug("rtable->slots[loop].irq[1].bitmap = %x\n", 179 debug("rtable->slots[loop].irq[1].bitmap = %x\n",
177 rtable->slots[loop].irq[1].bitmap); 180 rtable->slots[loop].irq[1].bitmap);
178 debug("rtable->slots[loop].irq[2].bitmap = %x\n", 181 debug("rtable->slots[loop].irq[2].bitmap = %x\n",
179 rtable->slots[loop].irq[2].bitmap); 182 rtable->slots[loop].irq[2].bitmap);
180 debug("rtable->slots[loop].irq[3].bitmap = %x\n", 183 debug("rtable->slots[loop].irq[3].bitmap = %x\n",
181 rtable->slots[loop].irq[3].bitmap); 184 rtable->slots[loop].irq[3].bitmap);
182 185
183 debug("rtable->slots[loop].irq[0].link = %x\n", 186 debug("rtable->slots[loop].irq[0].link = %x\n",
184 rtable->slots[loop].irq[0].link); 187 rtable->slots[loop].irq[0].link);
185 debug("rtable->slots[loop].irq[1].link = %x\n", 188 debug("rtable->slots[loop].irq[1].link = %x\n",
186 rtable->slots[loop].irq[1].link); 189 rtable->slots[loop].irq[1].link);
187 debug("rtable->slots[loop].irq[2].link = %x\n", 190 debug("rtable->slots[loop].irq[2].link = %x\n",
188 rtable->slots[loop].irq[2].link); 191 rtable->slots[loop].irq[2].link);
189 debug("rtable->slots[loop].irq[3].link = %x\n", 192 debug("rtable->slots[loop].irq[3].link = %x\n",
190 rtable->slots[loop].irq[3].link); 193 rtable->slots[loop].irq[3].link);
191 debug("end of init_devno\n"); 194 debug("end of init_devno\n");
192 kfree(rtable); 195 kfree(rtable);
193 return 0; 196 return 0;
194 }
195 } 197 }
196 } 198 }
197 199
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 6808d8333ecc..737a1c44b07a 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
98 int max_irq; 98 int max_irq;
99 int pos; 99 int pos;
100 int irq; 100 int irq;
101 int node;
101 102
102 pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); 103 pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
103 if (!pos) 104 if (!pos)
@@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
125 cfg->msg.address_lo = 0xffffffff; 126 cfg->msg.address_lo = 0xffffffff;
126 cfg->msg.address_hi = 0xffffffff; 127 cfg->msg.address_hi = 0xffffffff;
127 128
128 irq = create_irq(); 129 node = dev_to_node(&dev->dev);
130 irq = create_irq_nr(0, node);
129 131
130 if (irq <= 0) { 132 if (irq <= 0) {
131 kfree(cfg); 133 kfree(cfg);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index a563fbe559d0..cd389162735f 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1972,15 +1972,6 @@ static int __init init_dmars(void)
1972 } 1972 }
1973 } 1973 }
1974 1974
1975#ifdef CONFIG_INTR_REMAP
1976 if (!intr_remapping_enabled) {
1977 ret = enable_intr_remapping(0);
1978 if (ret)
1979 printk(KERN_ERR
1980 "IOMMU: enable interrupt remapping failed\n");
1981 }
1982#endif
1983
1984 /* 1975 /*
1985 * For each rmrr 1976 * For each rmrr
1986 * for each dev attached to rmrr 1977 * for each dev attached to rmrr
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea724a6f..3a0cb0bb0593 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
15static int ir_ioapic_num; 15static int ir_ioapic_num;
16int intr_remapping_enabled; 16int intr_remapping_enabled;
17 17
18static int disable_intremap;
19static __init int setup_nointremap(char *str)
20{
21 disable_intremap = 1;
22 return 0;
23}
24early_param("nointremap", setup_nointremap);
25
18struct irq_2_iommu { 26struct irq_2_iommu {
19 struct intel_iommu *iommu; 27 struct intel_iommu *iommu;
20 u16 irte_index; 28 u16 irte_index;
@@ -23,15 +31,12 @@ struct irq_2_iommu {
23}; 31};
24 32
25#ifdef CONFIG_GENERIC_HARDIRQS 33#ifdef CONFIG_GENERIC_HARDIRQS
26static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu) 34static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
27{ 35{
28 struct irq_2_iommu *iommu; 36 struct irq_2_iommu *iommu;
29 int node;
30
31 node = cpu_to_node(cpu);
32 37
33 iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node); 38 iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
34 printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node); 39 printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
35 40
36 return iommu; 41 return iommu;
37} 42}
@@ -48,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
48 return desc->irq_2_iommu; 53 return desc->irq_2_iommu;
49} 54}
50 55
51static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) 56static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
52{ 57{
53 struct irq_desc *desc; 58 struct irq_desc *desc;
54 struct irq_2_iommu *irq_iommu; 59 struct irq_2_iommu *irq_iommu;
@@ -56,7 +61,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
56 /* 61 /*
57 * alloc irq desc if not allocated already. 62 * alloc irq desc if not allocated already.
58 */ 63 */
59 desc = irq_to_desc_alloc_cpu(irq, cpu); 64 desc = irq_to_desc_alloc_node(irq, node);
60 if (!desc) { 65 if (!desc) {
61 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 66 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
62 return NULL; 67 return NULL;
@@ -65,14 +70,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
65 irq_iommu = desc->irq_2_iommu; 70 irq_iommu = desc->irq_2_iommu;
66 71
67 if (!irq_iommu) 72 if (!irq_iommu)
68 desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu); 73 desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
69 74
70 return desc->irq_2_iommu; 75 return desc->irq_2_iommu;
71} 76}
72 77
73static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) 78static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
74{ 79{
75 return irq_2_iommu_alloc_cpu(irq, boot_cpu_id); 80 return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
76} 81}
77 82
78#else /* !CONFIG_SPARSE_IRQ */ 83#else /* !CONFIG_SPARSE_IRQ */
@@ -423,20 +428,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
423 readl, (sts & DMA_GSTS_IRTPS), sts); 428 readl, (sts & DMA_GSTS_IRTPS), sts);
424 spin_unlock_irqrestore(&iommu->register_lock, flags); 429 spin_unlock_irqrestore(&iommu->register_lock, flags);
425 430
426 if (mode == 0) {
427 spin_lock_irqsave(&iommu->register_lock, flags);
428
429 /* enable comaptiblity format interrupt pass through */
430 cmd = iommu->gcmd | DMA_GCMD_CFI;
431 iommu->gcmd |= DMA_GCMD_CFI;
432 writel(cmd, iommu->reg + DMAR_GCMD_REG);
433
434 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
435 readl, (sts & DMA_GSTS_CFIS), sts);
436
437 spin_unlock_irqrestore(&iommu->register_lock, flags);
438 }
439
440 /* 431 /*
441 * global invalidation of interrupt entry cache before enabling 432 * global invalidation of interrupt entry cache before enabling
442 * interrupt-remapping. 433 * interrupt-remapping.
@@ -516,6 +507,23 @@ end:
516 spin_unlock_irqrestore(&iommu->register_lock, flags); 507 spin_unlock_irqrestore(&iommu->register_lock, flags);
517} 508}
518 509
510int __init intr_remapping_supported(void)
511{
512 struct dmar_drhd_unit *drhd;
513
514 if (disable_intremap)
515 return 0;
516
517 for_each_drhd_unit(drhd) {
518 struct intel_iommu *iommu = drhd->iommu;
519
520 if (!ecap_ir_support(iommu->ecap))
521 return 0;
522 }
523
524 return 1;
525}
526
519int __init enable_intr_remapping(int eim) 527int __init enable_intr_remapping(int eim)
520{ 528{
521 struct dmar_drhd_unit *drhd; 529 struct dmar_drhd_unit *drhd;
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index adf17856bacc..7f207f335bec 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
123 } 123 }
124 124
125 flags = irq_flags(triggering, polarity, shareable); 125 flags = irq_flags(triggering, polarity, shareable);
126 irq = acpi_register_gsi(gsi, triggering, polarity); 126 irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
127 if (irq >= 0) 127 if (irq >= 0)
128 pcibios_penalize_isa_irq(irq, 1); 128 pcibios_penalize_isa_irq(irq, 1);
129 else 129 else
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 8ac9cddac575..cab100acf983 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES
18 secure, but slightly less efficient. 18 secure, but slightly less efficient.
19 If in doubt, say yes. 19 If in doubt, say yes.
20 20
21config XEN_DEV_EVTCHN
22 tristate "Xen /dev/xen/evtchn device"
23 depends on XEN
24 default y
25 help
26 The evtchn driver allows a userspace process to triger event
27 channels and to receive notification of an event channel
28 firing.
29 If in doubt, say yes.
30
21config XENFS 31config XENFS
22 tristate "Xen filesystem" 32 tristate "Xen filesystem"
23 depends on XEN 33 depends on XEN
@@ -41,3 +51,13 @@ config XEN_COMPAT_XENFS
41 a xen platform. 51 a xen platform.
42 If in doubt, say yes. 52 If in doubt, say yes.
43 53
54config XEN_SYS_HYPERVISOR
55 bool "Create xen entries under /sys/hypervisor"
56 depends on XEN && SYSFS
57 select SYS_HYPERVISOR
58 default y
59 help
60 Create entries under /sys/hypervisor describing the Xen
61 hypervisor environment. When running native or in another
62 virtual environment, /sys/hypervisor will still be present,
63 but will have no xen contents. \ No newline at end of file
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ff8accc9e103..ec2a39b1e26f 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,6 @@ obj-y += xenbus/
4obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o 4obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
5obj-$(CONFIG_XEN_XENCOMM) += xencomm.o 5obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
6obj-$(CONFIG_XEN_BALLOON) += balloon.o 6obj-$(CONFIG_XEN_BALLOON) += balloon.o
7obj-$(CONFIG_XENFS) += xenfs/ \ No newline at end of file 7obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
8obj-$(CONFIG_XENFS) += xenfs/
9obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o \ No newline at end of file
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 30963af5dba0..891d2e90753a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq)
151 return info_for_irq(irq)->evtchn; 151 return info_for_irq(irq)->evtchn;
152} 152}
153 153
154unsigned irq_from_evtchn(unsigned int evtchn)
155{
156 return evtchn_to_irq[evtchn];
157}
158EXPORT_SYMBOL_GPL(irq_from_evtchn);
159
154static enum ipi_vector ipi_from_irq(unsigned irq) 160static enum ipi_vector ipi_from_irq(unsigned irq)
155{ 161{
156 struct irq_info *info = info_for_irq(irq); 162 struct irq_info *info = info_for_irq(irq);
@@ -335,7 +341,7 @@ static int find_unbound_irq(void)
335 if (irq == nr_irqs) 341 if (irq == nr_irqs)
336 panic("No available IRQ to bind to: increase nr_irqs!\n"); 342 panic("No available IRQ to bind to: increase nr_irqs!\n");
337 343
338 desc = irq_to_desc_alloc_cpu(irq, 0); 344 desc = irq_to_desc_alloc_node(irq, 0);
339 if (WARN_ON(desc == NULL)) 345 if (WARN_ON(desc == NULL))
340 return -1; 346 return -1;
341 347
@@ -688,13 +694,13 @@ void rebind_evtchn_irq(int evtchn, int irq)
688} 694}
689 695
690/* Rebind an evtchn so that it gets delivered to a specific cpu */ 696/* Rebind an evtchn so that it gets delivered to a specific cpu */
691static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) 697static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
692{ 698{
693 struct evtchn_bind_vcpu bind_vcpu; 699 struct evtchn_bind_vcpu bind_vcpu;
694 int evtchn = evtchn_from_irq(irq); 700 int evtchn = evtchn_from_irq(irq);
695 701
696 if (!VALID_EVTCHN(evtchn)) 702 if (!VALID_EVTCHN(evtchn))
697 return; 703 return -1;
698 704
699 /* Send future instances of this interrupt to other vcpu. */ 705 /* Send future instances of this interrupt to other vcpu. */
700 bind_vcpu.port = evtchn; 706 bind_vcpu.port = evtchn;
@@ -707,13 +713,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
707 */ 713 */
708 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) 714 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
709 bind_evtchn_to_cpu(evtchn, tcpu); 715 bind_evtchn_to_cpu(evtchn, tcpu);
710}
711 716
717 return 0;
718}
712 719
713static void set_affinity_irq(unsigned irq, const struct cpumask *dest) 720static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
714{ 721{
715 unsigned tcpu = cpumask_first(dest); 722 unsigned tcpu = cpumask_first(dest);
716 rebind_irq_to_cpu(irq, tcpu); 723
724 return rebind_irq_to_cpu(irq, tcpu);
717} 725}
718 726
719int resend_irq_on_evtchn(unsigned int irq) 727int resend_irq_on_evtchn(unsigned int irq)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644
index 000000000000..af031950f9b1
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,507 @@
1/******************************************************************************
2 * evtchn.c
3 *
4 * Driver for receiving and demuxing event-channel signals.
5 *
6 * Copyright (c) 2004-2005, K A Fraser
7 * Multi-process extensions Copyright (c) 2004, Steven Smith
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#include <linux/module.h>
35#include <linux/kernel.h>
36#include <linux/sched.h>
37#include <linux/slab.h>
38#include <linux/string.h>
39#include <linux/errno.h>
40#include <linux/fs.h>
41#include <linux/errno.h>
42#include <linux/miscdevice.h>
43#include <linux/major.h>
44#include <linux/proc_fs.h>
45#include <linux/stat.h>
46#include <linux/poll.h>
47#include <linux/irq.h>
48#include <linux/init.h>
49#include <linux/gfp.h>
50#include <linux/mutex.h>
51#include <linux/cpu.h>
52#include <xen/events.h>
53#include <xen/evtchn.h>
54#include <asm/xen/hypervisor.h>
55
56struct per_user_data {
57 struct mutex bind_mutex; /* serialize bind/unbind operations */
58
59 /* Notification ring, accessed via /dev/xen/evtchn. */
60#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
61#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
62 evtchn_port_t *ring;
63 unsigned int ring_cons, ring_prod, ring_overflow;
64 struct mutex ring_cons_mutex; /* protect against concurrent readers */
65
66 /* Processes wait on this queue when ring is empty. */
67 wait_queue_head_t evtchn_wait;
68 struct fasync_struct *evtchn_async_queue;
69 const char *name;
70};
71
72/* Who's bound to each port? */
73static struct per_user_data *port_user[NR_EVENT_CHANNELS];
74static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
75
76irqreturn_t evtchn_interrupt(int irq, void *data)
77{
78 unsigned int port = (unsigned long)data;
79 struct per_user_data *u;
80
81 spin_lock(&port_user_lock);
82
83 u = port_user[port];
84
85 disable_irq_nosync(irq);
86
87 if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
88 u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
89 wmb(); /* Ensure ring contents visible */
90 if (u->ring_cons == u->ring_prod++) {
91 wake_up_interruptible(&u->evtchn_wait);
92 kill_fasync(&u->evtchn_async_queue,
93 SIGIO, POLL_IN);
94 }
95 } else {
96 u->ring_overflow = 1;
97 }
98
99 spin_unlock(&port_user_lock);
100
101 return IRQ_HANDLED;
102}
103
104static ssize_t evtchn_read(struct file *file, char __user *buf,
105 size_t count, loff_t *ppos)
106{
107 int rc;
108 unsigned int c, p, bytes1 = 0, bytes2 = 0;
109 struct per_user_data *u = file->private_data;
110
111 /* Whole number of ports. */
112 count &= ~(sizeof(evtchn_port_t)-1);
113
114 if (count == 0)
115 return 0;
116
117 if (count > PAGE_SIZE)
118 count = PAGE_SIZE;
119
120 for (;;) {
121 mutex_lock(&u->ring_cons_mutex);
122
123 rc = -EFBIG;
124 if (u->ring_overflow)
125 goto unlock_out;
126
127 c = u->ring_cons;
128 p = u->ring_prod;
129 if (c != p)
130 break;
131
132 mutex_unlock(&u->ring_cons_mutex);
133
134 if (file->f_flags & O_NONBLOCK)
135 return -EAGAIN;
136
137 rc = wait_event_interruptible(u->evtchn_wait,
138 u->ring_cons != u->ring_prod);
139 if (rc)
140 return rc;
141 }
142
143 /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
144 if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
145 bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
146 sizeof(evtchn_port_t);
147 bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
148 } else {
149 bytes1 = (p - c) * sizeof(evtchn_port_t);
150 bytes2 = 0;
151 }
152
153 /* Truncate chunks according to caller's maximum byte count. */
154 if (bytes1 > count) {
155 bytes1 = count;
156 bytes2 = 0;
157 } else if ((bytes1 + bytes2) > count) {
158 bytes2 = count - bytes1;
159 }
160
161 rc = -EFAULT;
162 rmb(); /* Ensure that we see the port before we copy it. */
163 if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
164 ((bytes2 != 0) &&
165 copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
166 goto unlock_out;
167
168 u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
169 rc = bytes1 + bytes2;
170
171 unlock_out:
172 mutex_unlock(&u->ring_cons_mutex);
173 return rc;
174}
175
176static ssize_t evtchn_write(struct file *file, const char __user *buf,
177 size_t count, loff_t *ppos)
178{
179 int rc, i;
180 evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
181 struct per_user_data *u = file->private_data;
182
183 if (kbuf == NULL)
184 return -ENOMEM;
185
186 /* Whole number of ports. */
187 count &= ~(sizeof(evtchn_port_t)-1);
188
189 rc = 0;
190 if (count == 0)
191 goto out;
192
193 if (count > PAGE_SIZE)
194 count = PAGE_SIZE;
195
196 rc = -EFAULT;
197 if (copy_from_user(kbuf, buf, count) != 0)
198 goto out;
199
200 spin_lock_irq(&port_user_lock);
201 for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
202 if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
203 enable_irq(irq_from_evtchn(kbuf[i]));
204 spin_unlock_irq(&port_user_lock);
205
206 rc = count;
207
208 out:
209 free_page((unsigned long)kbuf);
210 return rc;
211}
212
213static int evtchn_bind_to_user(struct per_user_data *u, int port)
214{
215 int rc = 0;
216
217 /*
218 * Ports are never reused, so every caller should pass in a
219 * unique port.
220 *
221 * (Locking not necessary because we haven't registered the
222 * interrupt handler yet, and our caller has already
223 * serialized bind operations.)
224 */
225 BUG_ON(port_user[port] != NULL);
226 port_user[port] = u;
227
228 rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
229 u->name, (void *)(unsigned long)port);
230 if (rc >= 0)
231 rc = 0;
232
233 return rc;
234}
235
236static void evtchn_unbind_from_user(struct per_user_data *u, int port)
237{
238 int irq = irq_from_evtchn(port);
239
240 unbind_from_irqhandler(irq, (void *)(unsigned long)port);
241
242 /* make sure we unbind the irq handler before clearing the port */
243 barrier();
244
245 port_user[port] = NULL;
246}
247
248static long evtchn_ioctl(struct file *file,
249 unsigned int cmd, unsigned long arg)
250{
251 int rc;
252 struct per_user_data *u = file->private_data;
253 void __user *uarg = (void __user *) arg;
254
255 /* Prevent bind from racing with unbind */
256 mutex_lock(&u->bind_mutex);
257
258 switch (cmd) {
259 case IOCTL_EVTCHN_BIND_VIRQ: {
260 struct ioctl_evtchn_bind_virq bind;
261 struct evtchn_bind_virq bind_virq;
262
263 rc = -EFAULT;
264 if (copy_from_user(&bind, uarg, sizeof(bind)))
265 break;
266
267 bind_virq.virq = bind.virq;
268 bind_virq.vcpu = 0;
269 rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
270 &bind_virq);
271 if (rc != 0)
272 break;
273
274 rc = evtchn_bind_to_user(u, bind_virq.port);
275 if (rc == 0)
276 rc = bind_virq.port;
277 break;
278 }
279
280 case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
281 struct ioctl_evtchn_bind_interdomain bind;
282 struct evtchn_bind_interdomain bind_interdomain;
283
284 rc = -EFAULT;
285 if (copy_from_user(&bind, uarg, sizeof(bind)))
286 break;
287
288 bind_interdomain.remote_dom = bind.remote_domain;
289 bind_interdomain.remote_port = bind.remote_port;
290 rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
291 &bind_interdomain);
292 if (rc != 0)
293 break;
294
295 rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
296 if (rc == 0)
297 rc = bind_interdomain.local_port;
298 break;
299 }
300
301 case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
302 struct ioctl_evtchn_bind_unbound_port bind;
303 struct evtchn_alloc_unbound alloc_unbound;
304
305 rc = -EFAULT;
306 if (copy_from_user(&bind, uarg, sizeof(bind)))
307 break;
308
309 alloc_unbound.dom = DOMID_SELF;
310 alloc_unbound.remote_dom = bind.remote_domain;
311 rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
312 &alloc_unbound);
313 if (rc != 0)
314 break;
315
316 rc = evtchn_bind_to_user(u, alloc_unbound.port);
317 if (rc == 0)
318 rc = alloc_unbound.port;
319 break;
320 }
321
322 case IOCTL_EVTCHN_UNBIND: {
323 struct ioctl_evtchn_unbind unbind;
324
325 rc = -EFAULT;
326 if (copy_from_user(&unbind, uarg, sizeof(unbind)))
327 break;
328
329 rc = -EINVAL;
330 if (unbind.port >= NR_EVENT_CHANNELS)
331 break;
332
333 spin_lock_irq(&port_user_lock);
334
335 rc = -ENOTCONN;
336 if (port_user[unbind.port] != u) {
337 spin_unlock_irq(&port_user_lock);
338 break;
339 }
340
341 evtchn_unbind_from_user(u, unbind.port);
342
343 spin_unlock_irq(&port_user_lock);
344
345 rc = 0;
346 break;
347 }
348
349 case IOCTL_EVTCHN_NOTIFY: {
350 struct ioctl_evtchn_notify notify;
351
352 rc = -EFAULT;
353 if (copy_from_user(&notify, uarg, sizeof(notify)))
354 break;
355
356 if (notify.port >= NR_EVENT_CHANNELS) {
357 rc = -EINVAL;
358 } else if (port_user[notify.port] != u) {
359 rc = -ENOTCONN;
360 } else {
361 notify_remote_via_evtchn(notify.port);
362 rc = 0;
363 }
364 break;
365 }
366
367 case IOCTL_EVTCHN_RESET: {
368 /* Initialise the ring to empty. Clear errors. */
369 mutex_lock(&u->ring_cons_mutex);
370 spin_lock_irq(&port_user_lock);
371 u->ring_cons = u->ring_prod = u->ring_overflow = 0;
372 spin_unlock_irq(&port_user_lock);
373 mutex_unlock(&u->ring_cons_mutex);
374 rc = 0;
375 break;
376 }
377
378 default:
379 rc = -ENOSYS;
380 break;
381 }
382 mutex_unlock(&u->bind_mutex);
383
384 return rc;
385}
386
387static unsigned int evtchn_poll(struct file *file, poll_table *wait)
388{
389 unsigned int mask = POLLOUT | POLLWRNORM;
390 struct per_user_data *u = file->private_data;
391
392 poll_wait(file, &u->evtchn_wait, wait);
393 if (u->ring_cons != u->ring_prod)
394 mask |= POLLIN | POLLRDNORM;
395 if (u->ring_overflow)
396 mask = POLLERR;
397 return mask;
398}
399
400static int evtchn_fasync(int fd, struct file *filp, int on)
401{
402 struct per_user_data *u = filp->private_data;
403 return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
404}
405
406static int evtchn_open(struct inode *inode, struct file *filp)
407{
408 struct per_user_data *u;
409
410 u = kzalloc(sizeof(*u), GFP_KERNEL);
411 if (u == NULL)
412 return -ENOMEM;
413
414 u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
415 if (u->name == NULL) {
416 kfree(u);
417 return -ENOMEM;
418 }
419
420 init_waitqueue_head(&u->evtchn_wait);
421
422 u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
423 if (u->ring == NULL) {
424 kfree(u->name);
425 kfree(u);
426 return -ENOMEM;
427 }
428
429 mutex_init(&u->bind_mutex);
430 mutex_init(&u->ring_cons_mutex);
431
432 filp->private_data = u;
433
434 return 0;
435}
436
437static int evtchn_release(struct inode *inode, struct file *filp)
438{
439 int i;
440 struct per_user_data *u = filp->private_data;
441
442 spin_lock_irq(&port_user_lock);
443
444 free_page((unsigned long)u->ring);
445
446 for (i = 0; i < NR_EVENT_CHANNELS; i++) {
447 if (port_user[i] != u)
448 continue;
449
450 evtchn_unbind_from_user(port_user[i], i);
451 }
452
453 spin_unlock_irq(&port_user_lock);
454
455 kfree(u->name);
456 kfree(u);
457
458 return 0;
459}
460
461static const struct file_operations evtchn_fops = {
462 .owner = THIS_MODULE,
463 .read = evtchn_read,
464 .write = evtchn_write,
465 .unlocked_ioctl = evtchn_ioctl,
466 .poll = evtchn_poll,
467 .fasync = evtchn_fasync,
468 .open = evtchn_open,
469 .release = evtchn_release,
470};
471
472static struct miscdevice evtchn_miscdev = {
473 .minor = MISC_DYNAMIC_MINOR,
474 .name = "evtchn",
475 .fops = &evtchn_fops,
476};
477static int __init evtchn_init(void)
478{
479 int err;
480
481 if (!xen_domain())
482 return -ENODEV;
483
484 spin_lock_init(&port_user_lock);
485 memset(port_user, 0, sizeof(port_user));
486
487 /* Create '/dev/misc/evtchn'. */
488 err = misc_register(&evtchn_miscdev);
489 if (err != 0) {
490 printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
491 return err;
492 }
493
494 printk(KERN_INFO "Event-channel device installed.\n");
495
496 return 0;
497}
498
499static void __exit evtchn_cleanup(void)
500{
501 misc_deregister(&evtchn_miscdev);
502}
503
504module_init(evtchn_init);
505module_exit(evtchn_cleanup);
506
507MODULE_LICENSE("GPL");
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 4b5b84837ee1..fddc2025dece 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -98,9 +98,8 @@ static void do_suspend(void)
98 goto out; 98 goto out;
99 } 99 }
100 100
101 printk("suspending xenbus...\n"); 101 printk(KERN_DEBUG "suspending xenstore...\n");
102 /* XXX use normal device tree? */ 102 xs_suspend();
103 xenbus_suspend();
104 103
105 err = device_power_down(PMSG_SUSPEND); 104 err = device_power_down(PMSG_SUSPEND);
106 if (err) { 105 if (err) {
@@ -116,9 +115,9 @@ static void do_suspend(void)
116 115
117 if (!cancelled) { 116 if (!cancelled) {
118 xen_arch_resume(); 117 xen_arch_resume();
119 xenbus_resume(); 118 xs_resume();
120 } else 119 } else
121 xenbus_suspend_cancel(); 120 xs_suspend_cancel();
122 121
123 device_power_up(PMSG_RESUME); 122 device_power_up(PMSG_RESUME);
124 123
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644
index 000000000000..88a60e03ccf0
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,445 @@
1/*
2 * copyright (c) 2006 IBM Corporation
3 * Authored by: Mike D. Day <ncmike@us.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/kobject.h>
13
14#include <asm/xen/hypervisor.h>
15#include <asm/xen/hypercall.h>
16
17#include <xen/xenbus.h>
18#include <xen/interface/xen.h>
19#include <xen/interface/version.h>
20
21#define HYPERVISOR_ATTR_RO(_name) \
22static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
23
24#define HYPERVISOR_ATTR_RW(_name) \
25static struct hyp_sysfs_attr _name##_attr = \
26 __ATTR(_name, 0644, _name##_show, _name##_store)
27
28struct hyp_sysfs_attr {
29 struct attribute attr;
30 ssize_t (*show)(struct hyp_sysfs_attr *, char *);
31 ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
32 void *hyp_attr_data;
33};
34
35static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
36{
37 return sprintf(buffer, "xen\n");
38}
39
40HYPERVISOR_ATTR_RO(type);
41
42static int __init xen_sysfs_type_init(void)
43{
44 return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
45}
46
47static void xen_sysfs_type_destroy(void)
48{
49 sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
50}
51
52/* xen version attributes */
53static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
54{
55 int version = HYPERVISOR_xen_version(XENVER_version, NULL);
56 if (version)
57 return sprintf(buffer, "%d\n", version >> 16);
58 return -ENODEV;
59}
60
61HYPERVISOR_ATTR_RO(major);
62
63static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
64{
65 int version = HYPERVISOR_xen_version(XENVER_version, NULL);
66 if (version)
67 return sprintf(buffer, "%d\n", version & 0xff);
68 return -ENODEV;
69}
70
71HYPERVISOR_ATTR_RO(minor);
72
73static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
74{
75 int ret = -ENOMEM;
76 char *extra;
77
78 extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
79 if (extra) {
80 ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
81 if (!ret)
82 ret = sprintf(buffer, "%s\n", extra);
83 kfree(extra);
84 }
85
86 return ret;
87}
88
89HYPERVISOR_ATTR_RO(extra);
90
91static struct attribute *version_attrs[] = {
92 &major_attr.attr,
93 &minor_attr.attr,
94 &extra_attr.attr,
95 NULL
96};
97
98static struct attribute_group version_group = {
99 .name = "version",
100 .attrs = version_attrs,
101};
102
103static int __init xen_sysfs_version_init(void)
104{
105 return sysfs_create_group(hypervisor_kobj, &version_group);
106}
107
108static void xen_sysfs_version_destroy(void)
109{
110 sysfs_remove_group(hypervisor_kobj, &version_group);
111}
112
113/* UUID */
114
115static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
116{
117 char *vm, *val;
118 int ret;
119 extern int xenstored_ready;
120
121 if (!xenstored_ready)
122 return -EBUSY;
123
124 vm = xenbus_read(XBT_NIL, "vm", "", NULL);
125 if (IS_ERR(vm))
126 return PTR_ERR(vm);
127 val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
128 kfree(vm);
129 if (IS_ERR(val))
130 return PTR_ERR(val);
131 ret = sprintf(buffer, "%s\n", val);
132 kfree(val);
133 return ret;
134}
135
136HYPERVISOR_ATTR_RO(uuid);
137
138static int __init xen_sysfs_uuid_init(void)
139{
140 return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
141}
142
143static void xen_sysfs_uuid_destroy(void)
144{
145 sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
146}
147
148/* xen compilation attributes */
149
150static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
151{
152 int ret = -ENOMEM;
153 struct xen_compile_info *info;
154
155 info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
156 if (info) {
157 ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
158 if (!ret)
159 ret = sprintf(buffer, "%s\n", info->compiler);
160 kfree(info);
161 }
162
163 return ret;
164}
165
166HYPERVISOR_ATTR_RO(compiler);
167
168static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
169{
170 int ret = -ENOMEM;
171 struct xen_compile_info *info;
172
173 info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
174 if (info) {
175 ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
176 if (!ret)
177 ret = sprintf(buffer, "%s\n", info->compile_by);
178 kfree(info);
179 }
180
181 return ret;
182}
183
184HYPERVISOR_ATTR_RO(compiled_by);
185
186static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
187{
188 int ret = -ENOMEM;
189 struct xen_compile_info *info;
190
191 info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
192 if (info) {
193 ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
194 if (!ret)
195 ret = sprintf(buffer, "%s\n", info->compile_date);
196 kfree(info);
197 }
198
199 return ret;
200}
201
202HYPERVISOR_ATTR_RO(compile_date);
203
204static struct attribute *xen_compile_attrs[] = {
205 &compiler_attr.attr,
206 &compiled_by_attr.attr,
207 &compile_date_attr.attr,
208 NULL
209};
210
211static struct attribute_group xen_compilation_group = {
212 .name = "compilation",
213 .attrs = xen_compile_attrs,
214};
215
216int __init static xen_compilation_init(void)
217{
218 return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
219}
220
221static void xen_compilation_destroy(void)
222{
223 sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
224}
225
226/* xen properties info */
227
228static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
229{
230 int ret = -ENOMEM;
231 char *caps;
232
233 caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
234 if (caps) {
235 ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
236 if (!ret)
237 ret = sprintf(buffer, "%s\n", caps);
238 kfree(caps);
239 }
240
241 return ret;
242}
243
244HYPERVISOR_ATTR_RO(capabilities);
245
246static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
247{
248 int ret = -ENOMEM;
249 char *cset;
250
251 cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
252 if (cset) {
253 ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
254 if (!ret)
255 ret = sprintf(buffer, "%s\n", cset);
256 kfree(cset);
257 }
258
259 return ret;
260}
261
262HYPERVISOR_ATTR_RO(changeset);
263
264static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
265{
266 int ret = -ENOMEM;
267 struct xen_platform_parameters *parms;
268
269 parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
270 if (parms) {
271 ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
272 parms);
273 if (!ret)
274 ret = sprintf(buffer, "%lx\n", parms->virt_start);
275 kfree(parms);
276 }
277
278 return ret;
279}
280
281HYPERVISOR_ATTR_RO(virtual_start);
282
283static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
284{
285 int ret;
286
287 ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
288 if (ret > 0)
289 ret = sprintf(buffer, "%x\n", ret);
290
291 return ret;
292}
293
294HYPERVISOR_ATTR_RO(pagesize);
295
296static ssize_t xen_feature_show(int index, char *buffer)
297{
298 ssize_t ret;
299 struct xen_feature_info info;
300
301 info.submap_idx = index;
302 ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
303 if (!ret)
304 ret = sprintf(buffer, "%08x", info.submap);
305
306 return ret;
307}
308
309static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
310{
311 ssize_t len;
312 int i;
313
314 len = 0;
315 for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
316 int ret = xen_feature_show(i, buffer + len);
317 if (ret < 0) {
318 if (len == 0)
319 len = ret;
320 break;
321 }
322 len += ret;
323 }
324 if (len > 0)
325 buffer[len++] = '\n';
326
327 return len;
328}
329
330HYPERVISOR_ATTR_RO(features);
331
332static struct attribute *xen_properties_attrs[] = {
333 &capabilities_attr.attr,
334 &changeset_attr.attr,
335 &virtual_start_attr.attr,
336 &pagesize_attr.attr,
337 &features_attr.attr,
338 NULL
339};
340
341static struct attribute_group xen_properties_group = {
342 .name = "properties",
343 .attrs = xen_properties_attrs,
344};
345
346static int __init xen_properties_init(void)
347{
348 return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
349}
350
351static void xen_properties_destroy(void)
352{
353 sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
354}
355
356static int __init hyper_sysfs_init(void)
357{
358 int ret;
359
360 if (!xen_domain())
361 return -ENODEV;
362
363 ret = xen_sysfs_type_init();
364 if (ret)
365 goto out;
366 ret = xen_sysfs_version_init();
367 if (ret)
368 goto version_out;
369 ret = xen_compilation_init();
370 if (ret)
371 goto comp_out;
372 ret = xen_sysfs_uuid_init();
373 if (ret)
374 goto uuid_out;
375 ret = xen_properties_init();
376 if (ret)
377 goto prop_out;
378
379 goto out;
380
381prop_out:
382 xen_sysfs_uuid_destroy();
383uuid_out:
384 xen_compilation_destroy();
385comp_out:
386 xen_sysfs_version_destroy();
387version_out:
388 xen_sysfs_type_destroy();
389out:
390 return ret;
391}
392
393static void __exit hyper_sysfs_exit(void)
394{
395 xen_properties_destroy();
396 xen_compilation_destroy();
397 xen_sysfs_uuid_destroy();
398 xen_sysfs_version_destroy();
399 xen_sysfs_type_destroy();
400
401}
402module_init(hyper_sysfs_init);
403module_exit(hyper_sysfs_exit);
404
405static ssize_t hyp_sysfs_show(struct kobject *kobj,
406 struct attribute *attr,
407 char *buffer)
408{
409 struct hyp_sysfs_attr *hyp_attr;
410 hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
411 if (hyp_attr->show)
412 return hyp_attr->show(hyp_attr, buffer);
413 return 0;
414}
415
416static ssize_t hyp_sysfs_store(struct kobject *kobj,
417 struct attribute *attr,
418 const char *buffer,
419 size_t len)
420{
421 struct hyp_sysfs_attr *hyp_attr;
422 hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
423 if (hyp_attr->store)
424 return hyp_attr->store(hyp_attr, buffer, len);
425 return 0;
426}
427
428static struct sysfs_ops hyp_sysfs_ops = {
429 .show = hyp_sysfs_show,
430 .store = hyp_sysfs_store,
431};
432
433static struct kobj_type hyp_sysfs_kobj_type = {
434 .sysfs_ops = &hyp_sysfs_ops,
435};
436
437static int __init hypervisor_subsys_init(void)
438{
439 if (!xen_domain())
440 return -ENODEV;
441
442 hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
443 return 0;
444}
445device_initcall(hypervisor_subsys_init);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 773d1cf23283..d42e25d5968d 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name);
71 71
72static void xenbus_dev_shutdown(struct device *_dev); 72static void xenbus_dev_shutdown(struct device *_dev);
73 73
74static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
75static int xenbus_dev_resume(struct device *dev);
76
74/* If something in array of ids matches this device, return it. */ 77/* If something in array of ids matches this device, return it. */
75static const struct xenbus_device_id * 78static const struct xenbus_device_id *
76match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) 79match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = {
188 .remove = xenbus_dev_remove, 191 .remove = xenbus_dev_remove,
189 .shutdown = xenbus_dev_shutdown, 192 .shutdown = xenbus_dev_shutdown,
190 .dev_attrs = xenbus_dev_attrs, 193 .dev_attrs = xenbus_dev_attrs,
194
195 .suspend = xenbus_dev_suspend,
196 .resume = xenbus_dev_resume,
191 }, 197 },
192}; 198};
193 199
@@ -654,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
654 660
655 kfree(root); 661 kfree(root);
656} 662}
663EXPORT_SYMBOL_GPL(xenbus_dev_changed);
657 664
658static void frontend_changed(struct xenbus_watch *watch, 665static void frontend_changed(struct xenbus_watch *watch,
659 const char **vec, unsigned int len) 666 const char **vec, unsigned int len)
@@ -669,7 +676,7 @@ static struct xenbus_watch fe_watch = {
669 .callback = frontend_changed, 676 .callback = frontend_changed,
670}; 677};
671 678
672static int suspend_dev(struct device *dev, void *data) 679static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
673{ 680{
674 int err = 0; 681 int err = 0;
675 struct xenbus_driver *drv; 682 struct xenbus_driver *drv;
@@ -682,35 +689,14 @@ static int suspend_dev(struct device *dev, void *data)
682 drv = to_xenbus_driver(dev->driver); 689 drv = to_xenbus_driver(dev->driver);
683 xdev = container_of(dev, struct xenbus_device, dev); 690 xdev = container_of(dev, struct xenbus_device, dev);
684 if (drv->suspend) 691 if (drv->suspend)
685 err = drv->suspend(xdev); 692 err = drv->suspend(xdev, state);
686 if (err) 693 if (err)
687 printk(KERN_WARNING 694 printk(KERN_WARNING
688 "xenbus: suspend %s failed: %i\n", dev_name(dev), err); 695 "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
689 return 0; 696 return 0;
690} 697}
691 698
692static int suspend_cancel_dev(struct device *dev, void *data) 699static int xenbus_dev_resume(struct device *dev)
693{
694 int err = 0;
695 struct xenbus_driver *drv;
696 struct xenbus_device *xdev;
697
698 DPRINTK("");
699
700 if (dev->driver == NULL)
701 return 0;
702 drv = to_xenbus_driver(dev->driver);
703 xdev = container_of(dev, struct xenbus_device, dev);
704 if (drv->suspend_cancel)
705 err = drv->suspend_cancel(xdev);
706 if (err)
707 printk(KERN_WARNING
708 "xenbus: suspend_cancel %s failed: %i\n",
709 dev_name(dev), err);
710 return 0;
711}
712
713static int resume_dev(struct device *dev, void *data)
714{ 700{
715 int err; 701 int err;
716 struct xenbus_driver *drv; 702 struct xenbus_driver *drv;
@@ -755,33 +741,6 @@ static int resume_dev(struct device *dev, void *data)
755 return 0; 741 return 0;
756} 742}
757 743
758void xenbus_suspend(void)
759{
760 DPRINTK("");
761
762 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
763 xenbus_backend_suspend(suspend_dev);
764 xs_suspend();
765}
766EXPORT_SYMBOL_GPL(xenbus_suspend);
767
768void xenbus_resume(void)
769{
770 xb_init_comms();
771 xs_resume();
772 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
773 xenbus_backend_resume(resume_dev);
774}
775EXPORT_SYMBOL_GPL(xenbus_resume);
776
777void xenbus_suspend_cancel(void)
778{
779 xs_suspend_cancel();
780 bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
781 xenbus_backend_resume(suspend_cancel_dev);
782}
783EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
784
785/* A flag to determine if xenstored is 'ready' (i.e. has started) */ 744/* A flag to determine if xenstored is 'ready' (i.e. has started) */
786int xenstored_ready = 0; 745int xenstored_ready = 0;
787 746
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index e325eab4724d..eab33f1dbdf7 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -673,6 +673,8 @@ void xs_resume(void)
673 struct xenbus_watch *watch; 673 struct xenbus_watch *watch;
674 char token[sizeof(watch) * 2 + 1]; 674 char token[sizeof(watch) * 2 + 1];
675 675
676 xb_init_comms();
677
676 mutex_unlock(&xs_state.response_mutex); 678 mutex_unlock(&xs_state.response_mutex);
677 mutex_unlock(&xs_state.request_mutex); 679 mutex_unlock(&xs_state.request_mutex);
678 up_write(&xs_state.transaction_mutex); 680 up_write(&xs_state.transaction_mutex);
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 515741a8e6b8..6559e0c752ce 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -20,10 +20,27 @@
20MODULE_DESCRIPTION("Xen filesystem"); 20MODULE_DESCRIPTION("Xen filesystem");
21MODULE_LICENSE("GPL"); 21MODULE_LICENSE("GPL");
22 22
23static ssize_t capabilities_read(struct file *file, char __user *buf,
24 size_t size, loff_t *off)
25{
26 char *tmp = "";
27
28 if (xen_initial_domain())
29 tmp = "control_d\n";
30
31 return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
32}
33
34static const struct file_operations capabilities_file_ops = {
35 .read = capabilities_read,
36};
37
23static int xenfs_fill_super(struct super_block *sb, void *data, int silent) 38static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
24{ 39{
25 static struct tree_descr xenfs_files[] = { 40 static struct tree_descr xenfs_files[] = {
26 [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR}, 41 [1] = {},
42 { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
43 { "capabilities", &capabilities_file_ops, S_IRUGO },
27 {""}, 44 {""},
28 }; 45 };
29 46
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index eeb246845909..2341375386f8 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait,
297 */ 297 */
298 if (notify == NFY_MOUNT) { 298 if (notify == NFY_MOUNT) {
299 /* 299 /*
300 * If the dentry isn't hashed just go ahead and try the 300 * If the dentry was successfully mounted while we slept
301 * mount again with a new wait (not much else we can do). 301 * on the wait queue mutex we can return success. If it
302 */ 302 * isn't mounted (doesn't have submounts for the case of
303 if (!d_unhashed(dentry)) { 303 * a multi-mount with no mount at it's base) we can
304 /* 304 * continue on and create a new request.
305 * But if the dentry is hashed, that means that we 305 */
306 * got here through the revalidate path. Thus, we 306 if (have_submounts(dentry))
307 * need to check if the dentry has been mounted 307 return 0;
308 * while we waited on the wq_mutex. If it has,
309 * simply return success.
310 */
311 if (d_mountpoint(dentry))
312 return 0;
313 }
314 } 308 }
315 309
316 return 1; 310 return 1;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 06560c520f49..618e21c0b7a3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -241,7 +241,7 @@ write_out_data:
241 spin_lock(&journal->j_list_lock); 241 spin_lock(&journal->j_list_lock);
242 } 242 }
243 /* Someone already cleaned up the buffer? */ 243 /* Someone already cleaned up the buffer? */
244 if (!buffer_jbd(bh) 244 if (!buffer_jbd(bh) || bh2jh(bh) != jh
245 || jh->b_transaction != commit_transaction 245 || jh->b_transaction != commit_transaction
246 || jh->b_jlist != BJ_SyncData) { 246 || jh->b_jlist != BJ_SyncData) {
247 jbd_unlock_bh_state(bh); 247 jbd_unlock_bh_state(bh);
@@ -478,7 +478,9 @@ void journal_commit_transaction(journal_t *journal)
478 spin_lock(&journal->j_list_lock); 478 spin_lock(&journal->j_list_lock);
479 continue; 479 continue;
480 } 480 }
481 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { 481 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
482 jh->b_transaction == commit_transaction &&
483 jh->b_jlist == BJ_Locked) {
482 __journal_unfile_buffer(jh); 484 __journal_unfile_buffer(jh);
483 jbd_unlock_bh_state(bh); 485 jbd_unlock_bh_state(bh);
484 journal_remove_journal_head(bh); 486 journal_remove_journal_head(bh);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99ee..1afa4dd4cae2 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
12 12
13static int loadavg_proc_show(struct seq_file *m, void *v) 13static int loadavg_proc_show(struct seq_file *m, void *v)
14{ 14{
15 int a, b, c; 15 unsigned long avnrun[3];
16 unsigned long seq;
17 16
18 do { 17 get_avenrun(avnrun, FIXED_1/200, 0);
19 seq = read_seqbegin(&xtime_lock);
20 a = avenrun[0] + (FIXED_1/200);
21 b = avenrun[1] + (FIXED_1/200);
22 c = avenrun[2] + (FIXED_1/200);
23 } while (read_seqretry(&xtime_lock, seq));
24 18
25 seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", 19 seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
26 LOAD_INT(a), LOAD_FRAC(a), 20 LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
27 LOAD_INT(b), LOAD_FRAC(b), 21 LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
28 LOAD_INT(c), LOAD_FRAC(c), 22 LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
29 nr_running(), nr_threads, 23 nr_running(), nr_threads,
30 task_active_pid_ns(current)->last_pid); 24 task_active_pid_ns(current)->last_pid);
31 return 0; 25 return 0;
diff --git a/include/Kbuild b/include/Kbuild
index d8c3e3cbf416..fe36accd4328 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -8,3 +8,4 @@ header-y += mtd/
8header-y += rdma/ 8header-y += rdma/
9header-y += video/ 9header-y += video/
10header-y += drm/ 10header-y += drm/
11header-y += xen/
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e6d0ca70aba..e410f602cab1 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
280#endif 280#endif
281 281
282/* 282/*
283 * A facility to provide batching of the reload of page tables with the 283 * A facility to provide batching of the reload of page tables and
284 * actual context switch code for paravirtualized guests. By convention, 284 * other process state with the actual context switch code for
285 * only one of the lazy modes (CPU, MMU) should be active at any given 285 * paravirtualized guests. By convention, only one of the batched
286 * time, entry should never be nested, and entry and exits should always 286 * update (lazy) modes (CPU, MMU) should be active at any given time,
287 * be paired. This is for sanity of maintaining and reasoning about the 287 * entry should never be nested, and entry and exits should always be
288 * kernel code. 288 * paired. This is for sanity of maintaining and reasoning about the
289 * kernel code. In this case, the exit (end of the context switch) is
290 * in architecture-specific code, and so doesn't need a generic
291 * definition.
289 */ 292 */
290#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE 293#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
291#define arch_enter_lazy_cpu_mode() do {} while (0) 294#define arch_start_context_switch(prev) do {} while (0)
292#define arch_leave_lazy_cpu_mode() do {} while (0)
293#define arch_flush_lazy_cpu_mode() do {} while (0)
294#endif 295#endif
295 296
296#ifndef __HAVE_PFNMAP_TRACKING 297#ifndef __HAVE_PFNMAP_TRACKING
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88be890ee3c7..51b4b0a5ce8c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num;
119extern int sbf_port; 119extern int sbf_port;
120extern unsigned long acpi_realmode_flags; 120extern unsigned long acpi_realmode_flags;
121 121
122int acpi_register_gsi (u32 gsi, int triggering, int polarity); 122int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
123int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); 123int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
124 124
125#ifdef CONFIG_X86_IO_APIC 125#ifdef CONFIG_X86_IO_APIC
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9f315382610b..c5ac87ca7bc6 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1022,6 +1022,8 @@ typedef struct cpumask *cpumask_var_t;
1022 1022
1023bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); 1023bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
1024bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); 1024bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
1025bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
1026bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
1025void alloc_bootmem_cpumask_var(cpumask_var_t *mask); 1027void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
1026void free_cpumask_var(cpumask_var_t mask); 1028void free_cpumask_var(cpumask_var_t mask);
1027void free_bootmem_cpumask_var(cpumask_var_t mask); 1029void free_bootmem_cpumask_var(cpumask_var_t mask);
@@ -1040,6 +1042,19 @@ static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
1040 return true; 1042 return true;
1041} 1043}
1042 1044
1045static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
1046{
1047 cpumask_clear(*mask);
1048 return true;
1049}
1050
1051static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
1052 int node)
1053{
1054 cpumask_clear(*mask);
1055 return true;
1056}
1057
1043static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) 1058static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
1044{ 1059{
1045} 1060}
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 28d53cb7b5a2..171ad8aedc83 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus);
32 32
33extern void dma_debug_init(u32 num_entries); 33extern void dma_debug_init(u32 num_entries);
34 34
35extern int dma_debug_resize_entries(u32 num_entries);
36
35extern void debug_dma_map_page(struct device *dev, struct page *page, 37extern void debug_dma_map_page(struct device *dev, struct page *page,
36 size_t offset, size_t size, 38 size_t offset, size_t size,
37 int direction, dma_addr_t dma_addr, 39 int direction, dma_addr_t dma_addr,
@@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries)
91{ 93{
92} 94}
93 95
96static inline int dma_debug_resize_entries(u32 num_entries)
97{
98 return 0;
99}
100
94static inline void debug_dma_map_page(struct device *dev, struct page *page, 101static inline void debug_dma_map_page(struct device *dev, struct page *page,
95 size_t offset, size_t size, 102 size_t offset, size_t size,
96 int direction, dma_addr_t dma_addr, 103 int direction, dma_addr_t dma_addr,
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc342cda..10ff5c498824 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -108,6 +108,7 @@ struct irte {
108}; 108};
109#ifdef CONFIG_INTR_REMAP 109#ifdef CONFIG_INTR_REMAP
110extern int intr_remapping_enabled; 110extern int intr_remapping_enabled;
111extern int intr_remapping_supported(void);
111extern int enable_intr_remapping(int); 112extern int enable_intr_remapping(int);
112extern void disable_intr_remapping(void); 113extern void disable_intr_remapping(void);
113extern int reenable_intr_remapping(int); 114extern int reenable_intr_remapping(int);
@@ -157,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
157} 158}
158#define irq_remapped(irq) (0) 159#define irq_remapped(irq) (0)
159#define enable_intr_remapping(mode) (-1) 160#define enable_intr_remapping(mode) (-1)
161#define disable_intr_remapping() (0)
162#define reenable_intr_remapping(mode) (0)
160#define intr_remapping_enabled (0) 163#define intr_remapping_enabled (0)
161#endif 164#endif
162 165
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3bf5bb5a34f9..34956c8fdebf 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -23,6 +23,8 @@ union ktime;
23#define FUTEX_TRYLOCK_PI 8 23#define FUTEX_TRYLOCK_PI 8
24#define FUTEX_WAIT_BITSET 9 24#define FUTEX_WAIT_BITSET 9
25#define FUTEX_WAKE_BITSET 10 25#define FUTEX_WAKE_BITSET 10
26#define FUTEX_WAIT_REQUEUE_PI 11
27#define FUTEX_CMP_REQUEUE_PI 12
26 28
27#define FUTEX_PRIVATE_FLAG 128 29#define FUTEX_PRIVATE_FLAG 128
28#define FUTEX_CLOCK_REALTIME 256 30#define FUTEX_CLOCK_REALTIME 256
@@ -38,6 +40,10 @@ union ktime;
38#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) 40#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
39#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) 41#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
40#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) 42#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
43#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \
44 FUTEX_PRIVATE_FLAG)
45#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
46 FUTEX_PRIVATE_FLAG)
41 47
42/* 48/*
43 * Support for robust futexes: the kernel cleans up held futexes at 49 * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91bb76f44f14..ff374ceface0 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,6 +566,6 @@ struct irq_desc;
566extern int early_irq_init(void); 566extern int early_irq_init(void);
567extern int arch_probe_nr_irqs(void); 567extern int arch_probe_nr_irqs(void);
568extern int arch_early_irq_init(void); 568extern int arch_early_irq_init(void);
569extern int arch_init_chip_data(struct irq_desc *desc, int cpu); 569extern int arch_init_chip_data(struct irq_desc *desc, int node);
570 570
571#endif 571#endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index b7cbeed972e4..eedbb8e5e0cc 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -117,7 +117,7 @@ struct irq_chip {
117 void (*eoi)(unsigned int irq); 117 void (*eoi)(unsigned int irq);
118 118
119 void (*end)(unsigned int irq); 119 void (*end)(unsigned int irq);
120 void (*set_affinity)(unsigned int irq, 120 int (*set_affinity)(unsigned int irq,
121 const struct cpumask *dest); 121 const struct cpumask *dest);
122 int (*retrigger)(unsigned int irq); 122 int (*retrigger)(unsigned int irq);
123 int (*set_type)(unsigned int irq, unsigned int flow_type); 123 int (*set_type)(unsigned int irq, unsigned int flow_type);
@@ -187,7 +187,7 @@ struct irq_desc {
187 spinlock_t lock; 187 spinlock_t lock;
188#ifdef CONFIG_SMP 188#ifdef CONFIG_SMP
189 cpumask_var_t affinity; 189 cpumask_var_t affinity;
190 unsigned int cpu; 190 unsigned int node;
191#ifdef CONFIG_GENERIC_PENDING_IRQ 191#ifdef CONFIG_GENERIC_PENDING_IRQ
192 cpumask_var_t pending_mask; 192 cpumask_var_t pending_mask;
193#endif 193#endif
@@ -201,26 +201,23 @@ struct irq_desc {
201} ____cacheline_internodealigned_in_smp; 201} ____cacheline_internodealigned_in_smp;
202 202
203extern void arch_init_copy_chip_data(struct irq_desc *old_desc, 203extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
204 struct irq_desc *desc, int cpu); 204 struct irq_desc *desc, int node);
205extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); 205extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
206 206
207#ifndef CONFIG_SPARSE_IRQ 207#ifndef CONFIG_SPARSE_IRQ
208extern struct irq_desc irq_desc[NR_IRQS]; 208extern struct irq_desc irq_desc[NR_IRQS];
209#else /* CONFIG_SPARSE_IRQ */ 209#endif
210extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
211#endif /* CONFIG_SPARSE_IRQ */
212
213extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
214 210
215static inline struct irq_desc * 211#ifdef CONFIG_NUMA_IRQ_DESC
216irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) 212extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
217{
218#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
219 return irq_to_desc(irq);
220#else 213#else
214static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
215{
221 return desc; 216 return desc;
222#endif
223} 217}
218#endif
219
220extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
224 221
225/* 222/*
226 * Migration helpers for obsolete names, they will go away: 223 * Migration helpers for obsolete names, they will go away:
@@ -386,7 +383,7 @@ extern void set_irq_noprobe(unsigned int irq);
386extern void set_irq_probe(unsigned int irq); 383extern void set_irq_probe(unsigned int irq);
387 384
388/* Handle dynamic irq creation and destruction */ 385/* Handle dynamic irq creation and destruction */
389extern unsigned int create_irq_nr(unsigned int irq_want); 386extern unsigned int create_irq_nr(unsigned int irq_want, int node);
390extern int create_irq(void); 387extern int create_irq(void);
391extern void destroy_irq(unsigned int irq); 388extern void destroy_irq(unsigned int irq);
392 389
@@ -424,47 +421,48 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
424 421
425#ifdef CONFIG_SMP 422#ifdef CONFIG_SMP
426/** 423/**
427 * init_alloc_desc_masks - allocate cpumasks for irq_desc 424 * alloc_desc_masks - allocate cpumasks for irq_desc
428 * @desc: pointer to irq_desc struct 425 * @desc: pointer to irq_desc struct
429 * @cpu: cpu which will be handling the cpumasks 426 * @cpu: cpu which will be handling the cpumasks
430 * @boot: true if need bootmem 427 * @boot: true if need bootmem
431 * 428 *
432 * Allocates affinity and pending_mask cpumask if required. 429 * Allocates affinity and pending_mask cpumask if required.
433 * Returns true if successful (or not required). 430 * Returns true if successful (or not required).
434 * Side effect: affinity has all bits set, pending_mask has all bits clear.
435 */ 431 */
436static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, 432static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
437 bool boot) 433 bool boot)
438{ 434{
439 int node; 435#ifdef CONFIG_CPUMASK_OFFSTACK
440
441 if (boot) { 436 if (boot) {
442 alloc_bootmem_cpumask_var(&desc->affinity); 437 alloc_bootmem_cpumask_var(&desc->affinity);
443 cpumask_setall(desc->affinity);
444 438
445#ifdef CONFIG_GENERIC_PENDING_IRQ 439#ifdef CONFIG_GENERIC_PENDING_IRQ
446 alloc_bootmem_cpumask_var(&desc->pending_mask); 440 alloc_bootmem_cpumask_var(&desc->pending_mask);
447 cpumask_clear(desc->pending_mask);
448#endif 441#endif
449 return true; 442 return true;
450 } 443 }
451 444
452 node = cpu_to_node(cpu);
453
454 if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) 445 if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
455 return false; 446 return false;
456 cpumask_setall(desc->affinity);
457 447
458#ifdef CONFIG_GENERIC_PENDING_IRQ 448#ifdef CONFIG_GENERIC_PENDING_IRQ
459 if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { 449 if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
460 free_cpumask_var(desc->affinity); 450 free_cpumask_var(desc->affinity);
461 return false; 451 return false;
462 } 452 }
463 cpumask_clear(desc->pending_mask); 453#endif
464#endif 454#endif
465 return true; 455 return true;
466} 456}
467 457
458static inline void init_desc_masks(struct irq_desc *desc)
459{
460 cpumask_setall(desc->affinity);
461#ifdef CONFIG_GENERIC_PENDING_IRQ
462 cpumask_clear(desc->pending_mask);
463#endif
464}
465
468/** 466/**
469 * init_copy_desc_masks - copy cpumasks for irq_desc 467 * init_copy_desc_masks - copy cpumasks for irq_desc
470 * @old_desc: pointer to old irq_desc struct 468 * @old_desc: pointer to old irq_desc struct
@@ -478,7 +476,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
478static inline void init_copy_desc_masks(struct irq_desc *old_desc, 476static inline void init_copy_desc_masks(struct irq_desc *old_desc,
479 struct irq_desc *new_desc) 477 struct irq_desc *new_desc)
480{ 478{
481#ifdef CONFIG_CPUMASKS_OFFSTACK 479#ifdef CONFIG_CPUMASK_OFFSTACK
482 cpumask_copy(new_desc->affinity, old_desc->affinity); 480 cpumask_copy(new_desc->affinity, old_desc->affinity);
483 481
484#ifdef CONFIG_GENERIC_PENDING_IRQ 482#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -499,12 +497,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
499 497
500#else /* !CONFIG_SMP */ 498#else /* !CONFIG_SMP */
501 499
502static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, 500static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
503 bool boot) 501 bool boot)
504{ 502{
505 return true; 503 return true;
506} 504}
507 505
506static inline void init_desc_masks(struct irq_desc *desc)
507{
508}
509
508static inline void init_copy_desc_masks(struct irq_desc *old_desc, 510static inline void init_copy_desc_masks(struct irq_desc *old_desc,
509 struct irq_desc *new_desc) 511 struct irq_desc *new_desc)
510{ 512{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c21af6abffb..5528ff32512e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1029,8 +1029,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
1029 unsigned long end_pfn); 1029 unsigned long end_pfn);
1030extern void remove_active_range(unsigned int nid, unsigned long start_pfn, 1030extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
1031 unsigned long end_pfn); 1031 unsigned long end_pfn);
1032extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
1033 unsigned long end_pfn);
1034extern void remove_all_active_ranges(void); 1032extern void remove_all_active_ranges(void);
1035extern unsigned long absent_pages_in_range(unsigned long start_pfn, 1033extern unsigned long absent_pages_in_range(unsigned long start_pfn,
1036 unsigned long end_pfn); 1034 unsigned long end_pfn);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab8..878cab4f5fcc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -150,5 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
150 */ 150 */
151extern int mutex_trylock(struct mutex *lock); 151extern int mutex_trylock(struct mutex *lock);
152extern void mutex_unlock(struct mutex *lock); 152extern void mutex_unlock(struct mutex *lock);
153extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
153 154
154#endif 155#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5932ace22400..ff687281f233 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
116 * 11 bit fractions. 116 * 11 bit fractions.
117 */ 117 */
118extern unsigned long avenrun[]; /* Load averages */ 118extern unsigned long avenrun[]; /* Load averages */
119extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
119 120
120#define FSHIFT 11 /* nr of bits of precision */ 121#define FSHIFT 11 /* nr of bits of precision */
121#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ 122#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
@@ -135,8 +136,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
135extern int nr_processes(void); 136extern int nr_processes(void);
136extern unsigned long nr_running(void); 137extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 138extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 139extern unsigned long nr_iowait(void);
140extern void calc_global_load(void);
140 141
141extern unsigned long get_parent_ip(unsigned long addr); 142extern unsigned long get_parent_ip(unsigned long addr);
142 143
@@ -838,7 +839,17 @@ struct sched_group {
838 */ 839 */
839 u32 reciprocal_cpu_power; 840 u32 reciprocal_cpu_power;
840 841
841 unsigned long cpumask[]; 842 /*
843 * The CPUs this group covers.
844 *
845 * NOTE: this field is variable length. (Allocated dynamically
846 * by attaching extra space to the end of the structure,
847 * depending on how many CPUs the kernel has booted up with)
848 *
849 * It is also be embedded into static data structures at build
850 * time. (See 'struct static_sched_group' in kernel/sched.c)
851 */
852 unsigned long cpumask[0];
842}; 853};
843 854
844static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 855static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -924,8 +935,17 @@ struct sched_domain {
924 char *name; 935 char *name;
925#endif 936#endif
926 937
927 /* span of all CPUs in this domain */ 938 /*
928 unsigned long span[]; 939 * Span of all CPUs in this domain.
940 *
941 * NOTE: this field is variable length. (Allocated dynamically
942 * by attaching extra space to the end of the structure,
943 * depending on how many CPUs the kernel has booted up with)
944 *
945 * It is also be embedded into static data structures at build
946 * time. (See 'struct static_sched_domain' in kernel/sched.c)
947 */
948 unsigned long span[0];
929}; 949};
930 950
931static inline struct cpumask *sched_domain_span(struct sched_domain *sd) 951static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 938234c4a996..d4841ed8215b 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
60#define __raw_spin_is_locked(lock) ((void)(lock), 0) 60#define __raw_spin_is_locked(lock) ((void)(lock), 0)
61/* for sched.c and kernel_lock.c: */ 61/* for sched.c and kernel_lock.c: */
62# define __raw_spin_lock(lock) do { (void)(lock); } while (0) 62# define __raw_spin_lock(lock) do { (void)(lock); } while (0)
63# define __raw_spin_lock_flags(lock, flags) do { (void)(lock); } while (0)
63# define __raw_spin_unlock(lock) do { (void)(lock); } while (0) 64# define __raw_spin_unlock(lock) do { (void)(lock); } while (0)
64# define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) 65# define __raw_spin_trylock(lock) ({ (void)(lock); 1; })
65#endif /* DEBUG_SPINLOCK */ 66#endif /* DEBUG_SPINLOCK */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ac9ff54f7cb3..cb1a6631b8f4 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
29 29
30extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, 30extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
31 phys_addr_t address); 31 phys_addr_t address);
32extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); 32extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
33 dma_addr_t address);
33 34
34extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); 35extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
35 36
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e6b820f8b56b..a8cc4e13434c 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -21,13 +21,14 @@ struct restart_block {
21 struct { 21 struct {
22 unsigned long arg0, arg1, arg2, arg3; 22 unsigned long arg0, arg1, arg2, arg3;
23 }; 23 };
24 /* For futex_wait */ 24 /* For futex_wait and futex_wait_requeue_pi */
25 struct { 25 struct {
26 u32 *uaddr; 26 u32 *uaddr;
27 u32 val; 27 u32 val;
28 u32 flags; 28 u32 flags;
29 u32 bitset; 29 u32 bitset;
30 u64 time; 30 u64 time;
31 u32 *uaddr2;
31 } futex; 32 } futex;
32 /* For nanosleep */ 33 /* For nanosleep */
33 struct { 34 struct {
diff --git a/include/linux/wait.h b/include/linux/wait.h
index bc024632f365..6788e1a4d4ca 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
132 list_del(&old->task_list); 132 list_del(&old->task_list);
133} 133}
134 134
135void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
136 int nr_exclusive, int sync, void *key);
137void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 135void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
138void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); 136void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
139void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, 137void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
new file mode 100644
index 000000000000..4e65c16a445b
--- /dev/null
+++ b/include/xen/Kbuild
@@ -0,0 +1 @@
header-y += evtchn.h
diff --git a/include/xen/events.h b/include/xen/events.h
index 0d5f1adc0363..e68d59a90ca8 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq);
53 irq will be disabled so it won't deliver an interrupt. */ 53 irq will be disabled so it won't deliver an interrupt. */
54void xen_poll_irq(int irq); 54void xen_poll_irq(int irq);
55 55
56/* Determine the IRQ which is bound to an event channel */
57unsigned irq_from_evtchn(unsigned int evtchn);
58
56#endif /* _XEN_EVENTS_H */ 59#endif /* _XEN_EVENTS_H */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h
new file mode 100644
index 000000000000..14e833ee4e0b
--- /dev/null
+++ b/include/xen/evtchn.h
@@ -0,0 +1,88 @@
1/******************************************************************************
2 * evtchn.h
3 *
4 * Interface to /dev/xen/evtchn.
5 *
6 * Copyright (c) 2003-2005, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef __LINUX_PUBLIC_EVTCHN_H__
34#define __LINUX_PUBLIC_EVTCHN_H__
35
36/*
37 * Bind a fresh port to VIRQ @virq.
38 * Return allocated port.
39 */
40#define IOCTL_EVTCHN_BIND_VIRQ \
41 _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
42struct ioctl_evtchn_bind_virq {
43 unsigned int virq;
44};
45
46/*
47 * Bind a fresh port to remote <@remote_domain, @remote_port>.
48 * Return allocated port.
49 */
50#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
51 _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
52struct ioctl_evtchn_bind_interdomain {
53 unsigned int remote_domain, remote_port;
54};
55
56/*
57 * Allocate a fresh port for binding to @remote_domain.
58 * Return allocated port.
59 */
60#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
61 _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
62struct ioctl_evtchn_bind_unbound_port {
63 unsigned int remote_domain;
64};
65
66/*
67 * Unbind previously allocated @port.
68 */
69#define IOCTL_EVTCHN_UNBIND \
70 _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
71struct ioctl_evtchn_unbind {
72 unsigned int port;
73};
74
75/*
76 * Unbind previously allocated @port.
77 */
78#define IOCTL_EVTCHN_NOTIFY \
79 _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
80struct ioctl_evtchn_notify {
81 unsigned int port;
82};
83
84/* Clear and reinitialise the event buffer. Clear error condition. */
85#define IOCTL_EVTCHN_RESET \
86 _IOC(_IOC_NONE, 'E', 5, 0)
87
88#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
index 453235e923f0..e8b6519d47e9 100644
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -57,4 +57,7 @@ struct xen_feature_info {
57/* Declares the features reported by XENVER_get_features. */ 57/* Declares the features reported by XENVER_get_features. */
58#include "features.h" 58#include "features.h"
59 59
60/* arg == NULL; returns host memory page size. */
61#define XENVER_pagesize 7
62
60#endif /* __XEN_PUBLIC_VERSION_H__ */ 63#endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index f87f9614844d..b9763badbd77 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -91,8 +91,7 @@ struct xenbus_driver {
91 void (*otherend_changed)(struct xenbus_device *dev, 91 void (*otherend_changed)(struct xenbus_device *dev,
92 enum xenbus_state backend_state); 92 enum xenbus_state backend_state);
93 int (*remove)(struct xenbus_device *dev); 93 int (*remove)(struct xenbus_device *dev);
94 int (*suspend)(struct xenbus_device *dev); 94 int (*suspend)(struct xenbus_device *dev, pm_message_t state);
95 int (*suspend_cancel)(struct xenbus_device *dev);
96 int (*resume)(struct xenbus_device *dev); 95 int (*resume)(struct xenbus_device *dev);
97 int (*uevent)(struct xenbus_device *, char **, int, char *, int); 96 int (*uevent)(struct xenbus_device *, char **, int, char *, int);
98 struct device_driver driver; 97 struct device_driver driver;
diff --git a/ipc/shm.c b/ipc/shm.c
index 560818353599..15dd238e5338 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -968,10 +968,13 @@ SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
968SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) 968SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
969{ 969{
970 struct mm_struct *mm = current->mm; 970 struct mm_struct *mm = current->mm;
971 struct vm_area_struct *vma, *next; 971 struct vm_area_struct *vma;
972 unsigned long addr = (unsigned long)shmaddr; 972 unsigned long addr = (unsigned long)shmaddr;
973 loff_t size = 0;
974 int retval = -EINVAL; 973 int retval = -EINVAL;
974#ifdef CONFIG_MMU
975 loff_t size = 0;
976 struct vm_area_struct *next;
977#endif
975 978
976 if (addr & ~PAGE_MASK) 979 if (addr & ~PAGE_MASK)
977 return retval; 980 return retval;
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d53a62..80b5ce716596 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
96 */ 100 */
97struct futex_q { 101struct futex_q {
98 struct plist_node list; 102 struct plist_node list;
99 /* There can only be a single waiter */ 103 /* Waiter reference */
100 wait_queue_head_t waiter; 104 struct task_struct *task;
101 105
102 /* Which hash list lock to use: */ 106 /* Which hash list lock to use: */
103 spinlock_t *lock_ptr; 107 spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
107 111
108 /* Optional priority inheritance state: */ 112 /* Optional priority inheritance state: */
109 struct futex_pi_state *pi_state; 113 struct futex_pi_state *pi_state;
110 struct task_struct *task; 114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter;
111 117
112 /* Bitset for the optional bitmasked wakeup */ 118 /* Bitset for the optional bitmasked wakeup */
113 u32 bitset; 119 u32 bitset;
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
278 drop_futex_key_refs(key); 284 drop_futex_key_refs(key);
279} 285}
280 286
287/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in
290 * @key: the futex key (to distinguish it from other futex futex_q's)
291 *
292 * Must be called with the hb lock held.
293 */
294static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
295 union futex_key *key)
296{
297 struct futex_q *this;
298
299 plist_for_each_entry(this, &hb->chain, list) {
300 if (match_futex(&this->key, key))
301 return this;
302 }
303 return NULL;
304}
305
281static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 306static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
282{ 307{
283 u32 curval; 308 u32 curval;
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
539 return 0; 564 return 0;
540} 565}
541 566
567/**
568 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
569 * @uaddr: the pi futex user address
570 * @hb: the pi futex hash bucket
571 * @key: the futex key associated with uaddr and hb
572 * @ps: the pi_state pointer where we store the result of the
573 * lookup
574 * @task: the task to perform the atomic lock work for. This will
575 * be "current" except in the case of requeue pi.
576 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
577 *
578 * Returns:
579 * 0 - ready to wait
580 * 1 - acquired the lock
581 * <0 - error
582 *
583 * The hb->lock and futex_key refs shall be held by the caller.
584 */
585static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
586 union futex_key *key,
587 struct futex_pi_state **ps,
588 struct task_struct *task, int set_waiters)
589{
590 int lock_taken, ret, ownerdied = 0;
591 u32 uval, newval, curval;
592
593retry:
594 ret = lock_taken = 0;
595
596 /*
597 * To avoid races, we attempt to take the lock here again
598 * (by doing a 0 -> TID atomic cmpxchg), while holding all
599 * the locks. It will most likely not succeed.
600 */
601 newval = task_pid_vnr(task);
602 if (set_waiters)
603 newval |= FUTEX_WAITERS;
604
605 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
606
607 if (unlikely(curval == -EFAULT))
608 return -EFAULT;
609
610 /*
611 * Detect deadlocks.
612 */
613 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
614 return -EDEADLK;
615
616 /*
617 * Surprise - we got the lock. Just return to userspace:
618 */
619 if (unlikely(!curval))
620 return 1;
621
622 uval = curval;
623
624 /*
625 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
626 * to wake at the next unlock.
627 */
628 newval = curval | FUTEX_WAITERS;
629
630 /*
631 * There are two cases, where a futex might have no owner (the
632 * owner TID is 0): OWNER_DIED. We take over the futex in this
633 * case. We also do an unconditional take over, when the owner
634 * of the futex died.
635 *
636 * This is safe as we are protected by the hash bucket lock !
637 */
638 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
639 /* Keep the OWNER_DIED bit */
640 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
641 ownerdied = 0;
642 lock_taken = 1;
643 }
644
645 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
646
647 if (unlikely(curval == -EFAULT))
648 return -EFAULT;
649 if (unlikely(curval != uval))
650 goto retry;
651
652 /*
653 * We took the lock due to owner died take over.
654 */
655 if (unlikely(lock_taken))
656 return 1;
657
658 /*
659 * We dont have the lock. Look up the PI state (or create it if
660 * we are the first waiter):
661 */
662 ret = lookup_pi_state(uval, hb, key, ps);
663
664 if (unlikely(ret)) {
665 switch (ret) {
666 case -ESRCH:
667 /*
668 * No owner found for this futex. Check if the
669 * OWNER_DIED bit is set to figure out whether
670 * this is a robust futex or not.
671 */
672 if (get_futex_value_locked(&curval, uaddr))
673 return -EFAULT;
674
675 /*
676 * We simply start over in case of a robust
677 * futex. The code above will take the futex
678 * and return happy.
679 */
680 if (curval & FUTEX_OWNER_DIED) {
681 ownerdied = 1;
682 goto retry;
683 }
684 default:
685 break;
686 }
687 }
688
689 return ret;
690}
691
542/* 692/*
543 * The hash bucket lock must be held when this is called. 693 * The hash bucket lock must be held when this is called.
544 * Afterwards, the futex_q must not be accessed. 694 * Afterwards, the futex_q must not be accessed.
545 */ 695 */
546static void wake_futex(struct futex_q *q) 696static void wake_futex(struct futex_q *q)
547{ 697{
548 plist_del(&q->list, &q->list.plist); 698 struct task_struct *p = q->task;
699
549 /* 700 /*
550 * The lock in wake_up_all() is a crucial memory barrier after the 701 * We set q->lock_ptr = NULL _before_ we wake up the task. If
551 * plist_del() and also before assigning to q->lock_ptr. 702 * a non futex wake up happens on another CPU then the task
703 * might exit and p would dereference a non existing task
704 * struct. Prevent this by holding a reference on p across the
705 * wake up.
552 */ 706 */
553 wake_up(&q->waiter); 707 get_task_struct(p);
708
709 plist_del(&q->list, &q->list.plist);
554 /* 710 /*
555 * The waiting task can free the futex_q as soon as this is written, 711 * The waiting task can free the futex_q as soon as
556 * without taking any locks. This must come last. 712 * q->lock_ptr = NULL is written, without taking any locks. A
557 * 713 * memory barrier is required here to prevent the following
558 * A memory barrier is required here to prevent the following store to 714 * store to lock_ptr from getting ahead of the plist_del.
559 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
560 * end of wake_up() does not prevent this store from moving.
561 */ 715 */
562 smp_wmb(); 716 smp_wmb();
563 q->lock_ptr = NULL; 717 q->lock_ptr = NULL;
718
719 wake_up_state(p, TASK_NORMAL);
720 put_task_struct(p);
564} 721}
565 722
566static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 723static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
689 846
690 plist_for_each_entry_safe(this, next, head, list) { 847 plist_for_each_entry_safe(this, next, head, list) {
691 if (match_futex (&this->key, &key)) { 848 if (match_futex (&this->key, &key)) {
692 if (this->pi_state) { 849 if (this->pi_state || this->rt_waiter) {
693 ret = -EINVAL; 850 ret = -EINVAL;
694 break; 851 break;
695 } 852 }
@@ -802,24 +959,185 @@ out:
802 return ret; 959 return ret;
803} 960}
804 961
805/* 962/**
806 * Requeue all waiters hashed on one physical page to another 963 * requeue_futex() - Requeue a futex_q from one hb to another
807 * physical page. 964 * @q: the futex_q to requeue
965 * @hb1: the source hash_bucket
966 * @hb2: the target hash_bucket
967 * @key2: the new key for the requeued futex_q
968 */
969static inline
970void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
971 struct futex_hash_bucket *hb2, union futex_key *key2)
972{
973
974 /*
975 * If key1 and key2 hash to the same bucket, no need to
976 * requeue.
977 */
978 if (likely(&hb1->chain != &hb2->chain)) {
979 plist_del(&q->list, &hb1->chain);
980 plist_add(&q->list, &hb2->chain);
981 q->lock_ptr = &hb2->lock;
982#ifdef CONFIG_DEBUG_PI_LIST
983 q->list.plist.lock = &hb2->lock;
984#endif
985 }
986 get_futex_key_refs(key2);
987 q->key = *key2;
988}
989
990/**
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q
993 * key: the key of the requeue target futex
994 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held.
1000 */
1001static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1003{
1004 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key);
1006 q->key = *key;
1007
1008 WARN_ON(plist_node_empty(&q->list));
1009 plist_del(&q->list, &q->list.plist);
1010
1011 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL;
1013
1014 wake_up_state(q->task, TASK_NORMAL);
1015}
1016
1017/**
1018 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1019 * @pifutex: the user address of the to futex
1020 * @hb1: the from futex hash bucket, must be locked by the caller
1021 * @hb2: the to futex hash bucket, must be locked by the caller
1022 * @key1: the from futex key
1023 * @key2: the to futex key
1024 * @ps: address to store the pi_state pointer
1025 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1026 *
1027 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1028 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1029 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1030 * hb1 and hb2 must be held by the caller.
1031 *
1032 * Returns:
1033 * 0 - failed to acquire the lock atomicly
1034 * 1 - acquired the lock
1035 * <0 - error
1036 */
1037static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1038 struct futex_hash_bucket *hb1,
1039 struct futex_hash_bucket *hb2,
1040 union futex_key *key1, union futex_key *key2,
1041 struct futex_pi_state **ps, int set_waiters)
1042{
1043 struct futex_q *top_waiter = NULL;
1044 u32 curval;
1045 int ret;
1046
1047 if (get_futex_value_locked(&curval, pifutex))
1048 return -EFAULT;
1049
1050 /*
1051 * Find the top_waiter and determine if there are additional waiters.
1052 * If the caller intends to requeue more than 1 waiter to pifutex,
1053 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1054 * as we have means to handle the possible fault. If not, don't set
1055 * the bit unecessarily as it will force the subsequent unlock to enter
1056 * the kernel.
1057 */
1058 top_waiter = futex_top_waiter(hb1, key1);
1059
1060 /* There are no waiters, nothing for us to do. */
1061 if (!top_waiter)
1062 return 0;
1063
1064 /*
1065 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1066 * the contended case or if set_waiters is 1. The pi_state is returned
1067 * in ps in contended cases.
1068 */
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters);
1071 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2);
1073
1074 return ret;
1075}
1076
1077/**
1078 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1079 * uaddr1: source futex user address
1080 * uaddr2: target futex user address
1081 * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1082 * nr_requeue: number of waiters to requeue (0-INT_MAX)
1083 * requeue_pi: if we are attempting to requeue from a non-pi futex to a
1084 * pi futex (pi to pi requeue is not supported)
1085 *
1086 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1087 * uaddr2 atomically on behalf of the top waiter.
1088 *
1089 * Returns:
1090 * >=0 - on success, the number of tasks requeued or woken
1091 * <0 - on error
808 */ 1092 */
809static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1093static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
810 int nr_wake, int nr_requeue, u32 *cmpval) 1094 int nr_wake, int nr_requeue, u32 *cmpval,
1095 int requeue_pi)
811{ 1096{
812 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1097 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1098 int drop_count = 0, task_count = 0, ret;
1099 struct futex_pi_state *pi_state = NULL;
813 struct futex_hash_bucket *hb1, *hb2; 1100 struct futex_hash_bucket *hb1, *hb2;
814 struct plist_head *head1; 1101 struct plist_head *head1;
815 struct futex_q *this, *next; 1102 struct futex_q *this, *next;
816 int ret, drop_count = 0; 1103 u32 curval2;
1104
1105 if (requeue_pi) {
1106 /*
1107 * requeue_pi requires a pi_state, try to allocate it now
1108 * without any locks in case it fails.
1109 */
1110 if (refill_pi_state_cache())
1111 return -ENOMEM;
1112 /*
1113 * requeue_pi must wake as many tasks as it can, up to nr_wake
1114 * + nr_requeue, since it acquires the rt_mutex prior to
1115 * returning to userspace, so as to not leave the rt_mutex with
1116 * waiters and no owner. However, second and third wake-ups
1117 * cannot be predicted as they involve race conditions with the
1118 * first wake and a fault while looking up the pi_state. Both
1119 * pthread_cond_signal() and pthread_cond_broadcast() should
1120 * use nr_wake=1.
1121 */
1122 if (nr_wake != 1)
1123 return -EINVAL;
1124 }
817 1125
818retry: 1126retry:
1127 if (pi_state != NULL) {
1128 /*
1129 * We will have to lookup the pi_state again, so free this one
1130 * to keep the accounting correct.
1131 */
1132 free_pi_state(pi_state);
1133 pi_state = NULL;
1134 }
1135
819 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1136 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
820 if (unlikely(ret != 0)) 1137 if (unlikely(ret != 0))
821 goto out; 1138 goto out;
822 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); 1139 ret = get_futex_key(uaddr2, fshared, &key2,
1140 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
823 if (unlikely(ret != 0)) 1141 if (unlikely(ret != 0))
824 goto out_put_key1; 1142 goto out_put_key1;
825 1143
@@ -854,32 +1172,99 @@ retry_private:
854 } 1172 }
855 } 1173 }
856 1174
1175 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1176 /*
1177 * Attempt to acquire uaddr2 and wake the top waiter. If we
1178 * intend to requeue waiters, force setting the FUTEX_WAITERS
1179 * bit. We force this here where we are able to easily handle
1180 * faults rather in the requeue loop below.
1181 */
1182 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1183 &key2, &pi_state, nr_requeue);
1184
1185 /*
1186 * At this point the top_waiter has either taken uaddr2 or is
1187 * waiting on it. If the former, then the pi_state will not
1188 * exist yet, look it up one more time to ensure we have a
1189 * reference to it.
1190 */
1191 if (ret == 1) {
1192 WARN_ON(pi_state);
1193 task_count++;
1194 ret = get_futex_value_locked(&curval2, uaddr2);
1195 if (!ret)
1196 ret = lookup_pi_state(curval2, hb2, &key2,
1197 &pi_state);
1198 }
1199
1200 switch (ret) {
1201 case 0:
1202 break;
1203 case -EFAULT:
1204 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2);
1208 if (!ret)
1209 goto retry;
1210 goto out;
1211 case -EAGAIN:
1212 /* The owner was exiting, try again. */
1213 double_unlock_hb(hb1, hb2);
1214 put_futex_key(fshared, &key2);
1215 put_futex_key(fshared, &key1);
1216 cond_resched();
1217 goto retry;
1218 default:
1219 goto out_unlock;
1220 }
1221 }
1222
857 head1 = &hb1->chain; 1223 head1 = &hb1->chain;
858 plist_for_each_entry_safe(this, next, head1, list) { 1224 plist_for_each_entry_safe(this, next, head1, list) {
859 if (!match_futex (&this->key, &key1)) 1225 if (task_count - nr_wake >= nr_requeue)
1226 break;
1227
1228 if (!match_futex(&this->key, &key1))
860 continue; 1229 continue;
861 if (++ret <= nr_wake) { 1230
1231 WARN_ON(!requeue_pi && this->rt_waiter);
1232 WARN_ON(requeue_pi && !this->rt_waiter);
1233
1234 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1236 * lock, we already woke the top_waiter. If not, it will be
1237 * woken by futex_unlock_pi().
1238 */
1239 if (++task_count <= nr_wake && !requeue_pi) {
862 wake_futex(this); 1240 wake_futex(this);
863 } else { 1241 continue;
864 /* 1242 }
865 * If key1 and key2 hash to the same bucket, no need to
866 * requeue.
867 */
868 if (likely(head1 != &hb2->chain)) {
869 plist_del(&this->list, &hb1->chain);
870 plist_add(&this->list, &hb2->chain);
871 this->lock_ptr = &hb2->lock;
872#ifdef CONFIG_DEBUG_PI_LIST
873 this->list.plist.lock = &hb2->lock;
874#endif
875 }
876 this->key = key2;
877 get_futex_key_refs(&key2);
878 drop_count++;
879 1243
880 if (ret - nr_wake >= nr_requeue) 1244 /*
881 break; 1245 * Requeue nr_requeue waiters and possibly one more in the case
1246 * of requeue_pi if we couldn't acquire the lock atomically.
1247 */
1248 if (requeue_pi) {
1249 /* Prepare the waiter to take the rt_mutex. */
1250 atomic_inc(&pi_state->refcount);
1251 this->pi_state = pi_state;
1252 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1253 this->rt_waiter,
1254 this->task, 1);
1255 if (ret == 1) {
1256 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2);
1258 continue;
1259 } else if (ret) {
1260 /* -EDEADLK */
1261 this->pi_state = NULL;
1262 free_pi_state(pi_state);
1263 goto out_unlock;
1264 }
882 } 1265 }
1266 requeue_futex(this, hb1, hb2, &key2);
1267 drop_count++;
883 } 1268 }
884 1269
885out_unlock: 1270out_unlock:
@@ -899,7 +1284,9 @@ out_put_keys:
899out_put_key1: 1284out_put_key1:
900 put_futex_key(fshared, &key1); 1285 put_futex_key(fshared, &key1);
901out: 1286out:
902 return ret; 1287 if (pi_state != NULL)
1288 free_pi_state(pi_state);
1289 return ret ? ret : task_count;
903} 1290}
904 1291
905/* The key must be already stored in q->key. */ 1292/* The key must be already stored in q->key. */
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
907{ 1294{
908 struct futex_hash_bucket *hb; 1295 struct futex_hash_bucket *hb;
909 1296
910 init_waitqueue_head(&q->waiter);
911
912 get_futex_key_refs(&q->key); 1297 get_futex_key_refs(&q->key);
913 hb = hash_futex(&q->key); 1298 hb = hash_futex(&q->key);
914 q->lock_ptr = &hb->lock; 1299 q->lock_ptr = &hb->lock;
@@ -1119,35 +1504,149 @@ handle_fault:
1119 */ 1504 */
1120#define FLAGS_SHARED 0x01 1505#define FLAGS_SHARED 0x01
1121#define FLAGS_CLOCKRT 0x02 1506#define FLAGS_CLOCKRT 0x02
1507#define FLAGS_HAS_TIMEOUT 0x04
1122 1508
1123static long futex_wait_restart(struct restart_block *restart); 1509static long futex_wait_restart(struct restart_block *restart);
1124 1510
1125static int futex_wait(u32 __user *uaddr, int fshared, 1511/**
1126 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1512 * fixup_owner() - Post lock pi_state and corner case management
1513 * @uaddr: user address of the futex
1514 * @fshared: whether the futex is shared (1) or not (0)
1515 * @q: futex_q (contains pi_state and access to the rt_mutex)
1516 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1517 *
1518 * After attempting to lock an rt_mutex, this function is called to cleanup
1519 * the pi_state owner as well as handle race conditions that may allow us to
1520 * acquire the lock. Must be called with the hb lock held.
1521 *
1522 * Returns:
1523 * 1 - success, lock taken
1524 * 0 - success, lock not taken
1525 * <0 - on error (-EFAULT)
1526 */
1527static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1528 int locked)
1127{ 1529{
1128 struct task_struct *curr = current; 1530 struct task_struct *owner;
1129 struct restart_block *restart; 1531 int ret = 0;
1130 DECLARE_WAITQUEUE(wait, curr);
1131 struct futex_hash_bucket *hb;
1132 struct futex_q q;
1133 u32 uval;
1134 int ret;
1135 struct hrtimer_sleeper t;
1136 int rem = 0;
1137 1532
1138 if (!bitset) 1533 if (locked) {
1139 return -EINVAL; 1534 /*
1535 * Got the lock. We might not be the anticipated owner if we
1536 * did a lock-steal - fix up the PI-state in that case:
1537 */
1538 if (q->pi_state->owner != current)
1539 ret = fixup_pi_state_owner(uaddr, q, current, fshared);
1540 goto out;
1541 }
1140 1542
1141 q.pi_state = NULL; 1543 /*
1142 q.bitset = bitset; 1544 * Catch the rare case, where the lock was released when we were on the
1143retry: 1545 * way back before we locked the hash bucket.
1144 q.key = FUTEX_KEY_INIT; 1546 */
1145 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); 1547 if (q->pi_state->owner == current) {
1146 if (unlikely(ret != 0)) 1548 /*
1549 * Try to get the rt_mutex now. This might fail as some other
1550 * task acquired the rt_mutex after we removed ourself from the
1551 * rt_mutex waiters list.
1552 */
1553 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1554 locked = 1;
1555 goto out;
1556 }
1557
1558 /*
1559 * pi_state is incorrect, some other task did a lock steal and
1560 * we returned due to timeout or signal without taking the
1561 * rt_mutex. Too late. We can access the rt_mutex_owner without
1562 * locking, as the other task is now blocked on the hash bucket
1563 * lock. Fix the state up.
1564 */
1565 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1566 ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
1147 goto out; 1567 goto out;
1568 }
1148 1569
1149retry_private: 1570 /*
1150 hb = queue_lock(&q); 1571 * Paranoia check. If we did not take the lock, then we should not be
1572 * the owner, nor the pending owner, of the rt_mutex.
1573 */
1574 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1575 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1576 "pi-state %p\n", ret,
1577 q->pi_state->pi_mutex.owner,
1578 q->pi_state->owner);
1579
1580out:
1581 return ret ? ret : locked;
1582}
1583
1584/**
1585 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1586 * @hb: the futex hash bucket, must be locked by the caller
1587 * @q: the futex_q to queue up on
1588 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1589 */
1590static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1591 struct hrtimer_sleeper *timeout)
1592{
1593 queue_me(q, hb);
1594
1595 /*
1596 * There might have been scheduling since the queue_me(), as we
1597 * cannot hold a spinlock across the get_user() in case it
1598 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1599 * queueing ourselves into the futex hash. This code thus has to
1600 * rely on the futex_wake() code removing us from hash when it
1601 * wakes us up.
1602 */
1603 set_current_state(TASK_INTERRUPTIBLE);
1604
1605 /* Arm the timer */
1606 if (timeout) {
1607 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1608 if (!hrtimer_active(&timeout->timer))
1609 timeout->task = NULL;
1610 }
1611
1612 /*
1613 * !plist_node_empty() is safe here without any lock.
1614 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1615 */
1616 if (likely(!plist_node_empty(&q->list))) {
1617 /*
1618 * If the timer has already expired, current will already be
1619 * flagged for rescheduling. Only call schedule if there
1620 * is no timeout, or if it has yet to expire.
1621 */
1622 if (!timeout || timeout->task)
1623 schedule();
1624 }
1625 __set_current_state(TASK_RUNNING);
1626}
1627
1628/**
1629 * futex_wait_setup() - Prepare to wait on a futex
1630 * @uaddr: the futex userspace address
1631 * @val: the expected value
1632 * @fshared: whether the futex is shared (1) or not (0)
1633 * @q: the associated futex_q
1634 * @hb: storage for hash_bucket pointer to be returned to caller
1635 *
1636 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1637 * compare it with the expected value. Handle atomic faults internally.
1638 * Return with the hb lock held and a q.key reference on success, and unlocked
1639 * with no q.key reference on failure.
1640 *
1641 * Returns:
1642 * 0 - uaddr contains val and hb has been locked
1643 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1644 */
1645static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1646 struct futex_q *q, struct futex_hash_bucket **hb)
1647{
1648 u32 uval;
1649 int ret;
1151 1650
1152 /* 1651 /*
1153 * Access the page AFTER the hash-bucket is locked. 1652 * Access the page AFTER the hash-bucket is locked.
@@ -1165,95 +1664,83 @@ retry_private:
1165 * A consequence is that futex_wait() can return zero and absorb 1664 * A consequence is that futex_wait() can return zero and absorb
1166 * a wakeup when *uaddr != val on entry to the syscall. This is 1665 * a wakeup when *uaddr != val on entry to the syscall. This is
1167 * rare, but normal. 1666 * rare, but normal.
1168 *
1169 * For shared futexes, we hold the mmap semaphore, so the mapping
1170 * cannot have changed since we looked it up in get_futex_key.
1171 */ 1667 */
1668retry:
1669 q->key = FUTEX_KEY_INIT;
1670 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
1671 if (unlikely(ret != 0))
1672 return ret;
1673
1674retry_private:
1675 *hb = queue_lock(q);
1676
1172 ret = get_futex_value_locked(&uval, uaddr); 1677 ret = get_futex_value_locked(&uval, uaddr);
1173 1678
1174 if (unlikely(ret)) { 1679 if (ret) {
1175 queue_unlock(&q, hb); 1680 queue_unlock(q, *hb);
1176 1681
1177 ret = get_user(uval, uaddr); 1682 ret = get_user(uval, uaddr);
1178 if (ret) 1683 if (ret)
1179 goto out_put_key; 1684 goto out;
1180 1685
1181 if (!fshared) 1686 if (!fshared)
1182 goto retry_private; 1687 goto retry_private;
1183 1688
1184 put_futex_key(fshared, &q.key); 1689 put_futex_key(fshared, &q->key);
1185 goto retry; 1690 goto retry;
1186 } 1691 }
1187 ret = -EWOULDBLOCK;
1188 if (unlikely(uval != val)) {
1189 queue_unlock(&q, hb);
1190 goto out_put_key;
1191 }
1192 1692
1193 /* Only actually queue if *uaddr contained val. */ 1693 if (uval != val) {
1194 queue_me(&q, hb); 1694 queue_unlock(q, *hb);
1695 ret = -EWOULDBLOCK;
1696 }
1195 1697
1196 /* 1698out:
1197 * There might have been scheduling since the queue_me(), as we 1699 if (ret)
1198 * cannot hold a spinlock across the get_user() in case it 1700 put_futex_key(fshared, &q->key);
1199 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1701 return ret;
1200 * queueing ourselves into the futex hash. This code thus has to 1702}
1201 * rely on the futex_wake() code removing us from hash when it
1202 * wakes us up.
1203 */
1204 1703
1205 /* add_wait_queue is the barrier after __set_current_state. */ 1704static int futex_wait(u32 __user *uaddr, int fshared,
1206 __set_current_state(TASK_INTERRUPTIBLE); 1705 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1207 add_wait_queue(&q.waiter, &wait); 1706{
1208 /* 1707 struct hrtimer_sleeper timeout, *to = NULL;
1209 * !plist_node_empty() is safe here without any lock. 1708 struct restart_block *restart;
1210 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1709 struct futex_hash_bucket *hb;
1211 */ 1710 struct futex_q q;
1212 if (likely(!plist_node_empty(&q.list))) { 1711 int ret;
1213 if (!abs_time)
1214 schedule();
1215 else {
1216 hrtimer_init_on_stack(&t.timer,
1217 clockrt ? CLOCK_REALTIME :
1218 CLOCK_MONOTONIC,
1219 HRTIMER_MODE_ABS);
1220 hrtimer_init_sleeper(&t, current);
1221 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1222 current->timer_slack_ns);
1223
1224 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1225 if (!hrtimer_active(&t.timer))
1226 t.task = NULL;
1227 1712
1228 /* 1713 if (!bitset)
1229 * the timer could have already expired, in which 1714 return -EINVAL;
1230 * case current would be flagged for rescheduling.
1231 * Don't bother calling schedule.
1232 */
1233 if (likely(t.task))
1234 schedule();
1235 1715
1236 hrtimer_cancel(&t.timer); 1716 q.pi_state = NULL;
1717 q.bitset = bitset;
1718 q.rt_waiter = NULL;
1237 1719
1238 /* Flag if a timeout occured */ 1720 if (abs_time) {
1239 rem = (t.task == NULL); 1721 to = &timeout;
1240 1722
1241 destroy_hrtimer_on_stack(&t.timer); 1723 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
1242 } 1724 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1725 hrtimer_init_sleeper(to, current);
1726 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1727 current->timer_slack_ns);
1243 } 1728 }
1244 __set_current_state(TASK_RUNNING);
1245 1729
1246 /* 1730 /* Prepare to wait on uaddr. */
1247 * NOTE: we don't remove ourselves from the waitqueue because 1731 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1248 * we are the only user of it. 1732 if (ret)
1249 */ 1733 goto out;
1734
1735 /* queue_me and wait for wakeup, timeout, or a signal. */
1736 futex_wait_queue_me(hb, &q, to);
1250 1737
1251 /* If we were woken (and unqueued), we succeeded, whatever. */ 1738 /* If we were woken (and unqueued), we succeeded, whatever. */
1252 ret = 0; 1739 ret = 0;
1253 if (!unqueue_me(&q)) 1740 if (!unqueue_me(&q))
1254 goto out_put_key; 1741 goto out_put_key;
1255 ret = -ETIMEDOUT; 1742 ret = -ETIMEDOUT;
1256 if (rem) 1743 if (to && !to->task)
1257 goto out_put_key; 1744 goto out_put_key;
1258 1745
1259 /* 1746 /*
@@ -1270,7 +1757,7 @@ retry_private:
1270 restart->futex.val = val; 1757 restart->futex.val = val;
1271 restart->futex.time = abs_time->tv64; 1758 restart->futex.time = abs_time->tv64;
1272 restart->futex.bitset = bitset; 1759 restart->futex.bitset = bitset;
1273 restart->futex.flags = 0; 1760 restart->futex.flags = FLAGS_HAS_TIMEOUT;
1274 1761
1275 if (fshared) 1762 if (fshared)
1276 restart->futex.flags |= FLAGS_SHARED; 1763 restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1769,10 @@ retry_private:
1282out_put_key: 1769out_put_key:
1283 put_futex_key(fshared, &q.key); 1770 put_futex_key(fshared, &q.key);
1284out: 1771out:
1772 if (to) {
1773 hrtimer_cancel(&to->timer);
1774 destroy_hrtimer_on_stack(&to->timer);
1775 }
1285 return ret; 1776 return ret;
1286} 1777}
1287 1778
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
1290{ 1781{
1291 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1782 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1292 int fshared = 0; 1783 int fshared = 0;
1293 ktime_t t; 1784 ktime_t t, *tp = NULL;
1294 1785
1295 t.tv64 = restart->futex.time; 1786 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1787 t.tv64 = restart->futex.time;
1788 tp = &t;
1789 }
1296 restart->fn = do_no_restart_syscall; 1790 restart->fn = do_no_restart_syscall;
1297 if (restart->futex.flags & FLAGS_SHARED) 1791 if (restart->futex.flags & FLAGS_SHARED)
1298 fshared = 1; 1792 fshared = 1;
1299 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1793 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
1300 restart->futex.bitset, 1794 restart->futex.bitset,
1301 restart->futex.flags & FLAGS_CLOCKRT); 1795 restart->futex.flags & FLAGS_CLOCKRT);
1302} 1796}
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1312 int detect, ktime_t *time, int trylock) 1806 int detect, ktime_t *time, int trylock)
1313{ 1807{
1314 struct hrtimer_sleeper timeout, *to = NULL; 1808 struct hrtimer_sleeper timeout, *to = NULL;
1315 struct task_struct *curr = current;
1316 struct futex_hash_bucket *hb; 1809 struct futex_hash_bucket *hb;
1317 u32 uval, newval, curval; 1810 u32 uval;
1318 struct futex_q q; 1811 struct futex_q q;
1319 int ret, lock_taken, ownerdied = 0; 1812 int res, ret;
1320 1813
1321 if (refill_pi_state_cache()) 1814 if (refill_pi_state_cache())
1322 return -ENOMEM; 1815 return -ENOMEM;
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1330 } 1823 }
1331 1824
1332 q.pi_state = NULL; 1825 q.pi_state = NULL;
1826 q.rt_waiter = NULL;
1333retry: 1827retry:
1334 q.key = FUTEX_KEY_INIT; 1828 q.key = FUTEX_KEY_INIT;
1335 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1829 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1833,15 @@ retry:
1339retry_private: 1833retry_private:
1340 hb = queue_lock(&q); 1834 hb = queue_lock(&q);
1341 1835
1342retry_locked: 1836 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1343 ret = lock_taken = 0;
1344
1345 /*
1346 * To avoid races, we attempt to take the lock here again
1347 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1348 * the locks. It will most likely not succeed.
1349 */
1350 newval = task_pid_vnr(current);
1351
1352 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1353
1354 if (unlikely(curval == -EFAULT))
1355 goto uaddr_faulted;
1356
1357 /*
1358 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1359 * situation and we return success to user space.
1360 */
1361 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1362 ret = -EDEADLK;
1363 goto out_unlock_put_key;
1364 }
1365
1366 /*
1367 * Surprise - we got the lock. Just return to userspace:
1368 */
1369 if (unlikely(!curval))
1370 goto out_unlock_put_key;
1371
1372 uval = curval;
1373
1374 /*
1375 * Set the WAITERS flag, so the owner will know it has someone
1376 * to wake at next unlock
1377 */
1378 newval = curval | FUTEX_WAITERS;
1379
1380 /*
1381 * There are two cases, where a futex might have no owner (the
1382 * owner TID is 0): OWNER_DIED. We take over the futex in this
1383 * case. We also do an unconditional take over, when the owner
1384 * of the futex died.
1385 *
1386 * This is safe as we are protected by the hash bucket lock !
1387 */
1388 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1389 /* Keep the OWNER_DIED bit */
1390 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1391 ownerdied = 0;
1392 lock_taken = 1;
1393 }
1394
1395 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1396
1397 if (unlikely(curval == -EFAULT))
1398 goto uaddr_faulted;
1399 if (unlikely(curval != uval))
1400 goto retry_locked;
1401
1402 /*
1403 * We took the lock due to owner died take over.
1404 */
1405 if (unlikely(lock_taken))
1406 goto out_unlock_put_key;
1407
1408 /*
1409 * We dont have the lock. Look up the PI state (or create it if
1410 * we are the first waiter):
1411 */
1412 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1413
1414 if (unlikely(ret)) { 1837 if (unlikely(ret)) {
1415 switch (ret) { 1838 switch (ret) {
1416 1839 case 1:
1840 /* We got the lock. */
1841 ret = 0;
1842 goto out_unlock_put_key;
1843 case -EFAULT:
1844 goto uaddr_faulted;
1417 case -EAGAIN: 1845 case -EAGAIN:
1418 /* 1846 /*
1419 * Task is exiting and we just wait for the 1847 * Task is exiting and we just wait for the
@@ -1423,25 +1851,6 @@ retry_locked:
1423 put_futex_key(fshared, &q.key); 1851 put_futex_key(fshared, &q.key);
1424 cond_resched(); 1852 cond_resched();
1425 goto retry; 1853 goto retry;
1426
1427 case -ESRCH:
1428 /*
1429 * No owner found for this futex. Check if the
1430 * OWNER_DIED bit is set to figure out whether
1431 * this is a robust futex or not.
1432 */
1433 if (get_futex_value_locked(&curval, uaddr))
1434 goto uaddr_faulted;
1435
1436 /*
1437 * We simply start over in case of a robust
1438 * futex. The code above will take the futex
1439 * and return happy.
1440 */
1441 if (curval & FUTEX_OWNER_DIED) {
1442 ownerdied = 1;
1443 goto retry_locked;
1444 }
1445 default: 1854 default:
1446 goto out_unlock_put_key; 1855 goto out_unlock_put_key;
1447 } 1856 }
@@ -1465,71 +1874,21 @@ retry_locked:
1465 } 1874 }
1466 1875
1467 spin_lock(q.lock_ptr); 1876 spin_lock(q.lock_ptr);
1468 1877 /*
1469 if (!ret) { 1878 * Fixup the pi_state owner and possibly acquire the lock if we
1470 /* 1879 * haven't already.
1471 * Got the lock. We might not be the anticipated owner 1880 */
1472 * if we did a lock-steal - fix up the PI-state in 1881 res = fixup_owner(uaddr, fshared, &q, !ret);
1473 * that case: 1882 /*
1474 */ 1883 * If fixup_owner() returned an error, proprogate that. If it acquired
1475 if (q.pi_state->owner != curr) 1884 * the lock, clear our -ETIMEDOUT or -EINTR.
1476 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); 1885 */
1477 } else { 1886 if (res)
1478 /* 1887 ret = (res < 0) ? res : 0;
1479 * Catch the rare case, where the lock was released
1480 * when we were on the way back before we locked the
1481 * hash bucket.
1482 */
1483 if (q.pi_state->owner == curr) {
1484 /*
1485 * Try to get the rt_mutex now. This might
1486 * fail as some other task acquired the
1487 * rt_mutex after we removed ourself from the
1488 * rt_mutex waiters list.
1489 */
1490 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1491 ret = 0;
1492 else {
1493 /*
1494 * pi_state is incorrect, some other
1495 * task did a lock steal and we
1496 * returned due to timeout or signal
1497 * without taking the rt_mutex. Too
1498 * late. We can access the
1499 * rt_mutex_owner without locking, as
1500 * the other task is now blocked on
1501 * the hash bucket lock. Fix the state
1502 * up.
1503 */
1504 struct task_struct *owner;
1505 int res;
1506
1507 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1508 res = fixup_pi_state_owner(uaddr, &q, owner,
1509 fshared);
1510
1511 /* propagate -EFAULT, if the fixup failed */
1512 if (res)
1513 ret = res;
1514 }
1515 } else {
1516 /*
1517 * Paranoia check. If we did not take the lock
1518 * in the trylock above, then we should not be
1519 * the owner of the rtmutex, neither the real
1520 * nor the pending one:
1521 */
1522 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1523 printk(KERN_ERR "futex_lock_pi: ret = %d "
1524 "pi-mutex: %p pi-state %p\n", ret,
1525 q.pi_state->pi_mutex.owner,
1526 q.pi_state->owner);
1527 }
1528 }
1529 1888
1530 /* 1889 /*
1531 * If fixup_pi_state_owner() faulted and was unable to handle the 1890 * If fixup_owner() faulted and was unable to handle the fault, unlock
1532 * fault, unlock it and return the fault to userspace. 1891 * it and return the fault to userspace.
1533 */ 1892 */
1534 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 1893 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1535 rt_mutex_unlock(&q.pi_state->pi_mutex); 1894 rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1896,7 @@ retry_locked:
1537 /* Unqueue and drop the lock */ 1896 /* Unqueue and drop the lock */
1538 unqueue_me_pi(&q); 1897 unqueue_me_pi(&q);
1539 1898
1540 if (to) 1899 goto out;
1541 destroy_hrtimer_on_stack(&to->timer);
1542 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1543 1900
1544out_unlock_put_key: 1901out_unlock_put_key:
1545 queue_unlock(&q, hb); 1902 queue_unlock(&q, hb);
@@ -1549,7 +1906,7 @@ out_put_key:
1549out: 1906out:
1550 if (to) 1907 if (to)
1551 destroy_hrtimer_on_stack(&to->timer); 1908 destroy_hrtimer_on_stack(&to->timer);
1552 return ret; 1909 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1553 1910
1554uaddr_faulted: 1911uaddr_faulted:
1555 /* 1912 /*
@@ -1572,7 +1929,6 @@ uaddr_faulted:
1572 goto retry; 1929 goto retry;
1573} 1930}
1574 1931
1575
1576/* 1932/*
1577 * Userspace attempted a TID -> 0 atomic transition, and failed. 1933 * Userspace attempted a TID -> 0 atomic transition, and failed.
1578 * This is the in-kernel slowpath: we look up the PI state (if any), 1934 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1674,6 +2030,229 @@ pi_faulted:
1674 return ret; 2030 return ret;
1675} 2031}
1676 2032
2033/**
2034 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2035 * @hb: the hash_bucket futex_q was original enqueued on
2036 * @q: the futex_q woken while waiting to be requeued
2037 * @key2: the futex_key of the requeue target futex
2038 * @timeout: the timeout associated with the wait (NULL if none)
2039 *
2040 * Detect if the task was woken on the initial futex as opposed to the requeue
2041 * target futex. If so, determine if it was a timeout or a signal that caused
2042 * the wakeup and return the appropriate error code to the caller. Must be
2043 * called with the hb lock held.
2044 *
2045 * Returns
2046 * 0 - no early wakeup detected
2047 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2048 */
2049static inline
2050int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2051 struct futex_q *q, union futex_key *key2,
2052 struct hrtimer_sleeper *timeout)
2053{
2054 int ret = 0;
2055
2056 /*
2057 * With the hb lock held, we avoid races while we process the wakeup.
2058 * We only need to hold hb (and not hb2) to ensure atomicity as the
2059 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2060 * It can't be requeued from uaddr2 to something else since we don't
2061 * support a PI aware source futex for requeue.
2062 */
2063 if (!match_futex(&q->key, key2)) {
2064 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2065 /*
2066 * We were woken prior to requeue by a timeout or a signal.
2067 * Unqueue the futex_q and determine which it was.
2068 */
2069 plist_del(&q->list, &q->list.plist);
2070 drop_futex_key_refs(&q->key);
2071
2072 if (timeout && !timeout->task)
2073 ret = -ETIMEDOUT;
2074 else
2075 ret = -ERESTARTNOINTR;
2076 }
2077 return ret;
2078}
2079
2080/**
2081 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2082 * @uaddr: the futex we initialyl wait on (non-pi)
2083 * @fshared: whether the futexes are shared (1) or not (0). They must be
2084 * the same type, no requeueing from private to shared, etc.
2085 * @val: the expected value of uaddr
2086 * @abs_time: absolute timeout
2087 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
2088 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2089 * @uaddr2: the pi futex we will take prior to returning to user-space
2090 *
2091 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2092 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2093 * complete the acquisition of the rt_mutex prior to returning to userspace.
2094 * This ensures the rt_mutex maintains an owner when it has waiters; without
2095 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2096 * need to.
2097 *
2098 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2099 * via the following:
2100 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2101 * 2) wakeup on uaddr2 after a requeue and subsequent unlock
2102 * 3) signal (before or after requeue)
2103 * 4) timeout (before or after requeue)
2104 *
2105 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
2106 *
2107 * If 2, we may then block on trying to take the rt_mutex and return via:
2108 * 5) successful lock
2109 * 6) signal
2110 * 7) timeout
2111 * 8) other lock acquisition failure
2112 *
2113 * If 6, we setup a restart_block with futex_lock_pi() as the function.
2114 *
2115 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2116 *
2117 * Returns:
2118 * 0 - On success
2119 * <0 - On error
2120 */
2121static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2122 u32 val, ktime_t *abs_time, u32 bitset,
2123 int clockrt, u32 __user *uaddr2)
2124{
2125 struct hrtimer_sleeper timeout, *to = NULL;
2126 struct rt_mutex_waiter rt_waiter;
2127 struct rt_mutex *pi_mutex = NULL;
2128 struct futex_hash_bucket *hb;
2129 union futex_key key2;
2130 struct futex_q q;
2131 int res, ret;
2132
2133 if (!bitset)
2134 return -EINVAL;
2135
2136 if (abs_time) {
2137 to = &timeout;
2138 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
2139 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2140 hrtimer_init_sleeper(to, current);
2141 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2142 current->timer_slack_ns);
2143 }
2144
2145 /*
2146 * The waiter is allocated on our stack, manipulated by the requeue
2147 * code while we sleep on uaddr.
2148 */
2149 debug_rt_mutex_init_waiter(&rt_waiter);
2150 rt_waiter.task = NULL;
2151
2152 q.pi_state = NULL;
2153 q.bitset = bitset;
2154 q.rt_waiter = &rt_waiter;
2155
2156 key2 = FUTEX_KEY_INIT;
2157 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2158 if (unlikely(ret != 0))
2159 goto out;
2160
2161 /* Prepare to wait on uaddr. */
2162 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2163 if (ret)
2164 goto out_key2;
2165
2166 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2167 futex_wait_queue_me(hb, &q, to);
2168
2169 spin_lock(&hb->lock);
2170 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2171 spin_unlock(&hb->lock);
2172 if (ret)
2173 goto out_put_keys;
2174
2175 /*
2176 * In order for us to be here, we know our q.key == key2, and since
2177 * we took the hb->lock above, we also know that futex_requeue() has
2178 * completed and we no longer have to concern ourselves with a wakeup
2179 * race with the atomic proxy lock acquition by the requeue code.
2180 */
2181
2182 /* Check if the requeue code acquired the second futex for us. */
2183 if (!q.rt_waiter) {
2184 /*
2185 * Got the lock. We might not be the anticipated owner if we
2186 * did a lock-steal - fix up the PI-state in that case.
2187 */
2188 if (q.pi_state && (q.pi_state->owner != current)) {
2189 spin_lock(q.lock_ptr);
2190 ret = fixup_pi_state_owner(uaddr2, &q, current,
2191 fshared);
2192 spin_unlock(q.lock_ptr);
2193 }
2194 } else {
2195 /*
2196 * We have been woken up by futex_unlock_pi(), a timeout, or a
2197 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2198 * the pi_state.
2199 */
2200 WARN_ON(!&q.pi_state);
2201 pi_mutex = &q.pi_state->pi_mutex;
2202 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2203 debug_rt_mutex_free_waiter(&rt_waiter);
2204
2205 spin_lock(q.lock_ptr);
2206 /*
2207 * Fixup the pi_state owner and possibly acquire the lock if we
2208 * haven't already.
2209 */
2210 res = fixup_owner(uaddr2, fshared, &q, !ret);
2211 /*
2212 * If fixup_owner() returned an error, proprogate that. If it
2213 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
2214 */
2215 if (res)
2216 ret = (res < 0) ? res : 0;
2217
2218 /* Unqueue and drop the lock. */
2219 unqueue_me_pi(&q);
2220 }
2221
2222 /*
2223 * If fixup_pi_state_owner() faulted and was unable to handle the
2224 * fault, unlock the rt_mutex and return the fault to userspace.
2225 */
2226 if (ret == -EFAULT) {
2227 if (rt_mutex_owner(pi_mutex) == current)
2228 rt_mutex_unlock(pi_mutex);
2229 } else if (ret == -EINTR) {
2230 /*
2231 * We've already been requeued, but we have no way to
2232 * restart by calling futex_lock_pi() directly. We
2233 * could restart the syscall, but that will look at
2234 * the user space value and return right away. So we
2235 * drop back with EWOULDBLOCK to tell user space that
2236 * "val" has been changed. That's the same what the
2237 * restart of the syscall would do in
2238 * futex_wait_setup().
2239 */
2240 ret = -EWOULDBLOCK;
2241 }
2242
2243out_put_keys:
2244 put_futex_key(fshared, &q.key);
2245out_key2:
2246 put_futex_key(fshared, &key2);
2247
2248out:
2249 if (to) {
2250 hrtimer_cancel(&to->timer);
2251 destroy_hrtimer_on_stack(&to->timer);
2252 }
2253 return ret;
2254}
2255
1677/* 2256/*
1678 * Support for robust futexes: the kernel cleans up held futexes at 2257 * Support for robust futexes: the kernel cleans up held futexes at
1679 * thread exit time. 2258 * thread exit time.
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1896 fshared = 1; 2475 fshared = 1;
1897 2476
1898 clockrt = op & FUTEX_CLOCK_REALTIME; 2477 clockrt = op & FUTEX_CLOCK_REALTIME;
1899 if (clockrt && cmd != FUTEX_WAIT_BITSET) 2478 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
1900 return -ENOSYS; 2479 return -ENOSYS;
1901 2480
1902 switch (cmd) { 2481 switch (cmd) {
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1911 ret = futex_wake(uaddr, fshared, val, val3); 2490 ret = futex_wake(uaddr, fshared, val, val3);
1912 break; 2491 break;
1913 case FUTEX_REQUEUE: 2492 case FUTEX_REQUEUE:
1914 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2493 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
1915 break; 2494 break;
1916 case FUTEX_CMP_REQUEUE: 2495 case FUTEX_CMP_REQUEUE:
1917 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2496 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2497 0);
1918 break; 2498 break;
1919 case FUTEX_WAKE_OP: 2499 case FUTEX_WAKE_OP:
1920 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2500 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1931 if (futex_cmpxchg_enabled) 2511 if (futex_cmpxchg_enabled)
1932 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2512 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
1933 break; 2513 break;
2514 case FUTEX_WAIT_REQUEUE_PI:
2515 val3 = FUTEX_BITSET_MATCH_ANY;
2516 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
2517 clockrt, uaddr2);
2518 break;
2519 case FUTEX_CMP_REQUEUE_PI:
2520 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2521 1);
2522 break;
1934 default: 2523 default:
1935 ret = -ENOSYS; 2524 ret = -ENOSYS;
1936 } 2525 }
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1948 int cmd = op & FUTEX_CMD_MASK; 2537 int cmd = op & FUTEX_CMD_MASK;
1949 2538
1950 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2539 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
1951 cmd == FUTEX_WAIT_BITSET)) { 2540 cmd == FUTEX_WAIT_BITSET ||
2541 cmd == FUTEX_WAIT_REQUEUE_PI)) {
1952 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2542 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1953 return -EFAULT; 2543 return -EFAULT;
1954 if (!timespec_valid(&ts)) 2544 if (!timespec_valid(&ts))
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1960 tp = &t; 2550 tp = &t;
1961 } 2551 }
1962 /* 2552 /*
1963 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2553 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
1964 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2554 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
1965 */ 2555 */
1966 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2556 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
1967 cmd == FUTEX_WAKE_OP) 2557 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
1968 val2 = (u32) (unsigned long) utime; 2558 val2 = (u32) (unsigned long) utime;
1969 2559
1970 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2560 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
359 359
360 spin_lock(&desc->lock); 360 spin_lock(&desc->lock);
361 mask_ack_irq(desc, irq); 361 mask_ack_irq(desc, irq);
362 desc = irq_remap_to_desc(irq, desc);
363 362
364 if (unlikely(desc->status & IRQ_INPROGRESS)) 363 if (unlikely(desc->status & IRQ_INPROGRESS))
365 goto out_unlock; 364 goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
438 desc->status &= ~IRQ_INPROGRESS; 437 desc->status &= ~IRQ_INPROGRESS;
439out: 438out:
440 desc->chip->eoi(irq); 439 desc->chip->eoi(irq);
441 desc = irq_remap_to_desc(irq, desc);
442 440
443 spin_unlock(&desc->lock); 441 spin_unlock(&desc->lock);
444} 442}
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 !desc->action)) { 473 !desc->action)) {
476 desc->status |= (IRQ_PENDING | IRQ_MASKED); 474 desc->status |= (IRQ_PENDING | IRQ_MASKED);
477 mask_ack_irq(desc, irq); 475 mask_ack_irq(desc, irq);
478 desc = irq_remap_to_desc(irq, desc);
479 goto out_unlock; 476 goto out_unlock;
480 } 477 }
481 kstat_incr_irqs_this_cpu(irq, desc); 478 kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
483 /* Start handling the irq */ 480 /* Start handling the irq */
484 if (desc->chip->ack) 481 if (desc->chip->ack)
485 desc->chip->ack(irq); 482 desc->chip->ack(irq);
486 desc = irq_remap_to_desc(irq, desc);
487 483
488 /* Mark the IRQ currently in progress.*/ 484 /* Mark the IRQ currently in progress.*/
489 desc->status |= IRQ_INPROGRESS; 485 desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 if (!noirqdebug) 540 if (!noirqdebug)
545 note_interrupt(irq, desc, action_ret); 541 note_interrupt(irq, desc, action_ret);
546 542
547 if (desc->chip->eoi) { 543 if (desc->chip->eoi)
548 desc->chip->eoi(irq); 544 desc->chip->eoi(irq);
549 desc = irq_remap_to_desc(irq, desc);
550 }
551} 545}
552 546
553void 547void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
582 576
583 /* Uninstall? */ 577 /* Uninstall? */
584 if (handle == handle_bad_irq) { 578 if (handle == handle_bad_irq) {
585 if (desc->chip != &no_irq_chip) { 579 if (desc->chip != &no_irq_chip)
586 mask_ack_irq(desc, irq); 580 mask_ack_irq(desc, irq);
587 desc = irq_remap_to_desc(irq, desc);
588 }
589 desc->status |= IRQ_DISABLED; 581 desc->status |= IRQ_DISABLED;
590 desc->depth = 1; 582 desc->depth = 1;
591 } 583 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744f..18041a254d32 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/slab.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/random.h> 16#include <linux/random.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 82 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
82}; 83};
83 84
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 85void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
85{ 86{
86 int node;
87 void *ptr; 87 void *ptr;
88 88
89 node = cpu_to_node(cpu); 89 if (slab_is_available())
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92 else
93 ptr = alloc_bootmem_node(NODE_DATA(node),
94 nr * sizeof(*desc->kstat_irqs));
91 95
92 /* 96 /*
93 * don't overwite if can not get new one 97 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one 98 * init_copy_kstat_irqs() could still use old one
95 */ 99 */
96 if (ptr) { 100 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", 101 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
98 cpu, node);
99 desc->kstat_irqs = ptr; 102 desc->kstat_irqs = ptr;
100 } 103 }
101} 104}
102 105
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 106static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{ 107{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 108 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106 109
107 spin_lock_init(&desc->lock); 110 spin_lock_init(&desc->lock);
108 desc->irq = irq; 111 desc->irq = irq;
109#ifdef CONFIG_SMP 112#ifdef CONFIG_SMP
110 desc->cpu = cpu; 113 desc->node = node;
111#endif 114#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 115 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, cpu, nr_cpu_ids); 116 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) { 117 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n"); 118 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1); 119 BUG_ON(1);
117 } 120 }
118 if (!init_alloc_desc_masks(desc, cpu, false)) { 121 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); 122 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1); 123 BUG_ON(1);
121 } 124 }
122 arch_init_chip_data(desc, cpu); 125 init_desc_masks(desc);
126 arch_init_chip_data(desc, node);
123} 127}
124 128
125/* 129/*
@@ -169,7 +173,8 @@ int __init early_irq_init(void)
169 desc[i].irq = i; 173 desc[i].irq = i;
170 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 174 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
171 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 175 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
172 init_alloc_desc_masks(&desc[i], 0, true); 176 alloc_desc_masks(&desc[i], 0, true);
177 init_desc_masks(&desc[i]);
173 irq_desc_ptrs[i] = desc + i; 178 irq_desc_ptrs[i] = desc + i;
174 } 179 }
175 180
@@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
187 return NULL; 192 return NULL;
188} 193}
189 194
190struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 195struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
191{ 196{
192 struct irq_desc *desc; 197 struct irq_desc *desc;
193 unsigned long flags; 198 unsigned long flags;
194 int node;
195 199
196 if (irq >= nr_irqs) { 200 if (irq >= nr_irqs) {
197 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", 201 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
210 if (desc) 214 if (desc)
211 goto out_unlock; 215 goto out_unlock;
212 216
213 node = cpu_to_node(cpu); 217 if (slab_is_available())
214 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 218 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
215 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", 219 else
216 irq, cpu, node); 220 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
221
222 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
217 if (!desc) { 223 if (!desc) {
218 printk(KERN_ERR "can not alloc irq_desc\n"); 224 printk(KERN_ERR "can not alloc irq_desc\n");
219 BUG_ON(1); 225 BUG_ON(1);
220 } 226 }
221 init_one_irq_desc(irq, desc, cpu); 227 init_one_irq_desc(irq, desc, node);
222 228
223 irq_desc_ptrs[irq] = desc; 229 irq_desc_ptrs[irq] = desc;
224 230
@@ -256,7 +262,8 @@ int __init early_irq_init(void)
256 262
257 for (i = 0; i < count; i++) { 263 for (i = 0; i < count; i++) {
258 desc[i].irq = i; 264 desc[i].irq = i;
259 init_alloc_desc_masks(&desc[i], 0, true); 265 alloc_desc_masks(&desc[i], 0, true);
266 init_desc_masks(&desc[i]);
260 desc[i].kstat_irqs = kstat_irqs_all[i]; 267 desc[i].kstat_irqs = kstat_irqs_all[i];
261 } 268 }
262 return arch_early_irq_init(); 269 return arch_early_irq_init();
@@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
267 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 274 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
268} 275}
269 276
270struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 277struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
271{ 278{
272 return irq_to_desc(irq); 279 return irq_to_desc(irq);
273} 280}
@@ -453,11 +460,8 @@ unsigned int __do_IRQ(unsigned int irq)
453 /* 460 /*
454 * No locking required for CPU-local interrupts: 461 * No locking required for CPU-local interrupts:
455 */ 462 */
456 if (desc->chip->ack) { 463 if (desc->chip->ack)
457 desc->chip->ack(irq); 464 desc->chip->ack(irq);
458 /* get new one */
459 desc = irq_remap_to_desc(irq, desc);
460 }
461 if (likely(!(desc->status & IRQ_DISABLED))) { 465 if (likely(!(desc->status & IRQ_DISABLED))) {
462 action_ret = handle_IRQ_event(irq, desc->action); 466 action_ret = handle_IRQ_event(irq, desc->action);
463 if (!noirqdebug) 467 if (!noirqdebug)
@@ -468,10 +472,8 @@ unsigned int __do_IRQ(unsigned int irq)
468 } 472 }
469 473
470 spin_lock(&desc->lock); 474 spin_lock(&desc->lock);
471 if (desc->chip->ack) { 475 if (desc->chip->ack)
472 desc->chip->ack(irq); 476 desc->chip->ack(irq);
473 desc = irq_remap_to_desc(irq, desc);
474 }
475 /* 477 /*
476 * REPLAY is when Linux resends an IRQ that was dropped earlier 478 * REPLAY is when Linux resends an IRQ that was dropped earlier
477 * WAITING is used by probe to mark irqs that are being tested 479 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..73468253143b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 17
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
22 22
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47
45/* 48/*
46 * Debugging printout: 49 * Debugging printout:
47 */ 50 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243..aaf5c9d05770 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83static void 83void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{ 85{
86 struct irqaction *action = desc->action; 86 struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
109 spin_lock_irqsave(&desc->lock, flags); 109 spin_lock_irqsave(&desc->lock, flags);
110 110
111#ifdef CONFIG_GENERIC_PENDING_IRQ 111#ifdef CONFIG_GENERIC_PENDING_IRQ
112 if (desc->status & IRQ_MOVE_PCNTXT) 112 if (desc->status & IRQ_MOVE_PCNTXT) {
113 desc->chip->set_affinity(irq, cpumask); 113 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask);
116 }
117 }
114 else { 118 else {
115 desc->status |= IRQ_MOVE_PENDING; 119 desc->status |= IRQ_MOVE_PENDING;
116 cpumask_copy(desc->pending_mask, cpumask); 120 cpumask_copy(desc->pending_mask, cpumask);
117 } 121 }
118#else 122#else
119 cpumask_copy(desc->affinity, cpumask); 123 if (!desc->chip->set_affinity(irq, cpumask)) {
120 desc->chip->set_affinity(irq, cpumask); 124 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask);
126 }
121#endif 127#endif
122 irq_set_thread_affinity(desc, cpumask);
123 desc->status |= IRQ_AFFINITY_SET; 128 desc->status |= IRQ_AFFINITY_SET;
124 spin_unlock_irqrestore(&desc->lock, flags); 129 spin_unlock_irqrestore(&desc->lock, flags);
125 return 0; 130 return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..cfe767ca1545 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3#include <linux/interrupt.h>
4
5#include "internals.h"
3 6
4void move_masked_irq(int irq) 7void move_masked_irq(int irq)
5{ 8{
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
39 * masking the irqs. 42 * masking the irqs.
40 */ 43 */
41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 45 < nr_cpu_ids))
43 cpumask_and(desc->affinity, 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
44 desc->pending_mask, cpu_online_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
45 desc->chip->set_affinity(irq, desc->affinity); 48 irq_set_thread_affinity(desc, desc->pending_mask);
46 } 49 }
50
47 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
48} 52}
49 53
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..2f69bee57bf2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
15 15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc, 16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int node, int nr)
19{ 19{
20 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, node, nr);
21 21
22 if (desc->kstat_irqs != old_desc->kstat_irqs) 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
34} 34}
35 35
36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
37 struct irq_desc *desc, int cpu) 37 struct irq_desc *desc, int node)
38{ 38{
39 memcpy(desc, old_desc, sizeof(struct irq_desc)); 39 memcpy(desc, old_desc, sizeof(struct irq_desc));
40 if (!init_alloc_desc_masks(desc, cpu, false)) { 40 if (!alloc_desc_masks(desc, node, false)) {
41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " 41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
49 init_copy_desc_masks(old_desc, desc); 49 init_copy_desc_masks(old_desc, desc);
50 arch_init_copy_chip_data(old_desc, desc, cpu); 50 arch_init_copy_chip_data(old_desc, desc, node);
51 return true; 51 return true;
52} 52}
53 53
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
59} 59}
60 60
61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, 61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
62 int cpu) 62 int node)
63{ 63{
64 struct irq_desc *desc; 64 struct irq_desc *desc;
65 unsigned int irq; 65 unsigned int irq;
66 unsigned long flags; 66 unsigned long flags;
67 int node;
68 67
69 irq = old_desc->irq; 68 irq = old_desc->irq;
70 69
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
77 goto out_unlock; 76 goto out_unlock;
78 77
79 node = cpu_to_node(cpu);
80 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 78 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
81 if (!desc) { 79 if (!desc) {
82 printk(KERN_ERR "irq %d: can not get new irq_desc " 80 printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
85 desc = old_desc; 83 desc = old_desc;
86 goto out_unlock; 84 goto out_unlock;
87 } 85 }
88 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { 86 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
89 /* still use old one */ 87 /* still use old one */
90 kfree(desc); 88 kfree(desc);
91 desc = old_desc; 89 desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
97 95
98 /* free the old one */ 96 /* free the old one */
99 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
100 spin_unlock(&old_desc->lock);
101 kfree(old_desc); 98 kfree(old_desc);
102 spin_lock(&desc->lock);
103 99
104 return desc; 100 return desc;
105 101
@@ -109,24 +105,14 @@ out_unlock:
109 return desc; 105 return desc;
110} 106}
111 107
112struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
113{ 109{
114 int old_cpu;
115 int node, old_node;
116
117 /* those all static, do move them */ 110 /* those all static, do move them */
118 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY)
119 return desc; 112 return desc;
120 113
121 old_cpu = desc->cpu; 114 if (desc->node != node)
122 if (old_cpu != cpu) { 115 desc = __real_move_irq_desc(desc, node);
123 node = cpu_to_node(cpu);
124 old_node = cpu_to_node(old_cpu);
125 if (old_node != node)
126 desc = __real_move_irq_desc(desc, cpu);
127 else
128 desc->cpu = cpu;
129 }
130 116
131 return desc; 117 return desc;
132} 118}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..e5cc0cd28d54 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
471 473
472 return ret; 474 return ret;
473} 475}
474
475EXPORT_SYMBOL(mutex_trylock); 476EXPORT_SYMBOL(mutex_trylock);
477
478/**
479 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
480 * @cnt: the atomic which we are to dec
481 * @lock: the mutex to return holding if we dec to 0
482 *
483 * return true and hold lock if we dec to 0, return false otherwise
484 */
485int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
486{
487 /* dec if we can't possibly hit 0 */
488 if (atomic_add_unless(cnt, -1, 1))
489 return 0;
490 /* we might hit 0, so take the lock */
491 mutex_lock(lock);
492 if (!atomic_dec_and_test(cnt)) {
493 /* when we actually did the dec, we didn't hit 0 */
494 mutex_unlock(lock);
495 return 0;
496 }
497 /* we hit 0, and we hold the lock */
498 return 1;
499}
500EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..820c5af44f3e 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
300 * assigned pending owner [which might not have taken the 300 * assigned pending owner [which might not have taken the
301 * lock yet]: 301 * lock yet]:
302 */ 302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock) 303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
304{ 305{
305 struct task_struct *pendowner = rt_mutex_owner(lock); 306 struct task_struct *pendowner = rt_mutex_owner(lock);
306 struct rt_mutex_waiter *next; 307 struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
309 if (!rt_mutex_owner_pending(lock)) 310 if (!rt_mutex_owner_pending(lock))
310 return 0; 311 return 0;
311 312
312 if (pendowner == current) 313 if (pendowner == task)
313 return 1; 314 return 1;
314 315
315 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 spin_lock_irqsave(&pendowner->pi_lock, flags);
316 if (current->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
317 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
318 return 0; 319 return 0;
319 } 320 }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
338 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
339 * enqueued on the pending owners pi_waiters queue. So 340 * enqueued on the pending owners pi_waiters queue. So
340 * we have to enqueue this waiter into 341 * we have to enqueue this waiter into
341 * current->pi_waiters list. This covers the case, 342 * task->pi_waiters list. This covers the case,
342 * where current is boosted because it holds another 343 * where task is boosted because it holds another
343 * lock and gets unboosted because the booster is 344 * lock and gets unboosted because the booster is
344 * interrupted, so we would delay a waiter with higher 345 * interrupted, so we would delay a waiter with higher
345 * priority as current->normal_prio. 346 * priority as task->normal_prio.
346 * 347 *
347 * Note: in the rare case of a SCHED_OTHER task changing 348 * Note: in the rare case of a SCHED_OTHER task changing
348 * its priority and thus stealing the lock, next->task 349 * its priority and thus stealing the lock, next->task
349 * might be current: 350 * might be task:
350 */ 351 */
351 if (likely(next->task != current)) { 352 if (likely(next->task != task)) {
352 spin_lock_irqsave(&current->pi_lock, flags); 353 spin_lock_irqsave(&task->pi_lock, flags);
353 plist_add(&next->pi_list_entry, &current->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
354 __rt_mutex_adjust_prio(current); 355 __rt_mutex_adjust_prio(task);
355 spin_unlock_irqrestore(&current->pi_lock, flags); 356 spin_unlock_irqrestore(&task->pi_lock, flags);
356 } 357 }
357 return 1; 358 return 1;
358} 359}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
389 */ 390 */
390 mark_rt_mutex_waiters(lock); 391 mark_rt_mutex_waiters(lock);
391 392
392 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) 393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
393 return 0; 394 return 0;
394 395
395 /* We got the lock. */ 396 /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
411 */ 412 */
412static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 413static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
413 struct rt_mutex_waiter *waiter, 414 struct rt_mutex_waiter *waiter,
415 struct task_struct *task,
414 int detect_deadlock) 416 int detect_deadlock)
415{ 417{
416 struct task_struct *owner = rt_mutex_owner(lock); 418 struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
418 unsigned long flags; 420 unsigned long flags;
419 int chain_walk = 0, res; 421 int chain_walk = 0, res;
420 422
421 spin_lock_irqsave(&current->pi_lock, flags); 423 spin_lock_irqsave(&task->pi_lock, flags);
422 __rt_mutex_adjust_prio(current); 424 __rt_mutex_adjust_prio(task);
423 waiter->task = current; 425 waiter->task = task;
424 waiter->lock = lock; 426 waiter->lock = lock;
425 plist_node_init(&waiter->list_entry, current->prio); 427 plist_node_init(&waiter->list_entry, task->prio);
426 plist_node_init(&waiter->pi_list_entry, current->prio); 428 plist_node_init(&waiter->pi_list_entry, task->prio);
427 429
428 /* Get the top priority waiter on the lock */ 430 /* Get the top priority waiter on the lock */
429 if (rt_mutex_has_waiters(lock)) 431 if (rt_mutex_has_waiters(lock))
430 top_waiter = rt_mutex_top_waiter(lock); 432 top_waiter = rt_mutex_top_waiter(lock);
431 plist_add(&waiter->list_entry, &lock->wait_list); 433 plist_add(&waiter->list_entry, &lock->wait_list);
432 434
433 current->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
434 436
435 spin_unlock_irqrestore(&current->pi_lock, flags); 437 spin_unlock_irqrestore(&task->pi_lock, flags);
436 438
437 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
438 spin_lock_irqsave(&owner->pi_lock, flags); 440 spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
460 spin_unlock(&lock->wait_lock); 462 spin_unlock(&lock->wait_lock);
461 463
462 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
463 current); 465 task);
464 466
465 spin_lock(&lock->wait_lock); 467 spin_lock(&lock->wait_lock);
466 468
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
605 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 607 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
606} 608}
607 609
608/* 610/**
609 * Slow path lock function: 611 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
612 * @lock: the rt_mutex to take
613 * @state: the state the task should block in (TASK_INTERRUPTIBLE
614 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 *
619 * lock->wait_lock must be held by the caller.
610 */ 620 */
611static int __sched 621static int __sched
612rt_mutex_slowlock(struct rt_mutex *lock, int state, 622__rt_mutex_slowlock(struct rt_mutex *lock, int state,
613 struct hrtimer_sleeper *timeout, 623 struct hrtimer_sleeper *timeout,
614 int detect_deadlock) 624 struct rt_mutex_waiter *waiter,
625 int detect_deadlock)
615{ 626{
616 struct rt_mutex_waiter waiter;
617 int ret = 0; 627 int ret = 0;
618 628
619 debug_rt_mutex_init_waiter(&waiter);
620 waiter.task = NULL;
621
622 spin_lock(&lock->wait_lock);
623
624 /* Try to acquire the lock again: */
625 if (try_to_take_rt_mutex(lock)) {
626 spin_unlock(&lock->wait_lock);
627 return 0;
628 }
629
630 set_current_state(state);
631
632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) {
634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 if (!hrtimer_active(&timeout->timer))
636 timeout->task = NULL;
637 }
638
639 for (;;) { 629 for (;;) {
640 /* Try to acquire the lock: */ 630 /* Try to acquire the lock: */
641 if (try_to_take_rt_mutex(lock)) 631 if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
656 } 646 }
657 647
658 /* 648 /*
659 * waiter.task is NULL the first time we come here and 649 * waiter->task is NULL the first time we come here and
660 * when we have been woken up by the previous owner 650 * when we have been woken up by the previous owner
661 * but the lock got stolen by a higher prio task. 651 * but the lock got stolen by a higher prio task.
662 */ 652 */
663 if (!waiter.task) { 653 if (!waiter->task) {
664 ret = task_blocks_on_rt_mutex(lock, &waiter, 654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
665 detect_deadlock); 655 detect_deadlock);
666 /* 656 /*
667 * If we got woken up by the owner then start loop 657 * If we got woken up by the owner then start loop
668 * all over without going into schedule to try 658 * all over without going into schedule to try
669 * to get the lock now: 659 * to get the lock now:
670 */ 660 */
671 if (unlikely(!waiter.task)) { 661 if (unlikely(!waiter->task)) {
672 /* 662 /*
673 * Reset the return value. We might 663 * Reset the return value. We might
674 * have returned with -EDEADLK and the 664 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
684 674
685 spin_unlock(&lock->wait_lock); 675 spin_unlock(&lock->wait_lock);
686 676
687 debug_rt_mutex_print_deadlock(&waiter); 677 debug_rt_mutex_print_deadlock(waiter);
688 678
689 if (waiter.task) 679 if (waiter->task)
690 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
691 681
692 spin_lock(&lock->wait_lock); 682 spin_lock(&lock->wait_lock);
693 set_current_state(state); 683 set_current_state(state);
694 } 684 }
695 685
686 return ret;
687}
688
689/*
690 * Slow path lock function:
691 */
692static int __sched
693rt_mutex_slowlock(struct rt_mutex *lock, int state,
694 struct hrtimer_sleeper *timeout,
695 int detect_deadlock)
696{
697 struct rt_mutex_waiter waiter;
698 int ret = 0;
699
700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702
703 spin_lock(&lock->wait_lock);
704
705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock);
708 return 0;
709 }
710
711 set_current_state(state);
712
713 /* Setup the timer, when timeout != NULL */
714 if (unlikely(timeout)) {
715 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
716 if (!hrtimer_active(&timeout->timer))
717 timeout->task = NULL;
718 }
719
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
721 detect_deadlock);
722
696 set_current_state(TASK_RUNNING); 723 set_current_state(TASK_RUNNING);
697 724
698 if (unlikely(waiter.task)) 725 if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 891EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
865 892
866/** 893/**
867 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible 894 * rt_mutex_timed_lock - lock a rt_mutex interruptible
868 * the timeout structure is provided 895 * the timeout structure is provided
869 * by the caller 896 * by the caller
870 * 897 *
871 * @lock: the rt_mutex to be locked 898 * @lock: the rt_mutex to be locked
872 * @timeout: timeout structure or NULL (no timeout) 899 * @timeout: timeout structure or NULL (no timeout)
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
913} 940}
914EXPORT_SYMBOL_GPL(rt_mutex_unlock); 941EXPORT_SYMBOL_GPL(rt_mutex_unlock);
915 942
916/*** 943/**
917 * rt_mutex_destroy - mark a mutex unusable 944 * rt_mutex_destroy - mark a mutex unusable
918 * @lock: the mutex to be destroyed 945 * @lock: the mutex to be destroyed
919 * 946 *
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
986} 1013}
987 1014
988/** 1015/**
1016 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1017 * @lock: the rt_mutex to take
1018 * @waiter: the pre-initialized rt_mutex_waiter
1019 * @task: the task to prepare
1020 * @detect_deadlock: perform deadlock detection (1) or not (0)
1021 *
1022 * Returns:
1023 * 0 - task blocked on lock
1024 * 1 - acquired the lock for task, caller should wake it up
1025 * <0 - error
1026 *
1027 * Special API call for FUTEX_REQUEUE_PI support.
1028 */
1029int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1030 struct rt_mutex_waiter *waiter,
1031 struct task_struct *task, int detect_deadlock)
1032{
1033 int ret;
1034
1035 spin_lock(&lock->wait_lock);
1036
1037 mark_rt_mutex_waiters(lock);
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0);
1044
1045 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1;
1047 }
1048
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050
1051
1052 if (ret && !waiter->task) {
1053 /*
1054 * Reset the return value. We might have
1055 * returned with -EDEADLK and the owner
1056 * released the lock while we were walking the
1057 * pi chain. Let the waiter sort it out.
1058 */
1059 ret = 0;
1060 }
1061 spin_unlock(&lock->wait_lock);
1062
1063 debug_rt_mutex_print_deadlock(waiter);
1064
1065 return ret;
1066}
1067
1068/**
989 * rt_mutex_next_owner - return the next owner of the lock 1069 * rt_mutex_next_owner - return the next owner of the lock
990 * 1070 *
991 * @lock: the rt lock query 1071 * @lock: the rt lock query
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1004 1084
1005 return rt_mutex_top_waiter(lock)->task; 1085 return rt_mutex_top_waiter(lock)->task;
1006} 1086}
1087
1088/**
1089 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1090 * @lock: the rt_mutex we were woken on
1091 * @to: the timeout, null if none. hrtimer should already have
1092 * been started.
1093 * @waiter: the pre-initialized rt_mutex_waiter
1094 * @detect_deadlock: perform deadlock detection (1) or not (0)
1095 *
1096 * Complete the lock acquisition started our behalf by another thread.
1097 *
1098 * Returns:
1099 * 0 - success
1100 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
1101 *
1102 * Special API call for PI-futex requeue support
1103 */
1104int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1105 struct hrtimer_sleeper *to,
1106 struct rt_mutex_waiter *waiter,
1107 int detect_deadlock)
1108{
1109 int ret;
1110
1111 spin_lock(&lock->wait_lock);
1112
1113 set_current_state(TASK_INTERRUPTIBLE);
1114
1115 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
1116 detect_deadlock);
1117
1118 set_current_state(TASK_RUNNING);
1119
1120 if (unlikely(waiter->task))
1121 remove_waiter(lock, waiter);
1122
1123 /*
1124 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1125 * have to fix that up.
1126 */
1127 fixup_rt_mutex_waiters(lock);
1128
1129 spin_unlock(&lock->wait_lock);
1130
1131 /*
1132 * Readjust priority, when we did not get the lock. We might have been
1133 * the pending owner and boosted. Since we did not take the lock, the
1134 * PI boost has to go.
1135 */
1136 if (unlikely(ret))
1137 rt_mutex_adjust_prio(current);
1138
1139 return ret;
1140}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
124 struct rt_mutex_waiter *waiter,
125 struct task_struct *task,
126 int detect_deadlock);
127extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
128 struct hrtimer_sleeper *to,
129 struct rt_mutex_waiter *waiter,
130 int detect_deadlock);
123 131
124#ifdef CONFIG_DEBUG_RT_MUTEXES 132#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h" 133# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..076e403b9c88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
630 struct list_head migration_queue; 630 struct list_head migration_queue;
631#endif 631#endif
632 632
633 /* calc_load related fields */
634 unsigned long calc_load_update;
635 long calc_load_active;
636
633#ifdef CONFIG_SCHED_HRTICK 637#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 638#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 639 int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1732}
1729#endif 1733#endif
1730 1734
1735static void calc_load_account_active(struct rq *this_rq);
1736
1731#include "sched_stats.h" 1737#include "sched_stats.h"
1732#include "sched_idletask.c" 1738#include "sched_idletask.c"
1733#include "sched_fair.c" 1739#include "sched_fair.c"
@@ -2458,6 +2464,17 @@ out:
2458 return success; 2464 return success;
2459} 2465}
2460 2466
2467/**
2468 * wake_up_process - Wake up a specific process
2469 * @p: The process to be woken up.
2470 *
2471 * Attempt to wake up the nominated process and move it to the set of runnable
2472 * processes. Returns 1 if the process was woken up, 0 if it was already
2473 * running.
2474 *
2475 * It may be assumed that this function implies a write memory barrier before
2476 * changing the task state if and only if any tasks are woken up.
2477 */
2461int wake_up_process(struct task_struct *p) 2478int wake_up_process(struct task_struct *p)
2462{ 2479{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2480 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2766,7 +2783,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2783 * combine the page table reload and the switch backend into
2767 * one hypercall. 2784 * one hypercall.
2768 */ 2785 */
2769 arch_enter_lazy_cpu_mode(); 2786 arch_start_context_switch(prev);
2770 2787
2771 if (unlikely(!mm)) { 2788 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2789 next->active_mm = oldmm;
@@ -2856,19 +2873,72 @@ unsigned long nr_iowait(void)
2856 return sum; 2873 return sum;
2857} 2874}
2858 2875
2859unsigned long nr_active(void) 2876/* Variables and functions for calc_load */
2877static atomic_long_t calc_load_tasks;
2878static unsigned long calc_load_update;
2879unsigned long avenrun[3];
2880EXPORT_SYMBOL(avenrun);
2881
2882/**
2883 * get_avenrun - get the load average array
2884 * @loads: pointer to dest load array
2885 * @offset: offset to add
2886 * @shift: shift count to shift the result left
2887 *
2888 * These values are estimates at best, so no need for locking.
2889 */
2890void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2891{
2861 unsigned long i, running = 0, uninterruptible = 0; 2892 loads[0] = (avenrun[0] + offset) << shift;
2893 loads[1] = (avenrun[1] + offset) << shift;
2894 loads[2] = (avenrun[2] + offset) << shift;
2895}
2862 2896
2863 for_each_online_cpu(i) { 2897static unsigned long
2864 running += cpu_rq(i)->nr_running; 2898calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2899{
2866 } 2900 load *= exp;
2901 load += active * (FIXED_1 - exp);
2902 return load >> FSHIFT;
2903}
2867 2904
2868 if (unlikely((long)uninterruptible < 0)) 2905/*
2869 uninterruptible = 0; 2906 * calc_load - update the avenrun load estimates 10 ticks after the
2907 * CPUs have updated calc_load_tasks.
2908 */
2909void calc_global_load(void)
2910{
2911 unsigned long upd = calc_load_update + 10;
2912 long active;
2913
2914 if (time_before(jiffies, upd))
2915 return;
2916
2917 active = atomic_long_read(&calc_load_tasks);
2918 active = active > 0 ? active * FIXED_1 : 0;
2870 2919
2871 return running + uninterruptible; 2920 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2921 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2922 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2923
2924 calc_load_update += LOAD_FREQ;
2925}
2926
2927/*
2928 * Either called from update_cpu_load() or from a cpu going idle
2929 */
2930static void calc_load_account_active(struct rq *this_rq)
2931{
2932 long nr_active, delta;
2933
2934 nr_active = this_rq->nr_running;
2935 nr_active += (long) this_rq->nr_uninterruptible;
2936
2937 if (nr_active != this_rq->calc_load_active) {
2938 delta = nr_active - this_rq->calc_load_active;
2939 this_rq->calc_load_active = nr_active;
2940 atomic_long_add(delta, &calc_load_tasks);
2941 }
2872} 2942}
2873 2943
2874/* 2944/*
@@ -2899,6 +2969,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 2969 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2970 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 2971 }
2972
2973 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
2974 this_rq->calc_load_update += LOAD_FREQ;
2975 calc_load_account_active(this_rq);
2976 }
2902} 2977}
2903 2978
2904#ifdef CONFIG_SMP 2979#ifdef CONFIG_SMP
@@ -4240,10 +4315,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4315static struct {
4241 atomic_t load_balancer; 4316 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4317 cpumask_var_t cpu_mask;
4318 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4319} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4320 .load_balancer = ATOMIC_INIT(-1),
4245}; 4321};
4246 4322
4323#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4324/**
4325 * lowest_flag_domain - Return lowest sched_domain containing flag.
4326 * @cpu: The cpu whose lowest level of sched domain is to
4327 * be returned.
4328 * @flag: The flag to check for the lowest sched_domain
4329 * for the given cpu.
4330 *
4331 * Returns the lowest sched_domain of a cpu which contains the given flag.
4332 */
4333static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4334{
4335 struct sched_domain *sd;
4336
4337 for_each_domain(cpu, sd)
4338 if (sd && (sd->flags & flag))
4339 break;
4340
4341 return sd;
4342}
4343
4344/**
4345 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4346 * @cpu: The cpu whose domains we're iterating over.
4347 * @sd: variable holding the value of the power_savings_sd
4348 * for cpu.
4349 * @flag: The flag to filter the sched_domains to be iterated.
4350 *
4351 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4352 * set, starting from the lowest sched_domain to the highest.
4353 */
4354#define for_each_flag_domain(cpu, sd, flag) \
4355 for (sd = lowest_flag_domain(cpu, flag); \
4356 (sd && (sd->flags & flag)); sd = sd->parent)
4357
4358/**
4359 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4360 * @ilb_group: group to be checked for semi-idleness
4361 *
4362 * Returns: 1 if the group is semi-idle. 0 otherwise.
4363 *
4364 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4365 * and atleast one non-idle CPU. This helper function checks if the given
4366 * sched_group is semi-idle or not.
4367 */
4368static inline int is_semi_idle_group(struct sched_group *ilb_group)
4369{
4370 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4371 sched_group_cpus(ilb_group));
4372
4373 /*
4374 * A sched_group is semi-idle when it has atleast one busy cpu
4375 * and atleast one idle cpu.
4376 */
4377 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4378 return 0;
4379
4380 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4381 return 0;
4382
4383 return 1;
4384}
4385/**
4386 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4387 * @cpu: The cpu which is nominating a new idle_load_balancer.
4388 *
4389 * Returns: Returns the id of the idle load balancer if it exists,
4390 * Else, returns >= nr_cpu_ids.
4391 *
4392 * This algorithm picks the idle load balancer such that it belongs to a
4393 * semi-idle powersavings sched_domain. The idea is to try and avoid
4394 * completely idle packages/cores just for the purpose of idle load balancing
4395 * when there are other idle cpu's which are better suited for that job.
4396 */
4397static int find_new_ilb(int cpu)
4398{
4399 struct sched_domain *sd;
4400 struct sched_group *ilb_group;
4401
4402 /*
4403 * Have idle load balancer selection from semi-idle packages only
4404 * when power-aware load balancing is enabled
4405 */
4406 if (!(sched_smt_power_savings || sched_mc_power_savings))
4407 goto out_done;
4408
4409 /*
4410 * Optimize for the case when we have no idle CPUs or only one
4411 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4412 */
4413 if (cpumask_weight(nohz.cpu_mask) < 2)
4414 goto out_done;
4415
4416 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4417 ilb_group = sd->groups;
4418
4419 do {
4420 if (is_semi_idle_group(ilb_group))
4421 return cpumask_first(nohz.ilb_grp_nohz_mask);
4422
4423 ilb_group = ilb_group->next;
4424
4425 } while (ilb_group != sd->groups);
4426 }
4427
4428out_done:
4429 return cpumask_first(nohz.cpu_mask);
4430}
4431#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4432static inline int find_new_ilb(int call_cpu)
4433{
4434 return cpumask_first(nohz.cpu_mask);
4435}
4436#endif
4437
4247/* 4438/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4439 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4440 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4489,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4489 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4490 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4491 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4492 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4493 int new_ilb;
4494
4495 if (!(sched_smt_power_savings ||
4496 sched_mc_power_savings))
4497 return 1;
4498 /*
4499 * Check to see if there is a more power-efficient
4500 * ilb.
4501 */
4502 new_ilb = find_new_ilb(cpu);
4503 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4504 atomic_set(&nohz.load_balancer, -1);
4505 resched_cpu(new_ilb);
4506 return 0;
4507 }
4302 return 1; 4508 return 1;
4509 }
4303 } else { 4510 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4511 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4512 return 0;
@@ -4468,15 +4675,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4675 }
4469 4676
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4677 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4678 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4679
4481 if (ilb < nr_cpu_ids) 4680 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4681 resched_cpu(ilb);
@@ -5007,13 +5206,15 @@ pick_next_task(struct rq *rq)
5007/* 5206/*
5008 * schedule() is the main scheduler function. 5207 * schedule() is the main scheduler function.
5009 */ 5208 */
5010asmlinkage void __sched __schedule(void) 5209asmlinkage void __sched schedule(void)
5011{ 5210{
5012 struct task_struct *prev, *next; 5211 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5212 unsigned long *switch_count;
5014 struct rq *rq; 5213 struct rq *rq;
5015 int cpu; 5214 int cpu;
5016 5215
5216need_resched:
5217 preempt_disable();
5017 cpu = smp_processor_id(); 5218 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5219 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5220 rcu_qsctr_inc(cpu);
@@ -5070,15 +5271,9 @@ need_resched_nonpreemptible:
5070 5271
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5272 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5273 goto need_resched_nonpreemptible;
5073}
5074 5274
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5275 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5276 if (need_resched())
5082 goto need_resched; 5277 goto need_resched;
5083} 5278}
5084EXPORT_SYMBOL(schedule); 5279EXPORT_SYMBOL(schedule);
@@ -5221,7 +5416,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5416 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5417 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5418 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5419static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5420 int nr_exclusive, int sync, void *key)
5226{ 5421{
5227 wait_queue_t *curr, *next; 5422 wait_queue_t *curr, *next;
@@ -5241,6 +5436,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5241 * @mode: which threads 5436 * @mode: which threads
5242 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5437 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5243 * @key: is directly passed to the wakeup function 5438 * @key: is directly passed to the wakeup function
5439 *
5440 * It may be assumed that this function implies a write memory barrier before
5441 * changing the task state if and only if any tasks are woken up.
5244 */ 5442 */
5245void __wake_up(wait_queue_head_t *q, unsigned int mode, 5443void __wake_up(wait_queue_head_t *q, unsigned int mode,
5246 int nr_exclusive, void *key) 5444 int nr_exclusive, void *key)
@@ -5279,6 +5477,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279 * with each other. This can prevent needless bouncing between CPUs. 5477 * with each other. This can prevent needless bouncing between CPUs.
5280 * 5478 *
5281 * On UP it can prevent extra preemption. 5479 * On UP it can prevent extra preemption.
5480 *
5481 * It may be assumed that this function implies a write memory barrier before
5482 * changing the task state if and only if any tasks are woken up.
5282 */ 5483 */
5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5484void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5284 int nr_exclusive, void *key) 5485 int nr_exclusive, void *key)
@@ -5315,6 +5516,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5315 * awakened in the same order in which they were queued. 5516 * awakened in the same order in which they were queued.
5316 * 5517 *
5317 * See also complete_all(), wait_for_completion() and related routines. 5518 * See also complete_all(), wait_for_completion() and related routines.
5519 *
5520 * It may be assumed that this function implies a write memory barrier before
5521 * changing the task state if and only if any tasks are woken up.
5318 */ 5522 */
5319void complete(struct completion *x) 5523void complete(struct completion *x)
5320{ 5524{
@@ -5332,6 +5536,9 @@ EXPORT_SYMBOL(complete);
5332 * @x: holds the state of this particular completion 5536 * @x: holds the state of this particular completion
5333 * 5537 *
5334 * This will wake up all threads waiting on this particular completion event. 5538 * This will wake up all threads waiting on this particular completion event.
5539 *
5540 * It may be assumed that this function implies a write memory barrier before
5541 * changing the task state if and only if any tasks are woken up.
5335 */ 5542 */
5336void complete_all(struct completion *x) 5543void complete_all(struct completion *x)
5337{ 5544{
@@ -6490,8 +6697,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6697#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6698 free = stack_not_used(p);
6492#endif 6699#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6700 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6701 task_pid_nr(p), task_pid_nr(p->real_parent),
6702 (unsigned long)task_thread_info(p)->flags);
6495 6703
6496 show_stack(p, NULL); 6704 show_stack(p, NULL);
6497} 6705}
@@ -6970,6 +7178,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7178
6971 } 7179 }
6972} 7180}
7181
7182/*
7183 * remove the tasks which were accounted by rq from calc_load_tasks.
7184 */
7185static void calc_global_load_remove(struct rq *rq)
7186{
7187 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7188}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7189#endif /* CONFIG_HOTPLUG_CPU */
6974 7190
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7191#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7420,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7420 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7421 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7422 spin_lock_irqsave(&rq->lock, flags);
7423 rq->calc_load_update = calc_load_update;
7424 rq->calc_load_active = 0;
7207 if (rq->rd) { 7425 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7426 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7427
@@ -7243,7 +7461,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7461 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7462 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7463 BUG_ON(rq->nr_running != 0);
7246 7464 calc_global_load_remove(rq);
7247 /* 7465 /*
7248 * No need to migrate the tasks: it was best-effort if 7466 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7467 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7971,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 7971
7754/* 7972/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 7973 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 7974 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 7975 * ( See the the comments in include/linux/sched.h:struct sched_group
7976 * and struct sched_domain. )
7758 */ 7977 */
7759struct static_sched_group { 7978struct static_sched_group {
7760 struct sched_group sg; 7979 struct sched_group sg;
@@ -7875,7 +8094,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8094 struct sched_domain *sd;
7876 8095
7877 sd = &per_cpu(phys_domains, j).sd; 8096 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8097 if (j != group_first_cpu(sd->groups)) {
7879 /* 8098 /*
7880 * Only add "power" once for each 8099 * Only add "power" once for each
7881 * physical package. 8100 * physical package.
@@ -7953,7 +8172,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8172
7954 WARN_ON(!sd || !sd->groups); 8173 WARN_ON(!sd || !sd->groups);
7955 8174
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8175 if (cpu != group_first_cpu(sd->groups))
7957 return; 8176 return;
7958 8177
7959 child = sd->child; 8178 child = sd->child;
@@ -8938,6 +9157,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9157 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9158 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9159 rq->nr_running = 0;
9160 rq->calc_load_active = 0;
9161 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9162 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9163 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9164#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9266,9 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9266 * when this runqueue becomes "idle".
9046 */ 9267 */
9047 init_idle(current, smp_processor_id()); 9268 init_idle(current, smp_processor_id());
9269
9270 calc_load_update = jiffies + LOAD_FREQ;
9271
9048 /* 9272 /*
9049 * During early bootup we pretend to be a normal task: 9273 * During early bootup we pretend to be a normal task:
9050 */ 9274 */
@@ -9055,6 +9279,7 @@ void __init sched_init(void)
9055#ifdef CONFIG_SMP 9279#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9280#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9281 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9282 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9058#endif 9283#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9284 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9285#endif /* SMP */
@@ -9800,6 +10025,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10025 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10026 return -EINVAL;
9802 10027
10028 /*
10029 * There's always some RT tasks in the root group
10030 * -- migration, kstopmachine etc..
10031 */
10032 if (sysctl_sched_rt_runtime == 0)
10033 return -EBUSY;
10034
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10035 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10036 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10037 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89574cd..344712a5e3ed 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
165 vec->count = 0; 165 vec->count = 0;
166 if (bootmem) 166 if (bootmem)
167 alloc_bootmem_cpumask_var(&vec->mask); 167 alloc_bootmem_cpumask_var(&vec->mask);
168 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) 168 else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
169 goto cleanup; 169 goto cleanup;
170 } 170 }
171 171
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..5f9650e8fe75 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1487
1488 find_matching_se(&se, &pse); 1488 find_matching_se(&se, &pse);
1489 1489
1490 while (se) { 1490 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492 1491
1493 if (wakeup_preempt_entity(se, pse) == 1) { 1492 if (wakeup_preempt_entity(se, pse) == 1)
1494 resched_task(curr); 1493 resched_task(curr);
1495 break;
1496 }
1497
1498 se = parent_entity(se);
1499 pse = parent_entity(pse);
1500 }
1501} 1494}
1502 1495
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1496static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f2c66f8f9712..9bf0d2a73045 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void)
1591 unsigned int i; 1591 unsigned int i;
1592 1592
1593 for_each_possible_cpu(i) 1593 for_each_possible_cpu(i)
1594 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1594 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1595 GFP_KERNEL, cpu_to_node(i)); 1595 GFP_KERNEL, cpu_to_node(i));
1596} 1596}
1597#endif /* CONFIG_SMP */ 1597#endif /* CONFIG_SMP */
diff --git a/kernel/smp.c b/kernel/smp.c
index 858baac568ee..ad63d8501207 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
52 switch (action) { 52 switch (action) {
53 case CPU_UP_PREPARE: 53 case CPU_UP_PREPARE:
54 case CPU_UP_PREPARE_FROZEN: 54 case CPU_UP_PREPARE_FROZEN:
55 if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 55 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56 cpu_to_node(cpu))) 56 cpu_to_node(cpu)))
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511..f674f332a024 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void)
828 return 0; 828 return 0;
829} 829}
830 830
831int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) 831int __weak arch_init_chip_data(struct irq_desc *desc, int node)
832{ 832{
833 return 0; 833 return 0;
834} 834}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45bd711a242e..944ba03cae19 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = {
743 }, 743 },
744 { 744 {
745 .ctl_name = CTL_UNNUMBERED, 745 .ctl_name = CTL_UNNUMBERED,
746 .procname = "bootloader_version",
747 .data = &bootloader_version,
748 .maxlen = sizeof (int),
749 .mode = 0444,
750 .proc_handler = &proc_dointvec,
751 },
752 {
753 .ctl_name = CTL_UNNUMBERED,
746 .procname = "kstack_depth_to_print", 754 .procname = "kstack_depth_to_print",
747 .data = &kstack_depth_to_print, 755 .data = &kstack_depth_to_print,
748 .maxlen = sizeof(int), 756 .maxlen = sizeof(int),
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..52a8bf8931f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..a26ed294f938 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick)
1123} 1123}
1124 1124
1125/* 1125/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1126 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1127 */
1169static void run_timer_softirq(struct softirq_action *h) 1128static void run_timer_softirq(struct softirq_action *h)
@@ -1187,16 +1146,6 @@ void run_local_timers(void)
1187} 1146}
1188 1147
1189/* 1148/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1149 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1150 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1151 * jiffies is defined in the linker script...
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1154void do_timer(unsigned long ticks)
1206{ 1155{
1207 jiffies_64 += ticks; 1156 jiffies_64 += ticks;
1208 update_times(ticks); 1157 update_wall_time();
1158 calc_global_load();
1209} 1159}
1210 1160
1211#ifdef __ARCH_WANT_SYS_ALARM 1161#ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1356{
1407 unsigned long mem_total, sav_total; 1357 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1358 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1359 struct timespec tp;
1410 1360
1411 memset(info, 0, sizeof(struct sysinfo)); 1361 memset(info, 0, sizeof(struct sysinfo));
1412 1362
1413 do { 1363 ktime_get_ts(&tp);
1414 struct timespec tp; 1364 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1365 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1366
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1367 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1368
1438 info->procs = nr_threads; 1369 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1370
1441 si_meminfo(info); 1371 si_meminfo(info);
1442 si_swapinfo(info); 1372 si_swapinfo(info);
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 157 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 158 spin_unlock_irqrestore(&q->lock, flags);
159} 159}
160EXPORT_SYMBOL(abort_exclusive_wait); 160EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 1f71b97de0f9..eb23aaa0c7b8 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -119,6 +119,12 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
119} 119}
120EXPORT_SYMBOL(alloc_cpumask_var_node); 120EXPORT_SYMBOL(alloc_cpumask_var_node);
121 121
122bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
123{
124 return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
125}
126EXPORT_SYMBOL(zalloc_cpumask_var_node);
127
122/** 128/**
123 * alloc_cpumask_var - allocate a struct cpumask 129 * alloc_cpumask_var - allocate a struct cpumask
124 * @mask: pointer to cpumask_var_t where the cpumask is returned 130 * @mask: pointer to cpumask_var_t where the cpumask is returned
@@ -135,6 +141,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
135} 141}
136EXPORT_SYMBOL(alloc_cpumask_var); 142EXPORT_SYMBOL(alloc_cpumask_var);
137 143
144bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
145{
146 return alloc_cpumask_var(mask, flags | __GFP_ZERO);
147}
148EXPORT_SYMBOL(zalloc_cpumask_var);
149
138/** 150/**
139 * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. 151 * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
140 * @mask: pointer to cpumask_var_t where the cpumask is returned 152 * @mask: pointer to cpumask_var_t where the cpumask is returned
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 69da09a085a1..ad65fc0317d9 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -23,9 +23,11 @@
23#include <linux/dma-debug.h> 23#include <linux/dma-debug.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
26#include <linux/device.h> 27#include <linux/device.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/ctype.h>
29#include <linux/list.h> 31#include <linux/list.h>
30#include <linux/slab.h> 32#include <linux/slab.h>
31 33
@@ -85,6 +87,7 @@ static u32 show_num_errors = 1;
85 87
86static u32 num_free_entries; 88static u32 num_free_entries;
87static u32 min_free_entries; 89static u32 min_free_entries;
90static u32 nr_total_entries;
88 91
89/* number of preallocated entries requested by kernel cmdline */ 92/* number of preallocated entries requested by kernel cmdline */
90static u32 req_entries; 93static u32 req_entries;
@@ -97,6 +100,16 @@ static struct dentry *show_all_errors_dent __read_mostly;
97static struct dentry *show_num_errors_dent __read_mostly; 100static struct dentry *show_num_errors_dent __read_mostly;
98static struct dentry *num_free_entries_dent __read_mostly; 101static struct dentry *num_free_entries_dent __read_mostly;
99static struct dentry *min_free_entries_dent __read_mostly; 102static struct dentry *min_free_entries_dent __read_mostly;
103static struct dentry *filter_dent __read_mostly;
104
105/* per-driver filter related state */
106
107#define NAME_MAX_LEN 64
108
109static char current_driver_name[NAME_MAX_LEN] __read_mostly;
110static struct device_driver *current_driver __read_mostly;
111
112static DEFINE_RWLOCK(driver_name_lock);
100 113
101static const char *type2name[4] = { "single", "page", 114static const char *type2name[4] = { "single", "page",
102 "scather-gather", "coherent" }; 115 "scather-gather", "coherent" };
@@ -104,6 +117,11 @@ static const char *type2name[4] = { "single", "page",
104static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", 117static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
105 "DMA_FROM_DEVICE", "DMA_NONE" }; 118 "DMA_FROM_DEVICE", "DMA_NONE" };
106 119
120/* little merge helper - remove it after the merge window */
121#ifndef BUS_NOTIFY_UNBOUND_DRIVER
122#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
123#endif
124
107/* 125/*
108 * The access to some variables in this macro is racy. We can't use atomic_t 126 * The access to some variables in this macro is racy. We can't use atomic_t
109 * here because all these variables are exported to debugfs. Some of them even 127 * here because all these variables are exported to debugfs. Some of them even
@@ -121,15 +139,54 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
121{ 139{
122#ifdef CONFIG_STACKTRACE 140#ifdef CONFIG_STACKTRACE
123 if (entry) { 141 if (entry) {
124 printk(KERN_WARNING "Mapped at:\n"); 142 pr_warning("Mapped at:\n");
125 print_stack_trace(&entry->stacktrace, 0); 143 print_stack_trace(&entry->stacktrace, 0);
126 } 144 }
127#endif 145#endif
128} 146}
129 147
148static bool driver_filter(struct device *dev)
149{
150 struct device_driver *drv;
151 unsigned long flags;
152 bool ret;
153
154 /* driver filter off */
155 if (likely(!current_driver_name[0]))
156 return true;
157
158 /* driver filter on and initialized */
159 if (current_driver && dev->driver == current_driver)
160 return true;
161
162 if (current_driver || !current_driver_name[0])
163 return false;
164
165 /* driver filter on but not yet initialized */
166 drv = get_driver(dev->driver);
167 if (!drv)
168 return false;
169
170 /* lock to protect against change of current_driver_name */
171 read_lock_irqsave(&driver_name_lock, flags);
172
173 ret = false;
174 if (drv->name &&
175 strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
176 current_driver = drv;
177 ret = true;
178 }
179
180 read_unlock_irqrestore(&driver_name_lock, flags);
181 put_driver(drv);
182
183 return ret;
184}
185
130#define err_printk(dev, entry, format, arg...) do { \ 186#define err_printk(dev, entry, format, arg...) do { \
131 error_count += 1; \ 187 error_count += 1; \
132 if (show_all_errors || show_num_errors > 0) { \ 188 if (driver_filter(dev) && \
189 (show_all_errors || show_num_errors > 0)) { \
133 WARN(1, "%s %s: " format, \ 190 WARN(1, "%s %s: " format, \
134 dev_driver_string(dev), \ 191 dev_driver_string(dev), \
135 dev_name(dev) , ## arg); \ 192 dev_name(dev) , ## arg); \
@@ -185,15 +242,50 @@ static void put_hash_bucket(struct hash_bucket *bucket,
185static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket, 242static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket,
186 struct dma_debug_entry *ref) 243 struct dma_debug_entry *ref)
187{ 244{
188 struct dma_debug_entry *entry; 245 struct dma_debug_entry *entry, *ret = NULL;
246 int matches = 0, match_lvl, last_lvl = 0;
189 247
190 list_for_each_entry(entry, &bucket->list, list) { 248 list_for_each_entry(entry, &bucket->list, list) {
191 if ((entry->dev_addr == ref->dev_addr) && 249 if ((entry->dev_addr != ref->dev_addr) ||
192 (entry->dev == ref->dev)) 250 (entry->dev != ref->dev))
251 continue;
252
253 /*
254 * Some drivers map the same physical address multiple
255 * times. Without a hardware IOMMU this results in the
256 * same device addresses being put into the dma-debug
257 * hash multiple times too. This can result in false
258 * positives being reported. Therfore we implement a
259 * best-fit algorithm here which returns the entry from
260 * the hash which fits best to the reference value
261 * instead of the first-fit.
262 */
263 matches += 1;
264 match_lvl = 0;
265 entry->size == ref->size ? ++match_lvl : match_lvl;
266 entry->type == ref->type ? ++match_lvl : match_lvl;
267 entry->direction == ref->direction ? ++match_lvl : match_lvl;
268
269 if (match_lvl == 3) {
270 /* perfect-fit - return the result */
193 return entry; 271 return entry;
272 } else if (match_lvl > last_lvl) {
273 /*
274 * We found an entry that fits better then the
275 * previous one
276 */
277 last_lvl = match_lvl;
278 ret = entry;
279 }
194 } 280 }
195 281
196 return NULL; 282 /*
283 * If we have multiple matches but no perfect-fit, just return
284 * NULL.
285 */
286 ret = (matches == 1) ? ret : NULL;
287
288 return ret;
197} 289}
198 290
199/* 291/*
@@ -257,6 +349,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
257 put_hash_bucket(bucket, &flags); 349 put_hash_bucket(bucket, &flags);
258} 350}
259 351
352static struct dma_debug_entry *__dma_entry_alloc(void)
353{
354 struct dma_debug_entry *entry;
355
356 entry = list_entry(free_entries.next, struct dma_debug_entry, list);
357 list_del(&entry->list);
358 memset(entry, 0, sizeof(*entry));
359
360 num_free_entries -= 1;
361 if (num_free_entries < min_free_entries)
362 min_free_entries = num_free_entries;
363
364 return entry;
365}
366
260/* struct dma_entry allocator 367/* struct dma_entry allocator
261 * 368 *
262 * The next two functions implement the allocator for 369 * The next two functions implement the allocator for
@@ -270,15 +377,12 @@ static struct dma_debug_entry *dma_entry_alloc(void)
270 spin_lock_irqsave(&free_entries_lock, flags); 377 spin_lock_irqsave(&free_entries_lock, flags);
271 378
272 if (list_empty(&free_entries)) { 379 if (list_empty(&free_entries)) {
273 printk(KERN_ERR "DMA-API: debugging out of memory " 380 pr_err("DMA-API: debugging out of memory - disabling\n");
274 "- disabling\n");
275 global_disable = true; 381 global_disable = true;
276 goto out; 382 goto out;
277 } 383 }
278 384
279 entry = list_entry(free_entries.next, struct dma_debug_entry, list); 385 entry = __dma_entry_alloc();
280 list_del(&entry->list);
281 memset(entry, 0, sizeof(*entry));
282 386
283#ifdef CONFIG_STACKTRACE 387#ifdef CONFIG_STACKTRACE
284 entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; 388 entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
@@ -286,9 +390,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
286 entry->stacktrace.skip = 2; 390 entry->stacktrace.skip = 2;
287 save_stack_trace(&entry->stacktrace); 391 save_stack_trace(&entry->stacktrace);
288#endif 392#endif
289 num_free_entries -= 1;
290 if (num_free_entries < min_free_entries)
291 min_free_entries = num_free_entries;
292 393
293out: 394out:
294 spin_unlock_irqrestore(&free_entries_lock, flags); 395 spin_unlock_irqrestore(&free_entries_lock, flags);
@@ -310,6 +411,53 @@ static void dma_entry_free(struct dma_debug_entry *entry)
310 spin_unlock_irqrestore(&free_entries_lock, flags); 411 spin_unlock_irqrestore(&free_entries_lock, flags);
311} 412}
312 413
414int dma_debug_resize_entries(u32 num_entries)
415{
416 int i, delta, ret = 0;
417 unsigned long flags;
418 struct dma_debug_entry *entry;
419 LIST_HEAD(tmp);
420
421 spin_lock_irqsave(&free_entries_lock, flags);
422
423 if (nr_total_entries < num_entries) {
424 delta = num_entries - nr_total_entries;
425
426 spin_unlock_irqrestore(&free_entries_lock, flags);
427
428 for (i = 0; i < delta; i++) {
429 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
430 if (!entry)
431 break;
432
433 list_add_tail(&entry->list, &tmp);
434 }
435
436 spin_lock_irqsave(&free_entries_lock, flags);
437
438 list_splice(&tmp, &free_entries);
439 nr_total_entries += i;
440 num_free_entries += i;
441 } else {
442 delta = nr_total_entries - num_entries;
443
444 for (i = 0; i < delta && !list_empty(&free_entries); i++) {
445 entry = __dma_entry_alloc();
446 kfree(entry);
447 }
448
449 nr_total_entries -= i;
450 }
451
452 if (nr_total_entries != num_entries)
453 ret = 1;
454
455 spin_unlock_irqrestore(&free_entries_lock, flags);
456
457 return ret;
458}
459EXPORT_SYMBOL(dma_debug_resize_entries);
460
313/* 461/*
314 * DMA-API debugging init code 462 * DMA-API debugging init code
315 * 463 *
@@ -334,8 +482,7 @@ static int prealloc_memory(u32 num_entries)
334 num_free_entries = num_entries; 482 num_free_entries = num_entries;
335 min_free_entries = num_entries; 483 min_free_entries = num_entries;
336 484
337 printk(KERN_INFO "DMA-API: preallocated %d debug entries\n", 485 pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
338 num_entries);
339 486
340 return 0; 487 return 0;
341 488
@@ -349,11 +496,102 @@ out_err:
349 return -ENOMEM; 496 return -ENOMEM;
350} 497}
351 498
499static ssize_t filter_read(struct file *file, char __user *user_buf,
500 size_t count, loff_t *ppos)
501{
502 char buf[NAME_MAX_LEN + 1];
503 unsigned long flags;
504 int len;
505
506 if (!current_driver_name[0])
507 return 0;
508
509 /*
510 * We can't copy to userspace directly because current_driver_name can
511 * only be read under the driver_name_lock with irqs disabled. So
512 * create a temporary copy first.
513 */
514 read_lock_irqsave(&driver_name_lock, flags);
515 len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
516 read_unlock_irqrestore(&driver_name_lock, flags);
517
518 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
519}
520
521static ssize_t filter_write(struct file *file, const char __user *userbuf,
522 size_t count, loff_t *ppos)
523{
524 char buf[NAME_MAX_LEN];
525 unsigned long flags;
526 size_t len;
527 int i;
528
529 /*
530 * We can't copy from userspace directly. Access to
531 * current_driver_name is protected with a write_lock with irqs
532 * disabled. Since copy_from_user can fault and may sleep we
533 * need to copy to temporary buffer first
534 */
535 len = min(count, (size_t)(NAME_MAX_LEN - 1));
536 if (copy_from_user(buf, userbuf, len))
537 return -EFAULT;
538
539 buf[len] = 0;
540
541 write_lock_irqsave(&driver_name_lock, flags);
542
543 /*
544 * Now handle the string we got from userspace very carefully.
545 * The rules are:
546 * - only use the first token we got
547 * - token delimiter is everything looking like a space
548 * character (' ', '\n', '\t' ...)
549 *
550 */
551 if (!isalnum(buf[0])) {
552 /*
553 * If the first character userspace gave us is not
554 * alphanumerical then assume the filter should be
555 * switched off.
556 */
557 if (current_driver_name[0])
558 pr_info("DMA-API: switching off dma-debug driver filter\n");
559 current_driver_name[0] = 0;
560 current_driver = NULL;
561 goto out_unlock;
562 }
563
564 /*
565 * Now parse out the first token and use it as the name for the
566 * driver to filter for.
567 */
568 for (i = 0; i < NAME_MAX_LEN; ++i) {
569 current_driver_name[i] = buf[i];
570 if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
571 break;
572 }
573 current_driver_name[i] = 0;
574 current_driver = NULL;
575
576 pr_info("DMA-API: enable driver filter for driver [%s]\n",
577 current_driver_name);
578
579out_unlock:
580 write_unlock_irqrestore(&driver_name_lock, flags);
581
582 return count;
583}
584
585const struct file_operations filter_fops = {
586 .read = filter_read,
587 .write = filter_write,
588};
589
352static int dma_debug_fs_init(void) 590static int dma_debug_fs_init(void)
353{ 591{
354 dma_debug_dent = debugfs_create_dir("dma-api", NULL); 592 dma_debug_dent = debugfs_create_dir("dma-api", NULL);
355 if (!dma_debug_dent) { 593 if (!dma_debug_dent) {
356 printk(KERN_ERR "DMA-API: can not create debugfs directory\n"); 594 pr_err("DMA-API: can not create debugfs directory\n");
357 return -ENOMEM; 595 return -ENOMEM;
358 } 596 }
359 597
@@ -392,6 +630,11 @@ static int dma_debug_fs_init(void)
392 if (!min_free_entries_dent) 630 if (!min_free_entries_dent)
393 goto out_err; 631 goto out_err;
394 632
633 filter_dent = debugfs_create_file("driver_filter", 0644,
634 dma_debug_dent, NULL, &filter_fops);
635 if (!filter_dent)
636 goto out_err;
637
395 return 0; 638 return 0;
396 639
397out_err: 640out_err:
@@ -400,9 +643,64 @@ out_err:
400 return -ENOMEM; 643 return -ENOMEM;
401} 644}
402 645
646static int device_dma_allocations(struct device *dev)
647{
648 struct dma_debug_entry *entry;
649 unsigned long flags;
650 int count = 0, i;
651
652 local_irq_save(flags);
653
654 for (i = 0; i < HASH_SIZE; ++i) {
655 spin_lock(&dma_entry_hash[i].lock);
656 list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
657 if (entry->dev == dev)
658 count += 1;
659 }
660 spin_unlock(&dma_entry_hash[i].lock);
661 }
662
663 local_irq_restore(flags);
664
665 return count;
666}
667
668static int dma_debug_device_change(struct notifier_block *nb,
669 unsigned long action, void *data)
670{
671 struct device *dev = data;
672 int count;
673
674
675 switch (action) {
676 case BUS_NOTIFY_UNBOUND_DRIVER:
677 count = device_dma_allocations(dev);
678 if (count == 0)
679 break;
680 err_printk(dev, NULL, "DMA-API: device driver has pending "
681 "DMA allocations while released from device "
682 "[count=%d]\n", count);
683 break;
684 default:
685 break;
686 }
687
688 return 0;
689}
690
403void dma_debug_add_bus(struct bus_type *bus) 691void dma_debug_add_bus(struct bus_type *bus)
404{ 692{
405 /* FIXME: register notifier */ 693 struct notifier_block *nb;
694
695 nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
696 if (nb == NULL) {
697 pr_err("dma_debug_add_bus: out of memory\n");
698 return;
699 }
700
701 nb->notifier_call = dma_debug_device_change;
702
703 bus_register_notifier(bus, nb);
406} 704}
407 705
408/* 706/*
@@ -421,8 +719,7 @@ void dma_debug_init(u32 num_entries)
421 } 719 }
422 720
423 if (dma_debug_fs_init() != 0) { 721 if (dma_debug_fs_init() != 0) {
424 printk(KERN_ERR "DMA-API: error creating debugfs entries " 722 pr_err("DMA-API: error creating debugfs entries - disabling\n");
425 "- disabling\n");
426 global_disable = true; 723 global_disable = true;
427 724
428 return; 725 return;
@@ -432,14 +729,15 @@ void dma_debug_init(u32 num_entries)
432 num_entries = req_entries; 729 num_entries = req_entries;
433 730
434 if (prealloc_memory(num_entries) != 0) { 731 if (prealloc_memory(num_entries) != 0) {
435 printk(KERN_ERR "DMA-API: debugging out of memory error " 732 pr_err("DMA-API: debugging out of memory error - disabled\n");
436 "- disabled\n");
437 global_disable = true; 733 global_disable = true;
438 734
439 return; 735 return;
440 } 736 }
441 737
442 printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n"); 738 nr_total_entries = num_free_entries;
739
740 pr_info("DMA-API: debugging enabled by kernel config\n");
443} 741}
444 742
445static __init int dma_debug_cmdline(char *str) 743static __init int dma_debug_cmdline(char *str)
@@ -448,8 +746,7 @@ static __init int dma_debug_cmdline(char *str)
448 return -EINVAL; 746 return -EINVAL;
449 747
450 if (strncmp(str, "off", 3) == 0) { 748 if (strncmp(str, "off", 3) == 0) {
451 printk(KERN_INFO "DMA-API: debugging disabled on kernel " 749 pr_info("DMA-API: debugging disabled on kernel command line\n");
452 "command line\n");
453 global_disable = true; 750 global_disable = true;
454 } 751 }
455 752
@@ -723,15 +1020,15 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
723 entry->type = dma_debug_sg; 1020 entry->type = dma_debug_sg;
724 entry->dev = dev; 1021 entry->dev = dev;
725 entry->paddr = sg_phys(s); 1022 entry->paddr = sg_phys(s);
726 entry->size = s->length; 1023 entry->size = sg_dma_len(s);
727 entry->dev_addr = s->dma_address; 1024 entry->dev_addr = sg_dma_address(s);
728 entry->direction = direction; 1025 entry->direction = direction;
729 entry->sg_call_ents = nents; 1026 entry->sg_call_ents = nents;
730 entry->sg_mapped_ents = mapped_ents; 1027 entry->sg_mapped_ents = mapped_ents;
731 1028
732 if (!PageHighMem(sg_page(s))) { 1029 if (!PageHighMem(sg_page(s))) {
733 check_for_stack(dev, sg_virt(s)); 1030 check_for_stack(dev, sg_virt(s));
734 check_for_illegal_area(dev, sg_virt(s), s->length); 1031 check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
735 } 1032 }
736 1033
737 add_dma_entry(entry); 1034 add_dma_entry(entry);
@@ -739,13 +1036,33 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
739} 1036}
740EXPORT_SYMBOL(debug_dma_map_sg); 1037EXPORT_SYMBOL(debug_dma_map_sg);
741 1038
1039static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
1040{
1041 struct dma_debug_entry *entry, ref;
1042 struct hash_bucket *bucket;
1043 unsigned long flags;
1044 int mapped_ents;
1045
1046 ref.dev = dev;
1047 ref.dev_addr = sg_dma_address(s);
1048 ref.size = sg_dma_len(s),
1049
1050 bucket = get_hash_bucket(&ref, &flags);
1051 entry = hash_bucket_find(bucket, &ref);
1052 mapped_ents = 0;
1053
1054 if (entry)
1055 mapped_ents = entry->sg_mapped_ents;
1056 put_hash_bucket(bucket, &flags);
1057
1058 return mapped_ents;
1059}
1060
742void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, 1061void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
743 int nelems, int dir) 1062 int nelems, int dir)
744{ 1063{
745 struct dma_debug_entry *entry;
746 struct scatterlist *s; 1064 struct scatterlist *s;
747 int mapped_ents = 0, i; 1065 int mapped_ents = 0, i;
748 unsigned long flags;
749 1066
750 if (unlikely(global_disable)) 1067 if (unlikely(global_disable))
751 return; 1068 return;
@@ -756,8 +1073,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
756 .type = dma_debug_sg, 1073 .type = dma_debug_sg,
757 .dev = dev, 1074 .dev = dev,
758 .paddr = sg_phys(s), 1075 .paddr = sg_phys(s),
759 .dev_addr = s->dma_address, 1076 .dev_addr = sg_dma_address(s),
760 .size = s->length, 1077 .size = sg_dma_len(s),
761 .direction = dir, 1078 .direction = dir,
762 .sg_call_ents = 0, 1079 .sg_call_ents = 0,
763 }; 1080 };
@@ -765,14 +1082,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
765 if (mapped_ents && i >= mapped_ents) 1082 if (mapped_ents && i >= mapped_ents)
766 break; 1083 break;
767 1084
768 if (mapped_ents == 0) { 1085 if (!i) {
769 struct hash_bucket *bucket;
770 ref.sg_call_ents = nelems; 1086 ref.sg_call_ents = nelems;
771 bucket = get_hash_bucket(&ref, &flags); 1087 mapped_ents = get_nr_mapped_entries(dev, s);
772 entry = hash_bucket_find(bucket, &ref);
773 if (entry)
774 mapped_ents = entry->sg_mapped_ents;
775 put_hash_bucket(bucket, &flags);
776 } 1088 }
777 1089
778 check_unmap(&ref); 1090 check_unmap(&ref);
@@ -874,14 +1186,20 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
874 int nelems, int direction) 1186 int nelems, int direction)
875{ 1187{
876 struct scatterlist *s; 1188 struct scatterlist *s;
877 int i; 1189 int mapped_ents = 0, i;
878 1190
879 if (unlikely(global_disable)) 1191 if (unlikely(global_disable))
880 return; 1192 return;
881 1193
882 for_each_sg(sg, s, nelems, i) { 1194 for_each_sg(sg, s, nelems, i) {
883 check_sync(dev, s->dma_address, s->dma_length, 0, 1195 if (!i)
884 direction, true); 1196 mapped_ents = get_nr_mapped_entries(dev, s);
1197
1198 if (i >= mapped_ents)
1199 break;
1200
1201 check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
1202 direction, true);
885 } 1203 }
886} 1204}
887EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); 1205EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
@@ -890,15 +1208,39 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
890 int nelems, int direction) 1208 int nelems, int direction)
891{ 1209{
892 struct scatterlist *s; 1210 struct scatterlist *s;
893 int i; 1211 int mapped_ents = 0, i;
894 1212
895 if (unlikely(global_disable)) 1213 if (unlikely(global_disable))
896 return; 1214 return;
897 1215
898 for_each_sg(sg, s, nelems, i) { 1216 for_each_sg(sg, s, nelems, i) {
899 check_sync(dev, s->dma_address, s->dma_length, 0, 1217 if (!i)
900 direction, false); 1218 mapped_ents = get_nr_mapped_entries(dev, s);
1219
1220 if (i >= mapped_ents)
1221 break;
1222
1223 check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
1224 direction, false);
901 } 1225 }
902} 1226}
903EXPORT_SYMBOL(debug_dma_sync_sg_for_device); 1227EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
904 1228
1229static int __init dma_debug_driver_setup(char *str)
1230{
1231 int i;
1232
1233 for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
1234 current_driver_name[i] = *str;
1235 if (*str == 0)
1236 break;
1237 }
1238
1239 if (current_driver_name[0])
1240 pr_info("DMA-API: enable driver filter for driver [%s]\n",
1241 current_driver_name);
1242
1243
1244 return 1;
1245}
1246__setup("dma_debug_driver=", dma_debug_driver_setup);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 2b0b5a7d2ced..bffe6d7ef9d9 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -60,8 +60,8 @@ enum dma_sync_target {
60int swiotlb_force; 60int swiotlb_force;
61 61
62/* 62/*
63 * Used to do a quick range check in swiotlb_unmap_single and 63 * Used to do a quick range check in unmap_single and
64 * swiotlb_sync_single_*, to see if the memory was in fact allocated by this 64 * sync_single_*, to see if the memory was in fact allocated by this
65 * API. 65 * API.
66 */ 66 */
67static char *io_tlb_start, *io_tlb_end; 67static char *io_tlb_start, *io_tlb_end;
@@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
129 return paddr; 129 return paddr;
130} 130}
131 131
132phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr) 132phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
133{ 133{
134 return baddr; 134 return baddr;
135} 135}
@@ -140,9 +140,15 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
140 return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); 140 return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
141} 141}
142 142
143static void *swiotlb_bus_to_virt(dma_addr_t address) 143void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
144{ 144{
145 return phys_to_virt(swiotlb_bus_to_phys(address)); 145 return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
146}
147
148int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
149 dma_addr_t addr, size_t size)
150{
151 return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
146} 152}
147 153
148int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) 154int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
@@ -309,10 +315,10 @@ cleanup1:
309 return -ENOMEM; 315 return -ENOMEM;
310} 316}
311 317
312static int 318static inline int
313address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) 319address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
314{ 320{
315 return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); 321 return swiotlb_arch_address_needs_mapping(hwdev, addr, size);
316} 322}
317 323
318static inline int range_needs_mapping(phys_addr_t paddr, size_t size) 324static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
@@ -341,7 +347,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
341 unsigned long flags; 347 unsigned long flags;
342 348
343 while (size) { 349 while (size) {
344 sz = min(PAGE_SIZE - offset, size); 350 sz = min_t(size_t, PAGE_SIZE - offset, size);
345 351
346 local_irq_save(flags); 352 local_irq_save(flags);
347 buffer = kmap_atomic(pfn_to_page(pfn), 353 buffer = kmap_atomic(pfn_to_page(pfn),
@@ -476,7 +482,7 @@ found:
476 * dma_addr is the kernel virtual address of the bounce buffer to unmap. 482 * dma_addr is the kernel virtual address of the bounce buffer to unmap.
477 */ 483 */
478static void 484static void
479unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) 485do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
480{ 486{
481 unsigned long flags; 487 unsigned long flags;
482 int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; 488 int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -560,7 +566,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
560 size)) { 566 size)) {
561 /* 567 /*
562 * The allocated memory isn't reachable by the device. 568 * The allocated memory isn't reachable by the device.
563 * Fall back on swiotlb_map_single().
564 */ 569 */
565 free_pages((unsigned long) ret, order); 570 free_pages((unsigned long) ret, order);
566 ret = NULL; 571 ret = NULL;
@@ -568,9 +573,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
568 if (!ret) { 573 if (!ret) {
569 /* 574 /*
570 * We are either out of memory or the device can't DMA 575 * We are either out of memory or the device can't DMA
571 * to GFP_DMA memory; fall back on 576 * to GFP_DMA memory; fall back on map_single(), which
572 * swiotlb_map_single(), which will grab memory from 577 * will grab memory from the lowest available address range.
573 * the lowest available address range.
574 */ 578 */
575 ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); 579 ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
576 if (!ret) 580 if (!ret)
@@ -587,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
587 (unsigned long long)dev_addr); 591 (unsigned long long)dev_addr);
588 592
589 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ 593 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
590 unmap_single(hwdev, ret, size, DMA_TO_DEVICE); 594 do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
591 return NULL; 595 return NULL;
592 } 596 }
593 *dma_handle = dev_addr; 597 *dma_handle = dev_addr;
@@ -604,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
604 free_pages((unsigned long) vaddr, get_order(size)); 608 free_pages((unsigned long) vaddr, get_order(size));
605 else 609 else
606 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ 610 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
607 unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); 611 do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
608} 612}
609EXPORT_SYMBOL(swiotlb_free_coherent); 613EXPORT_SYMBOL(swiotlb_free_coherent);
610 614
@@ -634,7 +638,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
634 * physical address to use is returned. 638 * physical address to use is returned.
635 * 639 *
636 * Once the device is given the dma address, the device owns this memory until 640 * Once the device is given the dma address, the device owns this memory until
637 * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. 641 * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
638 */ 642 */
639dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, 643dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
640 unsigned long offset, size_t size, 644 unsigned long offset, size_t size,
@@ -642,18 +646,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
642 struct dma_attrs *attrs) 646 struct dma_attrs *attrs)
643{ 647{
644 phys_addr_t phys = page_to_phys(page) + offset; 648 phys_addr_t phys = page_to_phys(page) + offset;
645 void *ptr = page_address(page) + offset;
646 dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); 649 dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys);
647 void *map; 650 void *map;
648 651
649 BUG_ON(dir == DMA_NONE); 652 BUG_ON(dir == DMA_NONE);
650 /* 653 /*
651 * If the pointer passed in happens to be in the device's DMA window, 654 * If the address happens to be in the device's DMA window,
652 * we can safely return the device addr and not worry about bounce 655 * we can safely return the device addr and not worry about bounce
653 * buffering it. 656 * buffering it.
654 */ 657 */
655 if (!address_needs_mapping(dev, dev_addr, size) && 658 if (!address_needs_mapping(dev, dev_addr, size) &&
656 !range_needs_mapping(virt_to_phys(ptr), size)) 659 !range_needs_mapping(phys, size))
657 return dev_addr; 660 return dev_addr;
658 661
659 /* 662 /*
@@ -679,23 +682,35 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
679 682
680/* 683/*
681 * Unmap a single streaming mode DMA translation. The dma_addr and size must 684 * Unmap a single streaming mode DMA translation. The dma_addr and size must
682 * match what was provided for in a previous swiotlb_map_single call. All 685 * match what was provided for in a previous swiotlb_map_page call. All
683 * other usages are undefined. 686 * other usages are undefined.
684 * 687 *
685 * After this call, reads by the cpu to the buffer are guaranteed to see 688 * After this call, reads by the cpu to the buffer are guaranteed to see
686 * whatever the device wrote there. 689 * whatever the device wrote there.
687 */ 690 */
691static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
692 size_t size, int dir)
693{
694 char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
695
696 BUG_ON(dir == DMA_NONE);
697
698 if (is_swiotlb_buffer(dma_addr)) {
699 do_unmap_single(hwdev, dma_addr, size, dir);
700 return;
701 }
702
703 if (dir != DMA_FROM_DEVICE)
704 return;
705
706 dma_mark_clean(dma_addr, size);
707}
708
688void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, 709void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
689 size_t size, enum dma_data_direction dir, 710 size_t size, enum dma_data_direction dir,
690 struct dma_attrs *attrs) 711 struct dma_attrs *attrs)
691{ 712{
692 char *dma_addr = swiotlb_bus_to_virt(dev_addr); 713 unmap_single(hwdev, dev_addr, size, dir);
693
694 BUG_ON(dir == DMA_NONE);
695 if (is_swiotlb_buffer(dma_addr))
696 unmap_single(hwdev, dma_addr, size, dir);
697 else if (dir == DMA_FROM_DEVICE)
698 dma_mark_clean(dma_addr, size);
699} 714}
700EXPORT_SYMBOL_GPL(swiotlb_unmap_page); 715EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
701 716
@@ -703,7 +718,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
703 * Make physical memory consistent for a single streaming mode DMA translation 718 * Make physical memory consistent for a single streaming mode DMA translation
704 * after a transfer. 719 * after a transfer.
705 * 720 *
706 * If you perform a swiotlb_map_single() but wish to interrogate the buffer 721 * If you perform a swiotlb_map_page() but wish to interrogate the buffer
707 * using the cpu, yet do not wish to teardown the dma mapping, you must 722 * using the cpu, yet do not wish to teardown the dma mapping, you must
708 * call this function before doing so. At the next point you give the dma 723 * call this function before doing so. At the next point you give the dma
709 * address back to the card, you must first perform a 724 * address back to the card, you must first perform a
@@ -713,13 +728,19 @@ static void
713swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, 728swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
714 size_t size, int dir, int target) 729 size_t size, int dir, int target)
715{ 730{
716 char *dma_addr = swiotlb_bus_to_virt(dev_addr); 731 char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
717 732
718 BUG_ON(dir == DMA_NONE); 733 BUG_ON(dir == DMA_NONE);
719 if (is_swiotlb_buffer(dma_addr)) 734
735 if (is_swiotlb_buffer(dma_addr)) {
720 sync_single(hwdev, dma_addr, size, dir, target); 736 sync_single(hwdev, dma_addr, size, dir, target);
721 else if (dir == DMA_FROM_DEVICE) 737 return;
722 dma_mark_clean(dma_addr, size); 738 }
739
740 if (dir != DMA_FROM_DEVICE)
741 return;
742
743 dma_mark_clean(dma_addr, size);
723} 744}
724 745
725void 746void
@@ -746,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
746 unsigned long offset, size_t size, 767 unsigned long offset, size_t size,
747 int dir, int target) 768 int dir, int target)
748{ 769{
749 char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset; 770 swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
750
751 BUG_ON(dir == DMA_NONE);
752 if (is_swiotlb_buffer(dma_addr))
753 sync_single(hwdev, dma_addr, size, dir, target);
754 else if (dir == DMA_FROM_DEVICE)
755 dma_mark_clean(dma_addr, size);
756} 771}
757 772
758void 773void
@@ -777,7 +792,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
777 792
778/* 793/*
779 * Map a set of buffers described by scatterlist in streaming mode for DMA. 794 * Map a set of buffers described by scatterlist in streaming mode for DMA.
780 * This is the scatter-gather version of the above swiotlb_map_single 795 * This is the scatter-gather version of the above swiotlb_map_page
781 * interface. Here the scatter gather list elements are each tagged with the 796 * interface. Here the scatter gather list elements are each tagged with the
782 * appropriate dma address and length. They are obtained via 797 * appropriate dma address and length. They are obtained via
783 * sg_dma_{address,length}(SG). 798 * sg_dma_{address,length}(SG).
@@ -788,7 +803,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
788 * The routine returns the number of addr/length pairs actually 803 * The routine returns the number of addr/length pairs actually
789 * used, at most nents. 804 * used, at most nents.
790 * 805 *
791 * Device ownership issues as mentioned above for swiotlb_map_single are the 806 * Device ownership issues as mentioned above for swiotlb_map_page are the
792 * same here. 807 * same here.
793 */ 808 */
794int 809int
@@ -836,7 +851,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
836 851
837/* 852/*
838 * Unmap a set of streaming mode DMA translations. Again, cpu read rules 853 * Unmap a set of streaming mode DMA translations. Again, cpu read rules
839 * concerning calls here are the same as for swiotlb_unmap_single() above. 854 * concerning calls here are the same as for swiotlb_unmap_page() above.
840 */ 855 */
841void 856void
842swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, 857swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
@@ -847,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
847 862
848 BUG_ON(dir == DMA_NONE); 863 BUG_ON(dir == DMA_NONE);
849 864
850 for_each_sg(sgl, sg, nelems, i) { 865 for_each_sg(sgl, sg, nelems, i)
851 if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) 866 unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
852 unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), 867
853 sg->dma_length, dir);
854 else if (dir == DMA_FROM_DEVICE)
855 dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
856 }
857} 868}
858EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); 869EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
859 870
@@ -879,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
879 struct scatterlist *sg; 890 struct scatterlist *sg;
880 int i; 891 int i;
881 892
882 BUG_ON(dir == DMA_NONE); 893 for_each_sg(sgl, sg, nelems, i)
883 894 swiotlb_sync_single(hwdev, sg->dma_address,
884 for_each_sg(sgl, sg, nelems, i) {
885 if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
886 sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
887 sg->dma_length, dir, target); 895 sg->dma_length, dir, target);
888 else if (dir == DMA_FROM_DEVICE)
889 dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
890 }
891} 896}
892 897
893void 898void
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 7536acea135b..756ccafa9cec 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -408,6 +408,8 @@ enum format_type {
408 FORMAT_TYPE_LONG_LONG, 408 FORMAT_TYPE_LONG_LONG,
409 FORMAT_TYPE_ULONG, 409 FORMAT_TYPE_ULONG,
410 FORMAT_TYPE_LONG, 410 FORMAT_TYPE_LONG,
411 FORMAT_TYPE_UBYTE,
412 FORMAT_TYPE_BYTE,
411 FORMAT_TYPE_USHORT, 413 FORMAT_TYPE_USHORT,
412 FORMAT_TYPE_SHORT, 414 FORMAT_TYPE_SHORT,
413 FORMAT_TYPE_UINT, 415 FORMAT_TYPE_UINT,
@@ -573,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec)
573} 575}
574 576
575static char *symbol_string(char *buf, char *end, void *ptr, 577static char *symbol_string(char *buf, char *end, void *ptr,
576 struct printf_spec spec) 578 struct printf_spec spec, char ext)
577{ 579{
578 unsigned long value = (unsigned long) ptr; 580 unsigned long value = (unsigned long) ptr;
579#ifdef CONFIG_KALLSYMS 581#ifdef CONFIG_KALLSYMS
580 char sym[KSYM_SYMBOL_LEN]; 582 char sym[KSYM_SYMBOL_LEN];
581 sprint_symbol(sym, value); 583 if (ext != 'f')
584 sprint_symbol(sym, value);
585 else
586 kallsyms_lookup(value, NULL, NULL, NULL, sym);
582 return string(buf, end, sym, spec); 587 return string(buf, end, sym, spec);
583#else 588#else
584 spec.field_width = 2*sizeof(void *); 589 spec.field_width = 2*sizeof(void *);
@@ -690,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr,
690 * 695 *
691 * Right now we handle: 696 * Right now we handle:
692 * 697 *
693 * - 'F' For symbolic function descriptor pointers 698 * - 'F' For symbolic function descriptor pointers with offset
699 * - 'f' For simple symbolic function names without offset
694 * - 'S' For symbolic direct pointers 700 * - 'S' For symbolic direct pointers
695 * - 'R' For a struct resource pointer, it prints the range of 701 * - 'R' For a struct resource pointer, it prints the range of
696 * addresses (not the name nor the flags) 702 * addresses (not the name nor the flags)
@@ -713,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
713 719
714 switch (*fmt) { 720 switch (*fmt) {
715 case 'F': 721 case 'F':
722 case 'f':
716 ptr = dereference_function_descriptor(ptr); 723 ptr = dereference_function_descriptor(ptr);
717 /* Fallthrough */ 724 /* Fallthrough */
718 case 'S': 725 case 'S':
719 return symbol_string(buf, end, ptr, spec); 726 return symbol_string(buf, end, ptr, spec, *fmt);
720 case 'R': 727 case 'R':
721 return resource_string(buf, end, ptr, spec); 728 return resource_string(buf, end, ptr, spec);
722 case 'm': 729 case 'm':
@@ -853,11 +860,15 @@ qualifier:
853 spec->qualifier = -1; 860 spec->qualifier = -1;
854 if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || 861 if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
855 *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { 862 *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
856 spec->qualifier = *fmt; 863 spec->qualifier = *fmt++;
857 ++fmt; 864 if (unlikely(spec->qualifier == *fmt)) {
858 if (spec->qualifier == 'l' && *fmt == 'l') { 865 if (spec->qualifier == 'l') {
859 spec->qualifier = 'L'; 866 spec->qualifier = 'L';
860 ++fmt; 867 ++fmt;
868 } else if (spec->qualifier == 'h') {
869 spec->qualifier = 'H';
870 ++fmt;
871 }
861 } 872 }
862 } 873 }
863 874
@@ -919,6 +930,11 @@ qualifier:
919 spec->type = FORMAT_TYPE_SIZE_T; 930 spec->type = FORMAT_TYPE_SIZE_T;
920 } else if (spec->qualifier == 't') { 931 } else if (spec->qualifier == 't') {
921 spec->type = FORMAT_TYPE_PTRDIFF; 932 spec->type = FORMAT_TYPE_PTRDIFF;
933 } else if (spec->qualifier == 'H') {
934 if (spec->flags & SIGN)
935 spec->type = FORMAT_TYPE_BYTE;
936 else
937 spec->type = FORMAT_TYPE_UBYTE;
922 } else if (spec->qualifier == 'h') { 938 } else if (spec->qualifier == 'h') {
923 if (spec->flags & SIGN) 939 if (spec->flags & SIGN)
924 spec->type = FORMAT_TYPE_SHORT; 940 spec->type = FORMAT_TYPE_SHORT;
@@ -943,7 +959,8 @@ qualifier:
943 * 959 *
944 * This function follows C99 vsnprintf, but has some extensions: 960 * This function follows C99 vsnprintf, but has some extensions:
945 * %pS output the name of a text symbol 961 * %pS output the name of a text symbol
946 * %pF output the name of a function pointer 962 * %pF output the name of a function pointer with its offset
963 * %pf output the name of a function pointer without its offset
947 * %pR output the address range in a struct resource 964 * %pR output the address range in a struct resource
948 * 965 *
949 * The return value is the number of characters which would 966 * The return value is the number of characters which would
@@ -1087,6 +1104,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
1087 case FORMAT_TYPE_PTRDIFF: 1104 case FORMAT_TYPE_PTRDIFF:
1088 num = va_arg(args, ptrdiff_t); 1105 num = va_arg(args, ptrdiff_t);
1089 break; 1106 break;
1107 case FORMAT_TYPE_UBYTE:
1108 num = (unsigned char) va_arg(args, int);
1109 break;
1110 case FORMAT_TYPE_BYTE:
1111 num = (signed char) va_arg(args, int);
1112 break;
1090 case FORMAT_TYPE_USHORT: 1113 case FORMAT_TYPE_USHORT:
1091 num = (unsigned short) va_arg(args, int); 1114 num = (unsigned short) va_arg(args, int);
1092 break; 1115 break;
@@ -1363,6 +1386,10 @@ do { \
1363 case FORMAT_TYPE_PTRDIFF: 1386 case FORMAT_TYPE_PTRDIFF:
1364 save_arg(ptrdiff_t); 1387 save_arg(ptrdiff_t);
1365 break; 1388 break;
1389 case FORMAT_TYPE_UBYTE:
1390 case FORMAT_TYPE_BYTE:
1391 save_arg(char);
1392 break;
1366 case FORMAT_TYPE_USHORT: 1393 case FORMAT_TYPE_USHORT:
1367 case FORMAT_TYPE_SHORT: 1394 case FORMAT_TYPE_SHORT:
1368 save_arg(short); 1395 save_arg(short);
@@ -1391,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf);
1391 * 1418 *
1392 * The format follows C99 vsnprintf, but has some extensions: 1419 * The format follows C99 vsnprintf, but has some extensions:
1393 * %pS output the name of a text symbol 1420 * %pS output the name of a text symbol
1394 * %pF output the name of a function pointer 1421 * %pF output the name of a function pointer with its offset
1422 * %pf output the name of a function pointer without its offset
1395 * %pR output the address range in a struct resource 1423 * %pR output the address range in a struct resource
1396 * %n is ignored 1424 * %n is ignored
1397 * 1425 *
@@ -1538,6 +1566,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
1538 case FORMAT_TYPE_PTRDIFF: 1566 case FORMAT_TYPE_PTRDIFF:
1539 num = get_arg(ptrdiff_t); 1567 num = get_arg(ptrdiff_t);
1540 break; 1568 break;
1569 case FORMAT_TYPE_UBYTE:
1570 num = get_arg(unsigned char);
1571 break;
1572 case FORMAT_TYPE_BYTE:
1573 num = get_arg(signed char);
1574 break;
1541 case FORMAT_TYPE_USHORT: 1575 case FORMAT_TYPE_USHORT:
1542 num = get_arg(unsigned short); 1576 num = get_arg(unsigned short);
1543 break; 1577 break;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe753ecf2aa5..474c7e9dd51a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
149 static int __meminitdata nr_nodemap_entries; 149 static int __meminitdata nr_nodemap_entries;
150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
152#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 static unsigned long __initdata required_kernelcore; 152 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 153 static unsigned long __initdata required_movablecore;
158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -3103,64 +3099,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
3103} 3099}
3104 3100
3105/** 3101/**
3106 * push_node_boundaries - Push node boundaries to at least the requested boundary
3107 * @nid: The nid of the node to push the boundary for
3108 * @start_pfn: The start pfn of the node
3109 * @end_pfn: The end pfn of the node
3110 *
3111 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
3112 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
3113 * be hotplugged even though no physical memory exists. This function allows
3114 * an arch to push out the node boundaries so mem_map is allocated that can
3115 * be used later.
3116 */
3117#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3118void __init push_node_boundaries(unsigned int nid,
3119 unsigned long start_pfn, unsigned long end_pfn)
3120{
3121 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3122 "Entering push_node_boundaries(%u, %lu, %lu)\n",
3123 nid, start_pfn, end_pfn);
3124
3125 /* Initialise the boundary for this node if necessary */
3126 if (node_boundary_end_pfn[nid] == 0)
3127 node_boundary_start_pfn[nid] = -1UL;
3128
3129 /* Update the boundaries */
3130 if (node_boundary_start_pfn[nid] > start_pfn)
3131 node_boundary_start_pfn[nid] = start_pfn;
3132 if (node_boundary_end_pfn[nid] < end_pfn)
3133 node_boundary_end_pfn[nid] = end_pfn;
3134}
3135
3136/* If necessary, push the node boundary out for reserve hotadd */
3137static void __meminit account_node_boundary(unsigned int nid,
3138 unsigned long *start_pfn, unsigned long *end_pfn)
3139{
3140 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3141 "Entering account_node_boundary(%u, %lu, %lu)\n",
3142 nid, *start_pfn, *end_pfn);
3143
3144 /* Return if boundary information has not been provided */
3145 if (node_boundary_end_pfn[nid] == 0)
3146 return;
3147
3148 /* Check the boundaries and update if necessary */
3149 if (node_boundary_start_pfn[nid] < *start_pfn)
3150 *start_pfn = node_boundary_start_pfn[nid];
3151 if (node_boundary_end_pfn[nid] > *end_pfn)
3152 *end_pfn = node_boundary_end_pfn[nid];
3153}
3154#else
3155void __init push_node_boundaries(unsigned int nid,
3156 unsigned long start_pfn, unsigned long end_pfn) {}
3157
3158static void __meminit account_node_boundary(unsigned int nid,
3159 unsigned long *start_pfn, unsigned long *end_pfn) {}
3160#endif
3161
3162
3163/**
3164 * get_pfn_range_for_nid - Return the start and end page frames for a node 3102 * get_pfn_range_for_nid - Return the start and end page frames for a node
3165 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3103 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3166 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3104 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3185 3123
3186 if (*start_pfn == -1UL) 3124 if (*start_pfn == -1UL)
3187 *start_pfn = 0; 3125 *start_pfn = 0;
3188
3189 /* Push the node boundaries out if requested */
3190 account_node_boundary(nid, start_pfn, end_pfn);
3191} 3126}
3192 3127
3193/* 3128/*
@@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void)
3793{ 3728{
3794 memset(early_node_map, 0, sizeof(early_node_map)); 3729 memset(early_node_map, 0, sizeof(early_node_map));
3795 nr_nodemap_entries = 0; 3730 nr_nodemap_entries = 0;
3796#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3797 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3798 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3799#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3800} 3731}
3801 3732
3802/* Compare two active node_active_regions */ 3733/* Compare two active node_active_regions */
diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8fbca12..c0b2c1a76e81 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -23,7 +23,7 @@
23 * Allocation is done in offset-size areas of single unit space. Ie, 23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart. 26 * percpu base registers pcpu_unit_size apart.
27 * 27 *
28 * There are usually many small percpu allocations many of them as 28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists 29 * small as 4 bytes. The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
38 * region and negative allocated. Allocation inside a chunk is done 38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching 39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator. 40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk 41 * Chunks can be determined from the address using the index field
42 * mapping during free. 42 * in the page struct. The index field contains a pointer to the chunk.
43 * 43 *
44 * To use this allocator, arch code should do the followings. 44 * To use this allocator, arch code should do the followings.
45 * 45 *
@@ -61,7 +61,6 @@
61#include <linux/mutex.h> 61#include <linux/mutex.h>
62#include <linux/percpu.h> 62#include <linux/percpu.h>
63#include <linux/pfn.h> 63#include <linux/pfn.h>
64#include <linux/rbtree.h>
65#include <linux/slab.h> 64#include <linux/slab.h>
66#include <linux/spinlock.h> 65#include <linux/spinlock.h>
67#include <linux/vmalloc.h> 66#include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
88 87
89struct pcpu_chunk { 88struct pcpu_chunk {
90 struct list_head list; /* linked to pcpu_slot lists */ 89 struct list_head list; /* linked to pcpu_slot lists */
91 struct rb_node rb_node; /* key is chunk->vm->addr */
92 int free_size; /* free bytes in the chunk */ 90 int free_size; /* free bytes in the chunk */
93 int contig_hint; /* max contiguous size hint */ 91 int contig_hint; /* max contiguous size hint */
94 struct vm_struct *vm; /* mapped vmalloc region */ 92 struct vm_struct *vm; /* mapped vmalloc region */
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
110void *pcpu_base_addr __read_mostly; 108void *pcpu_base_addr __read_mostly;
111EXPORT_SYMBOL_GPL(pcpu_base_addr); 109EXPORT_SYMBOL_GPL(pcpu_base_addr);
112 110
113/* optional reserved chunk, only accessible for reserved allocations */ 111/*
112 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different
114 * ways and thus often doesn't live in the vmalloc area.
115 */
116static struct pcpu_chunk *pcpu_first_chunk;
117
118/*
119 * Optional reserved chunk. This chunk reserves part of the first
120 * chunk and serves it for reserved allocations. The amount of
121 * reserved offset is in pcpu_reserved_chunk_limit. When reserved
122 * area doesn't exist, the following variables contain NULL and 0
123 * respectively.
124 */
114static struct pcpu_chunk *pcpu_reserved_chunk; 125static struct pcpu_chunk *pcpu_reserved_chunk;
115/* offset limit of the reserved chunk */
116static int pcpu_reserved_chunk_limit; 126static int pcpu_reserved_chunk_limit;
117 127
118/* 128/*
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
121 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
122 * protects allocation/reclaim paths, chunks and chunk->page arrays. 132 * protects allocation/reclaim paths, chunks and chunk->page arrays.
123 * The latter is a spinlock and protects the index data structures - 133 * The latter is a spinlock and protects the index data structures -
124 * chunk slots, rbtree, chunks and area maps in chunks. 134 * chunk slots, chunks and area maps in chunks.
125 * 135 *
126 * During allocation, pcpu_alloc_mutex is kept locked all the time and 136 * During allocation, pcpu_alloc_mutex is kept locked all the time and
127 * pcpu_lock is grabbed and released as necessary. All actual memory 137 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
140static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 150static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
141 151
142static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 152static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
143static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
144 153
145/* reclaim work to release fully free chunks, scheduled from free path */ 154/* reclaim work to release fully free chunks, scheduled from free path */
146static void pcpu_reclaim(struct work_struct *work); 155static void pcpu_reclaim(struct work_struct *work);
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
191 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
192} 201}
193 202
203/* set the pointer to a chunk in a page struct */
204static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
205{
206 page->index = (unsigned long)pcpu;
207}
208
209/* obtain pointer to a chunk from a page struct */
210static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
211{
212 return (struct pcpu_chunk *)page->index;
213}
214
194/** 215/**
195 * pcpu_mem_alloc - allocate memory 216 * pcpu_mem_alloc - allocate memory
196 * @size: bytes to allocate 217 * @size: bytes to allocate
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
257 } 278 }
258} 279}
259 280
260static struct rb_node **pcpu_chunk_rb_search(void *addr,
261 struct rb_node **parentp)
262{
263 struct rb_node **p = &pcpu_addr_root.rb_node;
264 struct rb_node *parent = NULL;
265 struct pcpu_chunk *chunk;
266
267 while (*p) {
268 parent = *p;
269 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
270
271 if (addr < chunk->vm->addr)
272 p = &(*p)->rb_left;
273 else if (addr > chunk->vm->addr)
274 p = &(*p)->rb_right;
275 else
276 break;
277 }
278
279 if (parentp)
280 *parentp = parent;
281 return p;
282}
283
284/** 281/**
285 * pcpu_chunk_addr_search - search for chunk containing specified address 282 * pcpu_chunk_addr_search - determine chunk containing specified address
286 * @addr: address to search for 283 * @addr: address for which the chunk needs to be determined.
287 *
288 * Look for chunk which might contain @addr. More specifically, it
289 * searchs for the chunk with the highest start address which isn't
290 * beyond @addr.
291 *
292 * CONTEXT:
293 * pcpu_lock.
294 * 284 *
295 * RETURNS: 285 * RETURNS:
296 * The address of the found chunk. 286 * The address of the found chunk.
297 */ 287 */
298static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 288static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
299{ 289{
300 struct rb_node *n, *parent; 290 void *first_start = pcpu_first_chunk->vm->addr;
301 struct pcpu_chunk *chunk;
302 291
303 /* is it in the reserved chunk? */ 292 /* is it in the first chunk? */
304 if (pcpu_reserved_chunk) { 293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
305 void *start = pcpu_reserved_chunk->vm->addr; 294 /* is it in the reserved area? */
306 295 if (addr < first_start + pcpu_reserved_chunk_limit)
307 if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
308 return pcpu_reserved_chunk; 296 return pcpu_reserved_chunk;
297 return pcpu_first_chunk;
309 } 298 }
310 299
311 /* nah... search the regular ones */ 300 return pcpu_get_page_chunk(vmalloc_to_page(addr));
312 n = *pcpu_chunk_rb_search(addr, &parent);
313 if (!n) {
314 /* no exactly matching chunk, the parent is the closest */
315 n = parent;
316 BUG_ON(!n);
317 }
318 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
319
320 if (addr < chunk->vm->addr) {
321 /* the parent was the next one, look for the previous one */
322 n = rb_prev(n);
323 BUG_ON(!n);
324 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
325 }
326
327 return chunk;
328}
329
330/**
331 * pcpu_chunk_addr_insert - insert chunk into address rb tree
332 * @new: chunk to insert
333 *
334 * Insert @new into address rb tree.
335 *
336 * CONTEXT:
337 * pcpu_lock.
338 */
339static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
340{
341 struct rb_node **p, *parent;
342
343 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
344 BUG_ON(*p);
345 rb_link_node(&new->rb_node, parent, p);
346 rb_insert_color(&new->rb_node, &pcpu_addr_root);
347} 301}
348 302
349/** 303/**
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
755 alloc_mask, 0); 709 alloc_mask, 0);
756 if (!*pagep) 710 if (!*pagep)
757 goto err; 711 goto err;
712 pcpu_set_page_chunk(*pagep, chunk);
758 } 713 }
759 } 714 }
760 715
@@ -879,7 +834,6 @@ restart:
879 834
880 spin_lock_irq(&pcpu_lock); 835 spin_lock_irq(&pcpu_lock);
881 pcpu_chunk_relocate(chunk, -1); 836 pcpu_chunk_relocate(chunk, -1);
882 pcpu_chunk_addr_insert(chunk);
883 goto restart; 837 goto restart;
884 838
885area_found: 839area_found:
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
968 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 922 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
969 continue; 923 continue;
970 924
971 rb_erase(&chunk->rb_node, &pcpu_addr_root);
972 list_move(&chunk->list, &todo); 925 list_move(&chunk->list, &todo);
973 } 926 }
974 927
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1147 1100
1148 if (reserved_size) { 1101 if (reserved_size) {
1149 schunk->free_size = reserved_size; 1102 schunk->free_size = reserved_size;
1150 pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ 1103 pcpu_reserved_chunk = schunk;
1104 pcpu_reserved_chunk_limit = static_size + reserved_size;
1151 } else { 1105 } else {
1152 schunk->free_size = dyn_size; 1106 schunk->free_size = dyn_size;
1153 dyn_size = 0; /* dynamic area covered */ 1107 dyn_size = 0; /* dynamic area covered */
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1158 if (schunk->free_size) 1112 if (schunk->free_size)
1159 schunk->map[schunk->map_used++] = schunk->free_size; 1113 schunk->map[schunk->map_used++] = schunk->free_size;
1160 1114
1161 pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1162
1163 /* init dynamic chunk if necessary */ 1115 /* init dynamic chunk if necessary */
1164 if (dyn_size) { 1116 if (dyn_size) {
1165 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1117 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1226 } 1178 }
1227 1179
1228 /* link the first chunk in */ 1180 /* link the first chunk in */
1229 if (!dchunk) { 1181 pcpu_first_chunk = dchunk ?: schunk;
1230 pcpu_chunk_relocate(schunk, -1); 1182 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1231 pcpu_chunk_addr_insert(schunk);
1232 } else {
1233 pcpu_chunk_relocate(dchunk, -1);
1234 pcpu_chunk_addr_insert(dchunk);
1235 }
1236 1183
1237 /* we're done */ 1184 /* we're done */
1238 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1185 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index cc29b44b1500..e5becb92b3e7 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -167,6 +167,9 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
167 struct tcf_exts e; 167 struct tcf_exts e;
168 int err; 168 int err;
169 169
170 if (!tca[TCA_OPTIONS])
171 return -EINVAL;
172
170 if (head == NULL) { 173 if (head == NULL) {
171 if (!handle) 174 if (!handle)
172 return -EINVAL; 175 return -EINVAL;
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index cba61ca403ca..2b706617c89a 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -188,20 +188,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
188# --------------------------------------------------------------------------- 188# ---------------------------------------------------------------------------
189 189
190quiet_cmd_gzip = GZIP $@ 190quiet_cmd_gzip = GZIP $@
191cmd_gzip = gzip -f -9 < $< > $@ 191cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \
192 (rm -f $@ ; false)
192 193
193 194
194# Bzip2 195# Bzip2
195# --------------------------------------------------------------------------- 196# ---------------------------------------------------------------------------
196 197
197# Bzip2 does not include size in file... so we have to fake that 198# Bzip2 and LZMA do not include size in file... so we have to fake that;
198size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size 199# append the size as a 32-bit littleendian number as gzip does.
199 200size_append = echo -ne $(shell \
200quiet_cmd_bzip2 = BZIP2 $@ 201dec_size=0; \
201cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false) 202for F in $1; do \
203 fsize=$$(stat -c "%s" $$F); \
204 dec_size=$$(expr $$dec_size + $$fsize); \
205done; \
206printf "%08x" $$dec_size | \
207 sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g' \
208)
209
210quiet_cmd_bzip2 = BZIP2 $@
211cmd_bzip2 = (cat $(filter-out FORCE,$^) | \
212 bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
213 (rm -f $@ ; false)
202 214
203# Lzma 215# Lzma
204# --------------------------------------------------------------------------- 216# ---------------------------------------------------------------------------
205 217
206quiet_cmd_lzma = LZMA $@ 218quiet_cmd_lzma = LZMA $@
207cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false) 219cmd_lzma = (cat $(filter-out FORCE,$^) | \
220 lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
221 (rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size
deleted file mode 100644
index 43e1b360cee6..000000000000
--- a/scripts/bin_size
+++ /dev/null
@@ -1,10 +0,0 @@
1#!/bin/sh
2
3if [ $# = 0 ] ; then
4 echo Usage: $0 file
5fi
6
7size_dec=`stat -c "%s" $1`
8size_hex_echo_string=`printf "%08x" $size_dec |
9 sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
10/bin/echo -ne $size_hex_echo_string
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4293528200b3..4d0dd390aa50 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2301,7 +2301,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
2301 2301
2302 bad_pfn = page_to_pfn(bad_page); 2302 bad_pfn = page_to_pfn(bad_page);
2303 2303
2304 if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2304 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2305 r = -ENOMEM; 2305 r = -ENOMEM;
2306 goto out_free_0; 2306 goto out_free_0;
2307 } 2307 }