aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS4
-rw-r--r--Documentation/vm/00-INDEX2
-rw-r--r--Documentation/vm/idle_page_tracking.txt98
-rw-r--r--Documentation/vm/pagemap.txt13
-rw-r--r--Documentation/vm/zswap.txt36
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/include/asm/dma-mapping.h36
-rw-r--r--arch/alpha/kernel/pci-noop.c10
-rw-r--r--arch/alpha/kernel/pci_iommu.c11
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/boot/compressed/decompress.c2
-rw-r--r--arch/arm/include/asm/dma-mapping.h68
-rw-r--r--arch/arm/mm/dma-mapping.c12
-rw-r--r--arch/arm64/include/asm/dma-mapping.h69
-rw-r--r--arch/h8300/boot/compressed/misc.c2
-rw-r--r--arch/h8300/include/asm/dma-mapping.h44
-rw-r--r--arch/hexagon/include/asm/dma-mapping.h49
-rw-r--r--arch/hexagon/kernel/dma.c11
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/dma-mapping.h50
-rw-r--r--arch/m32r/boot/compressed/misc.c3
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/microblaze/include/asm/dma-mapping.h70
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/boot/compressed/decompress.c4
-rw-r--r--arch/mips/cavium-octeon/dma-octeon.c8
-rw-r--r--arch/mips/include/asm/dma-mapping.h67
-rw-r--r--arch/mips/loongson64/common/dma-swiotlb.c11
-rw-r--r--arch/mips/mm/dma-default.c21
-rw-r--r--arch/mips/netlogic/common/nlm-dma.c10
-rw-r--r--arch/openrisc/include/asm/dma-mapping.h67
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/dma-mapping.h68
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/boot/compressed/misc.c2
-rw-r--r--arch/s390/include/asm/dma-mapping.h55
-rw-r--r--arch/s390/pci/pci_dma.c10
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/boot/compressed/misc.c2
-rw-r--r--arch/sh/include/asm/dma-mapping.h77
-rw-r--r--arch/sparc/include/asm/dma-mapping.h40
-rw-r--r--arch/tile/Kconfig1
-rw-r--r--arch/tile/include/asm/dma-mapping.h45
-rw-r--r--arch/unicore32/boot/compressed/misc.c4
-rw-r--r--arch/unicore32/include/asm/dma-mapping.h57
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/include/asm/dma-mapping.h34
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/pci-dma.c60
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/vmx.c8
-rw-r--r--arch/x86/mm/mpx.c51
-rw-r--r--arch/x86/platform/efi/efi.c4
-rw-r--r--arch/x86/platform/uv/uv_nmi.c6
-rw-r--r--arch/xtensa/include/asm/dma-mapping.h60
-rw-r--r--drivers/android/binder.c2
-rw-r--r--drivers/crypto/qat/qat_common/adf_transport_debug.c16
-rw-r--r--drivers/firmware/efi/Kconfig2
-rw-r--r--drivers/gpu/drm/vgem/vgem_drv.c2
-rw-r--r--drivers/hsi/clients/cmt_speech.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_mmap.c2
-rw-r--r--drivers/media/platform/omap/omap_vout.c2
-rw-r--r--drivers/misc/genwqe/card_dev.c2
-rw-r--r--drivers/net/wireless/ath/wil6210/debugfs.c35
-rw-r--r--drivers/parisc/ccio-dma.c13
-rw-r--r--drivers/parisc/sba_iommu.c9
-rw-r--r--drivers/pci/pci-driver.c2
-rw-r--r--drivers/s390/crypto/zcrypt_api.c10
-rw-r--r--drivers/staging/android/ion/ion.c2
-rw-r--r--drivers/staging/comedi/comedi_fops.c2
-rw-r--r--drivers/video/fbdev/omap2/omapfb/omapfb-main.c2
-rw-r--r--drivers/xen/gntalloc.c2
-rw-r--r--drivers/xen/gntdev.c2
-rw-r--r--drivers/xen/privcmd.c4
-rw-r--r--drivers/xen/swiotlb-xen.c6
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/coda/upcall.c6
-rw-r--r--fs/coredump.c46
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/namei.c2
-rw-r--r--fs/proc/base.c113
-rw-r--r--fs/proc/generic.c44
-rw-r--r--fs/proc/page.c65
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/seq_file.c42
-rw-r--r--include/asm-generic/dma-mapping-common.h118
-rw-r--r--include/linux/kexec.h17
-rw-r--r--include/linux/kmod.h2
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/mm.h12
-rw-r--r--include/linux/mmu_notifier.h46
-rw-r--r--include/linux/page-flags.h11
-rw-r--r--include/linux/page_ext.h4
-rw-r--r--include/linux/page_idle.h110
-rw-r--r--include/linux/poison.h11
-rw-r--r--include/linux/printk.h14
-rw-r--r--include/linux/seq_file.h4
-rw-r--r--include/linux/string_helpers.h14
-rw-r--r--include/linux/zpool.h2
-rw-r--r--include/uapi/linux/kernel-page-flags.h1
-rw-r--r--init/initramfs.c4
-rw-r--r--init/main.c1
-rw-r--r--ipc/msgutil.c2
-rw-r--r--ipc/shm.c4
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/extable.c1
-rw-r--r--kernel/kexec.c2531
-rw-r--r--kernel/kexec_core.c1534
-rw-r--r--kernel/kexec_file.c1045
-rw-r--r--kernel/kexec_internal.h22
-rw-r--r--kernel/kmod.c100
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sysctl.c12
-rw-r--r--lib/bitmap.c43
-rw-r--r--lib/decompress_bunzip2.c6
-rw-r--r--lib/decompress_inflate.c31
-rw-r--r--lib/decompress_unlz4.c6
-rw-r--r--lib/decompress_unlzma.c9
-rw-r--r--lib/decompress_unlzo.c13
-rw-r--r--lib/decompress_unxz.c12
-rw-r--r--lib/kstrtox.c2
-rw-r--r--lib/string_helpers.c20
-rw-r--r--lib/test-kstrtox.c6
-rw-r--r--lib/test_kasan.c6
-rw-r--r--lib/zlib_deflate/deftree.c6
-rw-r--r--lib/zlib_deflate/defutil.h16
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile1
-rw-r--r--mm/debug.c4
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/hwpoison-inject.c5
-rw-r--r--mm/kmemleak.c21
-rw-r--r--mm/memcontrol.c76
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c4
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c18
-rw-r--r--mm/mmu_notifier.c17
-rw-r--r--mm/nommu.c19
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_idle.c232
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/swap.c3
-rw-r--r--mm/zpool.c33
-rw-r--r--mm/zswap.c688
-rwxr-xr-xscripts/checkpatch.pl185
-rw-r--r--security/selinux/selinuxfs.c2
-rw-r--r--virt/kvm/kvm_main.c31
165 files changed, 4768 insertions, 4353 deletions
diff --git a/CREDITS b/CREDITS
index bcb8efaa9459..8207cc62ee9d 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2992,6 +2992,10 @@ S: 2200 Mission College Blvd
2992S: Santa Clara, CA 95052 2992S: Santa Clara, CA 95052
2993S: USA 2993S: USA
2994 2994
2995N: Anil Ravindranath
2996E: anil_ravindranath@pmc-sierra.com
2997D: PMC-Sierra MaxRAID driver
2998
2995N: Eric S. Raymond 2999N: Eric S. Raymond
2996E: esr@thyrsus.com 3000E: esr@thyrsus.com
2997W: http://www.tuxedo.org/~esr/ 3001W: http://www.tuxedo.org/~esr/
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 081c49777abb..6a5e2a102a45 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -14,6 +14,8 @@ hugetlbpage.txt
14 - a brief summary of hugetlbpage support in the Linux kernel. 14 - a brief summary of hugetlbpage support in the Linux kernel.
15hwpoison.txt 15hwpoison.txt
16 - explains what hwpoison is 16 - explains what hwpoison is
17idle_page_tracking.txt
18 - description of the idle page tracking feature.
17ksm.txt 19ksm.txt
18 - how to use the Kernel Samepage Merging feature. 20 - how to use the Kernel Samepage Merging feature.
19numa 21numa
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
new file mode 100644
index 000000000000..85dcc3bb85dc
--- /dev/null
+++ b/Documentation/vm/idle_page_tracking.txt
@@ -0,0 +1,98 @@
1MOTIVATION
2
3The idle page tracking feature allows to track which memory pages are being
4accessed by a workload and which are idle. This information can be useful for
5estimating the workload's working set size, which, in turn, can be taken into
6account when configuring the workload parameters, setting memory cgroup limits,
7or deciding where to place the workload within a compute cluster.
8
9It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
10
11USER API
12
13The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
14it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
15
16The file implements a bitmap where each bit corresponds to a memory page. The
17bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
18mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
19set, the corresponding page is idle.
20
21A page is considered idle if it has not been accessed since it was marked idle
22(for more details on what "accessed" actually means see the IMPLEMENTATION
23DETAILS section). To mark a page idle one has to set the bit corresponding to
24the page by writing to the file. A value written to the file is OR-ed with the
25current bitmap value.
26
27Only accesses to user memory pages are tracked. These are pages mapped to a
28process address space, page cache and buffer pages, swap cache pages. For other
29page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
30and hence such pages are never reported idle.
31
32For huge pages the idle flag is set only on the head page, so one has to read
33/proc/kpageflags in order to correctly count idle huge pages.
34
35Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
36-EINVAL if you are not starting the read/write on an 8-byte boundary, or
37if the size of the read/write is not a multiple of 8 bytes. Writing to
38this file beyond max PFN will return -ENXIO.
39
40That said, in order to estimate the amount of pages that are not used by a
41workload one should:
42
43 1. Mark all the workload's pages as idle by setting corresponding bits in
44 /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
45 /proc/pid/pagemap if the workload is represented by a process, or by
46 filtering out alien pages using /proc/kpagecgroup in case the workload is
47 placed in a memory cgroup.
48
49 2. Wait until the workload accesses its working set.
50
51 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
52 one wants to ignore certain types of pages, e.g. mlocked pages since they
53 are not reclaimable, he or she can filter them out using /proc/kpageflags.
54
55See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
56/proc/kpageflags, and /proc/kpagecgroup.
57
58IMPLEMENTATION DETAILS
59
60The kernel internally keeps track of accesses to user memory pages in order to
61reclaim unreferenced pages first on memory shortage conditions. A page is
62considered referenced if it has been recently accessed via a process address
63space, in which case one or more PTEs it is mapped to will have the Accessed bit
64set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
65latter happens when:
66
67 - a userspace process reads or writes a page using a system call (e.g. read(2)
68 or write(2))
69
70 - a page that is used for storing filesystem buffers is read or written,
71 because a process needs filesystem metadata stored in it (e.g. lists a
72 directory tree)
73
74 - a page is accessed by a device driver using get_user_pages()
75
76When a dirty page is written to swap or disk as a result of memory reclaim or
77exceeding the dirty memory limit, it is not marked referenced.
78
79The idle memory tracking feature adds a new page flag, the Idle flag. This flag
80is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
81section), and cleared automatically whenever a page is referenced as defined
82above.
83
84When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
85mapped to, otherwise we will not be able to detect accesses to the page coming
86from a process address space. To avoid interference with the reclaimer, which,
87as noted above, uses the Accessed bit to promote actively referenced pages, one
88more page flag is introduced, the Young flag. When the PTE Accessed bit is
89cleared as a result of setting or updating a page's Idle flag, the Young flag
90is set on the page. The reclaimer treats the Young flag as an extra PTE
91Accessed bit and therefore will consider such a page as referenced.
92
93Since the idle memory tracking feature is based on the memory reclaimer logic,
94it only works with pages that are on an LRU list, other pages are silently
95ignored. That means it will ignore a user memory page if it is isolated, but
96since there are usually not many of them, it should not affect the overall
97result noticeably. In order not to stall scanning of the idle page bitmap,
98locked pages may be skipped too.
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 3cd38438242a..0e1e55588b59 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
5userspace programs to examine the page tables and related information by 5userspace programs to examine the page tables and related information by
6reading files in /proc. 6reading files in /proc.
7 7
8There are three components to pagemap: 8There are four components to pagemap:
9 9
10 * /proc/pid/pagemap. This file lets a userspace process find out which 10 * /proc/pid/pagemap. This file lets a userspace process find out which
11 physical frame each virtual page is mapped to. It contains one 64-bit 11 physical frame each virtual page is mapped to. It contains one 64-bit
@@ -70,6 +70,11 @@ There are three components to pagemap:
70 22. THP 70 22. THP
71 23. BALLOON 71 23. BALLOON
72 24. ZERO_PAGE 72 24. ZERO_PAGE
73 25. IDLE
74
75 * /proc/kpagecgroup. This file contains a 64-bit inode number of the
76 memory cgroup each page is charged to, indexed by PFN. Only available when
77 CONFIG_MEMCG is set.
73 78
74Short descriptions to the page flags: 79Short descriptions to the page flags:
75 80
@@ -116,6 +121,12 @@ Short descriptions to the page flags:
11624. ZERO_PAGE 12124. ZERO_PAGE
117 zero page for pfn_zero or huge_zero page 122 zero page for pfn_zero or huge_zero page
118 123
12425. IDLE
125 page has not been accessed since it was marked idle (see
126 Documentation/vm/idle_page_tracking.txt). Note that this flag may be
127 stale in case the page was accessed via a PTE. To make sure the flag
128 is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
129
119 [IO related page flags] 130 [IO related page flags]
120 1. ERROR IO error occurred 131 1. ERROR IO error occurred
121 3. UPTODATE page has up-to-date data 132 3. UPTODATE page has up-to-date data
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
index 8458c0861e4e..89fff7d611cc 100644
--- a/Documentation/vm/zswap.txt
+++ b/Documentation/vm/zswap.txt
@@ -32,7 +32,7 @@ can also be enabled and disabled at runtime using the sysfs interface.
32An example command to enable zswap at runtime, assuming sysfs is mounted 32An example command to enable zswap at runtime, assuming sysfs is mounted
33at /sys, is: 33at /sys, is:
34 34
35echo 1 > /sys/modules/zswap/parameters/enabled 35echo 1 > /sys/module/zswap/parameters/enabled
36 36
37When zswap is disabled at runtime it will stop storing pages that are 37When zswap is disabled at runtime it will stop storing pages that are
38being swapped out. However, it will _not_ immediately write out or fault 38being swapped out. However, it will _not_ immediately write out or fault
@@ -49,14 +49,26 @@ Zswap receives pages for compression through the Frontswap API and is able to
49evict pages from its own compressed pool on an LRU basis and write them back to 49evict pages from its own compressed pool on an LRU basis and write them back to
50the backing swap device in the case that the compressed pool is full. 50the backing swap device in the case that the compressed pool is full.
51 51
52Zswap makes use of zbud for the managing the compressed memory pool. Each 52Zswap makes use of zpool for the managing the compressed memory pool. Each
53allocation in zbud is not directly accessible by address. Rather, a handle is 53allocation in zpool is not directly accessible by address. Rather, a handle is
54returned by the allocation routine and that handle must be mapped before being 54returned by the allocation routine and that handle must be mapped before being
55accessed. The compressed memory pool grows on demand and shrinks as compressed 55accessed. The compressed memory pool grows on demand and shrinks as compressed
56pages are freed. The pool is not preallocated. 56pages are freed. The pool is not preallocated. By default, a zpool of type
57zbud is created, but it can be selected at boot time by setting the "zpool"
58attribute, e.g. zswap.zpool=zbud. It can also be changed at runtime using the
59sysfs "zpool" attribute, e.g.
60
61echo zbud > /sys/module/zswap/parameters/zpool
62
63The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
64means the compression ratio will always be 2:1 or worse (because of half-full
65zbud pages). The zsmalloc type zpool has a more complex compressed page
66storage method, and it can achieve greater storage densities. However,
67zsmalloc does not implement compressed page eviction, so once zswap fills it
68cannot evict the oldest page, it can only reject new pages.
57 69
58When a swap page is passed from frontswap to zswap, zswap maintains a mapping 70When a swap page is passed from frontswap to zswap, zswap maintains a mapping
59of the swap entry, a combination of the swap type and swap offset, to the zbud 71of the swap entry, a combination of the swap type and swap offset, to the zpool
60handle that references that compressed swap page. This mapping is achieved 72handle that references that compressed swap page. This mapping is achieved
61with a red-black tree per swap type. The swap offset is the search key for the 73with a red-black tree per swap type. The swap offset is the search key for the
62tree nodes. 74tree nodes.
@@ -74,9 +86,17 @@ controlled policy:
74* max_pool_percent - The maximum percentage of memory that the compressed 86* max_pool_percent - The maximum percentage of memory that the compressed
75 pool can occupy. 87 pool can occupy.
76 88
77Zswap allows the compressor to be selected at kernel boot time by setting the 89The default compressor is lzo, but it can be selected at boot time by setting
78“compressor” attribute. The default compressor is lzo. e.g. 90the “compressor” attribute, e.g. zswap.compressor=lzo. It can also be changed
79zswap.compressor=deflate 91at runtime using the sysfs "compressor" attribute, e.g.
92
93echo lzo > /sys/module/zswap/parameters/compressor
94
95When the zpool and/or compressor parameter is changed at runtime, any existing
96compressed pages are not modified; they are left in their own zpool. When a
97request is made for a page in an old zpool, it is uncompressed using its
98original compressor. Once all pages are removed from an old zpool, the zpool
99and its compressor are freed.
80 100
81A debugfs interface is provided for various statistic about pool size, number 101A debugfs interface is provided for various statistic about pool size, number
82of pages stored, and various counters for the reasons pages are rejected. 102of pages stored, and various counters for the reasons pages are rejected.
diff --git a/MAINTAINERS b/MAINTAINERS
index 67a4443daed9..310da4295c70 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8199,10 +8199,9 @@ F: drivers/hwmon/pmbus/
8199F: include/linux/i2c/pmbus.h 8199F: include/linux/i2c/pmbus.h
8200 8200
8201PMC SIERRA MaxRAID DRIVER 8201PMC SIERRA MaxRAID DRIVER
8202M: Anil Ravindranath <anil_ravindranath@pmc-sierra.com>
8203L: linux-scsi@vger.kernel.org 8202L: linux-scsi@vger.kernel.org
8204W: http://www.pmc-sierra.com/ 8203W: http://www.pmc-sierra.com/
8205S: Supported 8204S: Orphan
8206F: drivers/scsi/pmcraid.* 8205F: drivers/scsi/pmcraid.*
8207 8206
8208PMC SIERRA PM8001 DRIVER 8207PMC SIERRA PM8001 DRIVER
diff --git a/arch/Kconfig b/arch/Kconfig
index 8f3564930580..4e949e58b192 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,6 +2,9 @@
2# General architecture dependent options 2# General architecture dependent options
3# 3#
4 4
5config KEXEC_CORE
6 bool
7
5config OPROFILE 8config OPROFILE
6 tristate "OProfile system profiling" 9 tristate "OProfile system profiling"
7 depends on PROFILING 10 depends on PROFILING
diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index dfa32f061320..72a8ca7796d9 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -12,42 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
12 12
13#include <asm-generic/dma-mapping-common.h> 13#include <asm-generic/dma-mapping-common.h>
14 14
15#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
16
17static inline void *dma_alloc_attrs(struct device *dev, size_t size,
18 dma_addr_t *dma_handle, gfp_t gfp,
19 struct dma_attrs *attrs)
20{
21 return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs);
22}
23
24#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
25
26static inline void dma_free_attrs(struct device *dev, size_t size,
27 void *vaddr, dma_addr_t dma_handle,
28 struct dma_attrs *attrs)
29{
30 get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs);
31}
32
33static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
34{
35 return get_dma_ops(dev)->mapping_error(dev, dma_addr);
36}
37
38static inline int dma_supported(struct device *dev, u64 mask)
39{
40 return get_dma_ops(dev)->dma_supported(dev, mask);
41}
42
43static inline int dma_set_mask(struct device *dev, u64 mask)
44{
45 return get_dma_ops(dev)->set_dma_mask(dev, mask);
46}
47
48#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
49#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
50
51#define dma_cache_sync(dev, va, size, dir) ((void)0) 15#define dma_cache_sync(dev, va, size, dir) ((void)0)
52 16
53#endif /* _ALPHA_DMA_MAPPING_H */ 17#endif /* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index df24b76f9246..2b1f4a1e9272 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 mask)
166 return mask < 0x00ffffffUL ? 0 : 1; 166 return mask < 0x00ffffffUL ? 0 : 1;
167} 167}
168 168
169static int alpha_noop_set_mask(struct device *dev, u64 mask)
170{
171 if (!dev->dma_mask || !dma_supported(dev, mask))
172 return -EIO;
173
174 *dev->dma_mask = mask;
175 return 0;
176}
177
178struct dma_map_ops alpha_noop_ops = { 169struct dma_map_ops alpha_noop_ops = {
179 .alloc = alpha_noop_alloc_coherent, 170 .alloc = alpha_noop_alloc_coherent,
180 .free = alpha_noop_free_coherent, 171 .free = alpha_noop_free_coherent,
@@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = {
182 .map_sg = alpha_noop_map_sg, 173 .map_sg = alpha_noop_map_sg,
183 .mapping_error = alpha_noop_mapping_error, 174 .mapping_error = alpha_noop_mapping_error,
184 .dma_supported = alpha_noop_supported, 175 .dma_supported = alpha_noop_supported,
185 .set_dma_mask = alpha_noop_set_mask,
186}; 176};
187 177
188struct dma_map_ops *dma_ops = &alpha_noop_ops; 178struct dma_map_ops *dma_ops = &alpha_noop_ops;
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index eddee7720343..8969bf2dfe3a 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
939 return dma_addr == 0; 939 return dma_addr == 0;
940} 940}
941 941
942static int alpha_pci_set_mask(struct device *dev, u64 mask)
943{
944 if (!dev->dma_mask ||
945 !pci_dma_supported(alpha_gendev_to_pci(dev), mask))
946 return -EIO;
947
948 *dev->dma_mask = mask;
949 return 0;
950}
951
952struct dma_map_ops alpha_pci_ops = { 942struct dma_map_ops alpha_pci_ops = {
953 .alloc = alpha_pci_alloc_coherent, 943 .alloc = alpha_pci_alloc_coherent,
954 .free = alpha_pci_free_coherent, 944 .free = alpha_pci_free_coherent,
@@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = {
958 .unmap_sg = alpha_pci_unmap_sg, 948 .unmap_sg = alpha_pci_unmap_sg,
959 .mapping_error = alpha_pci_mapping_error, 949 .mapping_error = alpha_pci_mapping_error,
960 .dma_supported = alpha_pci_supported, 950 .dma_supported = alpha_pci_supported,
961 .set_dma_mask = alpha_pci_set_mask,
962}; 951};
963 952
964struct dma_map_ops *dma_ops = &alpha_pci_ops; 953struct dma_map_ops *dma_ops = &alpha_pci_ops;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d1b717e1eca..72ad724c67ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2020,6 +2020,7 @@ config KEXEC
2020 bool "Kexec system call (EXPERIMENTAL)" 2020 bool "Kexec system call (EXPERIMENTAL)"
2021 depends on (!SMP || PM_SLEEP_SMP) 2021 depends on (!SMP || PM_SLEEP_SMP)
2022 depends on !CPU_V7M 2022 depends on !CPU_V7M
2023 select KEXEC_CORE
2023 help 2024 help
2024 kexec is a system call that implements the ability to shutdown your 2025 kexec is a system call that implements the ability to shutdown your
2025 current kernel, and to start another kernel. It is like a reboot 2026 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c
index bd245d34952d..a0765e7ed6c7 100644
--- a/arch/arm/boot/compressed/decompress.c
+++ b/arch/arm/boot/compressed/decompress.c
@@ -57,5 +57,5 @@ extern char * strstr(const char * s1, const char *s2);
57 57
58int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) 58int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
59{ 59{
60 return decompress(input, len, NULL, NULL, output, NULL, error); 60 return __decompress(input, len, NULL, NULL, output, 0, NULL, error);
61} 61}
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index a68b9d8a71fe..ccb3aa64640d 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -8,7 +8,6 @@
8#include <linux/dma-attrs.h> 8#include <linux/dma-attrs.h>
9#include <linux/dma-debug.h> 9#include <linux/dma-debug.h>
10 10
11#include <asm-generic/dma-coherent.h>
12#include <asm/memory.h> 11#include <asm/memory.h>
13 12
14#include <xen/xen.h> 13#include <xen/xen.h>
@@ -39,12 +38,15 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
39 dev->archdata.dma_ops = ops; 38 dev->archdata.dma_ops = ops;
40} 39}
41 40
42#include <asm-generic/dma-mapping-common.h> 41#define HAVE_ARCH_DMA_SUPPORTED 1
42extern int dma_supported(struct device *dev, u64 mask);
43 43
44static inline int dma_set_mask(struct device *dev, u64 mask) 44/*
45{ 45 * Note that while the generic code provides dummy dma_{alloc,free}_noncoherent
46 return get_dma_ops(dev)->set_dma_mask(dev, mask); 46 * implementations, we don't provide a dma_cache_sync function so drivers using
47} 47 * this API are highlighted with build warnings.
48 */
49#include <asm-generic/dma-mapping-common.h>
48 50
49#ifdef __arch_page_to_dma 51#ifdef __arch_page_to_dma
50#error Please update to __arch_pfn_to_dma 52#error Please update to __arch_pfn_to_dma
@@ -167,32 +169,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
167 169
168static inline void dma_mark_clean(void *addr, size_t size) { } 170static inline void dma_mark_clean(void *addr, size_t size) { }
169 171
170/*
171 * DMA errors are defined by all-bits-set in the DMA address.
172 */
173static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
174{
175 debug_dma_mapping_error(dev, dma_addr);
176 return dma_addr == DMA_ERROR_CODE;
177}
178
179/*
180 * Dummy noncoherent implementation. We don't provide a dma_cache_sync
181 * function so drivers using this API are highlighted with build warnings.
182 */
183static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
184 dma_addr_t *handle, gfp_t gfp)
185{
186 return NULL;
187}
188
189static inline void dma_free_noncoherent(struct device *dev, size_t size,
190 void *cpu_addr, dma_addr_t handle)
191{
192}
193
194extern int dma_supported(struct device *dev, u64 mask);
195
196extern int arm_dma_set_mask(struct device *dev, u64 dma_mask); 172extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
197 173
198/** 174/**
@@ -209,21 +185,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
209extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, 185extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
210 gfp_t gfp, struct dma_attrs *attrs); 186 gfp_t gfp, struct dma_attrs *attrs);
211 187
212#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
213
214static inline void *dma_alloc_attrs(struct device *dev, size_t size,
215 dma_addr_t *dma_handle, gfp_t flag,
216 struct dma_attrs *attrs)
217{
218 struct dma_map_ops *ops = get_dma_ops(dev);
219 void *cpu_addr;
220 BUG_ON(!ops);
221
222 cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
223 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
224 return cpu_addr;
225}
226
227/** 188/**
228 * arm_dma_free - free memory allocated by arm_dma_alloc 189 * arm_dma_free - free memory allocated by arm_dma_alloc
229 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices 190 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -241,19 +202,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
241extern void arm_dma_free(struct device *dev, size_t size, void *cpu_addr, 202extern void arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
242 dma_addr_t handle, struct dma_attrs *attrs); 203 dma_addr_t handle, struct dma_attrs *attrs);
243 204
244#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
245
246static inline void dma_free_attrs(struct device *dev, size_t size,
247 void *cpu_addr, dma_addr_t dma_handle,
248 struct dma_attrs *attrs)
249{
250 struct dma_map_ops *ops = get_dma_ops(dev);
251 BUG_ON(!ops);
252
253 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
254 ops->free(dev, size, cpu_addr, dma_handle, attrs);
255}
256
257/** 205/**
258 * arm_dma_mmap - map a coherent DMA allocation into user space 206 * arm_dma_mmap - map a coherent DMA allocation into user space
259 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices 207 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index bf35abcc7d59..e62604384945 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -676,10 +676,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
676 gfp_t gfp, struct dma_attrs *attrs) 676 gfp_t gfp, struct dma_attrs *attrs)
677{ 677{
678 pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL); 678 pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
679 void *memory;
680
681 if (dma_alloc_from_coherent(dev, size, handle, &memory))
682 return memory;
683 679
684 return __dma_alloc(dev, size, handle, gfp, prot, false, 680 return __dma_alloc(dev, size, handle, gfp, prot, false,
685 attrs, __builtin_return_address(0)); 681 attrs, __builtin_return_address(0));
@@ -688,11 +684,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
688static void *arm_coherent_dma_alloc(struct device *dev, size_t size, 684static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
689 dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) 685 dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
690{ 686{
691 void *memory;
692
693 if (dma_alloc_from_coherent(dev, size, handle, &memory))
694 return memory;
695
696 return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true, 687 return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true,
697 attrs, __builtin_return_address(0)); 688 attrs, __builtin_return_address(0));
698} 689}
@@ -752,9 +743,6 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
752 struct page *page = pfn_to_page(dma_to_pfn(dev, handle)); 743 struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
753 bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs); 744 bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
754 745
755 if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
756 return;
757
758 size = PAGE_ALIGN(size); 746 size = PAGE_ALIGN(size);
759 747
760 if (nommu()) { 748 if (nommu()) {
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index f0d6d0bfe55c..cfdb34bedbcd 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -22,8 +22,6 @@
22#include <linux/types.h> 22#include <linux/types.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24 24
25#include <asm-generic/dma-coherent.h>
26
27#include <xen/xen.h> 25#include <xen/xen.h>
28#include <asm/xen/hypervisor.h> 26#include <asm/xen/hypervisor.h>
29 27
@@ -86,28 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
86 return (phys_addr_t)dev_addr; 84 return (phys_addr_t)dev_addr;
87} 85}
88 86
89static inline int dma_mapping_error(struct device *dev, dma_addr_t dev_addr)
90{
91 struct dma_map_ops *ops = get_dma_ops(dev);
92 debug_dma_mapping_error(dev, dev_addr);
93 return ops->mapping_error(dev, dev_addr);
94}
95
96static inline int dma_supported(struct device *dev, u64 mask)
97{
98 struct dma_map_ops *ops = get_dma_ops(dev);
99 return ops->dma_supported(dev, mask);
100}
101
102static inline int dma_set_mask(struct device *dev, u64 mask)
103{
104 if (!dev->dma_mask || !dma_supported(dev, mask))
105 return -EIO;
106 *dev->dma_mask = mask;
107
108 return 0;
109}
110
111static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) 87static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
112{ 88{
113 if (!dev->dma_mask) 89 if (!dev->dma_mask)
@@ -120,50 +96,5 @@ static inline void dma_mark_clean(void *addr, size_t size)
120{ 96{
121} 97}
122 98
123#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
124#define dma_free_coherent(d, s, h, f) dma_free_attrs(d, s, h, f, NULL)
125
126static inline void *dma_alloc_attrs(struct device *dev, size_t size,
127 dma_addr_t *dma_handle, gfp_t flags,
128 struct dma_attrs *attrs)
129{
130 struct dma_map_ops *ops = get_dma_ops(dev);
131 void *vaddr;
132
133 if (dma_alloc_from_coherent(dev, size, dma_handle, &vaddr))
134 return vaddr;
135
136 vaddr = ops->alloc(dev, size, dma_handle, flags, attrs);
137 debug_dma_alloc_coherent(dev, size, *dma_handle, vaddr);
138 return vaddr;
139}
140
141static inline void dma_free_attrs(struct device *dev, size_t size,
142 void *vaddr, dma_addr_t dev_addr,
143 struct dma_attrs *attrs)
144{
145 struct dma_map_ops *ops = get_dma_ops(dev);
146
147 if (dma_release_from_coherent(dev, get_order(size), vaddr))
148 return;
149
150 debug_dma_free_coherent(dev, size, vaddr, dev_addr);
151 ops->free(dev, size, vaddr, dev_addr, attrs);
152}
153
154/*
155 * There is no dma_cache_sync() implementation, so just return NULL here.
156 */
157static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
158 dma_addr_t *handle, gfp_t flags)
159{
160 return NULL;
161}
162
163static inline void dma_free_noncoherent(struct device *dev, size_t size,
164 void *cpu_addr, dma_addr_t handle)
165{
166}
167
168#endif /* __KERNEL__ */ 99#endif /* __KERNEL__ */
169#endif /* __ASM_DMA_MAPPING_H */ 100#endif /* __ASM_DMA_MAPPING_H */
diff --git a/arch/h8300/boot/compressed/misc.c b/arch/h8300/boot/compressed/misc.c
index 704274127c07..c4f2cfcb117b 100644
--- a/arch/h8300/boot/compressed/misc.c
+++ b/arch/h8300/boot/compressed/misc.c
@@ -70,5 +70,5 @@ void decompress_kernel(void)
70 free_mem_ptr = (unsigned long)&_end; 70 free_mem_ptr = (unsigned long)&_end;
71 free_mem_end_ptr = free_mem_ptr + HEAP_SIZE; 71 free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
72 72
73 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 73 __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
74} 74}
diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h
index 6e67a90902f2..d9b5b806afe6 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -1,8 +1,6 @@
1#ifndef _H8300_DMA_MAPPING_H 1#ifndef _H8300_DMA_MAPPING_H
2#define _H8300_DMA_MAPPING_H 2#define _H8300_DMA_MAPPING_H
3 3
4#include <asm-generic/dma-coherent.h>
5
6extern struct dma_map_ops h8300_dma_map_ops; 4extern struct dma_map_ops h8300_dma_map_ops;
7 5
8static inline struct dma_map_ops *get_dma_ops(struct device *dev) 6static inline struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -12,46 +10,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
12 10
13#include <asm-generic/dma-mapping-common.h> 11#include <asm-generic/dma-mapping-common.h>
14 12
15static inline int dma_supported(struct device *dev, u64 mask)
16{
17 return 0;
18}
19
20static inline int dma_set_mask(struct device *dev, u64 mask)
21{
22 return 0;
23}
24
25#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
26#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
27
28#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
29
30static inline void *dma_alloc_attrs(struct device *dev, size_t size,
31 dma_addr_t *dma_handle, gfp_t flag,
32 struct dma_attrs *attrs)
33{
34 struct dma_map_ops *ops = get_dma_ops(dev);
35 void *memory;
36
37 memory = ops->alloc(dev, size, dma_handle, flag, attrs);
38 return memory;
39}
40
41#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
42
43static inline void dma_free_attrs(struct device *dev, size_t size,
44 void *cpu_addr, dma_addr_t dma_handle,
45 struct dma_attrs *attrs)
46{
47 struct dma_map_ops *ops = get_dma_ops(dev);
48
49 ops->free(dev, size, cpu_addr, dma_handle, attrs);
50}
51
52static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
53{
54 return 0;
55}
56
57#endif 13#endif
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index 16965427f6b4..268fde8a4575 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -31,12 +31,10 @@
31 31
32struct device; 32struct device;
33extern int bad_dma_address; 33extern int bad_dma_address;
34#define DMA_ERROR_CODE bad_dma_address
34 35
35extern struct dma_map_ops *dma_ops; 36extern struct dma_map_ops *dma_ops;
36 37
37#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
38#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
39
40static inline struct dma_map_ops *get_dma_ops(struct device *dev) 38static inline struct dma_map_ops *get_dma_ops(struct device *dev)
41{ 39{
42 if (unlikely(dev == NULL)) 40 if (unlikely(dev == NULL))
@@ -45,8 +43,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
45 return dma_ops; 43 return dma_ops;
46} 44}
47 45
46#define HAVE_ARCH_DMA_SUPPORTED 1
48extern int dma_supported(struct device *dev, u64 mask); 47extern int dma_supported(struct device *dev, u64 mask);
49extern int dma_set_mask(struct device *dev, u64 mask);
50extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle); 48extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
51extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 49extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
52 enum dma_data_direction direction); 50 enum dma_data_direction direction);
@@ -60,47 +58,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
60 return addr + size - 1 <= *dev->dma_mask; 58 return addr + size - 1 <= *dev->dma_mask;
61} 59}
62 60
63static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
64{
65 struct dma_map_ops *dma_ops = get_dma_ops(dev);
66
67 if (dma_ops->mapping_error)
68 return dma_ops->mapping_error(dev, dma_addr);
69
70 return (dma_addr == bad_dma_address);
71}
72
73#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
74
75static inline void *dma_alloc_attrs(struct device *dev, size_t size,
76 dma_addr_t *dma_handle, gfp_t flag,
77 struct dma_attrs *attrs)
78{
79 void *ret;
80 struct dma_map_ops *ops = get_dma_ops(dev);
81
82 BUG_ON(!dma_ops);
83
84 ret = ops->alloc(dev, size, dma_handle, flag, attrs);
85
86 debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
87
88 return ret;
89}
90
91#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
92
93static inline void dma_free_attrs(struct device *dev, size_t size,
94 void *cpu_addr, dma_addr_t dma_handle,
95 struct dma_attrs *attrs)
96{
97 struct dma_map_ops *dma_ops = get_dma_ops(dev);
98
99 BUG_ON(!dma_ops);
100
101 dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
102
103 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
104}
105
106#endif 61#endif
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index b74f9bae31a3..9e3ddf792bd3 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -44,17 +44,6 @@ int dma_supported(struct device *dev, u64 mask)
44} 44}
45EXPORT_SYMBOL(dma_supported); 45EXPORT_SYMBOL(dma_supported);
46 46
47int dma_set_mask(struct device *dev, u64 mask)
48{
49 if (!dev->dma_mask || !dma_supported(dev, mask))
50 return -EIO;
51
52 *dev->dma_mask = mask;
53
54 return 0;
55}
56EXPORT_SYMBOL(dma_set_mask);
57
58static struct gen_pool *coherent_pool; 47static struct gen_pool *coherent_pool;
59 48
60 49
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 42a91a7aa2b0..eb0249e37981 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
518config KEXEC 518config KEXEC
519 bool "kexec system call" 519 bool "kexec system call"
520 depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) 520 depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
521 select KEXEC_CORE
521 help 522 help
522 kexec is a system call that implements the ability to shutdown your 523 kexec is a system call that implements the ability to shutdown your
523 current kernel, and to start another kernel. It is like a reboot 524 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index cf3ab7e784b5..9beccf8010bd 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -23,60 +23,10 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
23extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, 23extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
24 enum dma_data_direction); 24 enum dma_data_direction);
25 25
26#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
27
28static inline void *dma_alloc_attrs(struct device *dev, size_t size,
29 dma_addr_t *daddr, gfp_t gfp,
30 struct dma_attrs *attrs)
31{
32 struct dma_map_ops *ops = platform_dma_get_ops(dev);
33 void *caddr;
34
35 caddr = ops->alloc(dev, size, daddr, gfp, attrs);
36 debug_dma_alloc_coherent(dev, size, *daddr, caddr);
37 return caddr;
38}
39
40#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
41
42static inline void dma_free_attrs(struct device *dev, size_t size,
43 void *caddr, dma_addr_t daddr,
44 struct dma_attrs *attrs)
45{
46 struct dma_map_ops *ops = platform_dma_get_ops(dev);
47 debug_dma_free_coherent(dev, size, caddr, daddr);
48 ops->free(dev, size, caddr, daddr, attrs);
49}
50
51#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
52#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
53
54#define get_dma_ops(dev) platform_dma_get_ops(dev) 26#define get_dma_ops(dev) platform_dma_get_ops(dev)
55 27
56#include <asm-generic/dma-mapping-common.h> 28#include <asm-generic/dma-mapping-common.h>
57 29
58static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
59{
60 struct dma_map_ops *ops = platform_dma_get_ops(dev);
61 debug_dma_mapping_error(dev, daddr);
62 return ops->mapping_error(dev, daddr);
63}
64
65static inline int dma_supported(struct device *dev, u64 mask)
66{
67 struct dma_map_ops *ops = platform_dma_get_ops(dev);
68 return ops->dma_supported(dev, mask);
69}
70
71static inline int
72dma_set_mask (struct device *dev, u64 mask)
73{
74 if (!dev->dma_mask || !dma_supported(dev, mask))
75 return -EIO;
76 *dev->dma_mask = mask;
77 return 0;
78}
79
80static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) 30static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
81{ 31{
82 if (!dev->dma_mask) 32 if (!dev->dma_mask)
diff --git a/arch/m32r/boot/compressed/misc.c b/arch/m32r/boot/compressed/misc.c
index 28a09529f206..3a7692745868 100644
--- a/arch/m32r/boot/compressed/misc.c
+++ b/arch/m32r/boot/compressed/misc.c
@@ -86,6 +86,7 @@ decompress_kernel(int mmu_on, unsigned char *zimage_data,
86 free_mem_end_ptr = free_mem_ptr + BOOT_HEAP_SIZE; 86 free_mem_end_ptr = free_mem_ptr + BOOT_HEAP_SIZE;
87 87
88 puts("\nDecompressing Linux... "); 88 puts("\nDecompressing Linux... ");
89 decompress(input_data, input_len, NULL, NULL, output_data, NULL, error); 89 __decompress(input_data, input_len, NULL, NULL, output_data, 0,
90 NULL, error);
90 puts("done.\nBooting the kernel.\n"); 91 puts("done.\nBooting the kernel.\n");
91} 92}
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..498b567f007b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -95,6 +95,7 @@ config MMU_SUN3
95config KEXEC 95config KEXEC
96 bool "kexec system call" 96 bool "kexec system call"
97 depends on M68KCLASSIC 97 depends on M68KCLASSIC
98 select KEXEC_CORE
98 help 99 help
99 kexec is a system call that implements the ability to shutdown your 100 kexec is a system call that implements the ability to shutdown your
100 current kernel, and to start another kernel. It is like a reboot 101 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index ab353723076a..24b12970c9cf 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -27,7 +27,6 @@
27#include <linux/dma-debug.h> 27#include <linux/dma-debug.h>
28#include <linux/dma-attrs.h> 28#include <linux/dma-attrs.h>
29#include <asm/io.h> 29#include <asm/io.h>
30#include <asm-generic/dma-coherent.h>
31#include <asm/cacheflush.h> 30#include <asm/cacheflush.h>
32 31
33#define DMA_ERROR_CODE (~(dma_addr_t)0x0) 32#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
@@ -45,31 +44,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
45 return &dma_direct_ops; 44 return &dma_direct_ops;
46} 45}
47 46
48static inline int dma_supported(struct device *dev, u64 mask)
49{
50 struct dma_map_ops *ops = get_dma_ops(dev);
51
52 if (unlikely(!ops))
53 return 0;
54 if (!ops->dma_supported)
55 return 1;
56 return ops->dma_supported(dev, mask);
57}
58
59static inline int dma_set_mask(struct device *dev, u64 dma_mask)
60{
61 struct dma_map_ops *ops = get_dma_ops(dev);
62
63 if (unlikely(ops == NULL))
64 return -EIO;
65 if (ops->set_dma_mask)
66 return ops->set_dma_mask(dev, dma_mask);
67 if (!dev->dma_mask || !dma_supported(dev, dma_mask))
68 return -EIO;
69 *dev->dma_mask = dma_mask;
70 return 0;
71}
72
73#include <asm-generic/dma-mapping-common.h> 47#include <asm-generic/dma-mapping-common.h>
74 48
75static inline void __dma_sync(unsigned long paddr, 49static inline void __dma_sync(unsigned long paddr,
@@ -88,50 +62,6 @@ static inline void __dma_sync(unsigned long paddr,
88 } 62 }
89} 63}
90 64
91static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
92{
93 struct dma_map_ops *ops = get_dma_ops(dev);
94
95 debug_dma_mapping_error(dev, dma_addr);
96 if (ops->mapping_error)
97 return ops->mapping_error(dev, dma_addr);
98
99 return (dma_addr == DMA_ERROR_CODE);
100}
101
102#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
103#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
104
105#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
106
107static inline void *dma_alloc_attrs(struct device *dev, size_t size,
108 dma_addr_t *dma_handle, gfp_t flag,
109 struct dma_attrs *attrs)
110{
111 struct dma_map_ops *ops = get_dma_ops(dev);
112 void *memory;
113
114 BUG_ON(!ops);
115
116 memory = ops->alloc(dev, size, dma_handle, flag, attrs);
117
118 debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
119 return memory;
120}
121
122#define dma_free_coherent(d,s,c,h) dma_free_attrs(d, s, c, h, NULL)
123
124static inline void dma_free_attrs(struct device *dev, size_t size,
125 void *cpu_addr, dma_addr_t dma_handle,
126 struct dma_attrs *attrs)
127{
128 struct dma_map_ops *ops = get_dma_ops(dev);
129
130 BUG_ON(!ops);
131 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
132 ops->free(dev, size, cpu_addr, dma_handle, attrs);
133}
134
135static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 65static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
136 enum dma_data_direction direction) 66 enum dma_data_direction direction)
137{ 67{
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 752acca8de1f..e3aa5b0b4ef1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2597,6 +2597,7 @@ source "kernel/Kconfig.preempt"
2597 2597
2598config KEXEC 2598config KEXEC
2599 bool "Kexec system call" 2599 bool "Kexec system call"
2600 select KEXEC_CORE
2600 help 2601 help
2601 kexec is a system call that implements the ability to shutdown your 2602 kexec is a system call that implements the ability to shutdown your
2602 current kernel, and to start another kernel. It is like a reboot 2603 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c
index 54831069a206..080cd53bac36 100644
--- a/arch/mips/boot/compressed/decompress.c
+++ b/arch/mips/boot/compressed/decompress.c
@@ -111,8 +111,8 @@ void decompress_kernel(unsigned long boot_heap_start)
111 puts("\n"); 111 puts("\n");
112 112
113 /* Decompress the kernel with according algorithm */ 113 /* Decompress the kernel with according algorithm */
114 decompress((char *)zimage_start, zimage_size, 0, 0, 114 __decompress((char *)zimage_start, zimage_size, 0, 0,
115 (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, error); 115 (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, 0, error);
116 116
117 /* FIXME: should we flush cache here? */ 117 /* FIXME: should we flush cache here? */
118 puts("Now, booting the kernel...\n"); 118 puts("Now, booting the kernel...\n");
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index d8960d46417b..2cd45f5f9481 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -161,9 +161,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
161{ 161{
162 void *ret; 162 void *ret;
163 163
164 if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
165 return ret;
166
167 /* ignore region specifiers */ 164 /* ignore region specifiers */
168 gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); 165 gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
169 166
@@ -194,11 +191,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
194static void octeon_dma_free_coherent(struct device *dev, size_t size, 191static void octeon_dma_free_coherent(struct device *dev, size_t size,
195 void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs) 192 void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
196{ 193{
197 int order = get_order(size);
198
199 if (dma_release_from_coherent(dev, order, vaddr))
200 return;
201
202 swiotlb_free_coherent(dev, size, vaddr, dma_handle); 194 swiotlb_free_coherent(dev, size, vaddr, dma_handle);
203} 195}
204 196
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 360b3387182a..e604f760c4a0 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -4,7 +4,6 @@
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <asm/dma-coherence.h> 5#include <asm/dma-coherence.h>
6#include <asm/cache.h> 6#include <asm/cache.h>
7#include <asm-generic/dma-coherent.h>
8 7
9#ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */ 8#ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */
10#include <dma-coherence.h> 9#include <dma-coherence.h>
@@ -32,73 +31,7 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
32 31
33#include <asm-generic/dma-mapping-common.h> 32#include <asm-generic/dma-mapping-common.h>
34 33
35static inline int dma_supported(struct device *dev, u64 mask)
36{
37 struct dma_map_ops *ops = get_dma_ops(dev);
38 return ops->dma_supported(dev, mask);
39}
40
41static inline int dma_mapping_error(struct device *dev, u64 mask)
42{
43 struct dma_map_ops *ops = get_dma_ops(dev);
44
45 debug_dma_mapping_error(dev, mask);
46 return ops->mapping_error(dev, mask);
47}
48
49static inline int
50dma_set_mask(struct device *dev, u64 mask)
51{
52 struct dma_map_ops *ops = get_dma_ops(dev);
53
54 if(!dev->dma_mask || !dma_supported(dev, mask))
55 return -EIO;
56
57 if (ops->set_dma_mask)
58 return ops->set_dma_mask(dev, mask);
59
60 *dev->dma_mask = mask;
61
62 return 0;
63}
64
65extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 34extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
66 enum dma_data_direction direction); 35 enum dma_data_direction direction);
67 36
68#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
69
70static inline void *dma_alloc_attrs(struct device *dev, size_t size,
71 dma_addr_t *dma_handle, gfp_t gfp,
72 struct dma_attrs *attrs)
73{
74 void *ret;
75 struct dma_map_ops *ops = get_dma_ops(dev);
76
77 ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
78
79 debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
80
81 return ret;
82}
83
84#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
85
86static inline void dma_free_attrs(struct device *dev, size_t size,
87 void *vaddr, dma_addr_t dma_handle,
88 struct dma_attrs *attrs)
89{
90 struct dma_map_ops *ops = get_dma_ops(dev);
91
92 ops->free(dev, size, vaddr, dma_handle, attrs);
93
94 debug_dma_free_coherent(dev, size, vaddr, dma_handle);
95}
96
97
98void *dma_alloc_noncoherent(struct device *dev, size_t size,
99 dma_addr_t *dma_handle, gfp_t flag);
100
101void dma_free_noncoherent(struct device *dev, size_t size,
102 void *vaddr, dma_addr_t dma_handle);
103
104#endif /* _ASM_DMA_MAPPING_H */ 37#endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c
index 2c6b989c1bc4..4ffa6fc81c8f 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -14,9 +14,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
14{ 14{
15 void *ret; 15 void *ret;
16 16
17 if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
18 return ret;
19
20 /* ignore region specifiers */ 17 /* ignore region specifiers */
21 gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); 18 gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
22 19
@@ -46,11 +43,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
46static void loongson_dma_free_coherent(struct device *dev, size_t size, 43static void loongson_dma_free_coherent(struct device *dev, size_t size,
47 void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs) 44 void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
48{ 45{
49 int order = get_order(size);
50
51 if (dma_release_from_coherent(dev, order, vaddr))
52 return;
53
54 swiotlb_free_coherent(dev, size, vaddr, dma_handle); 46 swiotlb_free_coherent(dev, size, vaddr, dma_handle);
55} 47}
56 48
@@ -93,6 +85,9 @@ static void loongson_dma_sync_sg_for_device(struct device *dev,
93 85
94static int loongson_dma_set_mask(struct device *dev, u64 mask) 86static int loongson_dma_set_mask(struct device *dev, u64 mask)
95{ 87{
88 if (!dev->dma_mask || !dma_supported(dev, mask))
89 return -EIO;
90
96 if (mask > DMA_BIT_MASK(loongson_sysconf.dma_mask_bits)) { 91 if (mask > DMA_BIT_MASK(loongson_sysconf.dma_mask_bits)) {
97 *dev->dma_mask = DMA_BIT_MASK(loongson_sysconf.dma_mask_bits); 92 *dev->dma_mask = DMA_BIT_MASK(loongson_sysconf.dma_mask_bits);
98 return -EIO; 93 return -EIO;
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 8f23cf08f4ba..a914dc1cb6d1 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -112,7 +112,7 @@ static gfp_t massage_gfp_flags(const struct device *dev, gfp_t gfp)
112 return gfp | dma_flag; 112 return gfp | dma_flag;
113} 113}
114 114
115void *dma_alloc_noncoherent(struct device *dev, size_t size, 115static void *mips_dma_alloc_noncoherent(struct device *dev, size_t size,
116 dma_addr_t * dma_handle, gfp_t gfp) 116 dma_addr_t * dma_handle, gfp_t gfp)
117{ 117{
118 void *ret; 118 void *ret;
@@ -128,7 +128,6 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
128 128
129 return ret; 129 return ret;
130} 130}
131EXPORT_SYMBOL(dma_alloc_noncoherent);
132 131
133static void *mips_dma_alloc_coherent(struct device *dev, size_t size, 132static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
134 dma_addr_t * dma_handle, gfp_t gfp, struct dma_attrs *attrs) 133 dma_addr_t * dma_handle, gfp_t gfp, struct dma_attrs *attrs)
@@ -137,8 +136,12 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
137 struct page *page = NULL; 136 struct page *page = NULL;
138 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; 137 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
139 138
140 if (dma_alloc_from_coherent(dev, size, dma_handle, &ret)) 139 /*
141 return ret; 140 * XXX: seems like the coherent and non-coherent implementations could
141 * be consolidated.
142 */
143 if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs))
144 return mips_dma_alloc_noncoherent(dev, size, dma_handle, gfp);
142 145
143 gfp = massage_gfp_flags(dev, gfp); 146 gfp = massage_gfp_flags(dev, gfp);
144 147
@@ -164,24 +167,24 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
164} 167}
165 168
166 169
167void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, 170static void mips_dma_free_noncoherent(struct device *dev, size_t size,
168 dma_addr_t dma_handle) 171 void *vaddr, dma_addr_t dma_handle)
169{ 172{
170 plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL); 173 plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
171 free_pages((unsigned long) vaddr, get_order(size)); 174 free_pages((unsigned long) vaddr, get_order(size));
172} 175}
173EXPORT_SYMBOL(dma_free_noncoherent);
174 176
175static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr, 177static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
176 dma_addr_t dma_handle, struct dma_attrs *attrs) 178 dma_addr_t dma_handle, struct dma_attrs *attrs)
177{ 179{
178 unsigned long addr = (unsigned long) vaddr; 180 unsigned long addr = (unsigned long) vaddr;
179 int order = get_order(size);
180 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; 181 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
181 struct page *page = NULL; 182 struct page *page = NULL;
182 183
183 if (dma_release_from_coherent(dev, order, vaddr)) 184 if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs)) {
185 mips_dma_free_noncoherent(dev, size, vaddr, dma_handle);
184 return; 186 return;
187 }
185 188
186 plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL); 189 plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
187 190
diff --git a/arch/mips/netlogic/common/nlm-dma.c b/arch/mips/netlogic/common/nlm-dma.c
index f3d4ae87abc7..3758715d4ab6 100644
--- a/arch/mips/netlogic/common/nlm-dma.c
+++ b/arch/mips/netlogic/common/nlm-dma.c
@@ -47,11 +47,6 @@ static char *nlm_swiotlb;
47static void *nlm_dma_alloc_coherent(struct device *dev, size_t size, 47static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
48 dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) 48 dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs)
49{ 49{
50 void *ret;
51
52 if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
53 return ret;
54
55 /* ignore region specifiers */ 50 /* ignore region specifiers */
56 gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); 51 gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
57 52
@@ -69,11 +64,6 @@ static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
69static void nlm_dma_free_coherent(struct device *dev, size_t size, 64static void nlm_dma_free_coherent(struct device *dev, size_t size,
70 void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs) 65 void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
71{ 66{
72 int order = get_order(size);
73
74 if (dma_release_from_coherent(dev, order, vaddr))
75 return;
76
77 swiotlb_free_coherent(dev, size, vaddr, dma_handle); 67 swiotlb_free_coherent(dev, size, vaddr, dma_handle);
78} 68}
79 69
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index fab8628e1b6e..413bfcf86384 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -23,7 +23,6 @@
23 */ 23 */
24 24
25#include <linux/dma-debug.h> 25#include <linux/dma-debug.h>
26#include <asm-generic/dma-coherent.h>
27#include <linux/kmemcheck.h> 26#include <linux/kmemcheck.h>
28#include <linux/dma-mapping.h> 27#include <linux/dma-mapping.h>
29 28
@@ -36,75 +35,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
36 return &or1k_dma_map_ops; 35 return &or1k_dma_map_ops;
37} 36}
38 37
39#include <asm-generic/dma-mapping-common.h> 38#define HAVE_ARCH_DMA_SUPPORTED 1
40
41#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
42
43static inline void *dma_alloc_attrs(struct device *dev, size_t size,
44 dma_addr_t *dma_handle, gfp_t gfp,
45 struct dma_attrs *attrs)
46{
47 struct dma_map_ops *ops = get_dma_ops(dev);
48 void *memory;
49
50 memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
51
52 debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
53
54 return memory;
55}
56
57#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
58
59static inline void dma_free_attrs(struct device *dev, size_t size,
60 void *cpu_addr, dma_addr_t dma_handle,
61 struct dma_attrs *attrs)
62{
63 struct dma_map_ops *ops = get_dma_ops(dev);
64
65 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
66
67 ops->free(dev, size, cpu_addr, dma_handle, attrs);
68}
69
70static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
71 dma_addr_t *dma_handle, gfp_t gfp)
72{
73 struct dma_attrs attrs;
74
75 dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
76
77 return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
78}
79
80static inline void dma_free_noncoherent(struct device *dev, size_t size,
81 void *cpu_addr, dma_addr_t dma_handle)
82{
83 struct dma_attrs attrs;
84
85 dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
86
87 dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
88}
89
90static inline int dma_supported(struct device *dev, u64 dma_mask) 39static inline int dma_supported(struct device *dev, u64 dma_mask)
91{ 40{
92 /* Support 32 bit DMA mask exclusively */ 41 /* Support 32 bit DMA mask exclusively */
93 return dma_mask == DMA_BIT_MASK(32); 42 return dma_mask == DMA_BIT_MASK(32);
94} 43}
95 44
96static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 45#include <asm-generic/dma-mapping-common.h>
97{
98 return 0;
99}
100
101static inline int dma_set_mask(struct device *dev, u64 dma_mask)
102{
103 if (!dev->dma_mask || !dma_supported(dev, dma_mask))
104 return -EIO;
105
106 *dev->dma_mask = dma_mask;
107 46
108 return 0;
109}
110#endif /* __ASM_OPENRISC_DMA_MAPPING_H */ 47#endif /* __ASM_OPENRISC_DMA_MAPPING_H */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b447918b9e2c..9a7057ec2154 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
420config KEXEC 420config KEXEC
421 bool "kexec system call" 421 bool "kexec system call"
422 depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) 422 depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
423 select KEXEC_CORE
423 help 424 help
424 kexec is a system call that implements the ability to shutdown your 425 kexec is a system call that implements the ability to shutdown your
425 current kernel, and to start another kernel. It is like a reboot 426 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 710f60e380e0..7f522c021dc3 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -18,7 +18,9 @@
18#include <asm/io.h> 18#include <asm/io.h>
19#include <asm/swiotlb.h> 19#include <asm/swiotlb.h>
20 20
21#ifdef CONFIG_PPC64
21#define DMA_ERROR_CODE (~(dma_addr_t)0x0) 22#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
23#endif
22 24
23/* Some dma direct funcs must be visible for use in other dma_ops */ 25/* Some dma direct funcs must be visible for use in other dma_ops */
24extern void *__dma_direct_alloc_coherent(struct device *dev, size_t size, 26extern void *__dma_direct_alloc_coherent(struct device *dev, size_t size,
@@ -120,71 +122,14 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
120/* this will be removed soon */ 122/* this will be removed soon */
121#define flush_write_buffers() 123#define flush_write_buffers()
122 124
123#include <asm-generic/dma-mapping-common.h> 125#define HAVE_ARCH_DMA_SET_MASK 1
124 126extern int dma_set_mask(struct device *dev, u64 dma_mask);
125static inline int dma_supported(struct device *dev, u64 mask)
126{
127 struct dma_map_ops *dma_ops = get_dma_ops(dev);
128 127
129 if (unlikely(dma_ops == NULL)) 128#include <asm-generic/dma-mapping-common.h>
130 return 0;
131 if (dma_ops->dma_supported == NULL)
132 return 1;
133 return dma_ops->dma_supported(dev, mask);
134}
135 129
136extern int dma_set_mask(struct device *dev, u64 dma_mask);
137extern int __dma_set_mask(struct device *dev, u64 dma_mask); 130extern int __dma_set_mask(struct device *dev, u64 dma_mask);
138extern u64 __dma_get_required_mask(struct device *dev); 131extern u64 __dma_get_required_mask(struct device *dev);
139 132
140#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
141
142static inline void *dma_alloc_attrs(struct device *dev, size_t size,
143 dma_addr_t *dma_handle, gfp_t flag,
144 struct dma_attrs *attrs)
145{
146 struct dma_map_ops *dma_ops = get_dma_ops(dev);
147 void *cpu_addr;
148
149 BUG_ON(!dma_ops);
150
151 cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
152
153 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
154
155 return cpu_addr;
156}
157
158#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
159
160static inline void dma_free_attrs(struct device *dev, size_t size,
161 void *cpu_addr, dma_addr_t dma_handle,
162 struct dma_attrs *attrs)
163{
164 struct dma_map_ops *dma_ops = get_dma_ops(dev);
165
166 BUG_ON(!dma_ops);
167
168 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
169
170 dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
171}
172
173static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
174{
175 struct dma_map_ops *dma_ops = get_dma_ops(dev);
176
177 debug_dma_mapping_error(dev, dma_addr);
178 if (dma_ops->mapping_error)
179 return dma_ops->mapping_error(dev, dma_addr);
180
181#ifdef CONFIG_PPC64
182 return (dma_addr == DMA_ERROR_CODE);
183#else
184 return 0;
185#endif
186}
187
188static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) 133static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
189{ 134{
190#ifdef CONFIG_SWIOTLB 135#ifdef CONFIG_SWIOTLB
@@ -210,9 +155,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
210 return daddr - get_dma_offset(dev); 155 return daddr - get_dma_offset(dev);
211} 156}
212 157
213#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
214#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
215
216#define ARCH_HAS_DMA_MMAP_COHERENT 158#define ARCH_HAS_DMA_MMAP_COHERENT
217 159
218static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 160static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4827870f7a6d..1d57000b1b24 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
48 48
49config KEXEC 49config KEXEC
50 def_bool y 50 def_bool y
51 select KEXEC_CORE
51 52
52config AUDIT_ARCH 53config AUDIT_ARCH
53 def_bool y 54 def_bool y
diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c
index 42506b371b74..4da604ebf6fd 100644
--- a/arch/s390/boot/compressed/misc.c
+++ b/arch/s390/boot/compressed/misc.c
@@ -167,7 +167,7 @@ unsigned long decompress_kernel(void)
167#endif 167#endif
168 168
169 puts("Uncompressing Linux... "); 169 puts("Uncompressing Linux... ");
170 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 170 __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
171 puts("Ok, booting the kernel.\n"); 171 puts("Ok, booting the kernel.\n");
172 return (unsigned long) output; 172 return (unsigned long) output;
173} 173}
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 9d395961e713..b3fd54d93dd2 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -18,27 +18,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
18 return &s390_dma_ops; 18 return &s390_dma_ops;
19} 19}
20 20
21extern int dma_set_mask(struct device *dev, u64 mask);
22
23static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 21static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
24 enum dma_data_direction direction) 22 enum dma_data_direction direction)
25{ 23{
26} 24}
27 25
28#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
29#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
30
31#include <asm-generic/dma-mapping-common.h> 26#include <asm-generic/dma-mapping-common.h>
32 27
33static inline int dma_supported(struct device *dev, u64 mask)
34{
35 struct dma_map_ops *dma_ops = get_dma_ops(dev);
36
37 if (dma_ops->dma_supported == NULL)
38 return 1;
39 return dma_ops->dma_supported(dev, mask);
40}
41
42static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) 28static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
43{ 29{
44 if (!dev->dma_mask) 30 if (!dev->dma_mask)
@@ -46,45 +32,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
46 return addr + size - 1 <= *dev->dma_mask; 32 return addr + size - 1 <= *dev->dma_mask;
47} 33}
48 34
49static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
50{
51 struct dma_map_ops *dma_ops = get_dma_ops(dev);
52
53 debug_dma_mapping_error(dev, dma_addr);
54 if (dma_ops->mapping_error)
55 return dma_ops->mapping_error(dev, dma_addr);
56 return dma_addr == DMA_ERROR_CODE;
57}
58
59#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
60
61static inline void *dma_alloc_attrs(struct device *dev, size_t size,
62 dma_addr_t *dma_handle, gfp_t flags,
63 struct dma_attrs *attrs)
64{
65 struct dma_map_ops *ops = get_dma_ops(dev);
66 void *cpu_addr;
67
68 BUG_ON(!ops);
69
70 cpu_addr = ops->alloc(dev, size, dma_handle, flags, attrs);
71 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
72
73 return cpu_addr;
74}
75
76#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
77
78static inline void dma_free_attrs(struct device *dev, size_t size,
79 void *cpu_addr, dma_addr_t dma_handle,
80 struct dma_attrs *attrs)
81{
82 struct dma_map_ops *ops = get_dma_ops(dev);
83
84 BUG_ON(!ops);
85
86 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
87 ops->free(dev, size, cpu_addr, dma_handle, attrs);
88}
89
90#endif /* _ASM_S390_DMA_MAPPING_H */ 35#endif /* _ASM_S390_DMA_MAPPING_H */
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 42b76580c8b8..37505b8b4093 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -262,16 +262,6 @@ out:
262 spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); 262 spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
263} 263}
264 264
265int dma_set_mask(struct device *dev, u64 mask)
266{
267 if (!dev->dma_mask || !dma_supported(dev, mask))
268 return -EIO;
269
270 *dev->dma_mask = mask;
271 return 0;
272}
273EXPORT_SYMBOL_GPL(dma_set_mask);
274
275static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, 265static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
276 unsigned long offset, size_t size, 266 unsigned long offset, size_t size,
277 enum dma_data_direction direction, 267 enum dma_data_direction direction,
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..d514df7e04dd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
602config KEXEC 602config KEXEC
603 bool "kexec system call (EXPERIMENTAL)" 603 bool "kexec system call (EXPERIMENTAL)"
604 depends on SUPERH32 && MMU 604 depends on SUPERH32 && MMU
605 select KEXEC_CORE
605 help 606 help
606 kexec is a system call that implements the ability to shutdown your 607 kexec is a system call that implements the ability to shutdown your
607 current kernel, and to start another kernel. It is like a reboot 608 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c
index 95470a472d2c..208a9753ab38 100644
--- a/arch/sh/boot/compressed/misc.c
+++ b/arch/sh/boot/compressed/misc.c
@@ -132,7 +132,7 @@ void decompress_kernel(void)
132 132
133 puts("Uncompressing Linux... "); 133 puts("Uncompressing Linux... ");
134 cache_control(CACHE_ENABLE); 134 cache_control(CACHE_ENABLE);
135 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 135 __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
136 cache_control(CACHE_DISABLE); 136 cache_control(CACHE_DISABLE);
137 puts("Ok, booting the kernel.\n"); 137 puts("Ok, booting the kernel.\n");
138} 138}
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index b437f2c780b8..a3745a3fe029 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -9,86 +9,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
9 return dma_ops; 9 return dma_ops;
10} 10}
11 11
12#include <asm-generic/dma-coherent.h> 12#define DMA_ERROR_CODE 0
13#include <asm-generic/dma-mapping-common.h>
14
15static inline int dma_supported(struct device *dev, u64 mask)
16{
17 struct dma_map_ops *ops = get_dma_ops(dev);
18
19 if (ops->dma_supported)
20 return ops->dma_supported(dev, mask);
21
22 return 1;
23}
24
25static inline int dma_set_mask(struct device *dev, u64 mask)
26{
27 struct dma_map_ops *ops = get_dma_ops(dev);
28 13
29 if (!dev->dma_mask || !dma_supported(dev, mask)) 14#include <asm-generic/dma-mapping-common.h>
30 return -EIO;
31 if (ops->set_dma_mask)
32 return ops->set_dma_mask(dev, mask);
33
34 *dev->dma_mask = mask;
35
36 return 0;
37}
38 15
39void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 16void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
40 enum dma_data_direction dir); 17 enum dma_data_direction dir);
41 18
42#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
43#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
44
45static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
46{
47 struct dma_map_ops *ops = get_dma_ops(dev);
48
49 debug_dma_mapping_error(dev, dma_addr);
50 if (ops->mapping_error)
51 return ops->mapping_error(dev, dma_addr);
52
53 return dma_addr == 0;
54}
55
56#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
57
58static inline void *dma_alloc_attrs(struct device *dev, size_t size,
59 dma_addr_t *dma_handle, gfp_t gfp,
60 struct dma_attrs *attrs)
61{
62 struct dma_map_ops *ops = get_dma_ops(dev);
63 void *memory;
64
65 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
66 return memory;
67 if (!ops->alloc)
68 return NULL;
69
70 memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
71 debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
72
73 return memory;
74}
75
76#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
77
78static inline void dma_free_attrs(struct device *dev, size_t size,
79 void *vaddr, dma_addr_t dma_handle,
80 struct dma_attrs *attrs)
81{
82 struct dma_map_ops *ops = get_dma_ops(dev);
83
84 if (dma_release_from_coherent(dev, get_order(size), vaddr))
85 return;
86
87 debug_dma_free_coherent(dev, size, vaddr, dma_handle);
88 if (ops->free)
89 ops->free(dev, size, vaddr, dma_handle, attrs);
90}
91
92/* arch/sh/mm/consistent.c */ 19/* arch/sh/mm/consistent.c */
93extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, 20extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
94 dma_addr_t *dma_addr, gfp_t flag, 21 dma_addr_t *dma_addr, gfp_t flag,
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 7e064c68c5ec..a21da597b0b5 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -7,11 +7,9 @@
7 7
8#define DMA_ERROR_CODE (~(dma_addr_t)0x0) 8#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
9 9
10#define HAVE_ARCH_DMA_SUPPORTED 1
10int dma_supported(struct device *dev, u64 mask); 11int dma_supported(struct device *dev, u64 mask);
11 12
12#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
13#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
14
15static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 13static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16 enum dma_data_direction dir) 14 enum dma_data_direction dir)
17{ 15{
@@ -39,39 +37,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
39 return dma_ops; 37 return dma_ops;
40} 38}
41 39
42#include <asm-generic/dma-mapping-common.h> 40#define HAVE_ARCH_DMA_SET_MASK 1
43
44#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
45
46static inline void *dma_alloc_attrs(struct device *dev, size_t size,
47 dma_addr_t *dma_handle, gfp_t flag,
48 struct dma_attrs *attrs)
49{
50 struct dma_map_ops *ops = get_dma_ops(dev);
51 void *cpu_addr;
52
53 cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
54 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
55 return cpu_addr;
56}
57
58#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
59
60static inline void dma_free_attrs(struct device *dev, size_t size,
61 void *cpu_addr, dma_addr_t dma_handle,
62 struct dma_attrs *attrs)
63{
64 struct dma_map_ops *ops = get_dma_ops(dev);
65
66 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
67 ops->free(dev, size, cpu_addr, dma_handle, attrs);
68}
69
70static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
71{
72 debug_dma_mapping_error(dev, dma_addr);
73 return (dma_addr == DMA_ERROR_CODE);
74}
75 41
76static inline int dma_set_mask(struct device *dev, u64 mask) 42static inline int dma_set_mask(struct device *dev, u64 mask)
77{ 43{
@@ -86,4 +52,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
86 return -EINVAL; 52 return -EINVAL;
87} 53}
88 54
55#include <asm-generic/dma-mapping-common.h>
56
89#endif 57#endif
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 2ba12d761723..106c21bd7f44 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
205 205
206config KEXEC 206config KEXEC
207 bool "kexec system call" 207 bool "kexec system call"
208 select KEXEC_CORE
208 ---help--- 209 ---help---
209 kexec is a system call that implements the ability to shutdown your 210 kexec is a system call that implements the ability to shutdown your
210 current kernel, and to start another kernel. It is like a reboot 211 current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 1eae359d8315..96ac6cce4a32 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -59,8 +59,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
59 59
60static inline void dma_mark_clean(void *addr, size_t size) {} 60static inline void dma_mark_clean(void *addr, size_t size) {}
61 61
62#include <asm-generic/dma-mapping-common.h>
63
64static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) 62static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
65{ 63{
66 dev->archdata.dma_ops = ops; 64 dev->archdata.dma_ops = ops;
@@ -74,18 +72,9 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
74 return addr + size - 1 <= *dev->dma_mask; 72 return addr + size - 1 <= *dev->dma_mask;
75} 73}
76 74
77static inline int 75#define HAVE_ARCH_DMA_SET_MASK 1
78dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
79{
80 debug_dma_mapping_error(dev, dma_addr);
81 return get_dma_ops(dev)->mapping_error(dev, dma_addr);
82}
83 76
84static inline int 77#include <asm-generic/dma-mapping-common.h>
85dma_supported(struct device *dev, u64 mask)
86{
87 return get_dma_ops(dev)->dma_supported(dev, mask);
88}
89 78
90static inline int 79static inline int
91dma_set_mask(struct device *dev, u64 mask) 80dma_set_mask(struct device *dev, u64 mask)
@@ -116,36 +105,6 @@ dma_set_mask(struct device *dev, u64 mask)
116 return 0; 105 return 0;
117} 106}
118 107
119static inline void *dma_alloc_attrs(struct device *dev, size_t size,
120 dma_addr_t *dma_handle, gfp_t flag,
121 struct dma_attrs *attrs)
122{
123 struct dma_map_ops *dma_ops = get_dma_ops(dev);
124 void *cpu_addr;
125
126 cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
127
128 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
129
130 return cpu_addr;
131}
132
133static inline void dma_free_attrs(struct device *dev, size_t size,
134 void *cpu_addr, dma_addr_t dma_handle,
135 struct dma_attrs *attrs)
136{
137 struct dma_map_ops *dma_ops = get_dma_ops(dev);
138
139 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
140
141 dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
142}
143
144#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
145#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
146#define dma_free_coherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
147#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
148
149/* 108/*
150 * dma_alloc_noncoherent() is #defined to return coherent memory, 109 * dma_alloc_noncoherent() is #defined to return coherent memory,
151 * so there's no need to do any flushing here. 110 * so there's no need to do any flushing here.
diff --git a/arch/unicore32/boot/compressed/misc.c b/arch/unicore32/boot/compressed/misc.c
index 176d5bda3559..5c65dfee278c 100644
--- a/arch/unicore32/boot/compressed/misc.c
+++ b/arch/unicore32/boot/compressed/misc.c
@@ -119,8 +119,8 @@ unsigned long decompress_kernel(unsigned long output_start,
119 output_ptr = get_unaligned_le32(tmp); 119 output_ptr = get_unaligned_le32(tmp);
120 120
121 arch_decomp_puts("Uncompressing Linux..."); 121 arch_decomp_puts("Uncompressing Linux...");
122 decompress(input_data, input_data_end - input_data, NULL, NULL, 122 __decompress(input_data, input_data_end - input_data, NULL, NULL,
123 output_data, NULL, error); 123 output_data, 0, NULL, error);
124 arch_decomp_puts(" done, booting the kernel.\n"); 124 arch_decomp_puts(" done, booting the kernel.\n");
125 return output_ptr; 125 return output_ptr;
126} 126}
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 366460a81796..8140e053ccd3 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -18,8 +18,6 @@
18#include <linux/scatterlist.h> 18#include <linux/scatterlist.h>
19#include <linux/swiotlb.h> 19#include <linux/swiotlb.h>
20 20
21#include <asm-generic/dma-coherent.h>
22
23#include <asm/memory.h> 21#include <asm/memory.h>
24#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
25 23
@@ -30,26 +28,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
30 return &swiotlb_dma_map_ops; 28 return &swiotlb_dma_map_ops;
31} 29}
32 30
33static inline int dma_supported(struct device *dev, u64 mask)
34{
35 struct dma_map_ops *dma_ops = get_dma_ops(dev);
36
37 if (unlikely(dma_ops == NULL))
38 return 0;
39
40 return dma_ops->dma_supported(dev, mask);
41}
42
43static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
44{
45 struct dma_map_ops *dma_ops = get_dma_ops(dev);
46
47 if (dma_ops->mapping_error)
48 return dma_ops->mapping_error(dev, dma_addr);
49
50 return 0;
51}
52
53#include <asm-generic/dma-mapping-common.h> 31#include <asm-generic/dma-mapping-common.h>
54 32
55static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) 33static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
@@ -72,41 +50,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
72 50
73static inline void dma_mark_clean(void *addr, size_t size) {} 51static inline void dma_mark_clean(void *addr, size_t size) {}
74 52
75static inline int dma_set_mask(struct device *dev, u64 dma_mask)
76{
77 if (!dev->dma_mask || !dma_supported(dev, dma_mask))
78 return -EIO;
79
80 *dev->dma_mask = dma_mask;
81
82 return 0;
83}
84
85#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
86
87static inline void *dma_alloc_attrs(struct device *dev, size_t size,
88 dma_addr_t *dma_handle, gfp_t flag,
89 struct dma_attrs *attrs)
90{
91 struct dma_map_ops *dma_ops = get_dma_ops(dev);
92
93 return dma_ops->alloc(dev, size, dma_handle, flag, attrs);
94}
95
96#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
97
98static inline void dma_free_attrs(struct device *dev, size_t size,
99 void *cpu_addr, dma_addr_t dma_handle,
100 struct dma_attrs *attrs)
101{
102 struct dma_map_ops *dma_ops = get_dma_ops(dev);
103
104 dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
105}
106
107#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
108#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
109
110static inline void dma_cache_sync(struct device *dev, void *vaddr, 53static inline void dma_cache_sync(struct device *dev, void *vaddr,
111 size_t size, enum dma_data_direction direction) 54 size_t size, enum dma_data_direction direction)
112{ 55{
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc0d73eac047..7aef2d52daa0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1754,6 +1754,7 @@ source kernel/Kconfig.hz
1754 1754
1755config KEXEC 1755config KEXEC
1756 bool "kexec system call" 1756 bool "kexec system call"
1757 select KEXEC_CORE
1757 ---help--- 1758 ---help---
1758 kexec is a system call that implements the ability to shutdown your 1759 kexec is a system call that implements the ability to shutdown your
1759 current kernel, and to start another kernel. It is like a reboot 1760 current kernel, and to start another kernel. It is like a reboot
@@ -1770,8 +1771,8 @@ config KEXEC
1770 1771
1771config KEXEC_FILE 1772config KEXEC_FILE
1772 bool "kexec file based system call" 1773 bool "kexec file based system call"
1774 select KEXEC_CORE
1773 select BUILD_BIN2C 1775 select BUILD_BIN2C
1774 depends on KEXEC
1775 depends on X86_64 1776 depends on X86_64
1776 depends on CRYPTO=y 1777 depends on CRYPTO=y
1777 depends on CRYPTO_SHA256=y 1778 depends on CRYPTO_SHA256=y
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index f63797942bb5..79dac1758e7c 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
448#endif 448#endif
449 449
450 debug_putstr("\nDecompressing Linux... "); 450 debug_putstr("\nDecompressing Linux... ");
451 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 451 __decompress(input_data, input_len, NULL, NULL, output, output_len,
452 NULL, error);
452 parse_elf(output); 453 parse_elf(output);
453 /* 454 /*
454 * 32-bit always performs relocations. 64-bit relocations are only 455 * 32-bit always performs relocations. 64-bit relocations are only
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
414# define XLF23 0 414# define XLF23 0
415#endif 415#endif
416 416
417#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC) 417#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
418# define XLF4 XLF_EFI_KEXEC 418# define XLF4 XLF_EFI_KEXEC
419#else 419#else
420# define XLF4 0 420# define XLF4 0
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 26a46f44e298..b160c0c6baed 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
277{ 277{
278 return "[vsyscall]"; 278 return "[vsyscall]";
279} 279}
280static struct vm_operations_struct gate_vma_ops = { 280static const struct vm_operations_struct gate_vma_ops = {
281 .name = gate_vma_name, 281 .name = gate_vma_name,
282}; 282};
283static struct vm_area_struct gate_vma = { 283static struct vm_area_struct gate_vma = {
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1f5b7287d1ad..953b7263f844 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
12#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
13#include <asm/io.h> 13#include <asm/io.h>
14#include <asm/swiotlb.h> 14#include <asm/swiotlb.h>
15#include <asm-generic/dma-coherent.h>
16#include <linux/dma-contiguous.h> 15#include <linux/dma-contiguous.h>
17 16
18#ifdef CONFIG_ISA 17#ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
41#endif 40#endif
42} 41}
43 42
44#include <asm-generic/dma-mapping-common.h> 43bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
45 44#define arch_dma_alloc_attrs arch_dma_alloc_attrs
46/* Make sure we keep the same behaviour */
47static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
48{
49 struct dma_map_ops *ops = get_dma_ops(dev);
50 debug_dma_mapping_error(dev, dma_addr);
51 if (ops->mapping_error)
52 return ops->mapping_error(dev, dma_addr);
53
54 return (dma_addr == DMA_ERROR_CODE);
55}
56
57#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
58#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
59 45
46#define HAVE_ARCH_DMA_SUPPORTED 1
60extern int dma_supported(struct device *hwdev, u64 mask); 47extern int dma_supported(struct device *hwdev, u64 mask);
61extern int dma_set_mask(struct device *dev, u64 mask); 48
49#include <asm-generic/dma-mapping-common.h>
62 50
63extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, 51extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
64 dma_addr_t *dma_addr, gfp_t flag, 52 dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
125 return gfp; 113 return gfp;
126} 114}
127 115
128#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
129
130void *
131dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
132 gfp_t gfp, struct dma_attrs *attrs);
133
134#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
135
136void dma_free_attrs(struct device *dev, size_t size,
137 void *vaddr, dma_addr_t bus,
138 struct dma_attrs *attrs);
139
140#endif 116#endif
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
29extern void __show_regs(struct pt_regs *regs, int all); 29extern void __show_regs(struct pt_regs *regs, int all);
30extern unsigned long oops_begin(void); 30extern unsigned long oops_begin(void);
31extern void oops_end(unsigned long, struct pt_regs *, int signr); 31extern void oops_end(unsigned long, struct pt_regs *, int signr);
32#ifdef CONFIG_KEXEC 32#ifdef CONFIG_KEXEC_CORE
33extern int in_crash_kexec; 33extern int in_crash_kexec;
34#else 34#else
35/* no crash dump is ever in progress if no crash kernel can be kexec'd */ 35/* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9ffdf25e5b86..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o
71obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 71obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
72obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o 72obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
73obj-$(CONFIG_X86_TSC) += trace_clock.o 73obj-$(CONFIG_X86_TSC) += trace_clock.o
74obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 74obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
75obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 75obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
76obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o 76obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
77obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 77obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
78obj-y += kprobes/ 78obj-y += kprobes/
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
200 * kind of shutdown from our side, we unregister the clock by writting anything 200 * kind of shutdown from our side, we unregister the clock by writting anything
201 * that does not have the 'enable' bit set in the msr 201 * that does not have the 'enable' bit set in the msr
202 */ 202 */
203#ifdef CONFIG_KEXEC 203#ifdef CONFIG_KEXEC_CORE
204static void kvm_crash_shutdown(struct pt_regs *regs) 204static void kvm_crash_shutdown(struct pt_regs *regs)
205{ 205{
206 native_write_msr(msr_kvm_system_time, 0, 0); 206 native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
259 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; 259 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
260 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; 260 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
261 machine_ops.shutdown = kvm_shutdown; 261 machine_ops.shutdown = kvm_shutdown;
262#ifdef CONFIG_KEXEC 262#ifdef CONFIG_KEXEC_CORE
263 machine_ops.crash_shutdown = kvm_crash_shutdown; 263 machine_ops.crash_shutdown = kvm_crash_shutdown;
264#endif 264#endif
265 kvm_get_preset_lpj(); 265 kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 353972c1946c..84b8ef82a159 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
58/* Number of entries preallocated for DMA-API debugging */ 58/* Number of entries preallocated for DMA-API debugging */
59#define PREALLOC_DMA_DEBUG_ENTRIES 65536 59#define PREALLOC_DMA_DEBUG_ENTRIES 65536
60 60
61int dma_set_mask(struct device *dev, u64 mask)
62{
63 if (!dev->dma_mask || !dma_supported(dev, mask))
64 return -EIO;
65
66 *dev->dma_mask = mask;
67
68 return 0;
69}
70EXPORT_SYMBOL(dma_set_mask);
71
72void __init pci_iommu_alloc(void) 61void __init pci_iommu_alloc(void)
73{ 62{
74 struct iommu_table_entry *p; 63 struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
140 free_pages((unsigned long)vaddr, get_order(size)); 129 free_pages((unsigned long)vaddr, get_order(size));
141} 130}
142 131
143void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, 132bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
144 gfp_t gfp, struct dma_attrs *attrs)
145{ 133{
146 struct dma_map_ops *ops = get_dma_ops(dev); 134 *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
147 void *memory; 135 *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
148
149 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
150
151 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
152 return memory;
153
154 if (!dev)
155 dev = &x86_dma_fallback_dev;
156
157 if (!is_device_dma_capable(dev))
158 return NULL;
159
160 if (!ops->alloc)
161 return NULL;
162
163 memory = ops->alloc(dev, size, dma_handle,
164 dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
165 debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
166
167 return memory;
168}
169EXPORT_SYMBOL(dma_alloc_attrs);
170
171void dma_free_attrs(struct device *dev, size_t size,
172 void *vaddr, dma_addr_t bus,
173 struct dma_attrs *attrs)
174{
175 struct dma_map_ops *ops = get_dma_ops(dev);
176
177 WARN_ON(irqs_disabled()); /* for portability */
178 136
179 if (dma_release_from_coherent(dev, get_order(size), vaddr)) 137 if (!*dev)
180 return; 138 *dev = &x86_dma_fallback_dev;
139 if (!is_device_dma_capable(*dev))
140 return false;
141 return true;
181 142
182 debug_dma_free_coherent(dev, size, vaddr, bus);
183 if (ops->free)
184 ops->free(dev, size, vaddr, bus, attrs);
185} 143}
186EXPORT_SYMBOL(dma_free_attrs); 144EXPORT_SYMBOL(arch_dma_alloc_attrs);
187 145
188/* 146/*
189 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel 147 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
673 .emergency_restart = native_machine_emergency_restart, 673 .emergency_restart = native_machine_emergency_restart,
674 .restart = native_machine_restart, 674 .restart = native_machine_restart,
675 .halt = native_machine_halt, 675 .halt = native_machine_halt,
676#ifdef CONFIG_KEXEC 676#ifdef CONFIG_KEXEC_CORE
677 .crash_shutdown = native_machine_crash_shutdown, 677 .crash_shutdown = native_machine_crash_shutdown,
678#endif 678#endif
679}; 679};
@@ -703,7 +703,7 @@ void machine_halt(void)
703 machine_ops.halt(); 703 machine_ops.halt();
704} 704}
705 705
706#ifdef CONFIG_KEXEC 706#ifdef CONFIG_KEXEC_CORE
707void machine_crash_shutdown(struct pt_regs *regs) 707void machine_crash_shutdown(struct pt_regs *regs)
708{ 708{
709 machine_ops.crash_shutdown(regs); 709 machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index baadbf90a7c5..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -478,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
478 * --------- Crashkernel reservation ------------------------------ 478 * --------- Crashkernel reservation ------------------------------
479 */ 479 */
480 480
481#ifdef CONFIG_KEXEC 481#ifdef CONFIG_KEXEC_CORE
482 482
483/* 483/*
484 * Keep the crash kernel below this limit. On 32 bits earlier kernels 484 * Keep the crash kernel below this limit. On 32 bits earlier kernels
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
364 364
365#endif /* CONFIG_X86_32 */ 365#endif /* CONFIG_X86_32 */
366 366
367#ifdef CONFIG_KEXEC 367#ifdef CONFIG_KEXEC_CORE
368#include <asm/kexec.h> 368#include <asm/kexec.h>
369 369
370. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 370. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 148ea2016022..d01986832afc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
1264 vmcs, phys_addr); 1264 vmcs, phys_addr);
1265} 1265}
1266 1266
1267#ifdef CONFIG_KEXEC 1267#ifdef CONFIG_KEXEC_CORE
1268/* 1268/*
1269 * This bitmap is used to indicate whether the vmclear 1269 * This bitmap is used to indicate whether the vmclear
1270 * operation is enabled on all cpus. All disabled by 1270 * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
1302#else 1302#else
1303static inline void crash_enable_local_vmclear(int cpu) { } 1303static inline void crash_enable_local_vmclear(int cpu) { }
1304static inline void crash_disable_local_vmclear(int cpu) { } 1304static inline void crash_disable_local_vmclear(int cpu) { }
1305#endif /* CONFIG_KEXEC */ 1305#endif /* CONFIG_KEXEC_CORE */
1306 1306
1307static void __loaded_vmcs_clear(void *arg) 1307static void __loaded_vmcs_clear(void *arg)
1308{ 1308{
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
10411 if (r) 10411 if (r)
10412 return r; 10412 return r;
10413 10413
10414#ifdef CONFIG_KEXEC 10414#ifdef CONFIG_KEXEC_CORE
10415 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 10415 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
10416 crash_vmclear_local_loaded_vmcss); 10416 crash_vmclear_local_loaded_vmcss);
10417#endif 10417#endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
10421 10421
10422static void __exit vmx_exit(void) 10422static void __exit vmx_exit(void)
10423{ 10423{
10424#ifdef CONFIG_KEXEC 10424#ifdef CONFIG_KEXEC_CORE
10425 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 10425 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
10426 synchronize_rcu(); 10426 synchronize_rcu();
10427#endif 10427#endif
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index db1b0bc5017c..134948b0926f 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
42 */ 42 */
43static unsigned long mpx_mmap(unsigned long len) 43static unsigned long mpx_mmap(unsigned long len)
44{ 44{
45 unsigned long ret;
46 unsigned long addr, pgoff;
47 struct mm_struct *mm = current->mm; 45 struct mm_struct *mm = current->mm;
48 vm_flags_t vm_flags; 46 unsigned long addr, populate;
49 struct vm_area_struct *vma;
50 47
51 /* Only bounds table can be allocated here */ 48 /* Only bounds table can be allocated here */
52 if (len != mpx_bt_size_bytes(mm)) 49 if (len != mpx_bt_size_bytes(mm))
53 return -EINVAL; 50 return -EINVAL;
54 51
55 down_write(&mm->mmap_sem); 52 down_write(&mm->mmap_sem);
56 53 addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
57 /* Too many mappings? */ 54 MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
58 if (mm->map_count > sysctl_max_map_count) {
59 ret = -ENOMEM;
60 goto out;
61 }
62
63 /* Obtain the address to map to. we verify (or select) it and ensure
64 * that it represents a valid section of the address space.
65 */
66 addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
67 if (addr & ~PAGE_MASK) {
68 ret = addr;
69 goto out;
70 }
71
72 vm_flags = VM_READ | VM_WRITE | VM_MPX |
73 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
74
75 /* Set pgoff according to addr for anon_vma */
76 pgoff = addr >> PAGE_SHIFT;
77
78 ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
79 if (IS_ERR_VALUE(ret))
80 goto out;
81
82 vma = find_vma(mm, ret);
83 if (!vma) {
84 ret = -ENOMEM;
85 goto out;
86 }
87
88 if (vm_flags & VM_LOCKED) {
89 up_write(&mm->mmap_sem);
90 mm_populate(ret, len);
91 return ret;
92 }
93
94out:
95 up_write(&mm->mmap_sem); 55 up_write(&mm->mmap_sem);
96 return ret; 56 if (populate)
57 mm_populate(addr, populate);
58
59 return addr;
97} 60}
98 61
99enum reg_type { 62enum reg_type {
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe6afe8..1db84c0758b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
650 650
651static void __init save_runtime_map(void) 651static void __init save_runtime_map(void)
652{ 652{
653#ifdef CONFIG_KEXEC 653#ifdef CONFIG_KEXEC_CORE
654 efi_memory_desc_t *md; 654 efi_memory_desc_t *md;
655 void *tmp, *p, *q = NULL; 655 void *tmp, *p, *q = NULL;
656 int count = 0; 656 int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
748 748
749static void __init kexec_enter_virtual_mode(void) 749static void __init kexec_enter_virtual_mode(void)
750{ 750{
751#ifdef CONFIG_KEXEC 751#ifdef CONFIG_KEXEC_CORE
752 efi_memory_desc_t *md; 752 efi_memory_desc_t *md;
753 void *p; 753 void *p;
754 754
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
492 touch_nmi_watchdog(); 492 touch_nmi_watchdog();
493} 493}
494 494
495#if defined(CONFIG_KEXEC) 495#if defined(CONFIG_KEXEC_CORE)
496static atomic_t uv_nmi_kexec_failed; 496static atomic_t uv_nmi_kexec_failed;
497static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) 497static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
498{ 498{
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
519 uv_nmi_sync_exit(0); 519 uv_nmi_sync_exit(0);
520} 520}
521 521
522#else /* !CONFIG_KEXEC */ 522#else /* !CONFIG_KEXEC_CORE */
523static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) 523static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
524{ 524{
525 if (master) 525 if (master)
526 pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); 526 pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
527} 527}
528#endif /* !CONFIG_KEXEC */ 528#endif /* !CONFIG_KEXEC_CORE */
529 529
530#ifdef CONFIG_KGDB 530#ifdef CONFIG_KGDB
531#ifdef CONFIG_KGDB_KDB 531#ifdef CONFIG_KGDB_KDB
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index f01cb3044e50..4427f38b634e 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -32,66 +32,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
32 32
33#include <asm-generic/dma-mapping-common.h> 33#include <asm-generic/dma-mapping-common.h>
34 34
35#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
36#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
37#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
38#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
39
40static inline void *dma_alloc_attrs(struct device *dev, size_t size,
41 dma_addr_t *dma_handle, gfp_t gfp,
42 struct dma_attrs *attrs)
43{
44 void *ret;
45 struct dma_map_ops *ops = get_dma_ops(dev);
46
47 if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
48 return ret;
49
50 ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
51 debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
52
53 return ret;
54}
55
56static inline void dma_free_attrs(struct device *dev, size_t size,
57 void *vaddr, dma_addr_t dma_handle,
58 struct dma_attrs *attrs)
59{
60 struct dma_map_ops *ops = get_dma_ops(dev);
61
62 if (dma_release_from_coherent(dev, get_order(size), vaddr))
63 return;
64
65 ops->free(dev, size, vaddr, dma_handle, attrs);
66 debug_dma_free_coherent(dev, size, vaddr, dma_handle);
67}
68
69static inline int
70dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
71{
72 struct dma_map_ops *ops = get_dma_ops(dev);
73
74 debug_dma_mapping_error(dev, dma_addr);
75 return ops->mapping_error(dev, dma_addr);
76}
77
78static inline int
79dma_supported(struct device *dev, u64 mask)
80{
81 return 1;
82}
83
84static inline int
85dma_set_mask(struct device *dev, u64 mask)
86{
87 if(!dev->dma_mask || !dma_supported(dev, mask))
88 return -EIO;
89
90 *dev->dma_mask = mask;
91
92 return 0;
93}
94
95void dma_cache_sync(struct device *dev, void *vaddr, size_t size, 35void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
96 enum dma_data_direction direction); 36 enum dma_data_direction direction);
97 37
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 6607f3c6ace1..a39e85f9efa9 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2834,7 +2834,7 @@ static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2834 return VM_FAULT_SIGBUS; 2834 return VM_FAULT_SIGBUS;
2835} 2835}
2836 2836
2837static struct vm_operations_struct binder_vm_ops = { 2837static const struct vm_operations_struct binder_vm_ops = {
2838 .open = binder_vma_open, 2838 .open = binder_vma_open,
2839 .close = binder_vma_close, 2839 .close = binder_vma_close,
2840 .fault = binder_vm_fault, 2840 .fault = binder_vm_fault,
diff --git a/drivers/crypto/qat/qat_common/adf_transport_debug.c b/drivers/crypto/qat/qat_common/adf_transport_debug.c
index e41986967294..52340b9bb387 100644
--- a/drivers/crypto/qat/qat_common/adf_transport_debug.c
+++ b/drivers/crypto/qat/qat_common/adf_transport_debug.c
@@ -86,9 +86,7 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
86{ 86{
87 struct adf_etr_ring_data *ring = sfile->private; 87 struct adf_etr_ring_data *ring = sfile->private;
88 struct adf_etr_bank_data *bank = ring->bank; 88 struct adf_etr_bank_data *bank = ring->bank;
89 uint32_t *msg = v;
90 void __iomem *csr = ring->bank->csr_addr; 89 void __iomem *csr = ring->bank->csr_addr;
91 int i, x;
92 90
93 if (v == SEQ_START_TOKEN) { 91 if (v == SEQ_START_TOKEN) {
94 int head, tail, empty; 92 int head, tail, empty;
@@ -113,18 +111,8 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
113 seq_puts(sfile, "----------- Ring data ------------\n"); 111 seq_puts(sfile, "----------- Ring data ------------\n");
114 return 0; 112 return 0;
115 } 113 }
116 seq_printf(sfile, "%p:", msg); 114 seq_hex_dump(sfile, "", DUMP_PREFIX_ADDRESS, 32, 4,
117 x = 0; 115 v, ADF_MSG_SIZE_TO_BYTES(ring->msg_size), false);
118 i = 0;
119 for (; i < (ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2); i++) {
120 seq_printf(sfile, " %08X", *(msg + i));
121 if ((ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2) != i + 1 &&
122 (++x == 8)) {
123 seq_printf(sfile, "\n%p:", msg + i + 1);
124 x = 0;
125 }
126 }
127 seq_puts(sfile, "\n");
128 return 0; 116 return 0;
129} 117}
130 118
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 54071c148340..84533e02fbf8 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
43 43
44config EFI_RUNTIME_MAP 44config EFI_RUNTIME_MAP
45 bool "Export efi runtime maps to sysfs" 45 bool "Export efi runtime maps to sysfs"
46 depends on X86 && EFI && KEXEC 46 depends on X86 && EFI && KEXEC_CORE
47 default y 47 default y
48 help 48 help
49 Export efi runtime memory maps to /sys/firmware/efi/runtime-map. 49 Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 6394547cf67a..860062ef8814 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -125,7 +125,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
125 } 125 }
126} 126}
127 127
128static struct vm_operations_struct vgem_gem_vm_ops = { 128static const struct vm_operations_struct vgem_gem_vm_ops = {
129 .fault = vgem_gem_fault, 129 .fault = vgem_gem_fault,
130 .open = drm_gem_vm_open, 130 .open = drm_gem_vm_open,
131 .close = drm_gem_vm_close, 131 .close = drm_gem_vm_close,
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index d04643f9548b..95638df73d1c 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1110,7 +1110,7 @@ static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1110 return 0; 1110 return 0;
1111} 1111}
1112 1112
1113static struct vm_operations_struct cs_char_vm_ops = { 1113static const struct vm_operations_struct cs_char_vm_ops = {
1114 .fault = cs_char_vma_fault, 1114 .fault = cs_char_vma_fault,
1115}; 1115};
1116 1116
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 725881890c4a..e449e394963f 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -908,7 +908,7 @@ static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
908 return 0; 908 return 0;
909} 909}
910 910
911static struct vm_operations_struct qib_file_vm_ops = { 911static const struct vm_operations_struct qib_file_vm_ops = {
912 .fault = qib_file_vma_fault, 912 .fault = qib_file_vma_fault,
913}; 913};
914 914
diff --git a/drivers/infiniband/hw/qib/qib_mmap.c b/drivers/infiniband/hw/qib/qib_mmap.c
index 146cf29a2e1d..34927b700b0e 100644
--- a/drivers/infiniband/hw/qib/qib_mmap.c
+++ b/drivers/infiniband/hw/qib/qib_mmap.c
@@ -75,7 +75,7 @@ static void qib_vma_close(struct vm_area_struct *vma)
75 kref_put(&ip->ref, qib_release_mmap_info); 75 kref_put(&ip->ref, qib_release_mmap_info);
76} 76}
77 77
78static struct vm_operations_struct qib_vm_ops = { 78static const struct vm_operations_struct qib_vm_ops = {
79 .open = qib_vma_open, 79 .open = qib_vma_open,
80 .close = qib_vma_close, 80 .close = qib_vma_close,
81}; 81};
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c
index f09c5f17a42f..de2474e1132d 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -872,7 +872,7 @@ static void omap_vout_vm_close(struct vm_area_struct *vma)
872 vout->mmap_count--; 872 vout->mmap_count--;
873} 873}
874 874
875static struct vm_operations_struct omap_vout_vm_ops = { 875static const struct vm_operations_struct omap_vout_vm_ops = {
876 .open = omap_vout_vm_open, 876 .open = omap_vout_vm_open,
877 .close = omap_vout_vm_close, 877 .close = omap_vout_vm_close,
878}; 878};
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index c49d244265ec..70e62d6a3231 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -418,7 +418,7 @@ static void genwqe_vma_close(struct vm_area_struct *vma)
418 kfree(dma_map); 418 kfree(dma_map);
419} 419}
420 420
421static struct vm_operations_struct genwqe_vma_ops = { 421static const struct vm_operations_struct genwqe_vma_ops = {
422 .open = genwqe_vma_open, 422 .open = genwqe_vma_open,
423 .close = genwqe_vma_close, 423 .close = genwqe_vma_close,
424}; 424};
diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 613ca2b2527b..d1a1e160ef31 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -156,6 +156,12 @@ static const struct file_operations fops_vring = {
156 .llseek = seq_lseek, 156 .llseek = seq_lseek,
157}; 157};
158 158
159static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
160 const char *prefix)
161{
162 seq_hex_dump(s, prefix, DUMP_PREFIX_NONE, 16, 1, p, len, false);
163}
164
159static void wil_print_ring(struct seq_file *s, const char *prefix, 165static void wil_print_ring(struct seq_file *s, const char *prefix,
160 void __iomem *off) 166 void __iomem *off)
161{ 167{
@@ -212,8 +218,6 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
212 le16_to_cpu(hdr.seq), len, 218 le16_to_cpu(hdr.seq), len,
213 le16_to_cpu(hdr.type), hdr.flags); 219 le16_to_cpu(hdr.type), hdr.flags);
214 if (len <= MAX_MBOXITEM_SIZE) { 220 if (len <= MAX_MBOXITEM_SIZE) {
215 int n = 0;
216 char printbuf[16 * 3 + 2];
217 unsigned char databuf[MAX_MBOXITEM_SIZE]; 221 unsigned char databuf[MAX_MBOXITEM_SIZE];
218 void __iomem *src = wmi_buffer(wil, d.addr) + 222 void __iomem *src = wmi_buffer(wil, d.addr) +
219 sizeof(struct wil6210_mbox_hdr); 223 sizeof(struct wil6210_mbox_hdr);
@@ -223,16 +227,7 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
223 * reading header 227 * reading header
224 */ 228 */
225 wil_memcpy_fromio_32(databuf, src, len); 229 wil_memcpy_fromio_32(databuf, src, len);
226 while (n < len) { 230 wil_seq_hexdump(s, databuf, len, " : ");
227 int l = min(len - n, 16);
228
229 hex_dump_to_buffer(databuf + n, l,
230 16, 1, printbuf,
231 sizeof(printbuf),
232 false);
233 seq_printf(s, " : %s\n", printbuf);
234 n += l;
235 }
236 } 231 }
237 } else { 232 } else {
238 seq_puts(s, "\n"); 233 seq_puts(s, "\n");
@@ -867,22 +862,6 @@ static const struct file_operations fops_wmi = {
867 .open = simple_open, 862 .open = simple_open,
868}; 863};
869 864
870static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
871 const char *prefix)
872{
873 char printbuf[16 * 3 + 2];
874 int i = 0;
875
876 while (i < len) {
877 int l = min(len - i, 16);
878
879 hex_dump_to_buffer(p + i, l, 16, 1, printbuf,
880 sizeof(printbuf), false);
881 seq_printf(s, "%s%s\n", prefix, printbuf);
882 i += l;
883 }
884}
885
886static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb) 865static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb)
887{ 866{
888 int i = 0; 867 int i = 0;
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 02ff84fcfa61..957b42198328 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1103,16 +1103,9 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
1103 struct ioc *ioc = ioc_list; 1103 struct ioc *ioc = ioc_list;
1104 1104
1105 while (ioc != NULL) { 1105 while (ioc != NULL) {
1106 u32 *res_ptr = (u32 *)ioc->res_map; 1106 seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
1107 int j; 1107 ioc->res_size, false);
1108 1108 seq_putc(m, '\n');
1109 for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
1110 if ((j & 7) == 0)
1111 seq_puts(m, "\n ");
1112 seq_printf(m, "%08x", *res_ptr);
1113 res_ptr++;
1114 }
1115 seq_puts(m, "\n\n");
1116 ioc = ioc->next; 1109 ioc = ioc->next;
1117 break; /* XXX - remove me */ 1110 break; /* XXX - remove me */
1118 } 1111 }
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index f1441e466c06..225049b492e5 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1854,14 +1854,9 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
1854{ 1854{
1855 struct sba_device *sba_dev = sba_list; 1855 struct sba_device *sba_dev = sba_list;
1856 struct ioc *ioc = &sba_dev->ioc[0]; /* FIXME: Multi-IOC support! */ 1856 struct ioc *ioc = &sba_dev->ioc[0]; /* FIXME: Multi-IOC support! */
1857 unsigned int *res_ptr = (unsigned int *)ioc->res_map;
1858 int i;
1859 1857
1860 for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) { 1858 seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
1861 if ((i & 7) == 0) 1859 ioc->res_size, false);
1862 seq_puts(m, "\n ");
1863 seq_printf(m, " %08x", *res_ptr);
1864 }
1865 seq_putc(m, '\n'); 1860 seq_putc(m, '\n');
1866 1861
1867 return 0; 1862 return 0;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 52a880ca1768..dd652f2ae03d 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -467,7 +467,7 @@ static void pci_device_shutdown(struct device *dev)
467 pci_msi_shutdown(pci_dev); 467 pci_msi_shutdown(pci_dev);
468 pci_msix_shutdown(pci_dev); 468 pci_msix_shutdown(pci_dev);
469 469
470#ifdef CONFIG_KEXEC 470#ifdef CONFIG_KEXEC_CORE
471 /* 471 /*
472 * If this is a kexec reboot, turn off Bus Master bit on the 472 * If this is a kexec reboot, turn off Bus Master bit on the
473 * device to tell it to not continue to do DMA. Don't touch 473 * device to tell it to not continue to do DMA. Don't touch
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 01bf1f5cf2e9..4eb45546a3aa 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -1206,16 +1206,8 @@ static void sprinthx(unsigned char *title, struct seq_file *m,
1206static void sprinthx4(unsigned char *title, struct seq_file *m, 1206static void sprinthx4(unsigned char *title, struct seq_file *m,
1207 unsigned int *array, unsigned int len) 1207 unsigned int *array, unsigned int len)
1208{ 1208{
1209 int r;
1210
1211 seq_printf(m, "\n%s\n", title); 1209 seq_printf(m, "\n%s\n", title);
1212 for (r = 0; r < len; r++) { 1210 seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, array, len, false);
1213 if ((r % 8) == 0)
1214 seq_printf(m, " ");
1215 seq_printf(m, "%08X ", array[r]);
1216 if ((r % 8) == 7)
1217 seq_putc(m, '\n');
1218 }
1219 seq_putc(m, '\n'); 1211 seq_putc(m, '\n');
1220} 1212}
1221 1213
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index eec878e183f5..217aa537c4eb 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -997,7 +997,7 @@ static void ion_vm_close(struct vm_area_struct *vma)
997 mutex_unlock(&buffer->lock); 997 mutex_unlock(&buffer->lock);
998} 998}
999 999
1000static struct vm_operations_struct ion_vma_ops = { 1000static const struct vm_operations_struct ion_vma_ops = {
1001 .open = ion_vm_open, 1001 .open = ion_vm_open,
1002 .close = ion_vm_close, 1002 .close = ion_vm_close,
1003 .fault = ion_vm_fault, 1003 .fault = ion_vm_fault,
diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index fd54d098ab02..0e8a45102933 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -2156,7 +2156,7 @@ static void comedi_vm_close(struct vm_area_struct *area)
2156 comedi_buf_map_put(bm); 2156 comedi_buf_map_put(bm);
2157} 2157}
2158 2158
2159static struct vm_operations_struct comedi_vm_ops = { 2159static const struct vm_operations_struct comedi_vm_ops = {
2160 .open = comedi_vm_open, 2160 .open = comedi_vm_open,
2161 .close = comedi_vm_close, 2161 .close = comedi_vm_close,
2162}; 2162};
diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-main.c b/drivers/video/fbdev/omap2/omapfb/omapfb-main.c
index 4f0cbb54d4db..d3af01c94a58 100644
--- a/drivers/video/fbdev/omap2/omapfb/omapfb-main.c
+++ b/drivers/video/fbdev/omap2/omapfb/omapfb-main.c
@@ -1091,7 +1091,7 @@ static void mmap_user_close(struct vm_area_struct *vma)
1091 omapfb_put_mem_region(rg); 1091 omapfb_put_mem_region(rg);
1092} 1092}
1093 1093
1094static struct vm_operations_struct mmap_user_ops = { 1094static const struct vm_operations_struct mmap_user_ops = {
1095 .open = mmap_user_open, 1095 .open = mmap_user_open,
1096 .close = mmap_user_close, 1096 .close = mmap_user_close,
1097}; 1097};
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index 14370df9ac1c..4547a91bca67 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -494,7 +494,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma)
494 mutex_unlock(&gref_mutex); 494 mutex_unlock(&gref_mutex);
495} 495}
496 496
497static struct vm_operations_struct gntalloc_vmops = { 497static const struct vm_operations_struct gntalloc_vmops = {
498 .open = gntalloc_vma_open, 498 .open = gntalloc_vma_open,
499 .close = gntalloc_vma_close, 499 .close = gntalloc_vma_close,
500}; 500};
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 0dbb222daaf1..2ea0b3b2a91d 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -433,7 +433,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
433 return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT]; 433 return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
434} 434}
435 435
436static struct vm_operations_struct gntdev_vmops = { 436static const struct vm_operations_struct gntdev_vmops = {
437 .open = gntdev_vma_open, 437 .open = gntdev_vma_open,
438 .close = gntdev_vma_close, 438 .close = gntdev_vma_close,
439 .find_special_page = gntdev_vma_find_special_page, 439 .find_special_page = gntdev_vma_find_special_page,
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index c6deb87c5c69..5e9adac928e6 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -414,7 +414,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
414 return 0; 414 return 0;
415} 415}
416 416
417static struct vm_operations_struct privcmd_vm_ops; 417static const struct vm_operations_struct privcmd_vm_ops;
418 418
419static long privcmd_ioctl_mmap_batch(void __user *udata, int version) 419static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
420{ 420{
@@ -605,7 +605,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
605 return VM_FAULT_SIGBUS; 605 return VM_FAULT_SIGBUS;
606} 606}
607 607
608static struct vm_operations_struct privcmd_vm_ops = { 608static const struct vm_operations_struct privcmd_vm_ops = {
609 .close = privcmd_close, 609 .close = privcmd_close,
610 .fault = privcmd_fault 610 .fault = privcmd_fault
611}; 611};
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index d757a3e610c6..79bc4933b13e 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -311,9 +311,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
311 */ 311 */
312 flags &= ~(__GFP_DMA | __GFP_HIGHMEM); 312 flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
313 313
314 if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
315 return ret;
316
317 /* On ARM this function returns an ioremap'ped virtual address for 314 /* On ARM this function returns an ioremap'ped virtual address for
318 * which virt_to_phys doesn't return the corresponding physical 315 * which virt_to_phys doesn't return the corresponding physical
319 * address. In fact on ARM virt_to_phys only works for kernel direct 316 * address. In fact on ARM virt_to_phys only works for kernel direct
@@ -356,9 +353,6 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
356 phys_addr_t phys; 353 phys_addr_t phys;
357 u64 dma_mask = DMA_BIT_MASK(32); 354 u64 dma_mask = DMA_BIT_MASK(32);
358 355
359 if (dma_release_from_coherent(hwdev, order, vaddr))
360 return;
361
362 if (hwdev && hwdev->coherent_dma_mask) 356 if (hwdev && hwdev->coherent_dma_mask)
363 dma_mask = hwdev->coherent_dma_mask; 357 dma_mask = hwdev->coherent_dma_mask;
364 358
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e05b40..5b50c4ca43a7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h>
21#include "affs.h" 22#include "affs.h"
22 23
23static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
352 * blocks, we will have to change it. 353 * blocks, we will have to change it.
353 */ 354 */
354 355
355 size = sb->s_bdev->bd_inode->i_size >> 9; 356 size = i_size_read(sb->s_bdev->bd_inode) >> 9;
356 pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); 357 pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
357 358
358 affs_set_blocksize(sb, PAGE_SIZE); 359 affs_set_blocksize(sb, PAGE_SIZE);
359 /* Try to find root block. Its location depends on the block size. */ 360 /* Try to find root block. Its location depends on the block size. */
360 361
361 i = 512; 362 i = bdev_logical_block_size(sb->s_bdev);
362 j = 4096; 363 j = PAGE_SIZE;
363 if (blocksize > 0) { 364 if (blocksize > 0) {
364 i = j = blocksize; 365 i = j = blocksize;
365 size = size / (blocksize / 512); 366 size = size / (blocksize / 512);
366 } 367 }
368
367 for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { 369 for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
368 sbi->s_root_block = root_block; 370 sbi->s_root_block = root_block;
369 if (root_block < 0) 371 if (root_block < 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 890c50971a69..a268abfe60ac 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1593,7 +1593,7 @@ out:
1593 return err; 1593 return err;
1594} 1594}
1595 1595
1596static struct vm_operations_struct ceph_vmops = { 1596static const struct vm_operations_struct ceph_vmops = {
1597 .fault = ceph_filemap_fault, 1597 .fault = ceph_filemap_fault,
1598 .page_mkwrite = ceph_page_mkwrite, 1598 .page_mkwrite = ceph_page_mkwrite,
1599}; 1599};
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee79df9..e2a6af1508af 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
3216 return VM_FAULT_LOCKED; 3216 return VM_FAULT_LOCKED;
3217} 3217}
3218 3218
3219static struct vm_operations_struct cifs_file_vm_ops = { 3219static const struct vm_operations_struct cifs_file_vm_ops = {
3220 .fault = filemap_fault, 3220 .fault = filemap_fault,
3221 .map_pages = filemap_map_pages, 3221 .map_pages = filemap_map_pages,
3222 .page_mkwrite = cifs_page_mkwrite, 3222 .page_mkwrite = cifs_page_mkwrite,
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9b1ffaa0572e..f6c6c8adbc01 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
353 char *result; 353 char *result;
354 354
355 insize = max_t(unsigned int, 355 insize = max_t(unsigned int,
356 INSIZE(readlink), OUTSIZE(readlink)+ *length + 1); 356 INSIZE(readlink), OUTSIZE(readlink)+ *length);
357 UPARG(CODA_READLINK); 357 UPARG(CODA_READLINK);
358 358
359 inp->coda_readlink.VFid = *fid; 359 inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
361 error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); 361 error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
362 if (!error) { 362 if (!error) {
363 retlen = outp->coda_readlink.count; 363 retlen = outp->coda_readlink.count;
364 if ( retlen > *length ) 364 if (retlen >= *length)
365 retlen = *length; 365 retlen = *length - 1;
366 *length = retlen; 366 *length = retlen;
367 result = (char *)outp + (long)outp->coda_readlink.data; 367 result = (char *)outp + (long)outp->coda_readlink.data;
368 memcpy(buffer, result, retlen); 368 memcpy(buffer, result, retlen);
diff --git a/fs/coredump.c b/fs/coredump.c
index c5ecde6f3eed..a8f75640ac86 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo)
513 const struct cred *old_cred; 513 const struct cred *old_cred;
514 struct cred *cred; 514 struct cred *cred;
515 int retval = 0; 515 int retval = 0;
516 int flag = 0;
517 int ispipe; 516 int ispipe;
518 struct files_struct *displaced; 517 struct files_struct *displaced;
519 bool need_nonrelative = false; 518 /* require nonrelative corefile path and be extra careful */
519 bool need_suid_safe = false;
520 bool core_dumped = false; 520 bool core_dumped = false;
521 static atomic_t core_dump_count = ATOMIC_INIT(0); 521 static atomic_t core_dump_count = ATOMIC_INIT(0);
522 struct coredump_params cprm = { 522 struct coredump_params cprm = {
@@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo)
550 */ 550 */
551 if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { 551 if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
552 /* Setuid core dump mode */ 552 /* Setuid core dump mode */
553 flag = O_EXCL; /* Stop rewrite attacks */
554 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ 553 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
555 need_nonrelative = true; 554 need_suid_safe = true;
556 } 555 }
557 556
558 retval = coredump_wait(siginfo->si_signo, &core_state); 557 retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo)
633 if (cprm.limit < binfmt->min_coredump) 632 if (cprm.limit < binfmt->min_coredump)
634 goto fail_unlock; 633 goto fail_unlock;
635 634
636 if (need_nonrelative && cn.corename[0] != '/') { 635 if (need_suid_safe && cn.corename[0] != '/') {
637 printk(KERN_WARNING "Pid %d(%s) can only dump core "\ 636 printk(KERN_WARNING "Pid %d(%s) can only dump core "\
638 "to fully qualified path!\n", 637 "to fully qualified path!\n",
639 task_tgid_vnr(current), current->comm); 638 task_tgid_vnr(current), current->comm);
@@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo)
641 goto fail_unlock; 640 goto fail_unlock;
642 } 641 }
643 642
643 /*
644 * Unlink the file if it exists unless this is a SUID
645 * binary - in that case, we're running around with root
646 * privs and don't want to unlink another user's coredump.
647 */
648 if (!need_suid_safe) {
649 mm_segment_t old_fs;
650
651 old_fs = get_fs();
652 set_fs(KERNEL_DS);
653 /*
654 * If it doesn't exist, that's fine. If there's some
655 * other problem, we'll catch it at the filp_open().
656 */
657 (void) sys_unlink((const char __user *)cn.corename);
658 set_fs(old_fs);
659 }
660
661 /*
662 * There is a race between unlinking and creating the
663 * file, but if that causes an EEXIST here, that's
664 * fine - another process raced with us while creating
665 * the corefile, and the other process won. To userspace,
666 * what matters is that at least one of the two processes
667 * writes its coredump successfully, not which one.
668 */
644 cprm.file = filp_open(cn.corename, 669 cprm.file = filp_open(cn.corename,
645 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 670 O_CREAT | 2 | O_NOFOLLOW |
671 O_LARGEFILE | O_EXCL,
646 0600); 672 0600);
647 if (IS_ERR(cprm.file)) 673 if (IS_ERR(cprm.file))
648 goto fail_unlock; 674 goto fail_unlock;
@@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo)
659 if (!S_ISREG(inode->i_mode)) 685 if (!S_ISREG(inode->i_mode))
660 goto close_fail; 686 goto close_fail;
661 /* 687 /*
662 * Dont allow local users get cute and trick others to coredump 688 * Don't dump core if the filesystem changed owner or mode
663 * into their pre-created files. 689 * of the file during file creation. This is an issue when
690 * a process dumps core while its cwd is e.g. on a vfat
691 * filesystem.
664 */ 692 */
665 if (!uid_eq(inode->i_uid, current_fsuid())) 693 if (!uid_eq(inode->i_uid, current_fsuid()))
666 goto close_fail; 694 goto close_fail;
695 if ((inode->i_mode & 0677) != 0600)
696 goto close_fail;
667 if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) 697 if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
668 goto close_fail; 698 goto close_fail;
669 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) 699 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd9503e..221719eac5de 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
288 page_cache_release(page); 288 page_cache_release(page);
289 goto fail; 289 goto fail;
290 } 290 }
291 page_cache_release(page);
292 node->page[i] = page; 291 node->page[i] = page;
293 } 292 }
294 293
@@ -398,11 +397,11 @@ node_error:
398 397
399void hfs_bnode_free(struct hfs_bnode *node) 398void hfs_bnode_free(struct hfs_bnode *node)
400{ 399{
401 //int i; 400 int i;
402 401
403 //for (i = 0; i < node->tree->pages_per_bnode; i++) 402 for (i = 0; i < node->tree->pages_per_bnode; i++)
404 // if (node->page[i]) 403 if (node->page[i])
405 // page_cache_release(node->page[i]); 404 page_cache_release(node->page[i]);
406 kfree(node); 405 kfree(node);
407} 406}
408 407
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f52026..6fc766df0461 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
131 hfs_bnode_write(node, entry, data_off + key_len, entry_len); 131 hfs_bnode_write(node, entry, data_off + key_len, entry_len);
132 hfs_bnode_dump(node); 132 hfs_bnode_dump(node);
133 133
134 if (new_node) { 134 /*
135 /* update parent key if we inserted a key 135 * update parent key if we inserted a key
136 * at the start of the first node 136 * at the start of the node and it is not the new node
137 */ 137 */
138 if (!rec && new_node != node) 138 if (!rec && new_node != node) {
139 hfs_brec_update_parent(fd); 139 hfs_bnode_read_key(node, fd->search_key, data_off + size);
140 hfs_brec_update_parent(fd);
141 }
140 142
143 if (new_node) {
141 hfs_bnode_put(fd->bnode); 144 hfs_bnode_put(fd->bnode);
142 if (!new_node->parent) { 145 if (!new_node->parent) {
143 hfs_btree_inc_height(tree); 146 hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
166 goto again; 169 goto again;
167 } 170 }
168 171
169 if (!rec)
170 hfs_brec_update_parent(fd);
171
172 return 0; 172 return 0;
173} 173}
174 174
@@ -366,6 +366,8 @@ again:
366 if (IS_ERR(parent)) 366 if (IS_ERR(parent))
367 return PTR_ERR(parent); 367 return PTR_ERR(parent);
368 __hfs_brec_find(parent, fd); 368 __hfs_brec_find(parent, fd);
369 if (fd->record < 0)
370 return -ENOENT;
369 hfs_bnode_dump(parent); 371 hfs_bnode_dump(parent);
370 rec = fd->record; 372 rec = fd->record;
371 373
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 759708fd9331..63924662aaf3 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
454 page_cache_release(page); 454 page_cache_release(page);
455 goto fail; 455 goto fail;
456 } 456 }
457 page_cache_release(page);
458 node->page[i] = page; 457 node->page[i] = page;
459 } 458 }
460 459
@@ -566,13 +565,11 @@ node_error:
566 565
567void hfs_bnode_free(struct hfs_bnode *node) 566void hfs_bnode_free(struct hfs_bnode *node)
568{ 567{
569#if 0
570 int i; 568 int i;
571 569
572 for (i = 0; i < node->tree->pages_per_bnode; i++) 570 for (i = 0; i < node->tree->pages_per_bnode; i++)
573 if (node->page[i]) 571 if (node->page[i])
574 page_cache_release(node->page[i]); 572 page_cache_release(node->page[i]);
575#endif
576 kfree(node); 573 kfree(node);
577} 574}
578 575
diff --git a/fs/namei.c b/fs/namei.c
index 29b927938b8c..726d211db484 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2438,7 +2438,7 @@ done:
2438 2438
2439/** 2439/**
2440 * path_mountpoint - look up a path to be umounted 2440 * path_mountpoint - look up a path to be umounted
2441 * @nameidata: lookup context 2441 * @nd: lookup context
2442 * @flags: lookup flags 2442 * @flags: lookup flags
2443 * @path: pointer to container for result 2443 * @path: pointer to container for result
2444 * 2444 *
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1ac28fc..b25eee4cead5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1230 size_t count, loff_t *ppos) 1230 size_t count, loff_t *ppos)
1231{ 1231{
1232 struct inode * inode = file_inode(file); 1232 struct inode * inode = file_inode(file);
1233 char *page, *tmp;
1234 ssize_t length;
1235 uid_t loginuid; 1233 uid_t loginuid;
1236 kuid_t kloginuid; 1234 kuid_t kloginuid;
1235 int rv;
1237 1236
1238 rcu_read_lock(); 1237 rcu_read_lock();
1239 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1238 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1242 } 1241 }
1243 rcu_read_unlock(); 1242 rcu_read_unlock();
1244 1243
1245 if (count >= PAGE_SIZE)
1246 count = PAGE_SIZE - 1;
1247
1248 if (*ppos != 0) { 1244 if (*ppos != 0) {
1249 /* No partial writes. */ 1245 /* No partial writes. */
1250 return -EINVAL; 1246 return -EINVAL;
1251 } 1247 }
1252 page = (char*)__get_free_page(GFP_TEMPORARY);
1253 if (!page)
1254 return -ENOMEM;
1255 length = -EFAULT;
1256 if (copy_from_user(page, buf, count))
1257 goto out_free_page;
1258
1259 page[count] = '\0';
1260 loginuid = simple_strtoul(page, &tmp, 10);
1261 if (tmp == page) {
1262 length = -EINVAL;
1263 goto out_free_page;
1264 1248
1265 } 1249 rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1250 if (rv < 0)
1251 return rv;
1266 1252
1267 /* is userspace tring to explicitly UNSET the loginuid? */ 1253 /* is userspace tring to explicitly UNSET the loginuid? */
1268 if (loginuid == AUDIT_UID_UNSET) { 1254 if (loginuid == AUDIT_UID_UNSET) {
1269 kloginuid = INVALID_UID; 1255 kloginuid = INVALID_UID;
1270 } else { 1256 } else {
1271 kloginuid = make_kuid(file->f_cred->user_ns, loginuid); 1257 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1272 if (!uid_valid(kloginuid)) { 1258 if (!uid_valid(kloginuid))
1273 length = -EINVAL; 1259 return -EINVAL;
1274 goto out_free_page;
1275 }
1276 } 1260 }
1277 1261
1278 length = audit_set_loginuid(kloginuid); 1262 rv = audit_set_loginuid(kloginuid);
1279 if (likely(length == 0)) 1263 if (rv < 0)
1280 length = count; 1264 return rv;
1281 1265 return count;
1282out_free_page:
1283 free_page((unsigned long) page);
1284 return length;
1285} 1266}
1286 1267
1287static const struct file_operations proc_loginuid_operations = { 1268static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
1335 const char __user * buf, size_t count, loff_t *ppos) 1316 const char __user * buf, size_t count, loff_t *ppos)
1336{ 1317{
1337 struct task_struct *task; 1318 struct task_struct *task;
1338 char buffer[PROC_NUMBUF], *end; 1319 char buffer[PROC_NUMBUF];
1339 int make_it_fail; 1320 int make_it_fail;
1321 int rv;
1340 1322
1341 if (!capable(CAP_SYS_RESOURCE)) 1323 if (!capable(CAP_SYS_RESOURCE))
1342 return -EPERM; 1324 return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
1345 count = sizeof(buffer) - 1; 1327 count = sizeof(buffer) - 1;
1346 if (copy_from_user(buffer, buf, count)) 1328 if (copy_from_user(buffer, buf, count))
1347 return -EFAULT; 1329 return -EFAULT;
1348 make_it_fail = simple_strtol(strstrip(buffer), &end, 0); 1330 rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1349 if (*end) 1331 if (rv < 0)
1350 return -EINVAL; 1332 return rv;
1351 if (make_it_fail < 0 || make_it_fail > 1) 1333 if (make_it_fail < 0 || make_it_fail > 1)
1352 return -EINVAL; 1334 return -EINVAL;
1353 1335
@@ -1836,8 +1818,6 @@ end_instantiate:
1836 return dir_emit(ctx, name, len, 1, DT_UNKNOWN); 1818 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1837} 1819}
1838 1820
1839#ifdef CONFIG_CHECKPOINT_RESTORE
1840
1841/* 1821/*
1842 * dname_to_vma_addr - maps a dentry name into two unsigned longs 1822 * dname_to_vma_addr - maps a dentry name into two unsigned longs
1843 * which represent vma start and end addresses. 1823 * which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1864 if (flags & LOOKUP_RCU) 1844 if (flags & LOOKUP_RCU)
1865 return -ECHILD; 1845 return -ECHILD;
1866 1846
1867 if (!capable(CAP_SYS_ADMIN)) {
1868 status = -EPERM;
1869 goto out_notask;
1870 }
1871
1872 inode = d_inode(dentry); 1847 inode = d_inode(dentry);
1873 task = get_proc_task(inode); 1848 task = get_proc_task(inode);
1874 if (!task) 1849 if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
1957 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1932 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1958}; 1933};
1959 1934
1935/*
1936 * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
1937 * symlinks may be used to bypass permissions on ancestor directories in the
1938 * path to the file in question.
1939 */
1940static const char *
1941proc_map_files_follow_link(struct dentry *dentry, void **cookie)
1942{
1943 if (!capable(CAP_SYS_ADMIN))
1944 return ERR_PTR(-EPERM);
1945
1946 return proc_pid_follow_link(dentry, NULL);
1947}
1948
1949/*
1950 * Identical to proc_pid_link_inode_operations except for follow_link()
1951 */
1952static const struct inode_operations proc_map_files_link_inode_operations = {
1953 .readlink = proc_pid_readlink,
1954 .follow_link = proc_map_files_follow_link,
1955 .setattr = proc_setattr,
1956};
1957
1960static int 1958static int
1961proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1959proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1962 struct task_struct *task, const void *ptr) 1960 struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1972 ei = PROC_I(inode); 1970 ei = PROC_I(inode);
1973 ei->op.proc_get_link = proc_map_files_get_link; 1971 ei->op.proc_get_link = proc_map_files_get_link;
1974 1972
1975 inode->i_op = &proc_pid_link_inode_operations; 1973 inode->i_op = &proc_map_files_link_inode_operations;
1976 inode->i_size = 64; 1974 inode->i_size = 64;
1977 inode->i_mode = S_IFLNK; 1975 inode->i_mode = S_IFLNK;
1978 1976
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1996 int result; 1994 int result;
1997 struct mm_struct *mm; 1995 struct mm_struct *mm;
1998 1996
1999 result = -EPERM;
2000 if (!capable(CAP_SYS_ADMIN))
2001 goto out;
2002
2003 result = -ENOENT; 1997 result = -ENOENT;
2004 task = get_proc_task(dir); 1998 task = get_proc_task(dir);
2005 if (!task) 1999 if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2053 struct map_files_info *p; 2047 struct map_files_info *p;
2054 int ret; 2048 int ret;
2055 2049
2056 ret = -EPERM;
2057 if (!capable(CAP_SYS_ADMIN))
2058 goto out;
2059
2060 ret = -ENOENT; 2050 ret = -ENOENT;
2061 task = get_proc_task(file_inode(file)); 2051 task = get_proc_task(file_inode(file));
2062 if (!task) 2052 if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
2245 .llseek = seq_lseek, 2235 .llseek = seq_lseek,
2246 .release = seq_release_private, 2236 .release = seq_release_private,
2247}; 2237};
2248#endif /* CONFIG_CHECKPOINT_RESTORE */
2249 2238
2250static int proc_pident_instantiate(struct inode *dir, 2239static int proc_pident_instantiate(struct inode *dir,
2251 struct dentry *dentry, struct task_struct *task, const void *ptr) 2240 struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2481{ 2470{
2482 struct task_struct *task; 2471 struct task_struct *task;
2483 struct mm_struct *mm; 2472 struct mm_struct *mm;
2484 char buffer[PROC_NUMBUF], *end;
2485 unsigned int val; 2473 unsigned int val;
2486 int ret; 2474 int ret;
2487 int i; 2475 int i;
2488 unsigned long mask; 2476 unsigned long mask;
2489 2477
2490 ret = -EFAULT; 2478 ret = kstrtouint_from_user(buf, count, 0, &val);
2491 memset(buffer, 0, sizeof(buffer)); 2479 if (ret < 0)
2492 if (count > sizeof(buffer) - 1) 2480 return ret;
2493 count = sizeof(buffer) - 1;
2494 if (copy_from_user(buffer, buf, count))
2495 goto out_no_task;
2496
2497 ret = -EINVAL;
2498 val = (unsigned int)simple_strtoul(buffer, &end, 0);
2499 if (*end == '\n')
2500 end++;
2501 if (end - buffer == 0)
2502 goto out_no_task;
2503 2481
2504 ret = -ESRCH; 2482 ret = -ESRCH;
2505 task = get_proc_task(file_inode(file)); 2483 task = get_proc_task(file_inode(file));
2506 if (!task) 2484 if (!task)
2507 goto out_no_task; 2485 goto out_no_task;
2508 2486
2509 ret = end - buffer;
2510 mm = get_task_mm(task); 2487 mm = get_task_mm(task);
2511 if (!mm) 2488 if (!mm)
2512 goto out_no_mm; 2489 goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2522 out_no_mm: 2499 out_no_mm:
2523 put_task_struct(task); 2500 put_task_struct(task);
2524 out_no_task: 2501 out_no_task:
2525 return ret; 2502 if (ret < 0)
2503 return ret;
2504 return count;
2526} 2505}
2527 2506
2528static const struct file_operations proc_coredump_filter_operations = { 2507static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
2744static const struct pid_entry tgid_base_stuff[] = { 2723static const struct pid_entry tgid_base_stuff[] = {
2745 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2724 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2746 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2725 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2747#ifdef CONFIG_CHECKPOINT_RESTORE
2748 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), 2726 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
2749#endif
2750 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2727 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2751 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2728 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2752#ifdef CONFIG_NET 2729#ifdef CONFIG_NET
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e5dee5c3188e..ff3ffc76a937 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -26,7 +26,7 @@
26 26
27#include "internal.h" 27#include "internal.h"
28 28
29static DEFINE_SPINLOCK(proc_subdir_lock); 29static DEFINE_RWLOCK(proc_subdir_lock);
30 30
31static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) 31static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
32{ 32{
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
172{ 172{
173 int rv; 173 int rv;
174 174
175 spin_lock(&proc_subdir_lock); 175 read_lock(&proc_subdir_lock);
176 rv = __xlate_proc_name(name, ret, residual); 176 rv = __xlate_proc_name(name, ret, residual);
177 spin_unlock(&proc_subdir_lock); 177 read_unlock(&proc_subdir_lock);
178 return rv; 178 return rv;
179} 179}
180 180
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
231{ 231{
232 struct inode *inode; 232 struct inode *inode;
233 233
234 spin_lock(&proc_subdir_lock); 234 read_lock(&proc_subdir_lock);
235 de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); 235 de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
236 if (de) { 236 if (de) {
237 pde_get(de); 237 pde_get(de);
238 spin_unlock(&proc_subdir_lock); 238 read_unlock(&proc_subdir_lock);
239 inode = proc_get_inode(dir->i_sb, de); 239 inode = proc_get_inode(dir->i_sb, de);
240 if (!inode) 240 if (!inode)
241 return ERR_PTR(-ENOMEM); 241 return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
243 d_add(dentry, inode); 243 d_add(dentry, inode);
244 return NULL; 244 return NULL;
245 } 245 }
246 spin_unlock(&proc_subdir_lock); 246 read_unlock(&proc_subdir_lock);
247 return ERR_PTR(-ENOENT); 247 return ERR_PTR(-ENOENT);
248} 248}
249 249
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
270 if (!dir_emit_dots(file, ctx)) 270 if (!dir_emit_dots(file, ctx))
271 return 0; 271 return 0;
272 272
273 spin_lock(&proc_subdir_lock); 273 read_lock(&proc_subdir_lock);
274 de = pde_subdir_first(de); 274 de = pde_subdir_first(de);
275 i = ctx->pos - 2; 275 i = ctx->pos - 2;
276 for (;;) { 276 for (;;) {
277 if (!de) { 277 if (!de) {
278 spin_unlock(&proc_subdir_lock); 278 read_unlock(&proc_subdir_lock);
279 return 0; 279 return 0;
280 } 280 }
281 if (!i) 281 if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
287 do { 287 do {
288 struct proc_dir_entry *next; 288 struct proc_dir_entry *next;
289 pde_get(de); 289 pde_get(de);
290 spin_unlock(&proc_subdir_lock); 290 read_unlock(&proc_subdir_lock);
291 if (!dir_emit(ctx, de->name, de->namelen, 291 if (!dir_emit(ctx, de->name, de->namelen,
292 de->low_ino, de->mode >> 12)) { 292 de->low_ino, de->mode >> 12)) {
293 pde_put(de); 293 pde_put(de);
294 return 0; 294 return 0;
295 } 295 }
296 spin_lock(&proc_subdir_lock); 296 read_lock(&proc_subdir_lock);
297 ctx->pos++; 297 ctx->pos++;
298 next = pde_subdir_next(de); 298 next = pde_subdir_next(de);
299 pde_put(de); 299 pde_put(de);
300 de = next; 300 de = next;
301 } while (de); 301 } while (de);
302 spin_unlock(&proc_subdir_lock); 302 read_unlock(&proc_subdir_lock);
303 return 1; 303 return 1;
304} 304}
305 305
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
338 if (ret) 338 if (ret)
339 return ret; 339 return ret;
340 340
341 spin_lock(&proc_subdir_lock); 341 write_lock(&proc_subdir_lock);
342 dp->parent = dir; 342 dp->parent = dir;
343 if (pde_subdir_insert(dir, dp) == false) { 343 if (pde_subdir_insert(dir, dp) == false) {
344 WARN(1, "proc_dir_entry '%s/%s' already registered\n", 344 WARN(1, "proc_dir_entry '%s/%s' already registered\n",
345 dir->name, dp->name); 345 dir->name, dp->name);
346 spin_unlock(&proc_subdir_lock); 346 write_unlock(&proc_subdir_lock);
347 proc_free_inum(dp->low_ino); 347 proc_free_inum(dp->low_ino);
348 return -EEXIST; 348 return -EEXIST;
349 } 349 }
350 spin_unlock(&proc_subdir_lock); 350 write_unlock(&proc_subdir_lock);
351 351
352 return 0; 352 return 0;
353} 353}
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
549 const char *fn = name; 549 const char *fn = name;
550 unsigned int len; 550 unsigned int len;
551 551
552 spin_lock(&proc_subdir_lock); 552 write_lock(&proc_subdir_lock);
553 if (__xlate_proc_name(name, &parent, &fn) != 0) { 553 if (__xlate_proc_name(name, &parent, &fn) != 0) {
554 spin_unlock(&proc_subdir_lock); 554 write_unlock(&proc_subdir_lock);
555 return; 555 return;
556 } 556 }
557 len = strlen(fn); 557 len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
559 de = pde_subdir_find(parent, fn, len); 559 de = pde_subdir_find(parent, fn, len);
560 if (de) 560 if (de)
561 rb_erase(&de->subdir_node, &parent->subdir); 561 rb_erase(&de->subdir_node, &parent->subdir);
562 spin_unlock(&proc_subdir_lock); 562 write_unlock(&proc_subdir_lock);
563 if (!de) { 563 if (!de) {
564 WARN(1, "name '%s'\n", name); 564 WARN(1, "name '%s'\n", name);
565 return; 565 return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
583 const char *fn = name; 583 const char *fn = name;
584 unsigned int len; 584 unsigned int len;
585 585
586 spin_lock(&proc_subdir_lock); 586 write_lock(&proc_subdir_lock);
587 if (__xlate_proc_name(name, &parent, &fn) != 0) { 587 if (__xlate_proc_name(name, &parent, &fn) != 0) {
588 spin_unlock(&proc_subdir_lock); 588 write_unlock(&proc_subdir_lock);
589 return -ENOENT; 589 return -ENOENT;
590 } 590 }
591 len = strlen(fn); 591 len = strlen(fn);
592 592
593 root = pde_subdir_find(parent, fn, len); 593 root = pde_subdir_find(parent, fn, len);
594 if (!root) { 594 if (!root) {
595 spin_unlock(&proc_subdir_lock); 595 write_unlock(&proc_subdir_lock);
596 return -ENOENT; 596 return -ENOENT;
597 } 597 }
598 rb_erase(&root->subdir_node, &parent->subdir); 598 rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
605 de = next; 605 de = next;
606 continue; 606 continue;
607 } 607 }
608 spin_unlock(&proc_subdir_lock); 608 write_unlock(&proc_subdir_lock);
609 609
610 proc_entry_rundown(de); 610 proc_entry_rundown(de);
611 next = de->parent; 611 next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
616 break; 616 break;
617 pde_put(de); 617 pde_put(de);
618 618
619 spin_lock(&proc_subdir_lock); 619 write_lock(&proc_subdir_lock);
620 de = next; 620 de = next;
621 } 621 }
622 pde_put(root); 622 pde_put(root);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8b97d9..93484034a03d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -9,12 +9,16 @@
9#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/memcontrol.h>
13#include <linux/mmu_notifier.h>
14#include <linux/page_idle.h>
12#include <linux/kernel-page-flags.h> 15#include <linux/kernel-page-flags.h>
13#include <asm/uaccess.h> 16#include <asm/uaccess.h>
14#include "internal.h" 17#include "internal.h"
15 18
16#define KPMSIZE sizeof(u64) 19#define KPMSIZE sizeof(u64)
17#define KPMMASK (KPMSIZE - 1) 20#define KPMMASK (KPMSIZE - 1)
21#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
18 22
19/* /proc/kpagecount - an array exposing page counts 23/* /proc/kpagecount - an array exposing page counts
20 * 24 *
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
54 pfn++; 58 pfn++;
55 out++; 59 out++;
56 count -= KPMSIZE; 60 count -= KPMSIZE;
61
62 cond_resched();
57 } 63 }
58 64
59 *ppos += (char __user *)out - buf; 65 *ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
146 if (PageBalloon(page)) 152 if (PageBalloon(page))
147 u |= 1 << KPF_BALLOON; 153 u |= 1 << KPF_BALLOON;
148 154
155 if (page_is_idle(page))
156 u |= 1 << KPF_IDLE;
157
149 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); 158 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
150 159
151 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 160 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
212 pfn++; 221 pfn++;
213 out++; 222 out++;
214 count -= KPMSIZE; 223 count -= KPMSIZE;
224
225 cond_resched();
215 } 226 }
216 227
217 *ppos += (char __user *)out - buf; 228 *ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
225 .read = kpageflags_read, 236 .read = kpageflags_read,
226}; 237};
227 238
239#ifdef CONFIG_MEMCG
240static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
241 size_t count, loff_t *ppos)
242{
243 u64 __user *out = (u64 __user *)buf;
244 struct page *ppage;
245 unsigned long src = *ppos;
246 unsigned long pfn;
247 ssize_t ret = 0;
248 u64 ino;
249
250 pfn = src / KPMSIZE;
251 count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
252 if (src & KPMMASK || count & KPMMASK)
253 return -EINVAL;
254
255 while (count > 0) {
256 if (pfn_valid(pfn))
257 ppage = pfn_to_page(pfn);
258 else
259 ppage = NULL;
260
261 if (ppage)
262 ino = page_cgroup_ino(ppage);
263 else
264 ino = 0;
265
266 if (put_user(ino, out)) {
267 ret = -EFAULT;
268 break;
269 }
270
271 pfn++;
272 out++;
273 count -= KPMSIZE;
274
275 cond_resched();
276 }
277
278 *ppos += (char __user *)out - buf;
279 if (!ret)
280 ret = (char __user *)out - buf;
281 return ret;
282}
283
284static const struct file_operations proc_kpagecgroup_operations = {
285 .llseek = mem_lseek,
286 .read = kpagecgroup_read,
287};
288#endif /* CONFIG_MEMCG */
289
228static int __init proc_page_init(void) 290static int __init proc_page_init(void)
229{ 291{
230 proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); 292 proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
231 proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); 293 proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
294#ifdef CONFIG_MEMCG
295 proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
296#endif
232 return 0; 297 return 0;
233} 298}
234fs_initcall(proc_page_init); 299fs_initcall(proc_page_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 41f1a50c10c9..e2d46adb54b4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/swapops.h> 14#include <linux/swapops.h>
15#include <linux/mmu_notifier.h> 15#include <linux/mmu_notifier.h>
16#include <linux/page_idle.h>
16 17
17#include <asm/elf.h> 18#include <asm/elf.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
@@ -459,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
459 460
460 mss->resident += size; 461 mss->resident += size;
461 /* Accumulate the size in pages that have been accessed. */ 462 /* Accumulate the size in pages that have been accessed. */
462 if (young || PageReferenced(page)) 463 if (young || page_is_young(page) || PageReferenced(page))
463 mss->referenced += size; 464 mss->referenced += size;
464 mapcount = page_mapcount(page); 465 mapcount = page_mapcount(page);
465 if (mapcount >= 2) { 466 if (mapcount >= 2) {
@@ -807,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
807 808
808 /* Clear accessed and referenced bits. */ 809 /* Clear accessed and referenced bits. */
809 pmdp_test_and_clear_young(vma, addr, pmd); 810 pmdp_test_and_clear_young(vma, addr, pmd);
811 test_and_clear_page_young(page);
810 ClearPageReferenced(page); 812 ClearPageReferenced(page);
811out: 813out:
812 spin_unlock(ptl); 814 spin_unlock(ptl);
@@ -834,6 +836,7 @@ out:
834 836
835 /* Clear accessed and referenced bits. */ 837 /* Clear accessed and referenced bits. */
836 ptep_test_and_clear_young(vma, addr, pte); 838 ptep_test_and_clear_young(vma, addr, pte);
839 test_and_clear_page_young(page);
837 ClearPageReferenced(page); 840 ClearPageReferenced(page);
838 } 841 }
839 pte_unmap_unlock(pte - 1, ptl); 842 pte_unmap_unlock(pte - 1, ptl);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39fd5daf..263b125dbcf4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/printk.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/page.h> 18#include <asm/page.h>
@@ -773,6 +774,47 @@ void seq_pad(struct seq_file *m, char c)
773} 774}
774EXPORT_SYMBOL(seq_pad); 775EXPORT_SYMBOL(seq_pad);
775 776
777/* A complete analogue of print_hex_dump() */
778void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
779 int rowsize, int groupsize, const void *buf, size_t len,
780 bool ascii)
781{
782 const u8 *ptr = buf;
783 int i, linelen, remaining = len;
784 int ret;
785
786 if (rowsize != 16 && rowsize != 32)
787 rowsize = 16;
788
789 for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
790 linelen = min(remaining, rowsize);
791 remaining -= rowsize;
792
793 switch (prefix_type) {
794 case DUMP_PREFIX_ADDRESS:
795 seq_printf(m, "%s%p: ", prefix_str, ptr + i);
796 break;
797 case DUMP_PREFIX_OFFSET:
798 seq_printf(m, "%s%.8x: ", prefix_str, i);
799 break;
800 default:
801 seq_printf(m, "%s", prefix_str);
802 break;
803 }
804
805 ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
806 m->buf + m->count, m->size - m->count,
807 ascii);
808 if (ret >= m->size - m->count) {
809 seq_set_overflow(m);
810 } else {
811 m->count += ret;
812 seq_putc(m, '\n');
813 }
814 }
815}
816EXPORT_SYMBOL(seq_hex_dump);
817
776struct list_head *seq_list_start(struct list_head *head, loff_t pos) 818struct list_head *seq_list_start(struct list_head *head, loff_t pos)
777{ 819{
778 struct list_head *lh; 820 struct list_head *lh;
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index 940d5ec122c9..b1bc954eccf3 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -6,6 +6,7 @@
6#include <linux/scatterlist.h> 6#include <linux/scatterlist.h>
7#include <linux/dma-debug.h> 7#include <linux/dma-debug.h>
8#include <linux/dma-attrs.h> 8#include <linux/dma-attrs.h>
9#include <asm-generic/dma-coherent.h>
9 10
10static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, 11static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
11 size_t size, 12 size_t size,
@@ -237,4 +238,121 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
237 238
238#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL) 239#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
239 240
241#ifndef arch_dma_alloc_attrs
242#define arch_dma_alloc_attrs(dev, flag) (true)
243#endif
244
245static inline void *dma_alloc_attrs(struct device *dev, size_t size,
246 dma_addr_t *dma_handle, gfp_t flag,
247 struct dma_attrs *attrs)
248{
249 struct dma_map_ops *ops = get_dma_ops(dev);
250 void *cpu_addr;
251
252 BUG_ON(!ops);
253
254 if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
255 return cpu_addr;
256
257 if (!arch_dma_alloc_attrs(&dev, &flag))
258 return NULL;
259 if (!ops->alloc)
260 return NULL;
261
262 cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
263 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
264 return cpu_addr;
265}
266
267static inline void dma_free_attrs(struct device *dev, size_t size,
268 void *cpu_addr, dma_addr_t dma_handle,
269 struct dma_attrs *attrs)
270{
271 struct dma_map_ops *ops = get_dma_ops(dev);
272
273 BUG_ON(!ops);
274 WARN_ON(irqs_disabled());
275
276 if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
277 return;
278
279 if (!ops->free)
280 return;
281
282 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
283 ops->free(dev, size, cpu_addr, dma_handle, attrs);
284}
285
286static inline void *dma_alloc_coherent(struct device *dev, size_t size,
287 dma_addr_t *dma_handle, gfp_t flag)
288{
289 return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
290}
291
292static inline void dma_free_coherent(struct device *dev, size_t size,
293 void *cpu_addr, dma_addr_t dma_handle)
294{
295 return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
296}
297
298static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
299 dma_addr_t *dma_handle, gfp_t gfp)
300{
301 DEFINE_DMA_ATTRS(attrs);
302
303 dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
304 return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
305}
306
307static inline void dma_free_noncoherent(struct device *dev, size_t size,
308 void *cpu_addr, dma_addr_t dma_handle)
309{
310 DEFINE_DMA_ATTRS(attrs);
311
312 dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
313 dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
314}
315
316static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
317{
318 debug_dma_mapping_error(dev, dma_addr);
319
320 if (get_dma_ops(dev)->mapping_error)
321 return get_dma_ops(dev)->mapping_error(dev, dma_addr);
322
323#ifdef DMA_ERROR_CODE
324 return dma_addr == DMA_ERROR_CODE;
325#else
326 return 0;
327#endif
328}
329
330#ifndef HAVE_ARCH_DMA_SUPPORTED
331static inline int dma_supported(struct device *dev, u64 mask)
332{
333 struct dma_map_ops *ops = get_dma_ops(dev);
334
335 if (!ops)
336 return 0;
337 if (!ops->dma_supported)
338 return 1;
339 return ops->dma_supported(dev, mask);
340}
341#endif
342
343#ifndef HAVE_ARCH_DMA_SET_MASK
344static inline int dma_set_mask(struct device *dev, u64 mask)
345{
346 struct dma_map_ops *ops = get_dma_ops(dev);
347
348 if (ops->set_dma_mask)
349 return ops->set_dma_mask(dev, mask);
350
351 if (!dev->dma_mask || !dma_supported(dev, mask))
352 return -EIO;
353 *dev->dma_mask = mask;
354 return 0;
355}
356#endif
357
240#endif 358#endif
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index b63218f68c4b..d140b1e9faa7 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,7 +16,7 @@
16 16
17#include <uapi/linux/kexec.h> 17#include <uapi/linux/kexec.h>
18 18
19#ifdef CONFIG_KEXEC 19#ifdef CONFIG_KEXEC_CORE
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/linkage.h> 21#include <linux/linkage.h>
22#include <linux/compat.h> 22#include <linux/compat.h>
@@ -318,13 +318,24 @@ int crash_shrink_memory(unsigned long new_size);
318size_t crash_get_memory_size(void); 318size_t crash_get_memory_size(void);
319void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); 319void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
320 320
321#else /* !CONFIG_KEXEC */ 321int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
322 unsigned long buf_len);
323void * __weak arch_kexec_kernel_image_load(struct kimage *image);
324int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
325int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
326 unsigned long buf_len);
327int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
328 Elf_Shdr *sechdrs, unsigned int relsec);
329int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
330 unsigned int relsec);
331
332#else /* !CONFIG_KEXEC_CORE */
322struct pt_regs; 333struct pt_regs;
323struct task_struct; 334struct task_struct;
324static inline void crash_kexec(struct pt_regs *regs) { } 335static inline void crash_kexec(struct pt_regs *regs) { }
325static inline int kexec_should_crash(struct task_struct *p) { return 0; } 336static inline int kexec_should_crash(struct task_struct *p) { return 0; }
326#define kexec_in_progress false 337#define kexec_in_progress false
327#endif /* CONFIG_KEXEC */ 338#endif /* CONFIG_KEXEC_CORE */
328 339
329#endif /* !defined(__ASSEBMLY__) */ 340#endif /* !defined(__ASSEBMLY__) */
330 341
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 0555cc66a15b..fcfd2bf14d3f 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -85,8 +85,6 @@ enum umh_disable_depth {
85 UMH_DISABLED, 85 UMH_DISABLED,
86}; 86};
87 87
88extern void usermodehelper_init(void);
89
90extern int __usermodehelper_disable(enum umh_disable_depth depth); 88extern int __usermodehelper_disable(enum umh_disable_depth depth);
91extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth); 89extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
92 90
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d92b80b63c5c..ad800e62cb7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -305,11 +305,9 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
305struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 305struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
306 306
307bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); 307bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
308
309struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
310struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 308struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
311
312struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); 309struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
310
313static inline 311static inline
314struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ 312struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
315 return css ? container_of(css, struct mem_cgroup, css) : NULL; 313 return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -345,6 +343,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
345} 343}
346 344
347struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); 345struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
346ino_t page_cgroup_ino(struct page *page);
348 347
349static inline bool mem_cgroup_disabled(void) 348static inline bool mem_cgroup_disabled(void)
350{ 349{
@@ -555,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
555 return &zone->lruvec; 554 return &zone->lruvec;
556} 555}
557 556
558static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
559{
560 return NULL;
561}
562
563static inline bool mm_match_cgroup(struct mm_struct *mm, 557static inline bool mm_match_cgroup(struct mm_struct *mm,
564 struct mem_cgroup *memcg) 558 struct mem_cgroup *memcg)
565{ 559{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f25a957bf0ab..fda728e3c27d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1873,11 +1873,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
1873 1873
1874extern unsigned long mmap_region(struct file *file, unsigned long addr, 1874extern unsigned long mmap_region(struct file *file, unsigned long addr,
1875 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); 1875 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
1876extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1876extern unsigned long do_mmap(struct file *file, unsigned long addr,
1877 unsigned long len, unsigned long prot, unsigned long flags, 1877 unsigned long len, unsigned long prot, unsigned long flags,
1878 unsigned long pgoff, unsigned long *populate); 1878 vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
1879extern int do_munmap(struct mm_struct *, unsigned long, size_t); 1879extern int do_munmap(struct mm_struct *, unsigned long, size_t);
1880 1880
1881static inline unsigned long
1882do_mmap_pgoff(struct file *file, unsigned long addr,
1883 unsigned long len, unsigned long prot, unsigned long flags,
1884 unsigned long pgoff, unsigned long *populate)
1885{
1886 return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
1887}
1888
1881#ifdef CONFIG_MMU 1889#ifdef CONFIG_MMU
1882extern int __mm_populate(unsigned long addr, unsigned long len, 1890extern int __mm_populate(unsigned long addr, unsigned long len,
1883 int ignore_errors); 1891 int ignore_errors);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 61cd67f4d788..a1a210d59961 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -66,6 +66,16 @@ struct mmu_notifier_ops {
66 unsigned long end); 66 unsigned long end);
67 67
68 /* 68 /*
69 * clear_young is a lightweight version of clear_flush_young. Like the
70 * latter, it is supposed to test-and-clear the young/accessed bitflag
71 * in the secondary pte, but it may omit flushing the secondary tlb.
72 */
73 int (*clear_young)(struct mmu_notifier *mn,
74 struct mm_struct *mm,
75 unsigned long start,
76 unsigned long end);
77
78 /*
69 * test_young is called to check the young/accessed bitflag in 79 * test_young is called to check the young/accessed bitflag in
70 * the secondary pte. This is used to know if the page is 80 * the secondary pte. This is used to know if the page is
71 * frequently used without actually clearing the flag or tearing 81 * frequently used without actually clearing the flag or tearing
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
203extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 213extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
204 unsigned long start, 214 unsigned long start,
205 unsigned long end); 215 unsigned long end);
216extern int __mmu_notifier_clear_young(struct mm_struct *mm,
217 unsigned long start,
218 unsigned long end);
206extern int __mmu_notifier_test_young(struct mm_struct *mm, 219extern int __mmu_notifier_test_young(struct mm_struct *mm,
207 unsigned long address); 220 unsigned long address);
208extern void __mmu_notifier_change_pte(struct mm_struct *mm, 221extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
231 return 0; 244 return 0;
232} 245}
233 246
247static inline int mmu_notifier_clear_young(struct mm_struct *mm,
248 unsigned long start,
249 unsigned long end)
250{
251 if (mm_has_notifiers(mm))
252 return __mmu_notifier_clear_young(mm, start, end);
253 return 0;
254}
255
234static inline int mmu_notifier_test_young(struct mm_struct *mm, 256static inline int mmu_notifier_test_young(struct mm_struct *mm,
235 unsigned long address) 257 unsigned long address)
236{ 258{
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
311 __young; \ 333 __young; \
312}) 334})
313 335
336#define ptep_clear_young_notify(__vma, __address, __ptep) \
337({ \
338 int __young; \
339 struct vm_area_struct *___vma = __vma; \
340 unsigned long ___address = __address; \
341 __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
342 __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
343 ___address + PAGE_SIZE); \
344 __young; \
345})
346
347#define pmdp_clear_young_notify(__vma, __address, __pmdp) \
348({ \
349 int __young; \
350 struct vm_area_struct *___vma = __vma; \
351 unsigned long ___address = __address; \
352 __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
353 __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
354 ___address + PMD_SIZE); \
355 __young; \
356})
357
314#define ptep_clear_flush_notify(__vma, __address, __ptep) \ 358#define ptep_clear_flush_notify(__vma, __address, __ptep) \
315({ \ 359({ \
316 unsigned long ___addr = __address & PAGE_MASK; \ 360 unsigned long ___addr = __address & PAGE_MASK; \
@@ -427,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
427 471
428#define ptep_clear_flush_young_notify ptep_clear_flush_young 472#define ptep_clear_flush_young_notify ptep_clear_flush_young
429#define pmdp_clear_flush_young_notify pmdp_clear_flush_young 473#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
474#define ptep_clear_young_notify ptep_test_and_clear_young
475#define pmdp_clear_young_notify pmdp_test_and_clear_young
430#define ptep_clear_flush_notify ptep_clear_flush 476#define ptep_clear_flush_notify ptep_clear_flush
431#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush 477#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
432#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear 478#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 41c93844fb1d..416509e26d6d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,10 @@ enum pageflags {
109#ifdef CONFIG_TRANSPARENT_HUGEPAGE 109#ifdef CONFIG_TRANSPARENT_HUGEPAGE
110 PG_compound_lock, 110 PG_compound_lock,
111#endif 111#endif
112#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
113 PG_young,
114 PG_idle,
115#endif
112 __NR_PAGEFLAGS, 116 __NR_PAGEFLAGS,
113 117
114 /* Filesystems */ 118 /* Filesystems */
@@ -289,6 +293,13 @@ PAGEFLAG_FALSE(HWPoison)
289#define __PG_HWPOISON 0 293#define __PG_HWPOISON 0
290#endif 294#endif
291 295
296#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
297TESTPAGEFLAG(Young, young)
298SETPAGEFLAG(Young, young)
299TESTCLEARFLAG(Young, young)
300PAGEFLAG(Idle, idle)
301#endif
302
292/* 303/*
293 * On an anonymous page mapped into a user virtual memory area, 304 * On an anonymous page mapped into a user virtual memory area,
294 * page->mapping points to its anon_vma, not to a struct address_space; 305 * page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index c42981cd99aa..17f118a82854 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -26,6 +26,10 @@ enum page_ext_flags {
26 PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ 26 PAGE_EXT_DEBUG_POISON, /* Page is poisoned */
27 PAGE_EXT_DEBUG_GUARD, 27 PAGE_EXT_DEBUG_GUARD,
28 PAGE_EXT_OWNER, 28 PAGE_EXT_OWNER,
29#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
30 PAGE_EXT_YOUNG,
31 PAGE_EXT_IDLE,
32#endif
29}; 33};
30 34
31/* 35/*
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
new file mode 100644
index 000000000000..bf268fa92c5b
--- /dev/null
+++ b/include/linux/page_idle.h
@@ -0,0 +1,110 @@
1#ifndef _LINUX_MM_PAGE_IDLE_H
2#define _LINUX_MM_PAGE_IDLE_H
3
4#include <linux/bitops.h>
5#include <linux/page-flags.h>
6#include <linux/page_ext.h>
7
8#ifdef CONFIG_IDLE_PAGE_TRACKING
9
10#ifdef CONFIG_64BIT
11static inline bool page_is_young(struct page *page)
12{
13 return PageYoung(page);
14}
15
16static inline void set_page_young(struct page *page)
17{
18 SetPageYoung(page);
19}
20
21static inline bool test_and_clear_page_young(struct page *page)
22{
23 return TestClearPageYoung(page);
24}
25
26static inline bool page_is_idle(struct page *page)
27{
28 return PageIdle(page);
29}
30
31static inline void set_page_idle(struct page *page)
32{
33 SetPageIdle(page);
34}
35
36static inline void clear_page_idle(struct page *page)
37{
38 ClearPageIdle(page);
39}
40#else /* !CONFIG_64BIT */
41/*
42 * If there is not enough space to store Idle and Young bits in page flags, use
43 * page ext flags instead.
44 */
45extern struct page_ext_operations page_idle_ops;
46
47static inline bool page_is_young(struct page *page)
48{
49 return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
50}
51
52static inline void set_page_young(struct page *page)
53{
54 set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
55}
56
57static inline bool test_and_clear_page_young(struct page *page)
58{
59 return test_and_clear_bit(PAGE_EXT_YOUNG,
60 &lookup_page_ext(page)->flags);
61}
62
63static inline bool page_is_idle(struct page *page)
64{
65 return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
66}
67
68static inline void set_page_idle(struct page *page)
69{
70 set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
71}
72
73static inline void clear_page_idle(struct page *page)
74{
75 clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
76}
77#endif /* CONFIG_64BIT */
78
79#else /* !CONFIG_IDLE_PAGE_TRACKING */
80
81static inline bool page_is_young(struct page *page)
82{
83 return false;
84}
85
86static inline void set_page_young(struct page *page)
87{
88}
89
90static inline bool test_and_clear_page_young(struct page *page)
91{
92 return false;
93}
94
95static inline bool page_is_idle(struct page *page)
96{
97 return false;
98}
99
100static inline void set_page_idle(struct page *page)
101{
102}
103
104static inline void clear_page_idle(struct page *page)
105{
106}
107
108#endif /* CONFIG_IDLE_PAGE_TRACKING */
109
110#endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2a..317e16de09e5 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -19,8 +19,8 @@
19 * under normal circumstances, used to verify that nobody uses 19 * under normal circumstances, used to verify that nobody uses
20 * non-initialized list entries. 20 * non-initialized list entries.
21 */ 21 */
22#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) 22#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA)
23#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) 23#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA)
24 24
25/********** include/linux/timer.h **********/ 25/********** include/linux/timer.h **********/
26/* 26/*
@@ -69,10 +69,6 @@
69#define ATM_POISON_FREE 0x12 69#define ATM_POISON_FREE 0x12
70#define ATM_POISON 0xdeadbeef 70#define ATM_POISON 0xdeadbeef
71 71
72/********** net/ **********/
73#define NEIGHBOR_DEAD 0xdeadbeef
74#define NETFILTER_LINK_POISON 0xdead57ac
75
76/********** kernel/mutexes **********/ 72/********** kernel/mutexes **********/
77#define MUTEX_DEBUG_INIT 0x11 73#define MUTEX_DEBUG_INIT 0x11
78#define MUTEX_DEBUG_FREE 0x22 74#define MUTEX_DEBUG_FREE 0x22
@@ -83,7 +79,4 @@
83/********** security/ **********/ 79/********** security/ **********/
84#define KEY_DESTROY 0xbd 80#define KEY_DESTROY 0xbd
85 81
86/********** sound/oss/ **********/
87#define OSS_POISON_FREE 0xAB
88
89#endif 82#endif
diff --git a/include/linux/printk.h b/include/linux/printk.h
index a6298b27ac99..9729565c25ff 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -404,10 +404,10 @@ do { \
404 static DEFINE_RATELIMIT_STATE(_rs, \ 404 static DEFINE_RATELIMIT_STATE(_rs, \
405 DEFAULT_RATELIMIT_INTERVAL, \ 405 DEFAULT_RATELIMIT_INTERVAL, \
406 DEFAULT_RATELIMIT_BURST); \ 406 DEFAULT_RATELIMIT_BURST); \
407 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ 407 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \
408 if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \ 408 if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \
409 __ratelimit(&_rs)) \ 409 __ratelimit(&_rs)) \
410 __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__); \ 410 __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \
411} while (0) 411} while (0)
412#elif defined(DEBUG) 412#elif defined(DEBUG)
413#define pr_debug_ratelimited(fmt, ...) \ 413#define pr_debug_ratelimited(fmt, ...) \
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
456 groupsize, buf, len, ascii) \ 456 groupsize, buf, len, ascii) \
457 dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ 457 dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
458 groupsize, buf, len, ascii) 458 groupsize, buf, len, ascii)
459#else 459#elif defined(DEBUG)
460#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ 460#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \
461 groupsize, buf, len, ascii) \ 461 groupsize, buf, len, ascii) \
462 print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \ 462 print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \
463 groupsize, buf, len, ascii) 463 groupsize, buf, len, ascii)
464#endif /* defined(CONFIG_DYNAMIC_DEBUG) */ 464#else
465static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
466 int rowsize, int groupsize,
467 const void *buf, size_t len, bool ascii)
468{
469}
470#endif
465 471
466#endif 472#endif
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index d4c7271382cb..adeadbd6d7bf 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -122,6 +122,10 @@ int seq_write(struct seq_file *seq, const void *data, size_t len);
122__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...); 122__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
123__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args); 123__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
124 124
125void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
126 int rowsize, int groupsize, const void *buf, size_t len,
127 bool ascii);
128
125int seq_path(struct seq_file *, const struct path *, const char *); 129int seq_path(struct seq_file *, const struct path *, const char *);
126int seq_file_path(struct seq_file *, struct file *, const char *); 130int seq_file_path(struct seq_file *, struct file *, const char *);
127int seq_dentry(struct seq_file *, struct dentry *, const char *); 131int seq_dentry(struct seq_file *, struct dentry *, const char *);
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 71f711db4500..dabe643eb5fa 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -48,24 +48,24 @@ static inline int string_unescape_any_inplace(char *buf)
48#define ESCAPE_HEX 0x20 48#define ESCAPE_HEX 0x20
49 49
50int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, 50int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
51 unsigned int flags, const char *esc); 51 unsigned int flags, const char *only);
52 52
53static inline int string_escape_mem_any_np(const char *src, size_t isz, 53static inline int string_escape_mem_any_np(const char *src, size_t isz,
54 char *dst, size_t osz, const char *esc) 54 char *dst, size_t osz, const char *only)
55{ 55{
56 return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc); 56 return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
57} 57}
58 58
59static inline int string_escape_str(const char *src, char *dst, size_t sz, 59static inline int string_escape_str(const char *src, char *dst, size_t sz,
60 unsigned int flags, const char *esc) 60 unsigned int flags, const char *only)
61{ 61{
62 return string_escape_mem(src, strlen(src), dst, sz, flags, esc); 62 return string_escape_mem(src, strlen(src), dst, sz, flags, only);
63} 63}
64 64
65static inline int string_escape_str_any_np(const char *src, char *dst, 65static inline int string_escape_str_any_np(const char *src, char *dst,
66 size_t sz, const char *esc) 66 size_t sz, const char *only)
67{ 67{
68 return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc); 68 return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
69} 69}
70 70
71#endif 71#endif
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index c924a28d9805..42f8ec992452 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,6 +36,8 @@ enum zpool_mapmode {
36 ZPOOL_MM_DEFAULT = ZPOOL_MM_RW 36 ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
37}; 37};
38 38
39bool zpool_has_pool(char *type);
40
39struct zpool *zpool_create_pool(char *type, char *name, 41struct zpool *zpool_create_pool(char *type, char *name,
40 gfp_t gfp, const struct zpool_ops *ops); 42 gfp_t gfp, const struct zpool_ops *ops);
41 43
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index a6c4962e5d46..5da5f8751ce7 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -33,6 +33,7 @@
33#define KPF_THP 22 33#define KPF_THP 22
34#define KPF_BALLOON 23 34#define KPF_BALLOON 23
35#define KPF_ZERO_PAGE 24 35#define KPF_ZERO_PAGE 24
36#define KPF_IDLE 25
36 37
37 38
38#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ 39#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/init/initramfs.c b/init/initramfs.c
index ad1bd7787bbb..b32ad7d97ac9 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
526 526
527static void __init free_initrd(void) 527static void __init free_initrd(void)
528{ 528{
529#ifdef CONFIG_KEXEC 529#ifdef CONFIG_KEXEC_CORE
530 unsigned long crashk_start = (unsigned long)__va(crashk_res.start); 530 unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
531 unsigned long crashk_end = (unsigned long)__va(crashk_res.end); 531 unsigned long crashk_end = (unsigned long)__va(crashk_res.end);
532#endif 532#endif
533 if (do_retain_initrd) 533 if (do_retain_initrd)
534 goto skip; 534 goto skip;
535 535
536#ifdef CONFIG_KEXEC 536#ifdef CONFIG_KEXEC_CORE
537 /* 537 /*
538 * If the initrd region is overlapped with crashkernel reserved region, 538 * If the initrd region is overlapped with crashkernel reserved region,
539 * free only memory that is not part of crashkernel region. 539 * free only memory that is not part of crashkernel region.
diff --git a/init/main.c b/init/main.c
index 56506553d4d8..9e64d7097f1a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -877,7 +877,6 @@ static void __init do_initcalls(void)
877static void __init do_basic_setup(void) 877static void __init do_basic_setup(void)
878{ 878{
879 cpuset_init_smp(); 879 cpuset_init_smp();
880 usermodehelper_init();
881 shmem_init(); 880 shmem_init();
882 driver_init(); 881 driver_init();
883 init_irq_proc(); 882 init_irq_proc();
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 2b491590ebab..71f448e5e927 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -123,7 +123,7 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
123 size_t len = src->m_ts; 123 size_t len = src->m_ts;
124 size_t alen; 124 size_t alen;
125 125
126 BUG_ON(dst == NULL); 126 WARN_ON(dst == NULL);
127 if (src->m_ts > dst->m_ts) 127 if (src->m_ts > dst->m_ts)
128 return ERR_PTR(-EINVAL); 128 return ERR_PTR(-EINVAL);
129 129
diff --git a/ipc/shm.c b/ipc/shm.c
index 4aef24d91b63..222131e8e38f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -159,7 +159,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
159 * We raced in the idr lookup or with shm_destroy(). Either way, the 159 * We raced in the idr lookup or with shm_destroy(). Either way, the
160 * ID is busted. 160 * ID is busted.
161 */ 161 */
162 BUG_ON(IS_ERR(ipcp)); 162 WARN_ON(IS_ERR(ipcp));
163 163
164 return container_of(ipcp, struct shmid_kernel, shm_perm); 164 return container_of(ipcp, struct shmid_kernel, shm_perm);
165} 165}
@@ -393,7 +393,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
393 return ret; 393 return ret;
394 sfd->vm_ops = vma->vm_ops; 394 sfd->vm_ops = vma->vm_ops;
395#ifdef CONFIG_MMU 395#ifdef CONFIG_MMU
396 BUG_ON(!sfd->vm_ops->fault); 396 WARN_ON(!sfd->vm_ops->fault);
397#endif 397#endif
398 vma->vm_ops = &shm_vm_ops; 398 vma->vm_ops = &shm_vm_ops;
399 shm_open(vma); 399 shm_open(vma);
diff --git a/kernel/Makefile b/kernel/Makefile
index e0d7587e7684..d4988410b410 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,7 +49,9 @@ obj-$(CONFIG_MODULES) += module.o
49obj-$(CONFIG_MODULE_SIG) += module_signing.o 49obj-$(CONFIG_MODULE_SIG) += module_signing.o
50obj-$(CONFIG_KALLSYMS) += kallsyms.o 50obj-$(CONFIG_KALLSYMS) += kallsyms.o
51obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 51obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
52obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
52obj-$(CONFIG_KEXEC) += kexec.o 53obj-$(CONFIG_KEXEC) += kexec.o
54obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
53obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 55obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
54obj-$(CONFIG_COMPAT) += compat.o 56obj-$(CONFIG_COMPAT) += compat.o
55obj-$(CONFIG_CGROUPS) += cgroup.o 57obj-$(CONFIG_CGROUPS) += cgroup.o
diff --git a/kernel/cred.c b/kernel/cred.c
index ec1c07667ec1..71179a09c1d6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -20,11 +20,16 @@
20#include <linux/cn_proc.h> 20#include <linux/cn_proc.h>
21 21
22#if 0 22#if 0
23#define kdebug(FMT, ...) \ 23#define kdebug(FMT, ...) \
24 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 24 printk("[%-5.5s%5u] " FMT "\n", \
25 current->comm, current->pid, ##__VA_ARGS__)
25#else 26#else
26#define kdebug(FMT, ...) \ 27#define kdebug(FMT, ...) \
27 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 28do { \
29 if (0) \
30 no_printk("[%-5.5s%5u] " FMT "\n", \
31 current->comm, current->pid, ##__VA_ARGS__); \
32} while (0)
28#endif 33#endif
29 34
30static struct kmem_cache *cred_jar; 35static struct kmem_cache *cred_jar;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e8183895691c..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
9094 mutex_unlock(&swhash->hlist_mutex); 9094 mutex_unlock(&swhash->hlist_mutex);
9095} 9095}
9096 9096
9097#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC 9097#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
9098static void __perf_event_exit_context(void *__info) 9098static void __perf_event_exit_context(void *__info)
9099{ 9099{
9100 struct remove_event re = { .detach_group = true }; 9100 struct remove_event re = { .detach_group = true };
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ftrace.h>
22#include <linux/mutex.h> 21#include <linux/mutex.h>
23#include <linux/init.h> 22#include <linux/init.h>
24 23
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,22 @@
1/* 1/*
2 * kexec.c - kexec system call 2 * kexec.c - kexec_load system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 * 4 *
5 * This source code is licensed under the GNU General Public License, 5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#define pr_fmt(fmt) "kexec: " fmt
10
11#include <linux/capability.h> 9#include <linux/capability.h>
12#include <linux/mm.h> 10#include <linux/mm.h>
13#include <linux/file.h> 11#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/fs.h>
16#include <linux/kexec.h> 12#include <linux/kexec.h>
17#include <linux/mutex.h> 13#include <linux/mutex.h>
18#include <linux/list.h> 14#include <linux/list.h>
19#include <linux/highmem.h>
20#include <linux/syscalls.h> 15#include <linux/syscalls.h>
21#include <linux/reboot.h>
22#include <linux/ioport.h>
23#include <linux/hardirq.h>
24#include <linux/elf.h>
25#include <linux/elfcore.h>
26#include <linux/utsname.h>
27#include <linux/numa.h>
28#include <linux/suspend.h>
29#include <linux/device.h>
30#include <linux/freezer.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/console.h>
34#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
35#include <linux/swap.h> 17#include <linux/slab.h>
36#include <linux/syscore_ops.h>
37#include <linux/compiler.h>
38#include <linux/hugetlb.h>
39
40#include <asm/page.h>
41#include <asm/uaccess.h>
42#include <asm/io.h>
43#include <asm/sections.h>
44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47
48/* Per cpu memory for storing cpu states in case of system crash. */
49note_buf_t __percpu *crash_notes;
50
51/* vmcoreinfo stuff */
52static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
53u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
54size_t vmcoreinfo_size;
55size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
56
57/* Flag to indicate we are going to kexec a new kernel */
58bool kexec_in_progress = false;
59
60/*
61 * Declare these symbols weak so that if architecture provides a purgatory,
62 * these will be overridden.
63 */
64char __weak kexec_purgatory[0];
65size_t __weak kexec_purgatory_size = 0;
66
67#ifdef CONFIG_KEXEC_FILE
68static int kexec_calculate_store_digests(struct kimage *image);
69#endif
70
71/* Location of the reserved area for the crash kernel */
72struct resource crashk_res = {
73 .name = "Crash kernel",
74 .start = 0,
75 .end = 0,
76 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
77};
78struct resource crashk_low_res = {
79 .name = "Crash kernel",
80 .start = 0,
81 .end = 0,
82 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
83};
84
85int kexec_should_crash(struct task_struct *p)
86{
87 /*
88 * If crash_kexec_post_notifiers is enabled, don't run
89 * crash_kexec() here yet, which must be run after panic
90 * notifiers in panic().
91 */
92 if (crash_kexec_post_notifiers)
93 return 0;
94 /*
95 * There are 4 panic() calls in do_exit() path, each of which
96 * corresponds to each of these 4 conditions.
97 */
98 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
99 return 1;
100 return 0;
101}
102
103/*
104 * When kexec transitions to the new kernel there is a one-to-one
105 * mapping between physical and virtual addresses. On processors
106 * where you can disable the MMU this is trivial, and easy. For
107 * others it is still a simple predictable page table to setup.
108 *
109 * In that environment kexec copies the new kernel to its final
110 * resting place. This means I can only support memory whose
111 * physical address can fit in an unsigned long. In particular
112 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
113 * If the assembly stub has more restrictive requirements
114 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
115 * defined more restrictively in <asm/kexec.h>.
116 *
117 * The code for the transition from the current kernel to the
118 * the new kernel is placed in the control_code_buffer, whose size
119 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
120 * page of memory is necessary, but some architectures require more.
121 * Because this memory must be identity mapped in the transition from
122 * virtual to physical addresses it must live in the range
123 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
124 * modifiable.
125 *
126 * The assembly stub in the control code buffer is passed a linked list
127 * of descriptor pages detailing the source pages of the new kernel,
128 * and the destination addresses of those source pages. As this data
129 * structure is not used in the context of the current OS, it must
130 * be self-contained.
131 *
132 * The code has been made to work with highmem pages and will use a
133 * destination page in its final resting place (if it happens
134 * to allocate it). The end product of this is that most of the
135 * physical address space, and most of RAM can be used.
136 *
137 * Future directions include:
138 * - allocating a page table with the control code buffer identity
139 * mapped, to simplify machine_kexec and make kexec_on_panic more
140 * reliable.
141 */
142
143/*
144 * KIMAGE_NO_DEST is an impossible destination address..., for
145 * allocating pages whose destination address we do not care about.
146 */
147#define KIMAGE_NO_DEST (-1UL)
148 18
149static int kimage_is_destination_range(struct kimage *image, 19#include "kexec_internal.h"
150 unsigned long start, unsigned long end);
151static struct page *kimage_alloc_page(struct kimage *image,
152 gfp_t gfp_mask,
153 unsigned long dest);
154 20
155static int copy_user_segment_list(struct kimage *image, 21static int copy_user_segment_list(struct kimage *image,
156 unsigned long nr_segments, 22 unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
169 return ret; 35 return ret;
170} 36}
171 37
172static int sanity_check_segment_list(struct kimage *image)
173{
174 int result, i;
175 unsigned long nr_segments = image->nr_segments;
176
177 /*
178 * Verify we have good destination addresses. The caller is
179 * responsible for making certain we don't attempt to load
180 * the new image into invalid or reserved areas of RAM. This
181 * just verifies it is an address we can use.
182 *
183 * Since the kernel does everything in page size chunks ensure
184 * the destination addresses are page aligned. Too many
185 * special cases crop of when we don't do this. The most
186 * insidious is getting overlapping destination addresses
187 * simply because addresses are changed to page size
188 * granularity.
189 */
190 result = -EADDRNOTAVAIL;
191 for (i = 0; i < nr_segments; i++) {
192 unsigned long mstart, mend;
193
194 mstart = image->segment[i].mem;
195 mend = mstart + image->segment[i].memsz;
196 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
197 return result;
198 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
199 return result;
200 }
201
202 /* Verify our destination addresses do not overlap.
203 * If we alloed overlapping destination addresses
204 * through very weird things can happen with no
205 * easy explanation as one segment stops on another.
206 */
207 result = -EINVAL;
208 for (i = 0; i < nr_segments; i++) {
209 unsigned long mstart, mend;
210 unsigned long j;
211
212 mstart = image->segment[i].mem;
213 mend = mstart + image->segment[i].memsz;
214 for (j = 0; j < i; j++) {
215 unsigned long pstart, pend;
216 pstart = image->segment[j].mem;
217 pend = pstart + image->segment[j].memsz;
218 /* Do the segments overlap ? */
219 if ((mend > pstart) && (mstart < pend))
220 return result;
221 }
222 }
223
224 /* Ensure our buffer sizes are strictly less than
225 * our memory sizes. This should always be the case,
226 * and it is easier to check up front than to be surprised
227 * later on.
228 */
229 result = -EINVAL;
230 for (i = 0; i < nr_segments; i++) {
231 if (image->segment[i].bufsz > image->segment[i].memsz)
232 return result;
233 }
234
235 /*
236 * Verify we have good destination addresses. Normally
237 * the caller is responsible for making certain we don't
238 * attempt to load the new image into invalid or reserved
239 * areas of RAM. But crash kernels are preloaded into a
240 * reserved area of ram. We must ensure the addresses
241 * are in the reserved area otherwise preloading the
242 * kernel could corrupt things.
243 */
244
245 if (image->type == KEXEC_TYPE_CRASH) {
246 result = -EADDRNOTAVAIL;
247 for (i = 0; i < nr_segments; i++) {
248 unsigned long mstart, mend;
249
250 mstart = image->segment[i].mem;
251 mend = mstart + image->segment[i].memsz - 1;
252 /* Ensure we are within the crash kernel limits */
253 if ((mstart < crashk_res.start) ||
254 (mend > crashk_res.end))
255 return result;
256 }
257 }
258
259 return 0;
260}
261
262static struct kimage *do_kimage_alloc_init(void)
263{
264 struct kimage *image;
265
266 /* Allocate a controlling structure */
267 image = kzalloc(sizeof(*image), GFP_KERNEL);
268 if (!image)
269 return NULL;
270
271 image->head = 0;
272 image->entry = &image->head;
273 image->last_entry = &image->head;
274 image->control_page = ~0; /* By default this does not apply */
275 image->type = KEXEC_TYPE_DEFAULT;
276
277 /* Initialize the list of control pages */
278 INIT_LIST_HEAD(&image->control_pages);
279
280 /* Initialize the list of destination pages */
281 INIT_LIST_HEAD(&image->dest_pages);
282
283 /* Initialize the list of unusable pages */
284 INIT_LIST_HEAD(&image->unusable_pages);
285
286 return image;
287}
288
289static void kimage_free_page_list(struct list_head *list);
290
291static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, 38static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
292 unsigned long nr_segments, 39 unsigned long nr_segments,
293 struct kexec_segment __user *segments, 40 struct kexec_segment __user *segments,
@@ -354,873 +101,6 @@ out_free_image:
354 return ret; 101 return ret;
355} 102}
356 103
357#ifdef CONFIG_KEXEC_FILE
358static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
359{
360 struct fd f = fdget(fd);
361 int ret;
362 struct kstat stat;
363 loff_t pos;
364 ssize_t bytes = 0;
365
366 if (!f.file)
367 return -EBADF;
368
369 ret = vfs_getattr(&f.file->f_path, &stat);
370 if (ret)
371 goto out;
372
373 if (stat.size > INT_MAX) {
374 ret = -EFBIG;
375 goto out;
376 }
377
378 /* Don't hand 0 to vmalloc, it whines. */
379 if (stat.size == 0) {
380 ret = -EINVAL;
381 goto out;
382 }
383
384 *buf = vmalloc(stat.size);
385 if (!*buf) {
386 ret = -ENOMEM;
387 goto out;
388 }
389
390 pos = 0;
391 while (pos < stat.size) {
392 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
393 stat.size - pos);
394 if (bytes < 0) {
395 vfree(*buf);
396 ret = bytes;
397 goto out;
398 }
399
400 if (bytes == 0)
401 break;
402 pos += bytes;
403 }
404
405 if (pos != stat.size) {
406 ret = -EBADF;
407 vfree(*buf);
408 goto out;
409 }
410
411 *buf_len = pos;
412out:
413 fdput(f);
414 return ret;
415}
416
417/* Architectures can provide this probe function */
418int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
419 unsigned long buf_len)
420{
421 return -ENOEXEC;
422}
423
424void * __weak arch_kexec_kernel_image_load(struct kimage *image)
425{
426 return ERR_PTR(-ENOEXEC);
427}
428
429void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
430{
431}
432
433int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
434 unsigned long buf_len)
435{
436 return -EKEYREJECTED;
437}
438
439/* Apply relocations of type RELA */
440int __weak
441arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
442 unsigned int relsec)
443{
444 pr_err("RELA relocation unsupported.\n");
445 return -ENOEXEC;
446}
447
448/* Apply relocations of type REL */
449int __weak
450arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
451 unsigned int relsec)
452{
453 pr_err("REL relocation unsupported.\n");
454 return -ENOEXEC;
455}
456
457/*
458 * Free up memory used by kernel, initrd, and command line. This is temporary
459 * memory allocation which is not needed any more after these buffers have
460 * been loaded into separate segments and have been copied elsewhere.
461 */
462static void kimage_file_post_load_cleanup(struct kimage *image)
463{
464 struct purgatory_info *pi = &image->purgatory_info;
465
466 vfree(image->kernel_buf);
467 image->kernel_buf = NULL;
468
469 vfree(image->initrd_buf);
470 image->initrd_buf = NULL;
471
472 kfree(image->cmdline_buf);
473 image->cmdline_buf = NULL;
474
475 vfree(pi->purgatory_buf);
476 pi->purgatory_buf = NULL;
477
478 vfree(pi->sechdrs);
479 pi->sechdrs = NULL;
480
481 /* See if architecture has anything to cleanup post load */
482 arch_kimage_file_post_load_cleanup(image);
483
484 /*
485 * Above call should have called into bootloader to free up
486 * any data stored in kimage->image_loader_data. It should
487 * be ok now to free it up.
488 */
489 kfree(image->image_loader_data);
490 image->image_loader_data = NULL;
491}
492
493/*
494 * In file mode list of segments is prepared by kernel. Copy relevant
495 * data from user space, do error checking, prepare segment list
496 */
497static int
498kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
499 const char __user *cmdline_ptr,
500 unsigned long cmdline_len, unsigned flags)
501{
502 int ret = 0;
503 void *ldata;
504
505 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
506 &image->kernel_buf_len);
507 if (ret)
508 return ret;
509
510 /* Call arch image probe handlers */
511 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
512 image->kernel_buf_len);
513
514 if (ret)
515 goto out;
516
517#ifdef CONFIG_KEXEC_VERIFY_SIG
518 ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
519 image->kernel_buf_len);
520 if (ret) {
521 pr_debug("kernel signature verification failed.\n");
522 goto out;
523 }
524 pr_debug("kernel signature verification successful.\n");
525#endif
526 /* It is possible that there no initramfs is being loaded */
527 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
528 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
529 &image->initrd_buf_len);
530 if (ret)
531 goto out;
532 }
533
534 if (cmdline_len) {
535 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
536 if (!image->cmdline_buf) {
537 ret = -ENOMEM;
538 goto out;
539 }
540
541 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
542 cmdline_len);
543 if (ret) {
544 ret = -EFAULT;
545 goto out;
546 }
547
548 image->cmdline_buf_len = cmdline_len;
549
550 /* command line should be a string with last byte null */
551 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
552 ret = -EINVAL;
553 goto out;
554 }
555 }
556
557 /* Call arch image load handlers */
558 ldata = arch_kexec_kernel_image_load(image);
559
560 if (IS_ERR(ldata)) {
561 ret = PTR_ERR(ldata);
562 goto out;
563 }
564
565 image->image_loader_data = ldata;
566out:
567 /* In case of error, free up all allocated memory in this function */
568 if (ret)
569 kimage_file_post_load_cleanup(image);
570 return ret;
571}
572
573static int
574kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
575 int initrd_fd, const char __user *cmdline_ptr,
576 unsigned long cmdline_len, unsigned long flags)
577{
578 int ret;
579 struct kimage *image;
580 bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
581
582 image = do_kimage_alloc_init();
583 if (!image)
584 return -ENOMEM;
585
586 image->file_mode = 1;
587
588 if (kexec_on_panic) {
589 /* Enable special crash kernel control page alloc policy. */
590 image->control_page = crashk_res.start;
591 image->type = KEXEC_TYPE_CRASH;
592 }
593
594 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
595 cmdline_ptr, cmdline_len, flags);
596 if (ret)
597 goto out_free_image;
598
599 ret = sanity_check_segment_list(image);
600 if (ret)
601 goto out_free_post_load_bufs;
602
603 ret = -ENOMEM;
604 image->control_code_page = kimage_alloc_control_pages(image,
605 get_order(KEXEC_CONTROL_PAGE_SIZE));
606 if (!image->control_code_page) {
607 pr_err("Could not allocate control_code_buffer\n");
608 goto out_free_post_load_bufs;
609 }
610
611 if (!kexec_on_panic) {
612 image->swap_page = kimage_alloc_control_pages(image, 0);
613 if (!image->swap_page) {
614 pr_err("Could not allocate swap buffer\n");
615 goto out_free_control_pages;
616 }
617 }
618
619 *rimage = image;
620 return 0;
621out_free_control_pages:
622 kimage_free_page_list(&image->control_pages);
623out_free_post_load_bufs:
624 kimage_file_post_load_cleanup(image);
625out_free_image:
626 kfree(image);
627 return ret;
628}
629#else /* CONFIG_KEXEC_FILE */
630static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
631#endif /* CONFIG_KEXEC_FILE */
632
633static int kimage_is_destination_range(struct kimage *image,
634 unsigned long start,
635 unsigned long end)
636{
637 unsigned long i;
638
639 for (i = 0; i < image->nr_segments; i++) {
640 unsigned long mstart, mend;
641
642 mstart = image->segment[i].mem;
643 mend = mstart + image->segment[i].memsz;
644 if ((end > mstart) && (start < mend))
645 return 1;
646 }
647
648 return 0;
649}
650
651static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
652{
653 struct page *pages;
654
655 pages = alloc_pages(gfp_mask, order);
656 if (pages) {
657 unsigned int count, i;
658 pages->mapping = NULL;
659 set_page_private(pages, order);
660 count = 1 << order;
661 for (i = 0; i < count; i++)
662 SetPageReserved(pages + i);
663 }
664
665 return pages;
666}
667
668static void kimage_free_pages(struct page *page)
669{
670 unsigned int order, count, i;
671
672 order = page_private(page);
673 count = 1 << order;
674 for (i = 0; i < count; i++)
675 ClearPageReserved(page + i);
676 __free_pages(page, order);
677}
678
679static void kimage_free_page_list(struct list_head *list)
680{
681 struct list_head *pos, *next;
682
683 list_for_each_safe(pos, next, list) {
684 struct page *page;
685
686 page = list_entry(pos, struct page, lru);
687 list_del(&page->lru);
688 kimage_free_pages(page);
689 }
690}
691
692static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
693 unsigned int order)
694{
695 /* Control pages are special, they are the intermediaries
696 * that are needed while we copy the rest of the pages
697 * to their final resting place. As such they must
698 * not conflict with either the destination addresses
699 * or memory the kernel is already using.
700 *
701 * The only case where we really need more than one of
702 * these are for architectures where we cannot disable
703 * the MMU and must instead generate an identity mapped
704 * page table for all of the memory.
705 *
706 * At worst this runs in O(N) of the image size.
707 */
708 struct list_head extra_pages;
709 struct page *pages;
710 unsigned int count;
711
712 count = 1 << order;
713 INIT_LIST_HEAD(&extra_pages);
714
715 /* Loop while I can allocate a page and the page allocated
716 * is a destination page.
717 */
718 do {
719 unsigned long pfn, epfn, addr, eaddr;
720
721 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
722 if (!pages)
723 break;
724 pfn = page_to_pfn(pages);
725 epfn = pfn + count;
726 addr = pfn << PAGE_SHIFT;
727 eaddr = epfn << PAGE_SHIFT;
728 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
729 kimage_is_destination_range(image, addr, eaddr)) {
730 list_add(&pages->lru, &extra_pages);
731 pages = NULL;
732 }
733 } while (!pages);
734
735 if (pages) {
736 /* Remember the allocated page... */
737 list_add(&pages->lru, &image->control_pages);
738
739 /* Because the page is already in it's destination
740 * location we will never allocate another page at
741 * that address. Therefore kimage_alloc_pages
742 * will not return it (again) and we don't need
743 * to give it an entry in image->segment[].
744 */
745 }
746 /* Deal with the destination pages I have inadvertently allocated.
747 *
748 * Ideally I would convert multi-page allocations into single
749 * page allocations, and add everything to image->dest_pages.
750 *
751 * For now it is simpler to just free the pages.
752 */
753 kimage_free_page_list(&extra_pages);
754
755 return pages;
756}
757
758static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
759 unsigned int order)
760{
761 /* Control pages are special, they are the intermediaries
762 * that are needed while we copy the rest of the pages
763 * to their final resting place. As such they must
764 * not conflict with either the destination addresses
765 * or memory the kernel is already using.
766 *
767 * Control pages are also the only pags we must allocate
768 * when loading a crash kernel. All of the other pages
769 * are specified by the segments and we just memcpy
770 * into them directly.
771 *
772 * The only case where we really need more than one of
773 * these are for architectures where we cannot disable
774 * the MMU and must instead generate an identity mapped
775 * page table for all of the memory.
776 *
777 * Given the low demand this implements a very simple
778 * allocator that finds the first hole of the appropriate
779 * size in the reserved memory region, and allocates all
780 * of the memory up to and including the hole.
781 */
782 unsigned long hole_start, hole_end, size;
783 struct page *pages;
784
785 pages = NULL;
786 size = (1 << order) << PAGE_SHIFT;
787 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
788 hole_end = hole_start + size - 1;
789 while (hole_end <= crashk_res.end) {
790 unsigned long i;
791
792 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
793 break;
794 /* See if I overlap any of the segments */
795 for (i = 0; i < image->nr_segments; i++) {
796 unsigned long mstart, mend;
797
798 mstart = image->segment[i].mem;
799 mend = mstart + image->segment[i].memsz - 1;
800 if ((hole_end >= mstart) && (hole_start <= mend)) {
801 /* Advance the hole to the end of the segment */
802 hole_start = (mend + (size - 1)) & ~(size - 1);
803 hole_end = hole_start + size - 1;
804 break;
805 }
806 }
807 /* If I don't overlap any segments I have found my hole! */
808 if (i == image->nr_segments) {
809 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
810 break;
811 }
812 }
813 if (pages)
814 image->control_page = hole_end;
815
816 return pages;
817}
818
819
820struct page *kimage_alloc_control_pages(struct kimage *image,
821 unsigned int order)
822{
823 struct page *pages = NULL;
824
825 switch (image->type) {
826 case KEXEC_TYPE_DEFAULT:
827 pages = kimage_alloc_normal_control_pages(image, order);
828 break;
829 case KEXEC_TYPE_CRASH:
830 pages = kimage_alloc_crash_control_pages(image, order);
831 break;
832 }
833
834 return pages;
835}
836
837static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
838{
839 if (*image->entry != 0)
840 image->entry++;
841
842 if (image->entry == image->last_entry) {
843 kimage_entry_t *ind_page;
844 struct page *page;
845
846 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
847 if (!page)
848 return -ENOMEM;
849
850 ind_page = page_address(page);
851 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
852 image->entry = ind_page;
853 image->last_entry = ind_page +
854 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
855 }
856 *image->entry = entry;
857 image->entry++;
858 *image->entry = 0;
859
860 return 0;
861}
862
863static int kimage_set_destination(struct kimage *image,
864 unsigned long destination)
865{
866 int result;
867
868 destination &= PAGE_MASK;
869 result = kimage_add_entry(image, destination | IND_DESTINATION);
870
871 return result;
872}
873
874
875static int kimage_add_page(struct kimage *image, unsigned long page)
876{
877 int result;
878
879 page &= PAGE_MASK;
880 result = kimage_add_entry(image, page | IND_SOURCE);
881
882 return result;
883}
884
885
886static void kimage_free_extra_pages(struct kimage *image)
887{
888 /* Walk through and free any extra destination pages I may have */
889 kimage_free_page_list(&image->dest_pages);
890
891 /* Walk through and free any unusable pages I have cached */
892 kimage_free_page_list(&image->unusable_pages);
893
894}
895static void kimage_terminate(struct kimage *image)
896{
897 if (*image->entry != 0)
898 image->entry++;
899
900 *image->entry = IND_DONE;
901}
902
903#define for_each_kimage_entry(image, ptr, entry) \
904 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
905 ptr = (entry & IND_INDIRECTION) ? \
906 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
907
908static void kimage_free_entry(kimage_entry_t entry)
909{
910 struct page *page;
911
912 page = pfn_to_page(entry >> PAGE_SHIFT);
913 kimage_free_pages(page);
914}
915
916static void kimage_free(struct kimage *image)
917{
918 kimage_entry_t *ptr, entry;
919 kimage_entry_t ind = 0;
920
921 if (!image)
922 return;
923
924 kimage_free_extra_pages(image);
925 for_each_kimage_entry(image, ptr, entry) {
926 if (entry & IND_INDIRECTION) {
927 /* Free the previous indirection page */
928 if (ind & IND_INDIRECTION)
929 kimage_free_entry(ind);
930 /* Save this indirection page until we are
931 * done with it.
932 */
933 ind = entry;
934 } else if (entry & IND_SOURCE)
935 kimage_free_entry(entry);
936 }
937 /* Free the final indirection page */
938 if (ind & IND_INDIRECTION)
939 kimage_free_entry(ind);
940
941 /* Handle any machine specific cleanup */
942 machine_kexec_cleanup(image);
943
944 /* Free the kexec control pages... */
945 kimage_free_page_list(&image->control_pages);
946
947 /*
948 * Free up any temporary buffers allocated. This might hit if
949 * error occurred much later after buffer allocation.
950 */
951 if (image->file_mode)
952 kimage_file_post_load_cleanup(image);
953
954 kfree(image);
955}
956
957static kimage_entry_t *kimage_dst_used(struct kimage *image,
958 unsigned long page)
959{
960 kimage_entry_t *ptr, entry;
961 unsigned long destination = 0;
962
963 for_each_kimage_entry(image, ptr, entry) {
964 if (entry & IND_DESTINATION)
965 destination = entry & PAGE_MASK;
966 else if (entry & IND_SOURCE) {
967 if (page == destination)
968 return ptr;
969 destination += PAGE_SIZE;
970 }
971 }
972
973 return NULL;
974}
975
976static struct page *kimage_alloc_page(struct kimage *image,
977 gfp_t gfp_mask,
978 unsigned long destination)
979{
980 /*
981 * Here we implement safeguards to ensure that a source page
982 * is not copied to its destination page before the data on
983 * the destination page is no longer useful.
984 *
985 * To do this we maintain the invariant that a source page is
986 * either its own destination page, or it is not a
987 * destination page at all.
988 *
989 * That is slightly stronger than required, but the proof
990 * that no problems will not occur is trivial, and the
991 * implementation is simply to verify.
992 *
993 * When allocating all pages normally this algorithm will run
994 * in O(N) time, but in the worst case it will run in O(N^2)
995 * time. If the runtime is a problem the data structures can
996 * be fixed.
997 */
998 struct page *page;
999 unsigned long addr;
1000
1001 /*
1002 * Walk through the list of destination pages, and see if I
1003 * have a match.
1004 */
1005 list_for_each_entry(page, &image->dest_pages, lru) {
1006 addr = page_to_pfn(page) << PAGE_SHIFT;
1007 if (addr == destination) {
1008 list_del(&page->lru);
1009 return page;
1010 }
1011 }
1012 page = NULL;
1013 while (1) {
1014 kimage_entry_t *old;
1015
1016 /* Allocate a page, if we run out of memory give up */
1017 page = kimage_alloc_pages(gfp_mask, 0);
1018 if (!page)
1019 return NULL;
1020 /* If the page cannot be used file it away */
1021 if (page_to_pfn(page) >
1022 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1023 list_add(&page->lru, &image->unusable_pages);
1024 continue;
1025 }
1026 addr = page_to_pfn(page) << PAGE_SHIFT;
1027
1028 /* If it is the destination page we want use it */
1029 if (addr == destination)
1030 break;
1031
1032 /* If the page is not a destination page use it */
1033 if (!kimage_is_destination_range(image, addr,
1034 addr + PAGE_SIZE))
1035 break;
1036
1037 /*
1038 * I know that the page is someones destination page.
1039 * See if there is already a source page for this
1040 * destination page. And if so swap the source pages.
1041 */
1042 old = kimage_dst_used(image, addr);
1043 if (old) {
1044 /* If so move it */
1045 unsigned long old_addr;
1046 struct page *old_page;
1047
1048 old_addr = *old & PAGE_MASK;
1049 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1050 copy_highpage(page, old_page);
1051 *old = addr | (*old & ~PAGE_MASK);
1052
1053 /* The old page I have found cannot be a
1054 * destination page, so return it if it's
1055 * gfp_flags honor the ones passed in.
1056 */
1057 if (!(gfp_mask & __GFP_HIGHMEM) &&
1058 PageHighMem(old_page)) {
1059 kimage_free_pages(old_page);
1060 continue;
1061 }
1062 addr = old_addr;
1063 page = old_page;
1064 break;
1065 } else {
1066 /* Place the page on the destination list I
1067 * will use it later.
1068 */
1069 list_add(&page->lru, &image->dest_pages);
1070 }
1071 }
1072
1073 return page;
1074}
1075
1076static int kimage_load_normal_segment(struct kimage *image,
1077 struct kexec_segment *segment)
1078{
1079 unsigned long maddr;
1080 size_t ubytes, mbytes;
1081 int result;
1082 unsigned char __user *buf = NULL;
1083 unsigned char *kbuf = NULL;
1084
1085 result = 0;
1086 if (image->file_mode)
1087 kbuf = segment->kbuf;
1088 else
1089 buf = segment->buf;
1090 ubytes = segment->bufsz;
1091 mbytes = segment->memsz;
1092 maddr = segment->mem;
1093
1094 result = kimage_set_destination(image, maddr);
1095 if (result < 0)
1096 goto out;
1097
1098 while (mbytes) {
1099 struct page *page;
1100 char *ptr;
1101 size_t uchunk, mchunk;
1102
1103 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
1104 if (!page) {
1105 result = -ENOMEM;
1106 goto out;
1107 }
1108 result = kimage_add_page(image, page_to_pfn(page)
1109 << PAGE_SHIFT);
1110 if (result < 0)
1111 goto out;
1112
1113 ptr = kmap(page);
1114 /* Start with a clear page */
1115 clear_page(ptr);
1116 ptr += maddr & ~PAGE_MASK;
1117 mchunk = min_t(size_t, mbytes,
1118 PAGE_SIZE - (maddr & ~PAGE_MASK));
1119 uchunk = min(ubytes, mchunk);
1120
1121 /* For file based kexec, source pages are in kernel memory */
1122 if (image->file_mode)
1123 memcpy(ptr, kbuf, uchunk);
1124 else
1125 result = copy_from_user(ptr, buf, uchunk);
1126 kunmap(page);
1127 if (result) {
1128 result = -EFAULT;
1129 goto out;
1130 }
1131 ubytes -= uchunk;
1132 maddr += mchunk;
1133 if (image->file_mode)
1134 kbuf += mchunk;
1135 else
1136 buf += mchunk;
1137 mbytes -= mchunk;
1138 }
1139out:
1140 return result;
1141}
1142
1143static int kimage_load_crash_segment(struct kimage *image,
1144 struct kexec_segment *segment)
1145{
1146 /* For crash dumps kernels we simply copy the data from
1147 * user space to it's destination.
1148 * We do things a page at a time for the sake of kmap.
1149 */
1150 unsigned long maddr;
1151 size_t ubytes, mbytes;
1152 int result;
1153 unsigned char __user *buf = NULL;
1154 unsigned char *kbuf = NULL;
1155
1156 result = 0;
1157 if (image->file_mode)
1158 kbuf = segment->kbuf;
1159 else
1160 buf = segment->buf;
1161 ubytes = segment->bufsz;
1162 mbytes = segment->memsz;
1163 maddr = segment->mem;
1164 while (mbytes) {
1165 struct page *page;
1166 char *ptr;
1167 size_t uchunk, mchunk;
1168
1169 page = pfn_to_page(maddr >> PAGE_SHIFT);
1170 if (!page) {
1171 result = -ENOMEM;
1172 goto out;
1173 }
1174 ptr = kmap(page);
1175 ptr += maddr & ~PAGE_MASK;
1176 mchunk = min_t(size_t, mbytes,
1177 PAGE_SIZE - (maddr & ~PAGE_MASK));
1178 uchunk = min(ubytes, mchunk);
1179 if (mchunk > uchunk) {
1180 /* Zero the trailing part of the page */
1181 memset(ptr + uchunk, 0, mchunk - uchunk);
1182 }
1183
1184 /* For file based kexec, source pages are in kernel memory */
1185 if (image->file_mode)
1186 memcpy(ptr, kbuf, uchunk);
1187 else
1188 result = copy_from_user(ptr, buf, uchunk);
1189 kexec_flush_icache_page(page);
1190 kunmap(page);
1191 if (result) {
1192 result = -EFAULT;
1193 goto out;
1194 }
1195 ubytes -= uchunk;
1196 maddr += mchunk;
1197 if (image->file_mode)
1198 kbuf += mchunk;
1199 else
1200 buf += mchunk;
1201 mbytes -= mchunk;
1202 }
1203out:
1204 return result;
1205}
1206
1207static int kimage_load_segment(struct kimage *image,
1208 struct kexec_segment *segment)
1209{
1210 int result = -ENOMEM;
1211
1212 switch (image->type) {
1213 case KEXEC_TYPE_DEFAULT:
1214 result = kimage_load_normal_segment(image, segment);
1215 break;
1216 case KEXEC_TYPE_CRASH:
1217 result = kimage_load_crash_segment(image, segment);
1218 break;
1219 }
1220
1221 return result;
1222}
1223
1224/* 104/*
1225 * Exec Kernel system call: for obvious reasons only root may call it. 105 * Exec Kernel system call: for obvious reasons only root may call it.
1226 * 106 *
@@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image,
1241 * kexec does not sync, or unmount filesystems so if you need 121 * kexec does not sync, or unmount filesystems so if you need
1242 * that to happen you need to do that yourself. 122 * that to happen you need to do that yourself.
1243 */ 123 */
1244struct kimage *kexec_image;
1245struct kimage *kexec_crash_image;
1246int kexec_load_disabled;
1247
1248static DEFINE_MUTEX(kexec_mutex);
1249 124
1250SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, 125SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1251 struct kexec_segment __user *, segments, unsigned long, flags) 126 struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +215,6 @@ out:
1340 return result; 215 return result;
1341} 216}
1342 217
1343/*
1344 * Add and remove page tables for crashkernel memory
1345 *
1346 * Provide an empty default implementation here -- architecture
1347 * code may override this
1348 */
1349void __weak crash_map_reserved_pages(void)
1350{}
1351
1352void __weak crash_unmap_reserved_pages(void)
1353{}
1354
1355#ifdef CONFIG_COMPAT 218#ifdef CONFIG_COMPAT
1356COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, 219COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1357 compat_ulong_t, nr_segments, 220 compat_ulong_t, nr_segments,
@@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1390 return sys_kexec_load(entry, nr_segments, ksegments, flags); 253 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1391} 254}
1392#endif 255#endif
1393
1394#ifdef CONFIG_KEXEC_FILE
1395SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
1396 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
1397 unsigned long, flags)
1398{
1399 int ret = 0, i;
1400 struct kimage **dest_image, *image;
1401
1402 /* We only trust the superuser with rebooting the system. */
1403 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1404 return -EPERM;
1405
1406 /* Make sure we have a legal set of flags */
1407 if (flags != (flags & KEXEC_FILE_FLAGS))
1408 return -EINVAL;
1409
1410 image = NULL;
1411
1412 if (!mutex_trylock(&kexec_mutex))
1413 return -EBUSY;
1414
1415 dest_image = &kexec_image;
1416 if (flags & KEXEC_FILE_ON_CRASH)
1417 dest_image = &kexec_crash_image;
1418
1419 if (flags & KEXEC_FILE_UNLOAD)
1420 goto exchange;
1421
1422 /*
1423 * In case of crash, new kernel gets loaded in reserved region. It is
1424 * same memory where old crash kernel might be loaded. Free any
1425 * current crash dump kernel before we corrupt it.
1426 */
1427 if (flags & KEXEC_FILE_ON_CRASH)
1428 kimage_free(xchg(&kexec_crash_image, NULL));
1429
1430 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
1431 cmdline_len, flags);
1432 if (ret)
1433 goto out;
1434
1435 ret = machine_kexec_prepare(image);
1436 if (ret)
1437 goto out;
1438
1439 ret = kexec_calculate_store_digests(image);
1440 if (ret)
1441 goto out;
1442
1443 for (i = 0; i < image->nr_segments; i++) {
1444 struct kexec_segment *ksegment;
1445
1446 ksegment = &image->segment[i];
1447 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
1448 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
1449 ksegment->memsz);
1450
1451 ret = kimage_load_segment(image, &image->segment[i]);
1452 if (ret)
1453 goto out;
1454 }
1455
1456 kimage_terminate(image);
1457
1458 /*
1459 * Free up any temporary buffers allocated which are not needed
1460 * after image has been loaded
1461 */
1462 kimage_file_post_load_cleanup(image);
1463exchange:
1464 image = xchg(dest_image, image);
1465out:
1466 mutex_unlock(&kexec_mutex);
1467 kimage_free(image);
1468 return ret;
1469}
1470
1471#endif /* CONFIG_KEXEC_FILE */
1472
1473void crash_kexec(struct pt_regs *regs)
1474{
1475 /* Take the kexec_mutex here to prevent sys_kexec_load
1476 * running on one cpu from replacing the crash kernel
1477 * we are using after a panic on a different cpu.
1478 *
1479 * If the crash kernel was not located in a fixed area
1480 * of memory the xchg(&kexec_crash_image) would be
1481 * sufficient. But since I reuse the memory...
1482 */
1483 if (mutex_trylock(&kexec_mutex)) {
1484 if (kexec_crash_image) {
1485 struct pt_regs fixed_regs;
1486
1487 crash_setup_regs(&fixed_regs, regs);
1488 crash_save_vmcoreinfo();
1489 machine_crash_shutdown(&fixed_regs);
1490 machine_kexec(kexec_crash_image);
1491 }
1492 mutex_unlock(&kexec_mutex);
1493 }
1494}
1495
1496size_t crash_get_memory_size(void)
1497{
1498 size_t size = 0;
1499 mutex_lock(&kexec_mutex);
1500 if (crashk_res.end != crashk_res.start)
1501 size = resource_size(&crashk_res);
1502 mutex_unlock(&kexec_mutex);
1503 return size;
1504}
1505
1506void __weak crash_free_reserved_phys_range(unsigned long begin,
1507 unsigned long end)
1508{
1509 unsigned long addr;
1510
1511 for (addr = begin; addr < end; addr += PAGE_SIZE)
1512 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
1513}
1514
1515int crash_shrink_memory(unsigned long new_size)
1516{
1517 int ret = 0;
1518 unsigned long start, end;
1519 unsigned long old_size;
1520 struct resource *ram_res;
1521
1522 mutex_lock(&kexec_mutex);
1523
1524 if (kexec_crash_image) {
1525 ret = -ENOENT;
1526 goto unlock;
1527 }
1528 start = crashk_res.start;
1529 end = crashk_res.end;
1530 old_size = (end == 0) ? 0 : end - start + 1;
1531 if (new_size >= old_size) {
1532 ret = (new_size == old_size) ? 0 : -EINVAL;
1533 goto unlock;
1534 }
1535
1536 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1537 if (!ram_res) {
1538 ret = -ENOMEM;
1539 goto unlock;
1540 }
1541
1542 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1543 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
1544
1545 crash_map_reserved_pages();
1546 crash_free_reserved_phys_range(end, crashk_res.end);
1547
1548 if ((start == end) && (crashk_res.parent != NULL))
1549 release_resource(&crashk_res);
1550
1551 ram_res->start = end;
1552 ram_res->end = crashk_res.end;
1553 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1554 ram_res->name = "System RAM";
1555
1556 crashk_res.end = end - 1;
1557
1558 insert_resource(&iomem_resource, ram_res);
1559 crash_unmap_reserved_pages();
1560
1561unlock:
1562 mutex_unlock(&kexec_mutex);
1563 return ret;
1564}
1565
1566static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1567 size_t data_len)
1568{
1569 struct elf_note note;
1570
1571 note.n_namesz = strlen(name) + 1;
1572 note.n_descsz = data_len;
1573 note.n_type = type;
1574 memcpy(buf, &note, sizeof(note));
1575 buf += (sizeof(note) + 3)/4;
1576 memcpy(buf, name, note.n_namesz);
1577 buf += (note.n_namesz + 3)/4;
1578 memcpy(buf, data, note.n_descsz);
1579 buf += (note.n_descsz + 3)/4;
1580
1581 return buf;
1582}
1583
1584static void final_note(u32 *buf)
1585{
1586 struct elf_note note;
1587
1588 note.n_namesz = 0;
1589 note.n_descsz = 0;
1590 note.n_type = 0;
1591 memcpy(buf, &note, sizeof(note));
1592}
1593
1594void crash_save_cpu(struct pt_regs *regs, int cpu)
1595{
1596 struct elf_prstatus prstatus;
1597 u32 *buf;
1598
1599 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1600 return;
1601
1602 /* Using ELF notes here is opportunistic.
1603 * I need a well defined structure format
1604 * for the data I pass, and I need tags
1605 * on the data to indicate what information I have
1606 * squirrelled away. ELF notes happen to provide
1607 * all of that, so there is no need to invent something new.
1608 */
1609 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1610 if (!buf)
1611 return;
1612 memset(&prstatus, 0, sizeof(prstatus));
1613 prstatus.pr_pid = current->pid;
1614 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1615 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1616 &prstatus, sizeof(prstatus));
1617 final_note(buf);
1618}
1619
1620static int __init crash_notes_memory_init(void)
1621{
1622 /* Allocate memory for saving cpu registers. */
1623 crash_notes = alloc_percpu(note_buf_t);
1624 if (!crash_notes) {
1625 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1626 return -ENOMEM;
1627 }
1628 return 0;
1629}
1630subsys_initcall(crash_notes_memory_init);
1631
1632
1633/*
1634 * parsing the "crashkernel" commandline
1635 *
1636 * this code is intended to be called from architecture specific code
1637 */
1638
1639
1640/*
1641 * This function parses command lines in the format
1642 *
1643 * crashkernel=ramsize-range:size[,...][@offset]
1644 *
1645 * The function returns 0 on success and -EINVAL on failure.
1646 */
1647static int __init parse_crashkernel_mem(char *cmdline,
1648 unsigned long long system_ram,
1649 unsigned long long *crash_size,
1650 unsigned long long *crash_base)
1651{
1652 char *cur = cmdline, *tmp;
1653
1654 /* for each entry of the comma-separated list */
1655 do {
1656 unsigned long long start, end = ULLONG_MAX, size;
1657
1658 /* get the start of the range */
1659 start = memparse(cur, &tmp);
1660 if (cur == tmp) {
1661 pr_warn("crashkernel: Memory value expected\n");
1662 return -EINVAL;
1663 }
1664 cur = tmp;
1665 if (*cur != '-') {
1666 pr_warn("crashkernel: '-' expected\n");
1667 return -EINVAL;
1668 }
1669 cur++;
1670
1671 /* if no ':' is here, than we read the end */
1672 if (*cur != ':') {
1673 end = memparse(cur, &tmp);
1674 if (cur == tmp) {
1675 pr_warn("crashkernel: Memory value expected\n");
1676 return -EINVAL;
1677 }
1678 cur = tmp;
1679 if (end <= start) {
1680 pr_warn("crashkernel: end <= start\n");
1681 return -EINVAL;
1682 }
1683 }
1684
1685 if (*cur != ':') {
1686 pr_warn("crashkernel: ':' expected\n");
1687 return -EINVAL;
1688 }
1689 cur++;
1690
1691 size = memparse(cur, &tmp);
1692 if (cur == tmp) {
1693 pr_warn("Memory value expected\n");
1694 return -EINVAL;
1695 }
1696 cur = tmp;
1697 if (size >= system_ram) {
1698 pr_warn("crashkernel: invalid size\n");
1699 return -EINVAL;
1700 }
1701
1702 /* match ? */
1703 if (system_ram >= start && system_ram < end) {
1704 *crash_size = size;
1705 break;
1706 }
1707 } while (*cur++ == ',');
1708
1709 if (*crash_size > 0) {
1710 while (*cur && *cur != ' ' && *cur != '@')
1711 cur++;
1712 if (*cur == '@') {
1713 cur++;
1714 *crash_base = memparse(cur, &tmp);
1715 if (cur == tmp) {
1716 pr_warn("Memory value expected after '@'\n");
1717 return -EINVAL;
1718 }
1719 }
1720 }
1721
1722 return 0;
1723}
1724
1725/*
1726 * That function parses "simple" (old) crashkernel command lines like
1727 *
1728 * crashkernel=size[@offset]
1729 *
1730 * It returns 0 on success and -EINVAL on failure.
1731 */
1732static int __init parse_crashkernel_simple(char *cmdline,
1733 unsigned long long *crash_size,
1734 unsigned long long *crash_base)
1735{
1736 char *cur = cmdline;
1737
1738 *crash_size = memparse(cmdline, &cur);
1739 if (cmdline == cur) {
1740 pr_warn("crashkernel: memory value expected\n");
1741 return -EINVAL;
1742 }
1743
1744 if (*cur == '@')
1745 *crash_base = memparse(cur+1, &cur);
1746 else if (*cur != ' ' && *cur != '\0') {
1747 pr_warn("crashkernel: unrecognized char\n");
1748 return -EINVAL;
1749 }
1750
1751 return 0;
1752}
1753
1754#define SUFFIX_HIGH 0
1755#define SUFFIX_LOW 1
1756#define SUFFIX_NULL 2
1757static __initdata char *suffix_tbl[] = {
1758 [SUFFIX_HIGH] = ",high",
1759 [SUFFIX_LOW] = ",low",
1760 [SUFFIX_NULL] = NULL,
1761};
1762
1763/*
1764 * That function parses "suffix" crashkernel command lines like
1765 *
1766 * crashkernel=size,[high|low]
1767 *
1768 * It returns 0 on success and -EINVAL on failure.
1769 */
1770static int __init parse_crashkernel_suffix(char *cmdline,
1771 unsigned long long *crash_size,
1772 const char *suffix)
1773{
1774 char *cur = cmdline;
1775
1776 *crash_size = memparse(cmdline, &cur);
1777 if (cmdline == cur) {
1778 pr_warn("crashkernel: memory value expected\n");
1779 return -EINVAL;
1780 }
1781
1782 /* check with suffix */
1783 if (strncmp(cur, suffix, strlen(suffix))) {
1784 pr_warn("crashkernel: unrecognized char\n");
1785 return -EINVAL;
1786 }
1787 cur += strlen(suffix);
1788 if (*cur != ' ' && *cur != '\0') {
1789 pr_warn("crashkernel: unrecognized char\n");
1790 return -EINVAL;
1791 }
1792
1793 return 0;
1794}
1795
1796static __init char *get_last_crashkernel(char *cmdline,
1797 const char *name,
1798 const char *suffix)
1799{
1800 char *p = cmdline, *ck_cmdline = NULL;
1801
1802 /* find crashkernel and use the last one if there are more */
1803 p = strstr(p, name);
1804 while (p) {
1805 char *end_p = strchr(p, ' ');
1806 char *q;
1807
1808 if (!end_p)
1809 end_p = p + strlen(p);
1810
1811 if (!suffix) {
1812 int i;
1813
1814 /* skip the one with any known suffix */
1815 for (i = 0; suffix_tbl[i]; i++) {
1816 q = end_p - strlen(suffix_tbl[i]);
1817 if (!strncmp(q, suffix_tbl[i],
1818 strlen(suffix_tbl[i])))
1819 goto next;
1820 }
1821 ck_cmdline = p;
1822 } else {
1823 q = end_p - strlen(suffix);
1824 if (!strncmp(q, suffix, strlen(suffix)))
1825 ck_cmdline = p;
1826 }
1827next:
1828 p = strstr(p+1, name);
1829 }
1830
1831 if (!ck_cmdline)
1832 return NULL;
1833
1834 return ck_cmdline;
1835}
1836
1837static int __init __parse_crashkernel(char *cmdline,
1838 unsigned long long system_ram,
1839 unsigned long long *crash_size,
1840 unsigned long long *crash_base,
1841 const char *name,
1842 const char *suffix)
1843{
1844 char *first_colon, *first_space;
1845 char *ck_cmdline;
1846
1847 BUG_ON(!crash_size || !crash_base);
1848 *crash_size = 0;
1849 *crash_base = 0;
1850
1851 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1852
1853 if (!ck_cmdline)
1854 return -EINVAL;
1855
1856 ck_cmdline += strlen(name);
1857
1858 if (suffix)
1859 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1860 suffix);
1861 /*
1862 * if the commandline contains a ':', then that's the extended
1863 * syntax -- if not, it must be the classic syntax
1864 */
1865 first_colon = strchr(ck_cmdline, ':');
1866 first_space = strchr(ck_cmdline, ' ');
1867 if (first_colon && (!first_space || first_colon < first_space))
1868 return parse_crashkernel_mem(ck_cmdline, system_ram,
1869 crash_size, crash_base);
1870
1871 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1872}
1873
1874/*
1875 * That function is the entry point for command line parsing and should be
1876 * called from the arch-specific code.
1877 */
1878int __init parse_crashkernel(char *cmdline,
1879 unsigned long long system_ram,
1880 unsigned long long *crash_size,
1881 unsigned long long *crash_base)
1882{
1883 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1884 "crashkernel=", NULL);
1885}
1886
1887int __init parse_crashkernel_high(char *cmdline,
1888 unsigned long long system_ram,
1889 unsigned long long *crash_size,
1890 unsigned long long *crash_base)
1891{
1892 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1893 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1894}
1895
1896int __init parse_crashkernel_low(char *cmdline,
1897 unsigned long long system_ram,
1898 unsigned long long *crash_size,
1899 unsigned long long *crash_base)
1900{
1901 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1902 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1903}
1904
1905static void update_vmcoreinfo_note(void)
1906{
1907 u32 *buf = vmcoreinfo_note;
1908
1909 if (!vmcoreinfo_size)
1910 return;
1911 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1912 vmcoreinfo_size);
1913 final_note(buf);
1914}
1915
1916void crash_save_vmcoreinfo(void)
1917{
1918 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1919 update_vmcoreinfo_note();
1920}
1921
1922void vmcoreinfo_append_str(const char *fmt, ...)
1923{
1924 va_list args;
1925 char buf[0x50];
1926 size_t r;
1927
1928 va_start(args, fmt);
1929 r = vscnprintf(buf, sizeof(buf), fmt, args);
1930 va_end(args);
1931
1932 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1933
1934 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1935
1936 vmcoreinfo_size += r;
1937}
1938
1939/*
1940 * provide an empty default implementation here -- architecture
1941 * code may override this
1942 */
1943void __weak arch_crash_save_vmcoreinfo(void)
1944{}
1945
1946unsigned long __weak paddr_vmcoreinfo_note(void)
1947{
1948 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1949}
1950
1951static int __init crash_save_vmcoreinfo_init(void)
1952{
1953 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1954 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1955
1956 VMCOREINFO_SYMBOL(init_uts_ns);
1957 VMCOREINFO_SYMBOL(node_online_map);
1958#ifdef CONFIG_MMU
1959 VMCOREINFO_SYMBOL(swapper_pg_dir);
1960#endif
1961 VMCOREINFO_SYMBOL(_stext);
1962 VMCOREINFO_SYMBOL(vmap_area_list);
1963
1964#ifndef CONFIG_NEED_MULTIPLE_NODES
1965 VMCOREINFO_SYMBOL(mem_map);
1966 VMCOREINFO_SYMBOL(contig_page_data);
1967#endif
1968#ifdef CONFIG_SPARSEMEM
1969 VMCOREINFO_SYMBOL(mem_section);
1970 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1971 VMCOREINFO_STRUCT_SIZE(mem_section);
1972 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1973#endif
1974 VMCOREINFO_STRUCT_SIZE(page);
1975 VMCOREINFO_STRUCT_SIZE(pglist_data);
1976 VMCOREINFO_STRUCT_SIZE(zone);
1977 VMCOREINFO_STRUCT_SIZE(free_area);
1978 VMCOREINFO_STRUCT_SIZE(list_head);
1979 VMCOREINFO_SIZE(nodemask_t);
1980 VMCOREINFO_OFFSET(page, flags);
1981 VMCOREINFO_OFFSET(page, _count);
1982 VMCOREINFO_OFFSET(page, mapping);
1983 VMCOREINFO_OFFSET(page, lru);
1984 VMCOREINFO_OFFSET(page, _mapcount);
1985 VMCOREINFO_OFFSET(page, private);
1986 VMCOREINFO_OFFSET(pglist_data, node_zones);
1987 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1988#ifdef CONFIG_FLAT_NODE_MEM_MAP
1989 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1990#endif
1991 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1992 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1993 VMCOREINFO_OFFSET(pglist_data, node_id);
1994 VMCOREINFO_OFFSET(zone, free_area);
1995 VMCOREINFO_OFFSET(zone, vm_stat);
1996 VMCOREINFO_OFFSET(zone, spanned_pages);
1997 VMCOREINFO_OFFSET(free_area, free_list);
1998 VMCOREINFO_OFFSET(list_head, next);
1999 VMCOREINFO_OFFSET(list_head, prev);
2000 VMCOREINFO_OFFSET(vmap_area, va_start);
2001 VMCOREINFO_OFFSET(vmap_area, list);
2002 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
2003 log_buf_kexec_setup();
2004 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
2005 VMCOREINFO_NUMBER(NR_FREE_PAGES);
2006 VMCOREINFO_NUMBER(PG_lru);
2007 VMCOREINFO_NUMBER(PG_private);
2008 VMCOREINFO_NUMBER(PG_swapcache);
2009 VMCOREINFO_NUMBER(PG_slab);
2010#ifdef CONFIG_MEMORY_FAILURE
2011 VMCOREINFO_NUMBER(PG_hwpoison);
2012#endif
2013 VMCOREINFO_NUMBER(PG_head_mask);
2014 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
2015#ifdef CONFIG_HUGETLBFS
2016 VMCOREINFO_SYMBOL(free_huge_page);
2017#endif
2018
2019 arch_crash_save_vmcoreinfo();
2020 update_vmcoreinfo_note();
2021
2022 return 0;
2023}
2024
2025subsys_initcall(crash_save_vmcoreinfo_init);
2026
2027#ifdef CONFIG_KEXEC_FILE
2028static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
2029 struct kexec_buf *kbuf)
2030{
2031 struct kimage *image = kbuf->image;
2032 unsigned long temp_start, temp_end;
2033
2034 temp_end = min(end, kbuf->buf_max);
2035 temp_start = temp_end - kbuf->memsz;
2036
2037 do {
2038 /* align down start */
2039 temp_start = temp_start & (~(kbuf->buf_align - 1));
2040
2041 if (temp_start < start || temp_start < kbuf->buf_min)
2042 return 0;
2043
2044 temp_end = temp_start + kbuf->memsz - 1;
2045
2046 /*
2047 * Make sure this does not conflict with any of existing
2048 * segments
2049 */
2050 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2051 temp_start = temp_start - PAGE_SIZE;
2052 continue;
2053 }
2054
2055 /* We found a suitable memory range */
2056 break;
2057 } while (1);
2058
2059 /* If we are here, we found a suitable memory range */
2060 kbuf->mem = temp_start;
2061
2062 /* Success, stop navigating through remaining System RAM ranges */
2063 return 1;
2064}
2065
2066static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
2067 struct kexec_buf *kbuf)
2068{
2069 struct kimage *image = kbuf->image;
2070 unsigned long temp_start, temp_end;
2071
2072 temp_start = max(start, kbuf->buf_min);
2073
2074 do {
2075 temp_start = ALIGN(temp_start, kbuf->buf_align);
2076 temp_end = temp_start + kbuf->memsz - 1;
2077
2078 if (temp_end > end || temp_end > kbuf->buf_max)
2079 return 0;
2080 /*
2081 * Make sure this does not conflict with any of existing
2082 * segments
2083 */
2084 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2085 temp_start = temp_start + PAGE_SIZE;
2086 continue;
2087 }
2088
2089 /* We found a suitable memory range */
2090 break;
2091 } while (1);
2092
2093 /* If we are here, we found a suitable memory range */
2094 kbuf->mem = temp_start;
2095
2096 /* Success, stop navigating through remaining System RAM ranges */
2097 return 1;
2098}
2099
2100static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
2101{
2102 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
2103 unsigned long sz = end - start + 1;
2104
2105 /* Returning 0 will take to next memory range */
2106 if (sz < kbuf->memsz)
2107 return 0;
2108
2109 if (end < kbuf->buf_min || start > kbuf->buf_max)
2110 return 0;
2111
2112 /*
2113 * Allocate memory top down with-in ram range. Otherwise bottom up
2114 * allocation.
2115 */
2116 if (kbuf->top_down)
2117 return locate_mem_hole_top_down(start, end, kbuf);
2118 return locate_mem_hole_bottom_up(start, end, kbuf);
2119}
2120
2121/*
2122 * Helper function for placing a buffer in a kexec segment. This assumes
2123 * that kexec_mutex is held.
2124 */
2125int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
2126 unsigned long memsz, unsigned long buf_align,
2127 unsigned long buf_min, unsigned long buf_max,
2128 bool top_down, unsigned long *load_addr)
2129{
2130
2131 struct kexec_segment *ksegment;
2132 struct kexec_buf buf, *kbuf;
2133 int ret;
2134
2135 /* Currently adding segment this way is allowed only in file mode */
2136 if (!image->file_mode)
2137 return -EINVAL;
2138
2139 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
2140 return -EINVAL;
2141
2142 /*
2143 * Make sure we are not trying to add buffer after allocating
2144 * control pages. All segments need to be placed first before
2145 * any control pages are allocated. As control page allocation
2146 * logic goes through list of segments to make sure there are
2147 * no destination overlaps.
2148 */
2149 if (!list_empty(&image->control_pages)) {
2150 WARN_ON(1);
2151 return -EINVAL;
2152 }
2153
2154 memset(&buf, 0, sizeof(struct kexec_buf));
2155 kbuf = &buf;
2156 kbuf->image = image;
2157 kbuf->buffer = buffer;
2158 kbuf->bufsz = bufsz;
2159
2160 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
2161 kbuf->buf_align = max(buf_align, PAGE_SIZE);
2162 kbuf->buf_min = buf_min;
2163 kbuf->buf_max = buf_max;
2164 kbuf->top_down = top_down;
2165
2166 /* Walk the RAM ranges and allocate a suitable range for the buffer */
2167 if (image->type == KEXEC_TYPE_CRASH)
2168 ret = walk_iomem_res("Crash kernel",
2169 IORESOURCE_MEM | IORESOURCE_BUSY,
2170 crashk_res.start, crashk_res.end, kbuf,
2171 locate_mem_hole_callback);
2172 else
2173 ret = walk_system_ram_res(0, -1, kbuf,
2174 locate_mem_hole_callback);
2175 if (ret != 1) {
2176 /* A suitable memory range could not be found for buffer */
2177 return -EADDRNOTAVAIL;
2178 }
2179
2180 /* Found a suitable memory range */
2181 ksegment = &image->segment[image->nr_segments];
2182 ksegment->kbuf = kbuf->buffer;
2183 ksegment->bufsz = kbuf->bufsz;
2184 ksegment->mem = kbuf->mem;
2185 ksegment->memsz = kbuf->memsz;
2186 image->nr_segments++;
2187 *load_addr = ksegment->mem;
2188 return 0;
2189}
2190
2191/* Calculate and store the digest of segments */
2192static int kexec_calculate_store_digests(struct kimage *image)
2193{
2194 struct crypto_shash *tfm;
2195 struct shash_desc *desc;
2196 int ret = 0, i, j, zero_buf_sz, sha_region_sz;
2197 size_t desc_size, nullsz;
2198 char *digest;
2199 void *zero_buf;
2200 struct kexec_sha_region *sha_regions;
2201 struct purgatory_info *pi = &image->purgatory_info;
2202
2203 zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
2204 zero_buf_sz = PAGE_SIZE;
2205
2206 tfm = crypto_alloc_shash("sha256", 0, 0);
2207 if (IS_ERR(tfm)) {
2208 ret = PTR_ERR(tfm);
2209 goto out;
2210 }
2211
2212 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
2213 desc = kzalloc(desc_size, GFP_KERNEL);
2214 if (!desc) {
2215 ret = -ENOMEM;
2216 goto out_free_tfm;
2217 }
2218
2219 sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
2220 sha_regions = vzalloc(sha_region_sz);
2221 if (!sha_regions)
2222 goto out_free_desc;
2223
2224 desc->tfm = tfm;
2225 desc->flags = 0;
2226
2227 ret = crypto_shash_init(desc);
2228 if (ret < 0)
2229 goto out_free_sha_regions;
2230
2231 digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
2232 if (!digest) {
2233 ret = -ENOMEM;
2234 goto out_free_sha_regions;
2235 }
2236
2237 for (j = i = 0; i < image->nr_segments; i++) {
2238 struct kexec_segment *ksegment;
2239
2240 ksegment = &image->segment[i];
2241 /*
2242 * Skip purgatory as it will be modified once we put digest
2243 * info in purgatory.
2244 */
2245 if (ksegment->kbuf == pi->purgatory_buf)
2246 continue;
2247
2248 ret = crypto_shash_update(desc, ksegment->kbuf,
2249 ksegment->bufsz);
2250 if (ret)
2251 break;
2252
2253 /*
2254 * Assume rest of the buffer is filled with zero and
2255 * update digest accordingly.
2256 */
2257 nullsz = ksegment->memsz - ksegment->bufsz;
2258 while (nullsz) {
2259 unsigned long bytes = nullsz;
2260
2261 if (bytes > zero_buf_sz)
2262 bytes = zero_buf_sz;
2263 ret = crypto_shash_update(desc, zero_buf, bytes);
2264 if (ret)
2265 break;
2266 nullsz -= bytes;
2267 }
2268
2269 if (ret)
2270 break;
2271
2272 sha_regions[j].start = ksegment->mem;
2273 sha_regions[j].len = ksegment->memsz;
2274 j++;
2275 }
2276
2277 if (!ret) {
2278 ret = crypto_shash_final(desc, digest);
2279 if (ret)
2280 goto out_free_digest;
2281 ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
2282 sha_regions, sha_region_sz, 0);
2283 if (ret)
2284 goto out_free_digest;
2285
2286 ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
2287 digest, SHA256_DIGEST_SIZE, 0);
2288 if (ret)
2289 goto out_free_digest;
2290 }
2291
2292out_free_digest:
2293 kfree(digest);
2294out_free_sha_regions:
2295 vfree(sha_regions);
2296out_free_desc:
2297 kfree(desc);
2298out_free_tfm:
2299 kfree(tfm);
2300out:
2301 return ret;
2302}
2303
2304/* Actually load purgatory. Lot of code taken from kexec-tools */
2305static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
2306 unsigned long max, int top_down)
2307{
2308 struct purgatory_info *pi = &image->purgatory_info;
2309 unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
2310 unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
2311 unsigned char *buf_addr, *src;
2312 int i, ret = 0, entry_sidx = -1;
2313 const Elf_Shdr *sechdrs_c;
2314 Elf_Shdr *sechdrs = NULL;
2315 void *purgatory_buf = NULL;
2316
2317 /*
2318 * sechdrs_c points to section headers in purgatory and are read
2319 * only. No modifications allowed.
2320 */
2321 sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
2322
2323 /*
2324 * We can not modify sechdrs_c[] and its fields. It is read only.
2325 * Copy it over to a local copy where one can store some temporary
2326 * data and free it at the end. We need to modify ->sh_addr and
2327 * ->sh_offset fields to keep track of permanent and temporary
2328 * locations of sections.
2329 */
2330 sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2331 if (!sechdrs)
2332 return -ENOMEM;
2333
2334 memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2335
2336 /*
2337 * We seem to have multiple copies of sections. First copy is which
2338 * is embedded in kernel in read only section. Some of these sections
2339 * will be copied to a temporary buffer and relocated. And these
2340 * sections will finally be copied to their final destination at
2341 * segment load time.
2342 *
2343 * Use ->sh_offset to reflect section address in memory. It will
2344 * point to original read only copy if section is not allocatable.
2345 * Otherwise it will point to temporary copy which will be relocated.
2346 *
2347 * Use ->sh_addr to contain final address of the section where it
2348 * will go during execution time.
2349 */
2350 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2351 if (sechdrs[i].sh_type == SHT_NOBITS)
2352 continue;
2353
2354 sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
2355 sechdrs[i].sh_offset;
2356 }
2357
2358 /*
2359 * Identify entry point section and make entry relative to section
2360 * start.
2361 */
2362 entry = pi->ehdr->e_entry;
2363 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2364 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2365 continue;
2366
2367 if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
2368 continue;
2369
2370 /* Make entry section relative */
2371 if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
2372 ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
2373 pi->ehdr->e_entry)) {
2374 entry_sidx = i;
2375 entry -= sechdrs[i].sh_addr;
2376 break;
2377 }
2378 }
2379
2380 /* Determine how much memory is needed to load relocatable object. */
2381 buf_align = 1;
2382 bss_align = 1;
2383 buf_sz = 0;
2384 bss_sz = 0;
2385
2386 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2387 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2388 continue;
2389
2390 align = sechdrs[i].sh_addralign;
2391 if (sechdrs[i].sh_type != SHT_NOBITS) {
2392 if (buf_align < align)
2393 buf_align = align;
2394 buf_sz = ALIGN(buf_sz, align);
2395 buf_sz += sechdrs[i].sh_size;
2396 } else {
2397 /* bss section */
2398 if (bss_align < align)
2399 bss_align = align;
2400 bss_sz = ALIGN(bss_sz, align);
2401 bss_sz += sechdrs[i].sh_size;
2402 }
2403 }
2404
2405 /* Determine the bss padding required to align bss properly */
2406 bss_pad = 0;
2407 if (buf_sz & (bss_align - 1))
2408 bss_pad = bss_align - (buf_sz & (bss_align - 1));
2409
2410 memsz = buf_sz + bss_pad + bss_sz;
2411
2412 /* Allocate buffer for purgatory */
2413 purgatory_buf = vzalloc(buf_sz);
2414 if (!purgatory_buf) {
2415 ret = -ENOMEM;
2416 goto out;
2417 }
2418
2419 if (buf_align < bss_align)
2420 buf_align = bss_align;
2421
2422 /* Add buffer to segment list */
2423 ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
2424 buf_align, min, max, top_down,
2425 &pi->purgatory_load_addr);
2426 if (ret)
2427 goto out;
2428
2429 /* Load SHF_ALLOC sections */
2430 buf_addr = purgatory_buf;
2431 load_addr = curr_load_addr = pi->purgatory_load_addr;
2432 bss_addr = load_addr + buf_sz + bss_pad;
2433
2434 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2435 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2436 continue;
2437
2438 align = sechdrs[i].sh_addralign;
2439 if (sechdrs[i].sh_type != SHT_NOBITS) {
2440 curr_load_addr = ALIGN(curr_load_addr, align);
2441 offset = curr_load_addr - load_addr;
2442 /* We already modifed ->sh_offset to keep src addr */
2443 src = (char *) sechdrs[i].sh_offset;
2444 memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
2445
2446 /* Store load address and source address of section */
2447 sechdrs[i].sh_addr = curr_load_addr;
2448
2449 /*
2450 * This section got copied to temporary buffer. Update
2451 * ->sh_offset accordingly.
2452 */
2453 sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
2454
2455 /* Advance to the next address */
2456 curr_load_addr += sechdrs[i].sh_size;
2457 } else {
2458 bss_addr = ALIGN(bss_addr, align);
2459 sechdrs[i].sh_addr = bss_addr;
2460 bss_addr += sechdrs[i].sh_size;
2461 }
2462 }
2463
2464 /* Update entry point based on load address of text section */
2465 if (entry_sidx >= 0)
2466 entry += sechdrs[entry_sidx].sh_addr;
2467
2468 /* Make kernel jump to purgatory after shutdown */
2469 image->start = entry;
2470
2471 /* Used later to get/set symbol values */
2472 pi->sechdrs = sechdrs;
2473
2474 /*
2475 * Used later to identify which section is purgatory and skip it
2476 * from checksumming.
2477 */
2478 pi->purgatory_buf = purgatory_buf;
2479 return ret;
2480out:
2481 vfree(sechdrs);
2482 vfree(purgatory_buf);
2483 return ret;
2484}
2485
2486static int kexec_apply_relocations(struct kimage *image)
2487{
2488 int i, ret;
2489 struct purgatory_info *pi = &image->purgatory_info;
2490 Elf_Shdr *sechdrs = pi->sechdrs;
2491
2492 /* Apply relocations */
2493 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2494 Elf_Shdr *section, *symtab;
2495
2496 if (sechdrs[i].sh_type != SHT_RELA &&
2497 sechdrs[i].sh_type != SHT_REL)
2498 continue;
2499
2500 /*
2501 * For section of type SHT_RELA/SHT_REL,
2502 * ->sh_link contains section header index of associated
2503 * symbol table. And ->sh_info contains section header
2504 * index of section to which relocations apply.
2505 */
2506 if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
2507 sechdrs[i].sh_link >= pi->ehdr->e_shnum)
2508 return -ENOEXEC;
2509
2510 section = &sechdrs[sechdrs[i].sh_info];
2511 symtab = &sechdrs[sechdrs[i].sh_link];
2512
2513 if (!(section->sh_flags & SHF_ALLOC))
2514 continue;
2515
2516 /*
2517 * symtab->sh_link contain section header index of associated
2518 * string table.
2519 */
2520 if (symtab->sh_link >= pi->ehdr->e_shnum)
2521 /* Invalid section number? */
2522 continue;
2523
2524 /*
2525 * Respective architecture needs to provide support for applying
2526 * relocations of type SHT_RELA/SHT_REL.
2527 */
2528 if (sechdrs[i].sh_type == SHT_RELA)
2529 ret = arch_kexec_apply_relocations_add(pi->ehdr,
2530 sechdrs, i);
2531 else if (sechdrs[i].sh_type == SHT_REL)
2532 ret = arch_kexec_apply_relocations(pi->ehdr,
2533 sechdrs, i);
2534 if (ret)
2535 return ret;
2536 }
2537
2538 return 0;
2539}
2540
2541/* Load relocatable purgatory object and relocate it appropriately */
2542int kexec_load_purgatory(struct kimage *image, unsigned long min,
2543 unsigned long max, int top_down,
2544 unsigned long *load_addr)
2545{
2546 struct purgatory_info *pi = &image->purgatory_info;
2547 int ret;
2548
2549 if (kexec_purgatory_size <= 0)
2550 return -EINVAL;
2551
2552 if (kexec_purgatory_size < sizeof(Elf_Ehdr))
2553 return -ENOEXEC;
2554
2555 pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
2556
2557 if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
2558 || pi->ehdr->e_type != ET_REL
2559 || !elf_check_arch(pi->ehdr)
2560 || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
2561 return -ENOEXEC;
2562
2563 if (pi->ehdr->e_shoff >= kexec_purgatory_size
2564 || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
2565 kexec_purgatory_size - pi->ehdr->e_shoff))
2566 return -ENOEXEC;
2567
2568 ret = __kexec_load_purgatory(image, min, max, top_down);
2569 if (ret)
2570 return ret;
2571
2572 ret = kexec_apply_relocations(image);
2573 if (ret)
2574 goto out;
2575
2576 *load_addr = pi->purgatory_load_addr;
2577 return 0;
2578out:
2579 vfree(pi->sechdrs);
2580 vfree(pi->purgatory_buf);
2581 return ret;
2582}
2583
2584static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
2585 const char *name)
2586{
2587 Elf_Sym *syms;
2588 Elf_Shdr *sechdrs;
2589 Elf_Ehdr *ehdr;
2590 int i, k;
2591 const char *strtab;
2592
2593 if (!pi->sechdrs || !pi->ehdr)
2594 return NULL;
2595
2596 sechdrs = pi->sechdrs;
2597 ehdr = pi->ehdr;
2598
2599 for (i = 0; i < ehdr->e_shnum; i++) {
2600 if (sechdrs[i].sh_type != SHT_SYMTAB)
2601 continue;
2602
2603 if (sechdrs[i].sh_link >= ehdr->e_shnum)
2604 /* Invalid strtab section number */
2605 continue;
2606 strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
2607 syms = (Elf_Sym *)sechdrs[i].sh_offset;
2608
2609 /* Go through symbols for a match */
2610 for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
2611 if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
2612 continue;
2613
2614 if (strcmp(strtab + syms[k].st_name, name) != 0)
2615 continue;
2616
2617 if (syms[k].st_shndx == SHN_UNDEF ||
2618 syms[k].st_shndx >= ehdr->e_shnum) {
2619 pr_debug("Symbol: %s has bad section index %d.\n",
2620 name, syms[k].st_shndx);
2621 return NULL;
2622 }
2623
2624 /* Found the symbol we are looking for */
2625 return &syms[k];
2626 }
2627 }
2628
2629 return NULL;
2630}
2631
2632void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
2633{
2634 struct purgatory_info *pi = &image->purgatory_info;
2635 Elf_Sym *sym;
2636 Elf_Shdr *sechdr;
2637
2638 sym = kexec_purgatory_find_symbol(pi, name);
2639 if (!sym)
2640 return ERR_PTR(-EINVAL);
2641
2642 sechdr = &pi->sechdrs[sym->st_shndx];
2643
2644 /*
2645 * Returns the address where symbol will finally be loaded after
2646 * kexec_load_segment()
2647 */
2648 return (void *)(sechdr->sh_addr + sym->st_value);
2649}
2650
2651/*
2652 * Get or set value of a symbol. If "get_value" is true, symbol value is
2653 * returned in buf otherwise symbol value is set based on value in buf.
2654 */
2655int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
2656 void *buf, unsigned int size, bool get_value)
2657{
2658 Elf_Sym *sym;
2659 Elf_Shdr *sechdrs;
2660 struct purgatory_info *pi = &image->purgatory_info;
2661 char *sym_buf;
2662
2663 sym = kexec_purgatory_find_symbol(pi, name);
2664 if (!sym)
2665 return -EINVAL;
2666
2667 if (sym->st_size != size) {
2668 pr_err("symbol %s size mismatch: expected %lu actual %u\n",
2669 name, (unsigned long)sym->st_size, size);
2670 return -EINVAL;
2671 }
2672
2673 sechdrs = pi->sechdrs;
2674
2675 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
2676 pr_err("symbol %s is in a bss section. Cannot %s\n", name,
2677 get_value ? "get" : "set");
2678 return -EINVAL;
2679 }
2680
2681 sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
2682 sym->st_value;
2683
2684 if (get_value)
2685 memcpy((void *)buf, sym_buf, size);
2686 else
2687 memcpy((void *)sym_buf, buf, size);
2688
2689 return 0;
2690}
2691#endif /* CONFIG_KEXEC_FILE */
2692
2693/*
2694 * Move into place and start executing a preloaded standalone
2695 * executable. If nothing was preloaded return an error.
2696 */
2697int kernel_kexec(void)
2698{
2699 int error = 0;
2700
2701 if (!mutex_trylock(&kexec_mutex))
2702 return -EBUSY;
2703 if (!kexec_image) {
2704 error = -EINVAL;
2705 goto Unlock;
2706 }
2707
2708#ifdef CONFIG_KEXEC_JUMP
2709 if (kexec_image->preserve_context) {
2710 lock_system_sleep();
2711 pm_prepare_console();
2712 error = freeze_processes();
2713 if (error) {
2714 error = -EBUSY;
2715 goto Restore_console;
2716 }
2717 suspend_console();
2718 error = dpm_suspend_start(PMSG_FREEZE);
2719 if (error)
2720 goto Resume_console;
2721 /* At this point, dpm_suspend_start() has been called,
2722 * but *not* dpm_suspend_end(). We *must* call
2723 * dpm_suspend_end() now. Otherwise, drivers for
2724 * some devices (e.g. interrupt controllers) become
2725 * desynchronized with the actual state of the
2726 * hardware at resume time, and evil weirdness ensues.
2727 */
2728 error = dpm_suspend_end(PMSG_FREEZE);
2729 if (error)
2730 goto Resume_devices;
2731 error = disable_nonboot_cpus();
2732 if (error)
2733 goto Enable_cpus;
2734 local_irq_disable();
2735 error = syscore_suspend();
2736 if (error)
2737 goto Enable_irqs;
2738 } else
2739#endif
2740 {
2741 kexec_in_progress = true;
2742 kernel_restart_prepare(NULL);
2743 migrate_to_reboot_cpu();
2744
2745 /*
2746 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
2747 * no further code needs to use CPU hotplug (which is true in
2748 * the reboot case). However, the kexec path depends on using
2749 * CPU hotplug again; so re-enable it here.
2750 */
2751 cpu_hotplug_enable();
2752 pr_emerg("Starting new kernel\n");
2753 machine_shutdown();
2754 }
2755
2756 machine_kexec(kexec_image);
2757
2758#ifdef CONFIG_KEXEC_JUMP
2759 if (kexec_image->preserve_context) {
2760 syscore_resume();
2761 Enable_irqs:
2762 local_irq_enable();
2763 Enable_cpus:
2764 enable_nonboot_cpus();
2765 dpm_resume_start(PMSG_RESTORE);
2766 Resume_devices:
2767 dpm_resume_end(PMSG_RESTORE);
2768 Resume_console:
2769 resume_console();
2770 thaw_processes();
2771 Restore_console:
2772 pm_restore_console();
2773 unlock_system_sleep();
2774 }
2775#endif
2776
2777 Unlock:
2778 mutex_unlock(&kexec_mutex);
2779 return error;
2780}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..201b45327804
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1534 @@
1/*
2 * kexec.c - kexec system call core code.
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#define pr_fmt(fmt) "kexec: " fmt
10
11#include <linux/capability.h>
12#include <linux/mm.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/fs.h>
16#include <linux/kexec.h>
17#include <linux/mutex.h>
18#include <linux/list.h>
19#include <linux/highmem.h>
20#include <linux/syscalls.h>
21#include <linux/reboot.h>
22#include <linux/ioport.h>
23#include <linux/hardirq.h>
24#include <linux/elf.h>
25#include <linux/elfcore.h>
26#include <linux/utsname.h>
27#include <linux/numa.h>
28#include <linux/suspend.h>
29#include <linux/device.h>
30#include <linux/freezer.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/uaccess.h>
34#include <linux/io.h>
35#include <linux/console.h>
36#include <linux/vmalloc.h>
37#include <linux/swap.h>
38#include <linux/syscore_ops.h>
39#include <linux/compiler.h>
40#include <linux/hugetlb.h>
41
42#include <asm/page.h>
43#include <asm/sections.h>
44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47#include "kexec_internal.h"
48
49DEFINE_MUTEX(kexec_mutex);
50
51/* Per cpu memory for storing cpu states in case of system crash. */
52note_buf_t __percpu *crash_notes;
53
54/* vmcoreinfo stuff */
55static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
56u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
57size_t vmcoreinfo_size;
58size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
59
60/* Flag to indicate we are going to kexec a new kernel */
61bool kexec_in_progress = false;
62
63
64/* Location of the reserved area for the crash kernel */
65struct resource crashk_res = {
66 .name = "Crash kernel",
67 .start = 0,
68 .end = 0,
69 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
70};
71struct resource crashk_low_res = {
72 .name = "Crash kernel",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
76};
77
78int kexec_should_crash(struct task_struct *p)
79{
80 /*
81 * If crash_kexec_post_notifiers is enabled, don't run
82 * crash_kexec() here yet, which must be run after panic
83 * notifiers in panic().
84 */
85 if (crash_kexec_post_notifiers)
86 return 0;
87 /*
88 * There are 4 panic() calls in do_exit() path, each of which
89 * corresponds to each of these 4 conditions.
90 */
91 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
92 return 1;
93 return 0;
94}
95
96/*
97 * When kexec transitions to the new kernel there is a one-to-one
98 * mapping between physical and virtual addresses. On processors
99 * where you can disable the MMU this is trivial, and easy. For
100 * others it is still a simple predictable page table to setup.
101 *
102 * In that environment kexec copies the new kernel to its final
103 * resting place. This means I can only support memory whose
104 * physical address can fit in an unsigned long. In particular
105 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
106 * If the assembly stub has more restrictive requirements
107 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
108 * defined more restrictively in <asm/kexec.h>.
109 *
110 * The code for the transition from the current kernel to the
111 * the new kernel is placed in the control_code_buffer, whose size
112 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
113 * page of memory is necessary, but some architectures require more.
114 * Because this memory must be identity mapped in the transition from
115 * virtual to physical addresses it must live in the range
116 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
117 * modifiable.
118 *
119 * The assembly stub in the control code buffer is passed a linked list
120 * of descriptor pages detailing the source pages of the new kernel,
121 * and the destination addresses of those source pages. As this data
122 * structure is not used in the context of the current OS, it must
123 * be self-contained.
124 *
125 * The code has been made to work with highmem pages and will use a
126 * destination page in its final resting place (if it happens
127 * to allocate it). The end product of this is that most of the
128 * physical address space, and most of RAM can be used.
129 *
130 * Future directions include:
131 * - allocating a page table with the control code buffer identity
132 * mapped, to simplify machine_kexec and make kexec_on_panic more
133 * reliable.
134 */
135
136/*
137 * KIMAGE_NO_DEST is an impossible destination address..., for
138 * allocating pages whose destination address we do not care about.
139 */
140#define KIMAGE_NO_DEST (-1UL)
141
142static struct page *kimage_alloc_page(struct kimage *image,
143 gfp_t gfp_mask,
144 unsigned long dest);
145
146int sanity_check_segment_list(struct kimage *image)
147{
148 int result, i;
149 unsigned long nr_segments = image->nr_segments;
150
151 /*
152 * Verify we have good destination addresses. The caller is
153 * responsible for making certain we don't attempt to load
154 * the new image into invalid or reserved areas of RAM. This
155 * just verifies it is an address we can use.
156 *
157 * Since the kernel does everything in page size chunks ensure
158 * the destination addresses are page aligned. Too many
159 * special cases crop of when we don't do this. The most
160 * insidious is getting overlapping destination addresses
161 * simply because addresses are changed to page size
162 * granularity.
163 */
164 result = -EADDRNOTAVAIL;
165 for (i = 0; i < nr_segments; i++) {
166 unsigned long mstart, mend;
167
168 mstart = image->segment[i].mem;
169 mend = mstart + image->segment[i].memsz;
170 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
171 return result;
172 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
173 return result;
174 }
175
176 /* Verify our destination addresses do not overlap.
177 * If we alloed overlapping destination addresses
178 * through very weird things can happen with no
179 * easy explanation as one segment stops on another.
180 */
181 result = -EINVAL;
182 for (i = 0; i < nr_segments; i++) {
183 unsigned long mstart, mend;
184 unsigned long j;
185
186 mstart = image->segment[i].mem;
187 mend = mstart + image->segment[i].memsz;
188 for (j = 0; j < i; j++) {
189 unsigned long pstart, pend;
190
191 pstart = image->segment[j].mem;
192 pend = pstart + image->segment[j].memsz;
193 /* Do the segments overlap ? */
194 if ((mend > pstart) && (mstart < pend))
195 return result;
196 }
197 }
198
199 /* Ensure our buffer sizes are strictly less than
200 * our memory sizes. This should always be the case,
201 * and it is easier to check up front than to be surprised
202 * later on.
203 */
204 result = -EINVAL;
205 for (i = 0; i < nr_segments; i++) {
206 if (image->segment[i].bufsz > image->segment[i].memsz)
207 return result;
208 }
209
210 /*
211 * Verify we have good destination addresses. Normally
212 * the caller is responsible for making certain we don't
213 * attempt to load the new image into invalid or reserved
214 * areas of RAM. But crash kernels are preloaded into a
215 * reserved area of ram. We must ensure the addresses
216 * are in the reserved area otherwise preloading the
217 * kernel could corrupt things.
218 */
219
220 if (image->type == KEXEC_TYPE_CRASH) {
221 result = -EADDRNOTAVAIL;
222 for (i = 0; i < nr_segments; i++) {
223 unsigned long mstart, mend;
224
225 mstart = image->segment[i].mem;
226 mend = mstart + image->segment[i].memsz - 1;
227 /* Ensure we are within the crash kernel limits */
228 if ((mstart < crashk_res.start) ||
229 (mend > crashk_res.end))
230 return result;
231 }
232 }
233
234 return 0;
235}
236
237struct kimage *do_kimage_alloc_init(void)
238{
239 struct kimage *image;
240
241 /* Allocate a controlling structure */
242 image = kzalloc(sizeof(*image), GFP_KERNEL);
243 if (!image)
244 return NULL;
245
246 image->head = 0;
247 image->entry = &image->head;
248 image->last_entry = &image->head;
249 image->control_page = ~0; /* By default this does not apply */
250 image->type = KEXEC_TYPE_DEFAULT;
251
252 /* Initialize the list of control pages */
253 INIT_LIST_HEAD(&image->control_pages);
254
255 /* Initialize the list of destination pages */
256 INIT_LIST_HEAD(&image->dest_pages);
257
258 /* Initialize the list of unusable pages */
259 INIT_LIST_HEAD(&image->unusable_pages);
260
261 return image;
262}
263
264int kimage_is_destination_range(struct kimage *image,
265 unsigned long start,
266 unsigned long end)
267{
268 unsigned long i;
269
270 for (i = 0; i < image->nr_segments; i++) {
271 unsigned long mstart, mend;
272
273 mstart = image->segment[i].mem;
274 mend = mstart + image->segment[i].memsz;
275 if ((end > mstart) && (start < mend))
276 return 1;
277 }
278
279 return 0;
280}
281
282static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
283{
284 struct page *pages;
285
286 pages = alloc_pages(gfp_mask, order);
287 if (pages) {
288 unsigned int count, i;
289
290 pages->mapping = NULL;
291 set_page_private(pages, order);
292 count = 1 << order;
293 for (i = 0; i < count; i++)
294 SetPageReserved(pages + i);
295 }
296
297 return pages;
298}
299
300static void kimage_free_pages(struct page *page)
301{
302 unsigned int order, count, i;
303
304 order = page_private(page);
305 count = 1 << order;
306 for (i = 0; i < count; i++)
307 ClearPageReserved(page + i);
308 __free_pages(page, order);
309}
310
311void kimage_free_page_list(struct list_head *list)
312{
313 struct list_head *pos, *next;
314
315 list_for_each_safe(pos, next, list) {
316 struct page *page;
317
318 page = list_entry(pos, struct page, lru);
319 list_del(&page->lru);
320 kimage_free_pages(page);
321 }
322}
323
324static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
325 unsigned int order)
326{
327 /* Control pages are special, they are the intermediaries
328 * that are needed while we copy the rest of the pages
329 * to their final resting place. As such they must
330 * not conflict with either the destination addresses
331 * or memory the kernel is already using.
332 *
333 * The only case where we really need more than one of
334 * these are for architectures where we cannot disable
335 * the MMU and must instead generate an identity mapped
336 * page table for all of the memory.
337 *
338 * At worst this runs in O(N) of the image size.
339 */
340 struct list_head extra_pages;
341 struct page *pages;
342 unsigned int count;
343
344 count = 1 << order;
345 INIT_LIST_HEAD(&extra_pages);
346
347 /* Loop while I can allocate a page and the page allocated
348 * is a destination page.
349 */
350 do {
351 unsigned long pfn, epfn, addr, eaddr;
352
353 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
354 if (!pages)
355 break;
356 pfn = page_to_pfn(pages);
357 epfn = pfn + count;
358 addr = pfn << PAGE_SHIFT;
359 eaddr = epfn << PAGE_SHIFT;
360 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
361 kimage_is_destination_range(image, addr, eaddr)) {
362 list_add(&pages->lru, &extra_pages);
363 pages = NULL;
364 }
365 } while (!pages);
366
367 if (pages) {
368 /* Remember the allocated page... */
369 list_add(&pages->lru, &image->control_pages);
370
371 /* Because the page is already in it's destination
372 * location we will never allocate another page at
373 * that address. Therefore kimage_alloc_pages
374 * will not return it (again) and we don't need
375 * to give it an entry in image->segment[].
376 */
377 }
378 /* Deal with the destination pages I have inadvertently allocated.
379 *
380 * Ideally I would convert multi-page allocations into single
381 * page allocations, and add everything to image->dest_pages.
382 *
383 * For now it is simpler to just free the pages.
384 */
385 kimage_free_page_list(&extra_pages);
386
387 return pages;
388}
389
390static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
391 unsigned int order)
392{
393 /* Control pages are special, they are the intermediaries
394 * that are needed while we copy the rest of the pages
395 * to their final resting place. As such they must
396 * not conflict with either the destination addresses
397 * or memory the kernel is already using.
398 *
399 * Control pages are also the only pags we must allocate
400 * when loading a crash kernel. All of the other pages
401 * are specified by the segments and we just memcpy
402 * into them directly.
403 *
404 * The only case where we really need more than one of
405 * these are for architectures where we cannot disable
406 * the MMU and must instead generate an identity mapped
407 * page table for all of the memory.
408 *
409 * Given the low demand this implements a very simple
410 * allocator that finds the first hole of the appropriate
411 * size in the reserved memory region, and allocates all
412 * of the memory up to and including the hole.
413 */
414 unsigned long hole_start, hole_end, size;
415 struct page *pages;
416
417 pages = NULL;
418 size = (1 << order) << PAGE_SHIFT;
419 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
420 hole_end = hole_start + size - 1;
421 while (hole_end <= crashk_res.end) {
422 unsigned long i;
423
424 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
425 break;
426 /* See if I overlap any of the segments */
427 for (i = 0; i < image->nr_segments; i++) {
428 unsigned long mstart, mend;
429
430 mstart = image->segment[i].mem;
431 mend = mstart + image->segment[i].memsz - 1;
432 if ((hole_end >= mstart) && (hole_start <= mend)) {
433 /* Advance the hole to the end of the segment */
434 hole_start = (mend + (size - 1)) & ~(size - 1);
435 hole_end = hole_start + size - 1;
436 break;
437 }
438 }
439 /* If I don't overlap any segments I have found my hole! */
440 if (i == image->nr_segments) {
441 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
442 image->control_page = hole_end;
443 break;
444 }
445 }
446
447 return pages;
448}
449
450
451struct page *kimage_alloc_control_pages(struct kimage *image,
452 unsigned int order)
453{
454 struct page *pages = NULL;
455
456 switch (image->type) {
457 case KEXEC_TYPE_DEFAULT:
458 pages = kimage_alloc_normal_control_pages(image, order);
459 break;
460 case KEXEC_TYPE_CRASH:
461 pages = kimage_alloc_crash_control_pages(image, order);
462 break;
463 }
464
465 return pages;
466}
467
468static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
469{
470 if (*image->entry != 0)
471 image->entry++;
472
473 if (image->entry == image->last_entry) {
474 kimage_entry_t *ind_page;
475 struct page *page;
476
477 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
478 if (!page)
479 return -ENOMEM;
480
481 ind_page = page_address(page);
482 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
483 image->entry = ind_page;
484 image->last_entry = ind_page +
485 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
486 }
487 *image->entry = entry;
488 image->entry++;
489 *image->entry = 0;
490
491 return 0;
492}
493
494static int kimage_set_destination(struct kimage *image,
495 unsigned long destination)
496{
497 int result;
498
499 destination &= PAGE_MASK;
500 result = kimage_add_entry(image, destination | IND_DESTINATION);
501
502 return result;
503}
504
505
506static int kimage_add_page(struct kimage *image, unsigned long page)
507{
508 int result;
509
510 page &= PAGE_MASK;
511 result = kimage_add_entry(image, page | IND_SOURCE);
512
513 return result;
514}
515
516
517static void kimage_free_extra_pages(struct kimage *image)
518{
519 /* Walk through and free any extra destination pages I may have */
520 kimage_free_page_list(&image->dest_pages);
521
522 /* Walk through and free any unusable pages I have cached */
523 kimage_free_page_list(&image->unusable_pages);
524
525}
526void kimage_terminate(struct kimage *image)
527{
528 if (*image->entry != 0)
529 image->entry++;
530
531 *image->entry = IND_DONE;
532}
533
534#define for_each_kimage_entry(image, ptr, entry) \
535 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
536 ptr = (entry & IND_INDIRECTION) ? \
537 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
538
539static void kimage_free_entry(kimage_entry_t entry)
540{
541 struct page *page;
542
543 page = pfn_to_page(entry >> PAGE_SHIFT);
544 kimage_free_pages(page);
545}
546
547void kimage_free(struct kimage *image)
548{
549 kimage_entry_t *ptr, entry;
550 kimage_entry_t ind = 0;
551
552 if (!image)
553 return;
554
555 kimage_free_extra_pages(image);
556 for_each_kimage_entry(image, ptr, entry) {
557 if (entry & IND_INDIRECTION) {
558 /* Free the previous indirection page */
559 if (ind & IND_INDIRECTION)
560 kimage_free_entry(ind);
561 /* Save this indirection page until we are
562 * done with it.
563 */
564 ind = entry;
565 } else if (entry & IND_SOURCE)
566 kimage_free_entry(entry);
567 }
568 /* Free the final indirection page */
569 if (ind & IND_INDIRECTION)
570 kimage_free_entry(ind);
571
572 /* Handle any machine specific cleanup */
573 machine_kexec_cleanup(image);
574
575 /* Free the kexec control pages... */
576 kimage_free_page_list(&image->control_pages);
577
578 /*
579 * Free up any temporary buffers allocated. This might hit if
580 * error occurred much later after buffer allocation.
581 */
582 if (image->file_mode)
583 kimage_file_post_load_cleanup(image);
584
585 kfree(image);
586}
587
588static kimage_entry_t *kimage_dst_used(struct kimage *image,
589 unsigned long page)
590{
591 kimage_entry_t *ptr, entry;
592 unsigned long destination = 0;
593
594 for_each_kimage_entry(image, ptr, entry) {
595 if (entry & IND_DESTINATION)
596 destination = entry & PAGE_MASK;
597 else if (entry & IND_SOURCE) {
598 if (page == destination)
599 return ptr;
600 destination += PAGE_SIZE;
601 }
602 }
603
604 return NULL;
605}
606
607static struct page *kimage_alloc_page(struct kimage *image,
608 gfp_t gfp_mask,
609 unsigned long destination)
610{
611 /*
612 * Here we implement safeguards to ensure that a source page
613 * is not copied to its destination page before the data on
614 * the destination page is no longer useful.
615 *
616 * To do this we maintain the invariant that a source page is
617 * either its own destination page, or it is not a
618 * destination page at all.
619 *
620 * That is slightly stronger than required, but the proof
621 * that no problems will not occur is trivial, and the
622 * implementation is simply to verify.
623 *
624 * When allocating all pages normally this algorithm will run
625 * in O(N) time, but in the worst case it will run in O(N^2)
626 * time. If the runtime is a problem the data structures can
627 * be fixed.
628 */
629 struct page *page;
630 unsigned long addr;
631
632 /*
633 * Walk through the list of destination pages, and see if I
634 * have a match.
635 */
636 list_for_each_entry(page, &image->dest_pages, lru) {
637 addr = page_to_pfn(page) << PAGE_SHIFT;
638 if (addr == destination) {
639 list_del(&page->lru);
640 return page;
641 }
642 }
643 page = NULL;
644 while (1) {
645 kimage_entry_t *old;
646
647 /* Allocate a page, if we run out of memory give up */
648 page = kimage_alloc_pages(gfp_mask, 0);
649 if (!page)
650 return NULL;
651 /* If the page cannot be used file it away */
652 if (page_to_pfn(page) >
653 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
654 list_add(&page->lru, &image->unusable_pages);
655 continue;
656 }
657 addr = page_to_pfn(page) << PAGE_SHIFT;
658
659 /* If it is the destination page we want use it */
660 if (addr == destination)
661 break;
662
663 /* If the page is not a destination page use it */
664 if (!kimage_is_destination_range(image, addr,
665 addr + PAGE_SIZE))
666 break;
667
668 /*
669 * I know that the page is someones destination page.
670 * See if there is already a source page for this
671 * destination page. And if so swap the source pages.
672 */
673 old = kimage_dst_used(image, addr);
674 if (old) {
675 /* If so move it */
676 unsigned long old_addr;
677 struct page *old_page;
678
679 old_addr = *old & PAGE_MASK;
680 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
681 copy_highpage(page, old_page);
682 *old = addr | (*old & ~PAGE_MASK);
683
684 /* The old page I have found cannot be a
685 * destination page, so return it if it's
686 * gfp_flags honor the ones passed in.
687 */
688 if (!(gfp_mask & __GFP_HIGHMEM) &&
689 PageHighMem(old_page)) {
690 kimage_free_pages(old_page);
691 continue;
692 }
693 addr = old_addr;
694 page = old_page;
695 break;
696 }
697 /* Place the page on the destination list, to be used later */
698 list_add(&page->lru, &image->dest_pages);
699 }
700
701 return page;
702}
703
704static int kimage_load_normal_segment(struct kimage *image,
705 struct kexec_segment *segment)
706{
707 unsigned long maddr;
708 size_t ubytes, mbytes;
709 int result;
710 unsigned char __user *buf = NULL;
711 unsigned char *kbuf = NULL;
712
713 result = 0;
714 if (image->file_mode)
715 kbuf = segment->kbuf;
716 else
717 buf = segment->buf;
718 ubytes = segment->bufsz;
719 mbytes = segment->memsz;
720 maddr = segment->mem;
721
722 result = kimage_set_destination(image, maddr);
723 if (result < 0)
724 goto out;
725
726 while (mbytes) {
727 struct page *page;
728 char *ptr;
729 size_t uchunk, mchunk;
730
731 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
732 if (!page) {
733 result = -ENOMEM;
734 goto out;
735 }
736 result = kimage_add_page(image, page_to_pfn(page)
737 << PAGE_SHIFT);
738 if (result < 0)
739 goto out;
740
741 ptr = kmap(page);
742 /* Start with a clear page */
743 clear_page(ptr);
744 ptr += maddr & ~PAGE_MASK;
745 mchunk = min_t(size_t, mbytes,
746 PAGE_SIZE - (maddr & ~PAGE_MASK));
747 uchunk = min(ubytes, mchunk);
748
749 /* For file based kexec, source pages are in kernel memory */
750 if (image->file_mode)
751 memcpy(ptr, kbuf, uchunk);
752 else
753 result = copy_from_user(ptr, buf, uchunk);
754 kunmap(page);
755 if (result) {
756 result = -EFAULT;
757 goto out;
758 }
759 ubytes -= uchunk;
760 maddr += mchunk;
761 if (image->file_mode)
762 kbuf += mchunk;
763 else
764 buf += mchunk;
765 mbytes -= mchunk;
766 }
767out:
768 return result;
769}
770
771static int kimage_load_crash_segment(struct kimage *image,
772 struct kexec_segment *segment)
773{
774 /* For crash dumps kernels we simply copy the data from
775 * user space to it's destination.
776 * We do things a page at a time for the sake of kmap.
777 */
778 unsigned long maddr;
779 size_t ubytes, mbytes;
780 int result;
781 unsigned char __user *buf = NULL;
782 unsigned char *kbuf = NULL;
783
784 result = 0;
785 if (image->file_mode)
786 kbuf = segment->kbuf;
787 else
788 buf = segment->buf;
789 ubytes = segment->bufsz;
790 mbytes = segment->memsz;
791 maddr = segment->mem;
792 while (mbytes) {
793 struct page *page;
794 char *ptr;
795 size_t uchunk, mchunk;
796
797 page = pfn_to_page(maddr >> PAGE_SHIFT);
798 if (!page) {
799 result = -ENOMEM;
800 goto out;
801 }
802 ptr = kmap(page);
803 ptr += maddr & ~PAGE_MASK;
804 mchunk = min_t(size_t, mbytes,
805 PAGE_SIZE - (maddr & ~PAGE_MASK));
806 uchunk = min(ubytes, mchunk);
807 if (mchunk > uchunk) {
808 /* Zero the trailing part of the page */
809 memset(ptr + uchunk, 0, mchunk - uchunk);
810 }
811
812 /* For file based kexec, source pages are in kernel memory */
813 if (image->file_mode)
814 memcpy(ptr, kbuf, uchunk);
815 else
816 result = copy_from_user(ptr, buf, uchunk);
817 kexec_flush_icache_page(page);
818 kunmap(page);
819 if (result) {
820 result = -EFAULT;
821 goto out;
822 }
823 ubytes -= uchunk;
824 maddr += mchunk;
825 if (image->file_mode)
826 kbuf += mchunk;
827 else
828 buf += mchunk;
829 mbytes -= mchunk;
830 }
831out:
832 return result;
833}
834
835int kimage_load_segment(struct kimage *image,
836 struct kexec_segment *segment)
837{
838 int result = -ENOMEM;
839
840 switch (image->type) {
841 case KEXEC_TYPE_DEFAULT:
842 result = kimage_load_normal_segment(image, segment);
843 break;
844 case KEXEC_TYPE_CRASH:
845 result = kimage_load_crash_segment(image, segment);
846 break;
847 }
848
849 return result;
850}
851
852struct kimage *kexec_image;
853struct kimage *kexec_crash_image;
854int kexec_load_disabled;
855
856void crash_kexec(struct pt_regs *regs)
857{
858 /* Take the kexec_mutex here to prevent sys_kexec_load
859 * running on one cpu from replacing the crash kernel
860 * we are using after a panic on a different cpu.
861 *
862 * If the crash kernel was not located in a fixed area
863 * of memory the xchg(&kexec_crash_image) would be
864 * sufficient. But since I reuse the memory...
865 */
866 if (mutex_trylock(&kexec_mutex)) {
867 if (kexec_crash_image) {
868 struct pt_regs fixed_regs;
869
870 crash_setup_regs(&fixed_regs, regs);
871 crash_save_vmcoreinfo();
872 machine_crash_shutdown(&fixed_regs);
873 machine_kexec(kexec_crash_image);
874 }
875 mutex_unlock(&kexec_mutex);
876 }
877}
878
879size_t crash_get_memory_size(void)
880{
881 size_t size = 0;
882
883 mutex_lock(&kexec_mutex);
884 if (crashk_res.end != crashk_res.start)
885 size = resource_size(&crashk_res);
886 mutex_unlock(&kexec_mutex);
887 return size;
888}
889
890void __weak crash_free_reserved_phys_range(unsigned long begin,
891 unsigned long end)
892{
893 unsigned long addr;
894
895 for (addr = begin; addr < end; addr += PAGE_SIZE)
896 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
897}
898
899int crash_shrink_memory(unsigned long new_size)
900{
901 int ret = 0;
902 unsigned long start, end;
903 unsigned long old_size;
904 struct resource *ram_res;
905
906 mutex_lock(&kexec_mutex);
907
908 if (kexec_crash_image) {
909 ret = -ENOENT;
910 goto unlock;
911 }
912 start = crashk_res.start;
913 end = crashk_res.end;
914 old_size = (end == 0) ? 0 : end - start + 1;
915 if (new_size >= old_size) {
916 ret = (new_size == old_size) ? 0 : -EINVAL;
917 goto unlock;
918 }
919
920 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
921 if (!ram_res) {
922 ret = -ENOMEM;
923 goto unlock;
924 }
925
926 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
927 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
928
929 crash_map_reserved_pages();
930 crash_free_reserved_phys_range(end, crashk_res.end);
931
932 if ((start == end) && (crashk_res.parent != NULL))
933 release_resource(&crashk_res);
934
935 ram_res->start = end;
936 ram_res->end = crashk_res.end;
937 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
938 ram_res->name = "System RAM";
939
940 crashk_res.end = end - 1;
941
942 insert_resource(&iomem_resource, ram_res);
943 crash_unmap_reserved_pages();
944
945unlock:
946 mutex_unlock(&kexec_mutex);
947 return ret;
948}
949
950static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
951 size_t data_len)
952{
953 struct elf_note note;
954
955 note.n_namesz = strlen(name) + 1;
956 note.n_descsz = data_len;
957 note.n_type = type;
958 memcpy(buf, &note, sizeof(note));
959 buf += (sizeof(note) + 3)/4;
960 memcpy(buf, name, note.n_namesz);
961 buf += (note.n_namesz + 3)/4;
962 memcpy(buf, data, note.n_descsz);
963 buf += (note.n_descsz + 3)/4;
964
965 return buf;
966}
967
968static void final_note(u32 *buf)
969{
970 struct elf_note note;
971
972 note.n_namesz = 0;
973 note.n_descsz = 0;
974 note.n_type = 0;
975 memcpy(buf, &note, sizeof(note));
976}
977
978void crash_save_cpu(struct pt_regs *regs, int cpu)
979{
980 struct elf_prstatus prstatus;
981 u32 *buf;
982
983 if ((cpu < 0) || (cpu >= nr_cpu_ids))
984 return;
985
986 /* Using ELF notes here is opportunistic.
987 * I need a well defined structure format
988 * for the data I pass, and I need tags
989 * on the data to indicate what information I have
990 * squirrelled away. ELF notes happen to provide
991 * all of that, so there is no need to invent something new.
992 */
993 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
994 if (!buf)
995 return;
996 memset(&prstatus, 0, sizeof(prstatus));
997 prstatus.pr_pid = current->pid;
998 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
999 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1000 &prstatus, sizeof(prstatus));
1001 final_note(buf);
1002}
1003
1004static int __init crash_notes_memory_init(void)
1005{
1006 /* Allocate memory for saving cpu registers. */
1007 size_t size, align;
1008
1009 /*
1010 * crash_notes could be allocated across 2 vmalloc pages when percpu
1011 * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
1012 * pages are also on 2 continuous physical pages. In this case the
1013 * 2nd part of crash_notes in 2nd page could be lost since only the
1014 * starting address and size of crash_notes are exported through sysfs.
1015 * Here round up the size of crash_notes to the nearest power of two
1016 * and pass it to __alloc_percpu as align value. This can make sure
1017 * crash_notes is allocated inside one physical page.
1018 */
1019 size = sizeof(note_buf_t);
1020 align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
1021
1022 /*
1023 * Break compile if size is bigger than PAGE_SIZE since crash_notes
1024 * definitely will be in 2 pages with that.
1025 */
1026 BUILD_BUG_ON(size > PAGE_SIZE);
1027
1028 crash_notes = __alloc_percpu(size, align);
1029 if (!crash_notes) {
1030 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1031 return -ENOMEM;
1032 }
1033 return 0;
1034}
1035subsys_initcall(crash_notes_memory_init);
1036
1037
1038/*
1039 * parsing the "crashkernel" commandline
1040 *
1041 * this code is intended to be called from architecture specific code
1042 */
1043
1044
1045/*
1046 * This function parses command lines in the format
1047 *
1048 * crashkernel=ramsize-range:size[,...][@offset]
1049 *
1050 * The function returns 0 on success and -EINVAL on failure.
1051 */
1052static int __init parse_crashkernel_mem(char *cmdline,
1053 unsigned long long system_ram,
1054 unsigned long long *crash_size,
1055 unsigned long long *crash_base)
1056{
1057 char *cur = cmdline, *tmp;
1058
1059 /* for each entry of the comma-separated list */
1060 do {
1061 unsigned long long start, end = ULLONG_MAX, size;
1062
1063 /* get the start of the range */
1064 start = memparse(cur, &tmp);
1065 if (cur == tmp) {
1066 pr_warn("crashkernel: Memory value expected\n");
1067 return -EINVAL;
1068 }
1069 cur = tmp;
1070 if (*cur != '-') {
1071 pr_warn("crashkernel: '-' expected\n");
1072 return -EINVAL;
1073 }
1074 cur++;
1075
1076 /* if no ':' is here, than we read the end */
1077 if (*cur != ':') {
1078 end = memparse(cur, &tmp);
1079 if (cur == tmp) {
1080 pr_warn("crashkernel: Memory value expected\n");
1081 return -EINVAL;
1082 }
1083 cur = tmp;
1084 if (end <= start) {
1085 pr_warn("crashkernel: end <= start\n");
1086 return -EINVAL;
1087 }
1088 }
1089
1090 if (*cur != ':') {
1091 pr_warn("crashkernel: ':' expected\n");
1092 return -EINVAL;
1093 }
1094 cur++;
1095
1096 size = memparse(cur, &tmp);
1097 if (cur == tmp) {
1098 pr_warn("Memory value expected\n");
1099 return -EINVAL;
1100 }
1101 cur = tmp;
1102 if (size >= system_ram) {
1103 pr_warn("crashkernel: invalid size\n");
1104 return -EINVAL;
1105 }
1106
1107 /* match ? */
1108 if (system_ram >= start && system_ram < end) {
1109 *crash_size = size;
1110 break;
1111 }
1112 } while (*cur++ == ',');
1113
1114 if (*crash_size > 0) {
1115 while (*cur && *cur != ' ' && *cur != '@')
1116 cur++;
1117 if (*cur == '@') {
1118 cur++;
1119 *crash_base = memparse(cur, &tmp);
1120 if (cur == tmp) {
1121 pr_warn("Memory value expected after '@'\n");
1122 return -EINVAL;
1123 }
1124 }
1125 }
1126
1127 return 0;
1128}
1129
1130/*
1131 * That function parses "simple" (old) crashkernel command lines like
1132 *
1133 * crashkernel=size[@offset]
1134 *
1135 * It returns 0 on success and -EINVAL on failure.
1136 */
1137static int __init parse_crashkernel_simple(char *cmdline,
1138 unsigned long long *crash_size,
1139 unsigned long long *crash_base)
1140{
1141 char *cur = cmdline;
1142
1143 *crash_size = memparse(cmdline, &cur);
1144 if (cmdline == cur) {
1145 pr_warn("crashkernel: memory value expected\n");
1146 return -EINVAL;
1147 }
1148
1149 if (*cur == '@')
1150 *crash_base = memparse(cur+1, &cur);
1151 else if (*cur != ' ' && *cur != '\0') {
1152 pr_warn("crashkernel: unrecognized char\n");
1153 return -EINVAL;
1154 }
1155
1156 return 0;
1157}
1158
1159#define SUFFIX_HIGH 0
1160#define SUFFIX_LOW 1
1161#define SUFFIX_NULL 2
1162static __initdata char *suffix_tbl[] = {
1163 [SUFFIX_HIGH] = ",high",
1164 [SUFFIX_LOW] = ",low",
1165 [SUFFIX_NULL] = NULL,
1166};
1167
1168/*
1169 * That function parses "suffix" crashkernel command lines like
1170 *
1171 * crashkernel=size,[high|low]
1172 *
1173 * It returns 0 on success and -EINVAL on failure.
1174 */
1175static int __init parse_crashkernel_suffix(char *cmdline,
1176 unsigned long long *crash_size,
1177 const char *suffix)
1178{
1179 char *cur = cmdline;
1180
1181 *crash_size = memparse(cmdline, &cur);
1182 if (cmdline == cur) {
1183 pr_warn("crashkernel: memory value expected\n");
1184 return -EINVAL;
1185 }
1186
1187 /* check with suffix */
1188 if (strncmp(cur, suffix, strlen(suffix))) {
1189 pr_warn("crashkernel: unrecognized char\n");
1190 return -EINVAL;
1191 }
1192 cur += strlen(suffix);
1193 if (*cur != ' ' && *cur != '\0') {
1194 pr_warn("crashkernel: unrecognized char\n");
1195 return -EINVAL;
1196 }
1197
1198 return 0;
1199}
1200
1201static __init char *get_last_crashkernel(char *cmdline,
1202 const char *name,
1203 const char *suffix)
1204{
1205 char *p = cmdline, *ck_cmdline = NULL;
1206
1207 /* find crashkernel and use the last one if there are more */
1208 p = strstr(p, name);
1209 while (p) {
1210 char *end_p = strchr(p, ' ');
1211 char *q;
1212
1213 if (!end_p)
1214 end_p = p + strlen(p);
1215
1216 if (!suffix) {
1217 int i;
1218
1219 /* skip the one with any known suffix */
1220 for (i = 0; suffix_tbl[i]; i++) {
1221 q = end_p - strlen(suffix_tbl[i]);
1222 if (!strncmp(q, suffix_tbl[i],
1223 strlen(suffix_tbl[i])))
1224 goto next;
1225 }
1226 ck_cmdline = p;
1227 } else {
1228 q = end_p - strlen(suffix);
1229 if (!strncmp(q, suffix, strlen(suffix)))
1230 ck_cmdline = p;
1231 }
1232next:
1233 p = strstr(p+1, name);
1234 }
1235
1236 if (!ck_cmdline)
1237 return NULL;
1238
1239 return ck_cmdline;
1240}
1241
1242static int __init __parse_crashkernel(char *cmdline,
1243 unsigned long long system_ram,
1244 unsigned long long *crash_size,
1245 unsigned long long *crash_base,
1246 const char *name,
1247 const char *suffix)
1248{
1249 char *first_colon, *first_space;
1250 char *ck_cmdline;
1251
1252 BUG_ON(!crash_size || !crash_base);
1253 *crash_size = 0;
1254 *crash_base = 0;
1255
1256 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1257
1258 if (!ck_cmdline)
1259 return -EINVAL;
1260
1261 ck_cmdline += strlen(name);
1262
1263 if (suffix)
1264 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1265 suffix);
1266 /*
1267 * if the commandline contains a ':', then that's the extended
1268 * syntax -- if not, it must be the classic syntax
1269 */
1270 first_colon = strchr(ck_cmdline, ':');
1271 first_space = strchr(ck_cmdline, ' ');
1272 if (first_colon && (!first_space || first_colon < first_space))
1273 return parse_crashkernel_mem(ck_cmdline, system_ram,
1274 crash_size, crash_base);
1275
1276 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1277}
1278
1279/*
1280 * That function is the entry point for command line parsing and should be
1281 * called from the arch-specific code.
1282 */
1283int __init parse_crashkernel(char *cmdline,
1284 unsigned long long system_ram,
1285 unsigned long long *crash_size,
1286 unsigned long long *crash_base)
1287{
1288 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1289 "crashkernel=", NULL);
1290}
1291
1292int __init parse_crashkernel_high(char *cmdline,
1293 unsigned long long system_ram,
1294 unsigned long long *crash_size,
1295 unsigned long long *crash_base)
1296{
1297 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1298 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1299}
1300
1301int __init parse_crashkernel_low(char *cmdline,
1302 unsigned long long system_ram,
1303 unsigned long long *crash_size,
1304 unsigned long long *crash_base)
1305{
1306 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1307 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1308}
1309
1310static void update_vmcoreinfo_note(void)
1311{
1312 u32 *buf = vmcoreinfo_note;
1313
1314 if (!vmcoreinfo_size)
1315 return;
1316 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1317 vmcoreinfo_size);
1318 final_note(buf);
1319}
1320
1321void crash_save_vmcoreinfo(void)
1322{
1323 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1324 update_vmcoreinfo_note();
1325}
1326
1327void vmcoreinfo_append_str(const char *fmt, ...)
1328{
1329 va_list args;
1330 char buf[0x50];
1331 size_t r;
1332
1333 va_start(args, fmt);
1334 r = vscnprintf(buf, sizeof(buf), fmt, args);
1335 va_end(args);
1336
1337 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1338
1339 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1340
1341 vmcoreinfo_size += r;
1342}
1343
1344/*
1345 * provide an empty default implementation here -- architecture
1346 * code may override this
1347 */
1348void __weak arch_crash_save_vmcoreinfo(void)
1349{}
1350
1351unsigned long __weak paddr_vmcoreinfo_note(void)
1352{
1353 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1354}
1355
1356static int __init crash_save_vmcoreinfo_init(void)
1357{
1358 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1359 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1360
1361 VMCOREINFO_SYMBOL(init_uts_ns);
1362 VMCOREINFO_SYMBOL(node_online_map);
1363#ifdef CONFIG_MMU
1364 VMCOREINFO_SYMBOL(swapper_pg_dir);
1365#endif
1366 VMCOREINFO_SYMBOL(_stext);
1367 VMCOREINFO_SYMBOL(vmap_area_list);
1368
1369#ifndef CONFIG_NEED_MULTIPLE_NODES
1370 VMCOREINFO_SYMBOL(mem_map);
1371 VMCOREINFO_SYMBOL(contig_page_data);
1372#endif
1373#ifdef CONFIG_SPARSEMEM
1374 VMCOREINFO_SYMBOL(mem_section);
1375 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1376 VMCOREINFO_STRUCT_SIZE(mem_section);
1377 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1378#endif
1379 VMCOREINFO_STRUCT_SIZE(page);
1380 VMCOREINFO_STRUCT_SIZE(pglist_data);
1381 VMCOREINFO_STRUCT_SIZE(zone);
1382 VMCOREINFO_STRUCT_SIZE(free_area);
1383 VMCOREINFO_STRUCT_SIZE(list_head);
1384 VMCOREINFO_SIZE(nodemask_t);
1385 VMCOREINFO_OFFSET(page, flags);
1386 VMCOREINFO_OFFSET(page, _count);
1387 VMCOREINFO_OFFSET(page, mapping);
1388 VMCOREINFO_OFFSET(page, lru);
1389 VMCOREINFO_OFFSET(page, _mapcount);
1390 VMCOREINFO_OFFSET(page, private);
1391 VMCOREINFO_OFFSET(pglist_data, node_zones);
1392 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1393#ifdef CONFIG_FLAT_NODE_MEM_MAP
1394 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1395#endif
1396 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1397 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1398 VMCOREINFO_OFFSET(pglist_data, node_id);
1399 VMCOREINFO_OFFSET(zone, free_area);
1400 VMCOREINFO_OFFSET(zone, vm_stat);
1401 VMCOREINFO_OFFSET(zone, spanned_pages);
1402 VMCOREINFO_OFFSET(free_area, free_list);
1403 VMCOREINFO_OFFSET(list_head, next);
1404 VMCOREINFO_OFFSET(list_head, prev);
1405 VMCOREINFO_OFFSET(vmap_area, va_start);
1406 VMCOREINFO_OFFSET(vmap_area, list);
1407 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1408 log_buf_kexec_setup();
1409 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1410 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1411 VMCOREINFO_NUMBER(PG_lru);
1412 VMCOREINFO_NUMBER(PG_private);
1413 VMCOREINFO_NUMBER(PG_swapcache);
1414 VMCOREINFO_NUMBER(PG_slab);
1415#ifdef CONFIG_MEMORY_FAILURE
1416 VMCOREINFO_NUMBER(PG_hwpoison);
1417#endif
1418 VMCOREINFO_NUMBER(PG_head_mask);
1419 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1420#ifdef CONFIG_X86
1421 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
1422#endif
1423#ifdef CONFIG_HUGETLBFS
1424 VMCOREINFO_SYMBOL(free_huge_page);
1425#endif
1426
1427 arch_crash_save_vmcoreinfo();
1428 update_vmcoreinfo_note();
1429
1430 return 0;
1431}
1432
1433subsys_initcall(crash_save_vmcoreinfo_init);
1434
1435/*
1436 * Move into place and start executing a preloaded standalone
1437 * executable. If nothing was preloaded return an error.
1438 */
1439int kernel_kexec(void)
1440{
1441 int error = 0;
1442
1443 if (!mutex_trylock(&kexec_mutex))
1444 return -EBUSY;
1445 if (!kexec_image) {
1446 error = -EINVAL;
1447 goto Unlock;
1448 }
1449
1450#ifdef CONFIG_KEXEC_JUMP
1451 if (kexec_image->preserve_context) {
1452 lock_system_sleep();
1453 pm_prepare_console();
1454 error = freeze_processes();
1455 if (error) {
1456 error = -EBUSY;
1457 goto Restore_console;
1458 }
1459 suspend_console();
1460 error = dpm_suspend_start(PMSG_FREEZE);
1461 if (error)
1462 goto Resume_console;
1463 /* At this point, dpm_suspend_start() has been called,
1464 * but *not* dpm_suspend_end(). We *must* call
1465 * dpm_suspend_end() now. Otherwise, drivers for
1466 * some devices (e.g. interrupt controllers) become
1467 * desynchronized with the actual state of the
1468 * hardware at resume time, and evil weirdness ensues.
1469 */
1470 error = dpm_suspend_end(PMSG_FREEZE);
1471 if (error)
1472 goto Resume_devices;
1473 error = disable_nonboot_cpus();
1474 if (error)
1475 goto Enable_cpus;
1476 local_irq_disable();
1477 error = syscore_suspend();
1478 if (error)
1479 goto Enable_irqs;
1480 } else
1481#endif
1482 {
1483 kexec_in_progress = true;
1484 kernel_restart_prepare(NULL);
1485 migrate_to_reboot_cpu();
1486
1487 /*
1488 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1489 * no further code needs to use CPU hotplug (which is true in
1490 * the reboot case). However, the kexec path depends on using
1491 * CPU hotplug again; so re-enable it here.
1492 */
1493 cpu_hotplug_enable();
1494 pr_emerg("Starting new kernel\n");
1495 machine_shutdown();
1496 }
1497
1498 machine_kexec(kexec_image);
1499
1500#ifdef CONFIG_KEXEC_JUMP
1501 if (kexec_image->preserve_context) {
1502 syscore_resume();
1503 Enable_irqs:
1504 local_irq_enable();
1505 Enable_cpus:
1506 enable_nonboot_cpus();
1507 dpm_resume_start(PMSG_RESTORE);
1508 Resume_devices:
1509 dpm_resume_end(PMSG_RESTORE);
1510 Resume_console:
1511 resume_console();
1512 thaw_processes();
1513 Restore_console:
1514 pm_restore_console();
1515 unlock_system_sleep();
1516 }
1517#endif
1518
1519 Unlock:
1520 mutex_unlock(&kexec_mutex);
1521 return error;
1522}
1523
1524/*
1525 * Add and remove page tables for crashkernel memory
1526 *
1527 * Provide an empty default implementation here -- architecture
1528 * code may override this
1529 */
1530void __weak crash_map_reserved_pages(void)
1531{}
1532
1533void __weak crash_unmap_reserved_pages(void)
1534{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..6a9a3f2a0e8e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1045 @@
1/*
2 * kexec: kexec_file_load system call
3 *
4 * Copyright (C) 2014 Red Hat Inc.
5 * Authors:
6 * Vivek Goyal <vgoyal@redhat.com>
7 *
8 * This source code is licensed under the GNU General Public License,
9 * Version 2. See the file COPYING for more details.
10 */
11
12#include <linux/capability.h>
13#include <linux/mm.h>
14#include <linux/file.h>
15#include <linux/slab.h>
16#include <linux/kexec.h>
17#include <linux/mutex.h>
18#include <linux/list.h>
19#include <crypto/hash.h>
20#include <crypto/sha.h>
21#include <linux/syscalls.h>
22#include <linux/vmalloc.h>
23#include "kexec_internal.h"
24
25/*
26 * Declare these symbols weak so that if architecture provides a purgatory,
27 * these will be overridden.
28 */
29char __weak kexec_purgatory[0];
30size_t __weak kexec_purgatory_size = 0;
31
32static int kexec_calculate_store_digests(struct kimage *image);
33
34static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
35{
36 struct fd f = fdget(fd);
37 int ret;
38 struct kstat stat;
39 loff_t pos;
40 ssize_t bytes = 0;
41
42 if (!f.file)
43 return -EBADF;
44
45 ret = vfs_getattr(&f.file->f_path, &stat);
46 if (ret)
47 goto out;
48
49 if (stat.size > INT_MAX) {
50 ret = -EFBIG;
51 goto out;
52 }
53
54 /* Don't hand 0 to vmalloc, it whines. */
55 if (stat.size == 0) {
56 ret = -EINVAL;
57 goto out;
58 }
59
60 *buf = vmalloc(stat.size);
61 if (!*buf) {
62 ret = -ENOMEM;
63 goto out;
64 }
65
66 pos = 0;
67 while (pos < stat.size) {
68 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
69 stat.size - pos);
70 if (bytes < 0) {
71 vfree(*buf);
72 ret = bytes;
73 goto out;
74 }
75
76 if (bytes == 0)
77 break;
78 pos += bytes;
79 }
80
81 if (pos != stat.size) {
82 ret = -EBADF;
83 vfree(*buf);
84 goto out;
85 }
86
87 *buf_len = pos;
88out:
89 fdput(f);
90 return ret;
91}
92
93/* Architectures can provide this probe function */
94int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
95 unsigned long buf_len)
96{
97 return -ENOEXEC;
98}
99
100void * __weak arch_kexec_kernel_image_load(struct kimage *image)
101{
102 return ERR_PTR(-ENOEXEC);
103}
104
105int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
106{
107 return -EINVAL;
108}
109
110int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
111 unsigned long buf_len)
112{
113 return -EKEYREJECTED;
114}
115
116/* Apply relocations of type RELA */
117int __weak
118arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
119 unsigned int relsec)
120{
121 pr_err("RELA relocation unsupported.\n");
122 return -ENOEXEC;
123}
124
125/* Apply relocations of type REL */
126int __weak
127arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
128 unsigned int relsec)
129{
130 pr_err("REL relocation unsupported.\n");
131 return -ENOEXEC;
132}
133
134/*
135 * Free up memory used by kernel, initrd, and command line. This is temporary
136 * memory allocation which is not needed any more after these buffers have
137 * been loaded into separate segments and have been copied elsewhere.
138 */
139void kimage_file_post_load_cleanup(struct kimage *image)
140{
141 struct purgatory_info *pi = &image->purgatory_info;
142
143 vfree(image->kernel_buf);
144 image->kernel_buf = NULL;
145
146 vfree(image->initrd_buf);
147 image->initrd_buf = NULL;
148
149 kfree(image->cmdline_buf);
150 image->cmdline_buf = NULL;
151
152 vfree(pi->purgatory_buf);
153 pi->purgatory_buf = NULL;
154
155 vfree(pi->sechdrs);
156 pi->sechdrs = NULL;
157
158 /* See if architecture has anything to cleanup post load */
159 arch_kimage_file_post_load_cleanup(image);
160
161 /*
162 * Above call should have called into bootloader to free up
163 * any data stored in kimage->image_loader_data. It should
164 * be ok now to free it up.
165 */
166 kfree(image->image_loader_data);
167 image->image_loader_data = NULL;
168}
169
170/*
171 * In file mode list of segments is prepared by kernel. Copy relevant
172 * data from user space, do error checking, prepare segment list
173 */
174static int
175kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
176 const char __user *cmdline_ptr,
177 unsigned long cmdline_len, unsigned flags)
178{
179 int ret = 0;
180 void *ldata;
181
182 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
183 &image->kernel_buf_len);
184 if (ret)
185 return ret;
186
187 /* Call arch image probe handlers */
188 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
189 image->kernel_buf_len);
190
191 if (ret)
192 goto out;
193
194#ifdef CONFIG_KEXEC_VERIFY_SIG
195 ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
196 image->kernel_buf_len);
197 if (ret) {
198 pr_debug("kernel signature verification failed.\n");
199 goto out;
200 }
201 pr_debug("kernel signature verification successful.\n");
202#endif
203 /* It is possible that there no initramfs is being loaded */
204 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
205 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
206 &image->initrd_buf_len);
207 if (ret)
208 goto out;
209 }
210
211 if (cmdline_len) {
212 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
213 if (!image->cmdline_buf) {
214 ret = -ENOMEM;
215 goto out;
216 }
217
218 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
219 cmdline_len);
220 if (ret) {
221 ret = -EFAULT;
222 goto out;
223 }
224
225 image->cmdline_buf_len = cmdline_len;
226
227 /* command line should be a string with last byte null */
228 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
229 ret = -EINVAL;
230 goto out;
231 }
232 }
233
234 /* Call arch image load handlers */
235 ldata = arch_kexec_kernel_image_load(image);
236
237 if (IS_ERR(ldata)) {
238 ret = PTR_ERR(ldata);
239 goto out;
240 }
241
242 image->image_loader_data = ldata;
243out:
244 /* In case of error, free up all allocated memory in this function */
245 if (ret)
246 kimage_file_post_load_cleanup(image);
247 return ret;
248}
249
250static int
251kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
252 int initrd_fd, const char __user *cmdline_ptr,
253 unsigned long cmdline_len, unsigned long flags)
254{
255 int ret;
256 struct kimage *image;
257 bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
258
259 image = do_kimage_alloc_init();
260 if (!image)
261 return -ENOMEM;
262
263 image->file_mode = 1;
264
265 if (kexec_on_panic) {
266 /* Enable special crash kernel control page alloc policy. */
267 image->control_page = crashk_res.start;
268 image->type = KEXEC_TYPE_CRASH;
269 }
270
271 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
272 cmdline_ptr, cmdline_len, flags);
273 if (ret)
274 goto out_free_image;
275
276 ret = sanity_check_segment_list(image);
277 if (ret)
278 goto out_free_post_load_bufs;
279
280 ret = -ENOMEM;
281 image->control_code_page = kimage_alloc_control_pages(image,
282 get_order(KEXEC_CONTROL_PAGE_SIZE));
283 if (!image->control_code_page) {
284 pr_err("Could not allocate control_code_buffer\n");
285 goto out_free_post_load_bufs;
286 }
287
288 if (!kexec_on_panic) {
289 image->swap_page = kimage_alloc_control_pages(image, 0);
290 if (!image->swap_page) {
291 pr_err("Could not allocate swap buffer\n");
292 goto out_free_control_pages;
293 }
294 }
295
296 *rimage = image;
297 return 0;
298out_free_control_pages:
299 kimage_free_page_list(&image->control_pages);
300out_free_post_load_bufs:
301 kimage_file_post_load_cleanup(image);
302out_free_image:
303 kfree(image);
304 return ret;
305}
306
307SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
308 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
309 unsigned long, flags)
310{
311 int ret = 0, i;
312 struct kimage **dest_image, *image;
313
314 /* We only trust the superuser with rebooting the system. */
315 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
316 return -EPERM;
317
318 /* Make sure we have a legal set of flags */
319 if (flags != (flags & KEXEC_FILE_FLAGS))
320 return -EINVAL;
321
322 image = NULL;
323
324 if (!mutex_trylock(&kexec_mutex))
325 return -EBUSY;
326
327 dest_image = &kexec_image;
328 if (flags & KEXEC_FILE_ON_CRASH)
329 dest_image = &kexec_crash_image;
330
331 if (flags & KEXEC_FILE_UNLOAD)
332 goto exchange;
333
334 /*
335 * In case of crash, new kernel gets loaded in reserved region. It is
336 * same memory where old crash kernel might be loaded. Free any
337 * current crash dump kernel before we corrupt it.
338 */
339 if (flags & KEXEC_FILE_ON_CRASH)
340 kimage_free(xchg(&kexec_crash_image, NULL));
341
342 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
343 cmdline_len, flags);
344 if (ret)
345 goto out;
346
347 ret = machine_kexec_prepare(image);
348 if (ret)
349 goto out;
350
351 ret = kexec_calculate_store_digests(image);
352 if (ret)
353 goto out;
354
355 for (i = 0; i < image->nr_segments; i++) {
356 struct kexec_segment *ksegment;
357
358 ksegment = &image->segment[i];
359 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
360 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
361 ksegment->memsz);
362
363 ret = kimage_load_segment(image, &image->segment[i]);
364 if (ret)
365 goto out;
366 }
367
368 kimage_terminate(image);
369
370 /*
371 * Free up any temporary buffers allocated which are not needed
372 * after image has been loaded
373 */
374 kimage_file_post_load_cleanup(image);
375exchange:
376 image = xchg(dest_image, image);
377out:
378 mutex_unlock(&kexec_mutex);
379 kimage_free(image);
380 return ret;
381}
382
383static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
384 struct kexec_buf *kbuf)
385{
386 struct kimage *image = kbuf->image;
387 unsigned long temp_start, temp_end;
388
389 temp_end = min(end, kbuf->buf_max);
390 temp_start = temp_end - kbuf->memsz;
391
392 do {
393 /* align down start */
394 temp_start = temp_start & (~(kbuf->buf_align - 1));
395
396 if (temp_start < start || temp_start < kbuf->buf_min)
397 return 0;
398
399 temp_end = temp_start + kbuf->memsz - 1;
400
401 /*
402 * Make sure this does not conflict with any of existing
403 * segments
404 */
405 if (kimage_is_destination_range(image, temp_start, temp_end)) {
406 temp_start = temp_start - PAGE_SIZE;
407 continue;
408 }
409
410 /* We found a suitable memory range */
411 break;
412 } while (1);
413
414 /* If we are here, we found a suitable memory range */
415 kbuf->mem = temp_start;
416
417 /* Success, stop navigating through remaining System RAM ranges */
418 return 1;
419}
420
421static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
422 struct kexec_buf *kbuf)
423{
424 struct kimage *image = kbuf->image;
425 unsigned long temp_start, temp_end;
426
427 temp_start = max(start, kbuf->buf_min);
428
429 do {
430 temp_start = ALIGN(temp_start, kbuf->buf_align);
431 temp_end = temp_start + kbuf->memsz - 1;
432
433 if (temp_end > end || temp_end > kbuf->buf_max)
434 return 0;
435 /*
436 * Make sure this does not conflict with any of existing
437 * segments
438 */
439 if (kimage_is_destination_range(image, temp_start, temp_end)) {
440 temp_start = temp_start + PAGE_SIZE;
441 continue;
442 }
443
444 /* We found a suitable memory range */
445 break;
446 } while (1);
447
448 /* If we are here, we found a suitable memory range */
449 kbuf->mem = temp_start;
450
451 /* Success, stop navigating through remaining System RAM ranges */
452 return 1;
453}
454
455static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
456{
457 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
458 unsigned long sz = end - start + 1;
459
460 /* Returning 0 will take to next memory range */
461 if (sz < kbuf->memsz)
462 return 0;
463
464 if (end < kbuf->buf_min || start > kbuf->buf_max)
465 return 0;
466
467 /*
468 * Allocate memory top down with-in ram range. Otherwise bottom up
469 * allocation.
470 */
471 if (kbuf->top_down)
472 return locate_mem_hole_top_down(start, end, kbuf);
473 return locate_mem_hole_bottom_up(start, end, kbuf);
474}
475
476/*
477 * Helper function for placing a buffer in a kexec segment. This assumes
478 * that kexec_mutex is held.
479 */
480int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
481 unsigned long memsz, unsigned long buf_align,
482 unsigned long buf_min, unsigned long buf_max,
483 bool top_down, unsigned long *load_addr)
484{
485
486 struct kexec_segment *ksegment;
487 struct kexec_buf buf, *kbuf;
488 int ret;
489
490 /* Currently adding segment this way is allowed only in file mode */
491 if (!image->file_mode)
492 return -EINVAL;
493
494 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
495 return -EINVAL;
496
497 /*
498 * Make sure we are not trying to add buffer after allocating
499 * control pages. All segments need to be placed first before
500 * any control pages are allocated. As control page allocation
501 * logic goes through list of segments to make sure there are
502 * no destination overlaps.
503 */
504 if (!list_empty(&image->control_pages)) {
505 WARN_ON(1);
506 return -EINVAL;
507 }
508
509 memset(&buf, 0, sizeof(struct kexec_buf));
510 kbuf = &buf;
511 kbuf->image = image;
512 kbuf->buffer = buffer;
513 kbuf->bufsz = bufsz;
514
515 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
516 kbuf->buf_align = max(buf_align, PAGE_SIZE);
517 kbuf->buf_min = buf_min;
518 kbuf->buf_max = buf_max;
519 kbuf->top_down = top_down;
520
521 /* Walk the RAM ranges and allocate a suitable range for the buffer */
522 if (image->type == KEXEC_TYPE_CRASH)
523 ret = walk_iomem_res("Crash kernel",
524 IORESOURCE_MEM | IORESOURCE_BUSY,
525 crashk_res.start, crashk_res.end, kbuf,
526 locate_mem_hole_callback);
527 else
528 ret = walk_system_ram_res(0, -1, kbuf,
529 locate_mem_hole_callback);
530 if (ret != 1) {
531 /* A suitable memory range could not be found for buffer */
532 return -EADDRNOTAVAIL;
533 }
534
535 /* Found a suitable memory range */
536 ksegment = &image->segment[image->nr_segments];
537 ksegment->kbuf = kbuf->buffer;
538 ksegment->bufsz = kbuf->bufsz;
539 ksegment->mem = kbuf->mem;
540 ksegment->memsz = kbuf->memsz;
541 image->nr_segments++;
542 *load_addr = ksegment->mem;
543 return 0;
544}
545
546/* Calculate and store the digest of segments */
547static int kexec_calculate_store_digests(struct kimage *image)
548{
549 struct crypto_shash *tfm;
550 struct shash_desc *desc;
551 int ret = 0, i, j, zero_buf_sz, sha_region_sz;
552 size_t desc_size, nullsz;
553 char *digest;
554 void *zero_buf;
555 struct kexec_sha_region *sha_regions;
556 struct purgatory_info *pi = &image->purgatory_info;
557
558 zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
559 zero_buf_sz = PAGE_SIZE;
560
561 tfm = crypto_alloc_shash("sha256", 0, 0);
562 if (IS_ERR(tfm)) {
563 ret = PTR_ERR(tfm);
564 goto out;
565 }
566
567 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
568 desc = kzalloc(desc_size, GFP_KERNEL);
569 if (!desc) {
570 ret = -ENOMEM;
571 goto out_free_tfm;
572 }
573
574 sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
575 sha_regions = vzalloc(sha_region_sz);
576 if (!sha_regions)
577 goto out_free_desc;
578
579 desc->tfm = tfm;
580 desc->flags = 0;
581
582 ret = crypto_shash_init(desc);
583 if (ret < 0)
584 goto out_free_sha_regions;
585
586 digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
587 if (!digest) {
588 ret = -ENOMEM;
589 goto out_free_sha_regions;
590 }
591
592 for (j = i = 0; i < image->nr_segments; i++) {
593 struct kexec_segment *ksegment;
594
595 ksegment = &image->segment[i];
596 /*
597 * Skip purgatory as it will be modified once we put digest
598 * info in purgatory.
599 */
600 if (ksegment->kbuf == pi->purgatory_buf)
601 continue;
602
603 ret = crypto_shash_update(desc, ksegment->kbuf,
604 ksegment->bufsz);
605 if (ret)
606 break;
607
608 /*
609 * Assume rest of the buffer is filled with zero and
610 * update digest accordingly.
611 */
612 nullsz = ksegment->memsz - ksegment->bufsz;
613 while (nullsz) {
614 unsigned long bytes = nullsz;
615
616 if (bytes > zero_buf_sz)
617 bytes = zero_buf_sz;
618 ret = crypto_shash_update(desc, zero_buf, bytes);
619 if (ret)
620 break;
621 nullsz -= bytes;
622 }
623
624 if (ret)
625 break;
626
627 sha_regions[j].start = ksegment->mem;
628 sha_regions[j].len = ksegment->memsz;
629 j++;
630 }
631
632 if (!ret) {
633 ret = crypto_shash_final(desc, digest);
634 if (ret)
635 goto out_free_digest;
636 ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
637 sha_regions, sha_region_sz, 0);
638 if (ret)
639 goto out_free_digest;
640
641 ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
642 digest, SHA256_DIGEST_SIZE, 0);
643 if (ret)
644 goto out_free_digest;
645 }
646
647out_free_digest:
648 kfree(digest);
649out_free_sha_regions:
650 vfree(sha_regions);
651out_free_desc:
652 kfree(desc);
653out_free_tfm:
654 kfree(tfm);
655out:
656 return ret;
657}
658
659/* Actually load purgatory. Lot of code taken from kexec-tools */
660static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
661 unsigned long max, int top_down)
662{
663 struct purgatory_info *pi = &image->purgatory_info;
664 unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
665 unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
666 unsigned char *buf_addr, *src;
667 int i, ret = 0, entry_sidx = -1;
668 const Elf_Shdr *sechdrs_c;
669 Elf_Shdr *sechdrs = NULL;
670 void *purgatory_buf = NULL;
671
672 /*
673 * sechdrs_c points to section headers in purgatory and are read
674 * only. No modifications allowed.
675 */
676 sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
677
678 /*
679 * We can not modify sechdrs_c[] and its fields. It is read only.
680 * Copy it over to a local copy where one can store some temporary
681 * data and free it at the end. We need to modify ->sh_addr and
682 * ->sh_offset fields to keep track of permanent and temporary
683 * locations of sections.
684 */
685 sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
686 if (!sechdrs)
687 return -ENOMEM;
688
689 memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
690
691 /*
692 * We seem to have multiple copies of sections. First copy is which
693 * is embedded in kernel in read only section. Some of these sections
694 * will be copied to a temporary buffer and relocated. And these
695 * sections will finally be copied to their final destination at
696 * segment load time.
697 *
698 * Use ->sh_offset to reflect section address in memory. It will
699 * point to original read only copy if section is not allocatable.
700 * Otherwise it will point to temporary copy which will be relocated.
701 *
702 * Use ->sh_addr to contain final address of the section where it
703 * will go during execution time.
704 */
705 for (i = 0; i < pi->ehdr->e_shnum; i++) {
706 if (sechdrs[i].sh_type == SHT_NOBITS)
707 continue;
708
709 sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
710 sechdrs[i].sh_offset;
711 }
712
713 /*
714 * Identify entry point section and make entry relative to section
715 * start.
716 */
717 entry = pi->ehdr->e_entry;
718 for (i = 0; i < pi->ehdr->e_shnum; i++) {
719 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
720 continue;
721
722 if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
723 continue;
724
725 /* Make entry section relative */
726 if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
727 ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
728 pi->ehdr->e_entry)) {
729 entry_sidx = i;
730 entry -= sechdrs[i].sh_addr;
731 break;
732 }
733 }
734
735 /* Determine how much memory is needed to load relocatable object. */
736 buf_align = 1;
737 bss_align = 1;
738 buf_sz = 0;
739 bss_sz = 0;
740
741 for (i = 0; i < pi->ehdr->e_shnum; i++) {
742 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
743 continue;
744
745 align = sechdrs[i].sh_addralign;
746 if (sechdrs[i].sh_type != SHT_NOBITS) {
747 if (buf_align < align)
748 buf_align = align;
749 buf_sz = ALIGN(buf_sz, align);
750 buf_sz += sechdrs[i].sh_size;
751 } else {
752 /* bss section */
753 if (bss_align < align)
754 bss_align = align;
755 bss_sz = ALIGN(bss_sz, align);
756 bss_sz += sechdrs[i].sh_size;
757 }
758 }
759
760 /* Determine the bss padding required to align bss properly */
761 bss_pad = 0;
762 if (buf_sz & (bss_align - 1))
763 bss_pad = bss_align - (buf_sz & (bss_align - 1));
764
765 memsz = buf_sz + bss_pad + bss_sz;
766
767 /* Allocate buffer for purgatory */
768 purgatory_buf = vzalloc(buf_sz);
769 if (!purgatory_buf) {
770 ret = -ENOMEM;
771 goto out;
772 }
773
774 if (buf_align < bss_align)
775 buf_align = bss_align;
776
777 /* Add buffer to segment list */
778 ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
779 buf_align, min, max, top_down,
780 &pi->purgatory_load_addr);
781 if (ret)
782 goto out;
783
784 /* Load SHF_ALLOC sections */
785 buf_addr = purgatory_buf;
786 load_addr = curr_load_addr = pi->purgatory_load_addr;
787 bss_addr = load_addr + buf_sz + bss_pad;
788
789 for (i = 0; i < pi->ehdr->e_shnum; i++) {
790 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
791 continue;
792
793 align = sechdrs[i].sh_addralign;
794 if (sechdrs[i].sh_type != SHT_NOBITS) {
795 curr_load_addr = ALIGN(curr_load_addr, align);
796 offset = curr_load_addr - load_addr;
797 /* We already modifed ->sh_offset to keep src addr */
798 src = (char *) sechdrs[i].sh_offset;
799 memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
800
801 /* Store load address and source address of section */
802 sechdrs[i].sh_addr = curr_load_addr;
803
804 /*
805 * This section got copied to temporary buffer. Update
806 * ->sh_offset accordingly.
807 */
808 sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
809
810 /* Advance to the next address */
811 curr_load_addr += sechdrs[i].sh_size;
812 } else {
813 bss_addr = ALIGN(bss_addr, align);
814 sechdrs[i].sh_addr = bss_addr;
815 bss_addr += sechdrs[i].sh_size;
816 }
817 }
818
819 /* Update entry point based on load address of text section */
820 if (entry_sidx >= 0)
821 entry += sechdrs[entry_sidx].sh_addr;
822
823 /* Make kernel jump to purgatory after shutdown */
824 image->start = entry;
825
826 /* Used later to get/set symbol values */
827 pi->sechdrs = sechdrs;
828
829 /*
830 * Used later to identify which section is purgatory and skip it
831 * from checksumming.
832 */
833 pi->purgatory_buf = purgatory_buf;
834 return ret;
835out:
836 vfree(sechdrs);
837 vfree(purgatory_buf);
838 return ret;
839}
840
841static int kexec_apply_relocations(struct kimage *image)
842{
843 int i, ret;
844 struct purgatory_info *pi = &image->purgatory_info;
845 Elf_Shdr *sechdrs = pi->sechdrs;
846
847 /* Apply relocations */
848 for (i = 0; i < pi->ehdr->e_shnum; i++) {
849 Elf_Shdr *section, *symtab;
850
851 if (sechdrs[i].sh_type != SHT_RELA &&
852 sechdrs[i].sh_type != SHT_REL)
853 continue;
854
855 /*
856 * For section of type SHT_RELA/SHT_REL,
857 * ->sh_link contains section header index of associated
858 * symbol table. And ->sh_info contains section header
859 * index of section to which relocations apply.
860 */
861 if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
862 sechdrs[i].sh_link >= pi->ehdr->e_shnum)
863 return -ENOEXEC;
864
865 section = &sechdrs[sechdrs[i].sh_info];
866 symtab = &sechdrs[sechdrs[i].sh_link];
867
868 if (!(section->sh_flags & SHF_ALLOC))
869 continue;
870
871 /*
872 * symtab->sh_link contain section header index of associated
873 * string table.
874 */
875 if (symtab->sh_link >= pi->ehdr->e_shnum)
876 /* Invalid section number? */
877 continue;
878
879 /*
880 * Respective architecture needs to provide support for applying
881 * relocations of type SHT_RELA/SHT_REL.
882 */
883 if (sechdrs[i].sh_type == SHT_RELA)
884 ret = arch_kexec_apply_relocations_add(pi->ehdr,
885 sechdrs, i);
886 else if (sechdrs[i].sh_type == SHT_REL)
887 ret = arch_kexec_apply_relocations(pi->ehdr,
888 sechdrs, i);
889 if (ret)
890 return ret;
891 }
892
893 return 0;
894}
895
896/* Load relocatable purgatory object and relocate it appropriately */
897int kexec_load_purgatory(struct kimage *image, unsigned long min,
898 unsigned long max, int top_down,
899 unsigned long *load_addr)
900{
901 struct purgatory_info *pi = &image->purgatory_info;
902 int ret;
903
904 if (kexec_purgatory_size <= 0)
905 return -EINVAL;
906
907 if (kexec_purgatory_size < sizeof(Elf_Ehdr))
908 return -ENOEXEC;
909
910 pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
911
912 if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
913 || pi->ehdr->e_type != ET_REL
914 || !elf_check_arch(pi->ehdr)
915 || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
916 return -ENOEXEC;
917
918 if (pi->ehdr->e_shoff >= kexec_purgatory_size
919 || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
920 kexec_purgatory_size - pi->ehdr->e_shoff))
921 return -ENOEXEC;
922
923 ret = __kexec_load_purgatory(image, min, max, top_down);
924 if (ret)
925 return ret;
926
927 ret = kexec_apply_relocations(image);
928 if (ret)
929 goto out;
930
931 *load_addr = pi->purgatory_load_addr;
932 return 0;
933out:
934 vfree(pi->sechdrs);
935 vfree(pi->purgatory_buf);
936 return ret;
937}
938
939static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
940 const char *name)
941{
942 Elf_Sym *syms;
943 Elf_Shdr *sechdrs;
944 Elf_Ehdr *ehdr;
945 int i, k;
946 const char *strtab;
947
948 if (!pi->sechdrs || !pi->ehdr)
949 return NULL;
950
951 sechdrs = pi->sechdrs;
952 ehdr = pi->ehdr;
953
954 for (i = 0; i < ehdr->e_shnum; i++) {
955 if (sechdrs[i].sh_type != SHT_SYMTAB)
956 continue;
957
958 if (sechdrs[i].sh_link >= ehdr->e_shnum)
959 /* Invalid strtab section number */
960 continue;
961 strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
962 syms = (Elf_Sym *)sechdrs[i].sh_offset;
963
964 /* Go through symbols for a match */
965 for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
966 if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
967 continue;
968
969 if (strcmp(strtab + syms[k].st_name, name) != 0)
970 continue;
971
972 if (syms[k].st_shndx == SHN_UNDEF ||
973 syms[k].st_shndx >= ehdr->e_shnum) {
974 pr_debug("Symbol: %s has bad section index %d.\n",
975 name, syms[k].st_shndx);
976 return NULL;
977 }
978
979 /* Found the symbol we are looking for */
980 return &syms[k];
981 }
982 }
983
984 return NULL;
985}
986
987void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
988{
989 struct purgatory_info *pi = &image->purgatory_info;
990 Elf_Sym *sym;
991 Elf_Shdr *sechdr;
992
993 sym = kexec_purgatory_find_symbol(pi, name);
994 if (!sym)
995 return ERR_PTR(-EINVAL);
996
997 sechdr = &pi->sechdrs[sym->st_shndx];
998
999 /*
1000 * Returns the address where symbol will finally be loaded after
1001 * kexec_load_segment()
1002 */
1003 return (void *)(sechdr->sh_addr + sym->st_value);
1004}
1005
1006/*
1007 * Get or set value of a symbol. If "get_value" is true, symbol value is
1008 * returned in buf otherwise symbol value is set based on value in buf.
1009 */
1010int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
1011 void *buf, unsigned int size, bool get_value)
1012{
1013 Elf_Sym *sym;
1014 Elf_Shdr *sechdrs;
1015 struct purgatory_info *pi = &image->purgatory_info;
1016 char *sym_buf;
1017
1018 sym = kexec_purgatory_find_symbol(pi, name);
1019 if (!sym)
1020 return -EINVAL;
1021
1022 if (sym->st_size != size) {
1023 pr_err("symbol %s size mismatch: expected %lu actual %u\n",
1024 name, (unsigned long)sym->st_size, size);
1025 return -EINVAL;
1026 }
1027
1028 sechdrs = pi->sechdrs;
1029
1030 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
1031 pr_err("symbol %s is in a bss section. Cannot %s\n", name,
1032 get_value ? "get" : "set");
1033 return -EINVAL;
1034 }
1035
1036 sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
1037 sym->st_value;
1038
1039 if (get_value)
1040 memcpy((void *)buf, sym_buf, size);
1041 else
1042 memcpy((void *)sym_buf, buf, size);
1043
1044 return 0;
1045}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
1#ifndef LINUX_KEXEC_INTERNAL_H
2#define LINUX_KEXEC_INTERNAL_H
3
4#include <linux/kexec.h>
5
6struct kimage *do_kimage_alloc_init(void);
7int sanity_check_segment_list(struct kimage *image);
8void kimage_free_page_list(struct list_head *list);
9void kimage_free(struct kimage *image);
10int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
11void kimage_terminate(struct kimage *image);
12int kimage_is_destination_range(struct kimage *image,
13 unsigned long start, unsigned long end);
14
15extern struct mutex kexec_mutex;
16
17#ifdef CONFIG_KEXEC_FILE
18void kimage_file_post_load_cleanup(struct kimage *image);
19#else /* CONFIG_KEXEC_FILE */
20static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
21#endif /* CONFIG_KEXEC_FILE */
22#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2777f40a9c7b..da98d0593de2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
45 45
46extern int max_threads; 46extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq;
49
50#define CAP_BSET (void *)1 48#define CAP_BSET (void *)1
51#define CAP_PI (void *)2 49#define CAP_PI (void *)2
52 50
@@ -114,10 +112,11 @@ out:
114 * @...: arguments as specified in the format string 112 * @...: arguments as specified in the format string
115 * 113 *
116 * Load a module using the user mode module loader. The function returns 114 * Load a module using the user mode module loader. The function returns
117 * zero on success or a negative errno code on failure. Note that a 115 * zero on success or a negative errno code or positive exit code from
118 * successful module load does not mean the module did not then unload 116 * "modprobe" on failure. Note that a successful module load does not mean
119 * and exit on an error of its own. Callers must check that the service 117 * the module did not then unload and exit on an error of its own. Callers
120 * they requested is now available not blindly invoke it. 118 * must check that the service they requested is now available not blindly
119 * invoke it.
121 * 120 *
122 * If module auto-loading support is disabled then this function 121 * If module auto-loading support is disabled then this function
123 * becomes a no-operation. 122 * becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
213/* 212/*
214 * This is the task which runs the usermode application 213 * This is the task which runs the usermode application
215 */ 214 */
216static int ____call_usermodehelper(void *data) 215static int call_usermodehelper_exec_async(void *data)
217{ 216{
218 struct subprocess_info *sub_info = data; 217 struct subprocess_info *sub_info = data;
219 struct cred *new; 218 struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
223 flush_signal_handlers(current, 1); 222 flush_signal_handlers(current, 1);
224 spin_unlock_irq(&current->sighand->siglock); 223 spin_unlock_irq(&current->sighand->siglock);
225 224
226 /* We can run anywhere, unlike our parent keventd(). */
227 set_cpus_allowed_ptr(current, cpu_all_mask);
228
229 /* 225 /*
230 * Our parent is keventd, which runs with elevated scheduling priority. 226 * Our parent (unbound workqueue) runs with elevated scheduling
231 * Avoid propagating that into the userspace child. 227 * priority. Avoid propagating that into the userspace child.
232 */ 228 */
233 set_user_nice(current, 0); 229 set_user_nice(current, 0);
234 230
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
258 (const char __user *const __user *)sub_info->envp); 254 (const char __user *const __user *)sub_info->envp);
259out: 255out:
260 sub_info->retval = retval; 256 sub_info->retval = retval;
261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ 257 /*
258 * call_usermodehelper_exec_sync() will call umh_complete
259 * if UHM_WAIT_PROC.
260 */
262 if (!(sub_info->wait & UMH_WAIT_PROC)) 261 if (!(sub_info->wait & UMH_WAIT_PROC))
263 umh_complete(sub_info); 262 umh_complete(sub_info);
264 if (!retval) 263 if (!retval)
@@ -266,15 +265,14 @@ out:
266 do_exit(0); 265 do_exit(0);
267} 266}
268 267
269/* Keventd can't block, but this (a child) can. */ 268/* Handles UMH_WAIT_PROC. */
270static int wait_for_helper(void *data) 269static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
271{ 270{
272 struct subprocess_info *sub_info = data;
273 pid_t pid; 271 pid_t pid;
274 272
275 /* If SIGCLD is ignored sys_wait4 won't populate the status. */ 273 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
276 kernel_sigaction(SIGCHLD, SIG_DFL); 274 kernel_sigaction(SIGCHLD, SIG_DFL);
277 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 275 pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
278 if (pid < 0) { 276 if (pid < 0) {
279 sub_info->retval = pid; 277 sub_info->retval = pid;
280 } else { 278 } else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
282 /* 280 /*
283 * Normally it is bogus to call wait4() from in-kernel because 281 * Normally it is bogus to call wait4() from in-kernel because
284 * wait4() wants to write the exit code to a userspace address. 282 * wait4() wants to write the exit code to a userspace address.
285 * But wait_for_helper() always runs as keventd, and put_user() 283 * But call_usermodehelper_exec_sync() always runs as kernel
286 * to a kernel address works OK for kernel threads, due to their 284 * thread (workqueue) and put_user() to a kernel address works
287 * having an mm_segment_t which spans the entire address space. 285 * OK for kernel threads, due to their having an mm_segment_t
286 * which spans the entire address space.
288 * 287 *
289 * Thus the __user pointer cast is valid here. 288 * Thus the __user pointer cast is valid here.
290 */ 289 */
291 sys_wait4(pid, (int __user *)&ret, 0, NULL); 290 sys_wait4(pid, (int __user *)&ret, 0, NULL);
292 291
293 /* 292 /*
294 * If ret is 0, either ____call_usermodehelper failed and the 293 * If ret is 0, either call_usermodehelper_exec_async failed and
295 * real error code is already in sub_info->retval or 294 * the real error code is already in sub_info->retval or
296 * sub_info->retval is 0 anyway, so don't mess with it then. 295 * sub_info->retval is 0 anyway, so don't mess with it then.
297 */ 296 */
298 if (ret) 297 if (ret)
299 sub_info->retval = ret; 298 sub_info->retval = ret;
300 } 299 }
301 300
301 /* Restore default kernel sig handler */
302 kernel_sigaction(SIGCHLD, SIG_IGN);
303
302 umh_complete(sub_info); 304 umh_complete(sub_info);
303 do_exit(0);
304} 305}
305 306
306/* This is run by khelper thread */ 307/*
307static void __call_usermodehelper(struct work_struct *work) 308 * We need to create the usermodehelper kernel thread from a task that is affine
309 * to an optimized set of CPUs (or nohz housekeeping ones) such that they
310 * inherit a widest affinity irrespective of call_usermodehelper() callers with
311 * possibly reduced affinity (eg: per-cpu workqueues). We don't want
312 * usermodehelper targets to contend a busy CPU.
313 *
314 * Unbound workqueues provide such wide affinity and allow to block on
315 * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
316 *
317 * Besides, workqueues provide the privilege level that caller might not have
318 * to perform the usermodehelper request.
319 *
320 */
321static void call_usermodehelper_exec_work(struct work_struct *work)
308{ 322{
309 struct subprocess_info *sub_info = 323 struct subprocess_info *sub_info =
310 container_of(work, struct subprocess_info, work); 324 container_of(work, struct subprocess_info, work);
311 pid_t pid;
312 325
313 if (sub_info->wait & UMH_WAIT_PROC) 326 if (sub_info->wait & UMH_WAIT_PROC) {
314 pid = kernel_thread(wait_for_helper, sub_info, 327 call_usermodehelper_exec_sync(sub_info);
315 CLONE_FS | CLONE_FILES | SIGCHLD); 328 } else {
316 else 329 pid_t pid;
317 pid = kernel_thread(____call_usermodehelper, sub_info,
318 SIGCHLD);
319 330
320 if (pid < 0) { 331 pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
321 sub_info->retval = pid; 332 SIGCHLD);
322 umh_complete(sub_info); 333 if (pid < 0) {
334 sub_info->retval = pid;
335 umh_complete(sub_info);
336 }
323 } 337 }
324} 338}
325 339
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
509 if (!sub_info) 523 if (!sub_info)
510 goto out; 524 goto out;
511 525
512 INIT_WORK(&sub_info->work, __call_usermodehelper); 526 INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
513 sub_info->path = path; 527 sub_info->path = path;
514 sub_info->argv = argv; 528 sub_info->argv = argv;
515 sub_info->envp = envp; 529 sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
531 * from interrupt context. 545 * from interrupt context.
532 * 546 *
533 * Runs a user-space application. The application is started 547 * Runs a user-space application. The application is started
534 * asynchronously if wait is not set, and runs as a child of keventd. 548 * asynchronously if wait is not set, and runs as a child of system workqueues.
535 * (ie. it runs with full root capabilities). 549 * (ie. it runs with full root capabilities and optimized affinity).
536 */ 550 */
537int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 551int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
538{ 552{
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
544 return -EINVAL; 558 return -EINVAL;
545 } 559 }
546 helper_lock(); 560 helper_lock();
547 if (!khelper_wq || usermodehelper_disabled) { 561 if (usermodehelper_disabled) {
548 retval = -EBUSY; 562 retval = -EBUSY;
549 goto out; 563 goto out;
550 } 564 }
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
556 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; 570 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
557 sub_info->wait = wait; 571 sub_info->wait = wait;
558 572
559 queue_work(khelper_wq, &sub_info->work); 573 queue_work(system_unbound_wq, &sub_info->work);
560 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 574 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
561 goto unlock; 575 goto unlock;
562 576
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
686 }, 700 },
687 { } 701 { }
688}; 702};
689
690void __init usermodehelper_init(void)
691{
692 khelper_wq = create_singlethread_workqueue("khelper");
693 BUG_ON(!khelper_wq);
694}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
90KERNEL_ATTR_RW(profiling); 90KERNEL_ATTR_RW(profiling);
91#endif 91#endif
92 92
93#ifdef CONFIG_KEXEC 93#ifdef CONFIG_KEXEC_CORE
94static ssize_t kexec_loaded_show(struct kobject *kobj, 94static ssize_t kexec_loaded_show(struct kobject *kobj,
95 struct kobj_attribute *attr, char *buf) 95 struct kobj_attribute *attr, char *buf)
96{ 96{
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
134} 134}
135KERNEL_ATTR_RO(vmcoreinfo); 135KERNEL_ATTR_RO(vmcoreinfo);
136 136
137#endif /* CONFIG_KEXEC */ 137#endif /* CONFIG_KEXEC_CORE */
138 138
139/* whether file capabilities are enabled */ 139/* whether file capabilities are enabled */
140static ssize_t fscaps_show(struct kobject *kobj, 140static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
196#ifdef CONFIG_PROFILING 196#ifdef CONFIG_PROFILING
197 &profiling_attr.attr, 197 &profiling_attr.attr,
198#endif 198#endif
199#ifdef CONFIG_KEXEC 199#ifdef CONFIG_KEXEC_CORE
200 &kexec_loaded_attr.attr, 200 &kexec_loaded_attr.attr,
201 &kexec_crash_loaded_attr.attr, 201 &kexec_crash_loaded_attr.attr,
202 &kexec_crash_size_attr.attr, 202 &kexec_crash_size_attr.attr,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
835 .release = devkmsg_release, 835 .release = devkmsg_release,
836}; 836};
837 837
838#ifdef CONFIG_KEXEC 838#ifdef CONFIG_KEXEC_CORE
839/* 839/*
840 * This appends the listed symbols to /proc/vmcore 840 * This appends the listed symbols to /proc/vmcore
841 * 841 *
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
346 kernel_restart(buffer); 346 kernel_restart(buffer);
347 break; 347 break;
348 348
349#ifdef CONFIG_KEXEC 349#ifdef CONFIG_KEXEC_CORE
350 case LINUX_REBOOT_CMD_KEXEC: 350 case LINUX_REBOOT_CMD_KEXEC:
351 ret = kernel_kexec(); 351 ret = kernel_kexec();
352 break; 352 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..e69201d8094e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
621 .proc_handler = proc_dointvec, 621 .proc_handler = proc_dointvec,
622 }, 622 },
623#endif 623#endif
624#ifdef CONFIG_KEXEC 624#ifdef CONFIG_KEXEC_CORE
625 { 625 {
626 .procname = "kexec_load_disabled", 626 .procname = "kexec_load_disabled",
627 .data = &kexec_load_disabled, 627 .data = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
1995 int val = *valp; 1995 int val = *valp;
1996 if (val < 0) { 1996 if (val < 0) {
1997 *negp = true; 1997 *negp = true;
1998 *lvalp = (unsigned long)-val; 1998 *lvalp = -(unsigned long)val;
1999 } else { 1999 } else {
2000 *negp = false; 2000 *negp = false;
2001 *lvalp = (unsigned long)val; 2001 *lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2201 int val = *valp; 2201 int val = *valp;
2202 if (val < 0) { 2202 if (val < 0) {
2203 *negp = true; 2203 *negp = true;
2204 *lvalp = (unsigned long)-val; 2204 *lvalp = -(unsigned long)val;
2205 } else { 2205 } else {
2206 *negp = false; 2206 *negp = false;
2207 *lvalp = (unsigned long)val; 2207 *lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2436 unsigned long lval; 2436 unsigned long lval;
2437 if (val < 0) { 2437 if (val < 0) {
2438 *negp = true; 2438 *negp = true;
2439 lval = (unsigned long)-val; 2439 lval = -(unsigned long)val;
2440 } else { 2440 } else {
2441 *negp = false; 2441 *negp = false;
2442 lval = (unsigned long)val; 2442 lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
2459 unsigned long lval; 2459 unsigned long lval;
2460 if (val < 0) { 2460 if (val < 0) {
2461 *negp = true; 2461 *negp = true;
2462 lval = (unsigned long)-val; 2462 lval = -(unsigned long)val;
2463 } else { 2463 } else {
2464 *negp = false; 2464 *negp = false;
2465 lval = (unsigned long)val; 2465 lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2484 unsigned long lval; 2484 unsigned long lval;
2485 if (val < 0) { 2485 if (val < 0) {
2486 *negp = true; 2486 *negp = true;
2487 lval = (unsigned long)-val; 2487 lval = -(unsigned long)val;
2488 } else { 2488 } else {
2489 *negp = false; 2489 *negp = false;
2490 lval = (unsigned long)val; 2490 lval = (unsigned long)val;
diff --git a/lib/bitmap.c b/lib/bitmap.c
index a578a0189199..814814397cce 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -367,7 +367,8 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
367 367
368 nchunks = nbits = totaldigits = c = 0; 368 nchunks = nbits = totaldigits = c = 0;
369 do { 369 do {
370 chunk = ndigits = 0; 370 chunk = 0;
371 ndigits = totaldigits;
371 372
372 /* Get the next chunk of the bitmap */ 373 /* Get the next chunk of the bitmap */
373 while (buflen) { 374 while (buflen) {
@@ -406,9 +407,9 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
406 return -EOVERFLOW; 407 return -EOVERFLOW;
407 408
408 chunk = (chunk << 4) | hex_to_bin(c); 409 chunk = (chunk << 4) | hex_to_bin(c);
409 ndigits++; totaldigits++; 410 totaldigits++;
410 } 411 }
411 if (ndigits == 0) 412 if (ndigits == totaldigits)
412 return -EINVAL; 413 return -EINVAL;
413 if (nchunks == 0 && chunk == 0) 414 if (nchunks == 0 && chunk == 0)
414 continue; 415 continue;
@@ -505,7 +506,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
505 int nmaskbits) 506 int nmaskbits)
506{ 507{
507 unsigned a, b; 508 unsigned a, b;
508 int c, old_c, totaldigits; 509 int c, old_c, totaldigits, ndigits;
509 const char __user __force *ubuf = (const char __user __force *)buf; 510 const char __user __force *ubuf = (const char __user __force *)buf;
510 int at_start, in_range; 511 int at_start, in_range;
511 512
@@ -515,6 +516,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
515 at_start = 1; 516 at_start = 1;
516 in_range = 0; 517 in_range = 0;
517 a = b = 0; 518 a = b = 0;
519 ndigits = totaldigits;
518 520
519 /* Get the next cpu# or a range of cpu#'s */ 521 /* Get the next cpu# or a range of cpu#'s */
520 while (buflen) { 522 while (buflen) {
@@ -528,23 +530,27 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
528 if (isspace(c)) 530 if (isspace(c))
529 continue; 531 continue;
530 532
531 /*
532 * If the last character was a space and the current
533 * character isn't '\0', we've got embedded whitespace.
534 * This is a no-no, so throw an error.
535 */
536 if (totaldigits && c && isspace(old_c))
537 return -EINVAL;
538
539 /* A '\0' or a ',' signal the end of a cpu# or range */ 533 /* A '\0' or a ',' signal the end of a cpu# or range */
540 if (c == '\0' || c == ',') 534 if (c == '\0' || c == ',')
541 break; 535 break;
536 /*
537 * whitespaces between digits are not allowed,
538 * but it's ok if whitespaces are on head or tail.
539 * when old_c is whilespace,
540 * if totaldigits == ndigits, whitespace is on head.
541 * if whitespace is on tail, it should not run here.
542 * as c was ',' or '\0',
543 * the last code line has broken the current loop.
544 */
545 if ((totaldigits != ndigits) && isspace(old_c))
546 return -EINVAL;
542 547
543 if (c == '-') { 548 if (c == '-') {
544 if (at_start || in_range) 549 if (at_start || in_range)
545 return -EINVAL; 550 return -EINVAL;
546 b = 0; 551 b = 0;
547 in_range = 1; 552 in_range = 1;
553 at_start = 1;
548 continue; 554 continue;
549 } 555 }
550 556
@@ -557,15 +563,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
557 at_start = 0; 563 at_start = 0;
558 totaldigits++; 564 totaldigits++;
559 } 565 }
566 if (ndigits == totaldigits)
567 continue;
568 /* if no digit is after '-', it's wrong*/
569 if (at_start && in_range)
570 return -EINVAL;
560 if (!(a <= b)) 571 if (!(a <= b))
561 return -EINVAL; 572 return -EINVAL;
562 if (b >= nmaskbits) 573 if (b >= nmaskbits)
563 return -ERANGE; 574 return -ERANGE;
564 if (!at_start) { 575 while (a <= b) {
565 while (a <= b) { 576 set_bit(a, maskp);
566 set_bit(a, maskp); 577 a++;
567 a++;
568 }
569 } 578 }
570 } while (buflen && c == ','); 579 } while (buflen && c == ',');
571 return 0; 580 return 0;
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index 6dd0335ea61b..0234361b24b8 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -743,12 +743,12 @@ exit_0:
743} 743}
744 744
745#ifdef PREBOOT 745#ifdef PREBOOT
746STATIC int INIT decompress(unsigned char *buf, long len, 746STATIC int INIT __decompress(unsigned char *buf, long len,
747 long (*fill)(void*, unsigned long), 747 long (*fill)(void*, unsigned long),
748 long (*flush)(void*, unsigned long), 748 long (*flush)(void*, unsigned long),
749 unsigned char *outbuf, 749 unsigned char *outbuf, long olen,
750 long *pos, 750 long *pos,
751 void(*error)(char *x)) 751 void (*error)(char *x))
752{ 752{
753 return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error); 753 return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
754} 754}
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index d4c7891635ec..555c06bf20da 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -1,4 +1,5 @@
1#ifdef STATIC 1#ifdef STATIC
2#define PREBOOT
2/* Pre-boot environment: included */ 3/* Pre-boot environment: included */
3 4
4/* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots 5/* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
@@ -33,23 +34,23 @@ static long INIT nofill(void *buffer, unsigned long len)
33} 34}
34 35
35/* Included from initramfs et al code */ 36/* Included from initramfs et al code */
36STATIC int INIT gunzip(unsigned char *buf, long len, 37STATIC int INIT __gunzip(unsigned char *buf, long len,
37 long (*fill)(void*, unsigned long), 38 long (*fill)(void*, unsigned long),
38 long (*flush)(void*, unsigned long), 39 long (*flush)(void*, unsigned long),
39 unsigned char *out_buf, 40 unsigned char *out_buf, long out_len,
40 long *pos, 41 long *pos,
41 void(*error)(char *x)) { 42 void(*error)(char *x)) {
42 u8 *zbuf; 43 u8 *zbuf;
43 struct z_stream_s *strm; 44 struct z_stream_s *strm;
44 int rc; 45 int rc;
45 size_t out_len;
46 46
47 rc = -1; 47 rc = -1;
48 if (flush) { 48 if (flush) {
49 out_len = 0x8000; /* 32 K */ 49 out_len = 0x8000; /* 32 K */
50 out_buf = malloc(out_len); 50 out_buf = malloc(out_len);
51 } else { 51 } else {
52 out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */ 52 if (!out_len)
53 out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
53 } 54 }
54 if (!out_buf) { 55 if (!out_buf) {
55 error("Out of memory while allocating output buffer"); 56 error("Out of memory while allocating output buffer");
@@ -181,4 +182,24 @@ gunzip_nomem1:
181 return rc; /* returns Z_OK (0) if successful */ 182 return rc; /* returns Z_OK (0) if successful */
182} 183}
183 184
184#define decompress gunzip 185#ifndef PREBOOT
186STATIC int INIT gunzip(unsigned char *buf, long len,
187 long (*fill)(void*, unsigned long),
188 long (*flush)(void*, unsigned long),
189 unsigned char *out_buf,
190 long *pos,
191 void (*error)(char *x))
192{
193 return __gunzip(buf, len, fill, flush, out_buf, 0, pos, error);
194}
195#else
196STATIC int INIT __decompress(unsigned char *buf, long len,
197 long (*fill)(void*, unsigned long),
198 long (*flush)(void*, unsigned long),
199 unsigned char *out_buf, long out_len,
200 long *pos,
201 void (*error)(char *x))
202{
203 return __gunzip(buf, len, fill, flush, out_buf, out_len, pos, error);
204}
205#endif
diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c
index 40f66ebe57b7..036fc882cd72 100644
--- a/lib/decompress_unlz4.c
+++ b/lib/decompress_unlz4.c
@@ -196,12 +196,12 @@ exit_0:
196} 196}
197 197
198#ifdef PREBOOT 198#ifdef PREBOOT
199STATIC int INIT decompress(unsigned char *buf, long in_len, 199STATIC int INIT __decompress(unsigned char *buf, long in_len,
200 long (*fill)(void*, unsigned long), 200 long (*fill)(void*, unsigned long),
201 long (*flush)(void*, unsigned long), 201 long (*flush)(void*, unsigned long),
202 unsigned char *output, 202 unsigned char *output, long out_len,
203 long *posp, 203 long *posp,
204 void(*error)(char *x) 204 void (*error)(char *x)
205 ) 205 )
206{ 206{
207 return unlz4(buf, in_len - 4, fill, flush, output, posp, error); 207 return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
index 0be83af62b88..ed7a1fd819f2 100644
--- a/lib/decompress_unlzma.c
+++ b/lib/decompress_unlzma.c
@@ -620,7 +620,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, long in_len,
620 620
621 num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp)); 621 num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
622 p = (uint16_t *) large_malloc(num_probs * sizeof(*p)); 622 p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
623 if (p == 0) 623 if (p == NULL)
624 goto exit_2; 624 goto exit_2;
625 num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp)); 625 num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
626 for (i = 0; i < num_probs; i++) 626 for (i = 0; i < num_probs; i++)
@@ -667,13 +667,12 @@ exit_0:
667} 667}
668 668
669#ifdef PREBOOT 669#ifdef PREBOOT
670STATIC int INIT decompress(unsigned char *buf, long in_len, 670STATIC int INIT __decompress(unsigned char *buf, long in_len,
671 long (*fill)(void*, unsigned long), 671 long (*fill)(void*, unsigned long),
672 long (*flush)(void*, unsigned long), 672 long (*flush)(void*, unsigned long),
673 unsigned char *output, 673 unsigned char *output, long out_len,
674 long *posp, 674 long *posp,
675 void(*error)(char *x) 675 void (*error)(char *x))
676 )
677{ 676{
678 return unlzma(buf, in_len - 4, fill, flush, output, posp, error); 677 return unlzma(buf, in_len - 4, fill, flush, output, posp, error);
679} 678}
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
index b94a31bdd87d..f4c158e3a022 100644
--- a/lib/decompress_unlzo.c
+++ b/lib/decompress_unlzo.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#ifdef STATIC 33#ifdef STATIC
34#define PREBOOT
34#include "lzo/lzo1x_decompress_safe.c" 35#include "lzo/lzo1x_decompress_safe.c"
35#else 36#else
36#include <linux/decompress/unlzo.h> 37#include <linux/decompress/unlzo.h>
@@ -287,4 +288,14 @@ exit:
287 return ret; 288 return ret;
288} 289}
289 290
290#define decompress unlzo 291#ifdef PREBOOT
292STATIC int INIT __decompress(unsigned char *buf, long len,
293 long (*fill)(void*, unsigned long),
294 long (*flush)(void*, unsigned long),
295 unsigned char *out_buf, long olen,
296 long *pos,
297 void (*error)(char *x))
298{
299 return unlzo(buf, len, fill, flush, out_buf, pos, error);
300}
301#endif
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index b07a78340e9d..25d59a95bd66 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -394,4 +394,14 @@ error_alloc_state:
394 * This macro is used by architecture-specific files to decompress 394 * This macro is used by architecture-specific files to decompress
395 * the kernel image. 395 * the kernel image.
396 */ 396 */
397#define decompress unxz 397#ifdef XZ_PREBOOT
398STATIC int INIT __decompress(unsigned char *buf, long len,
399 long (*fill)(void*, unsigned long),
400 long (*flush)(void*, unsigned long),
401 unsigned char *out_buf, long olen,
402 long *pos,
403 void (*error)(char *x))
404{
405 return unxz(buf, len, fill, flush, out_buf, pos, error);
406}
407#endif
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index ec8da78df9be..94be244e8441 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -152,7 +152,7 @@ int kstrtoll(const char *s, unsigned int base, long long *res)
152 rv = _kstrtoull(s + 1, base, &tmp); 152 rv = _kstrtoull(s + 1, base, &tmp);
153 if (rv < 0) 153 if (rv < 0)
154 return rv; 154 return rv;
155 if ((long long)(-tmp) >= 0) 155 if ((long long)-tmp > 0)
156 return -ERANGE; 156 return -ERANGE;
157 *res = -tmp; 157 *res = -tmp;
158 } else { 158 } else {
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index c98ae818eb4e..54036ce2e2dd 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -410,7 +410,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
410 * @dst: destination buffer (escaped) 410 * @dst: destination buffer (escaped)
411 * @osz: destination buffer size 411 * @osz: destination buffer size
412 * @flags: combination of the flags (bitwise OR): 412 * @flags: combination of the flags (bitwise OR):
413 * %ESCAPE_SPACE: 413 * %ESCAPE_SPACE: (special white space, not space itself)
414 * '\f' - form feed 414 * '\f' - form feed
415 * '\n' - new line 415 * '\n' - new line
416 * '\r' - carriage return 416 * '\r' - carriage return
@@ -432,16 +432,18 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
432 * all previous together 432 * all previous together
433 * %ESCAPE_HEX: 433 * %ESCAPE_HEX:
434 * '\xHH' - byte with hexadecimal value HH (2 digits) 434 * '\xHH' - byte with hexadecimal value HH (2 digits)
435 * @esc: NULL-terminated string of characters any of which, if found in 435 * @only: NULL-terminated string containing characters used to limit
436 * the source, has to be escaped 436 * the selected escape class. If characters are included in @only
437 * that would not normally be escaped by the classes selected
438 * in @flags, they will be copied to @dst unescaped.
437 * 439 *
438 * Description: 440 * Description:
439 * The process of escaping byte buffer includes several parts. They are applied 441 * The process of escaping byte buffer includes several parts. They are applied
440 * in the following sequence. 442 * in the following sequence.
441 * 1. The character is matched to the printable class, if asked, and in 443 * 1. The character is matched to the printable class, if asked, and in
442 * case of match it passes through to the output. 444 * case of match it passes through to the output.
443 * 2. The character is not matched to the one from @esc string and thus 445 * 2. The character is not matched to the one from @only string and thus
444 * must go as is to the output. 446 * must go as-is to the output.
445 * 3. The character is checked if it falls into the class given by @flags. 447 * 3. The character is checked if it falls into the class given by @flags.
446 * %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any 448 * %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
447 * character. Note that they actually can't go together, otherwise 449 * character. Note that they actually can't go together, otherwise
@@ -458,11 +460,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
458 * dst for a '\0' terminator if and only if ret < osz. 460 * dst for a '\0' terminator if and only if ret < osz.
459 */ 461 */
460int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, 462int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
461 unsigned int flags, const char *esc) 463 unsigned int flags, const char *only)
462{ 464{
463 char *p = dst; 465 char *p = dst;
464 char *end = p + osz; 466 char *end = p + osz;
465 bool is_dict = esc && *esc; 467 bool is_dict = only && *only;
466 468
467 while (isz--) { 469 while (isz--) {
468 unsigned char c = *src++; 470 unsigned char c = *src++;
@@ -471,7 +473,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
471 * Apply rules in the following sequence: 473 * Apply rules in the following sequence:
472 * - the character is printable, when @flags has 474 * - the character is printable, when @flags has
473 * %ESCAPE_NP bit set 475 * %ESCAPE_NP bit set
474 * - the @esc string is supplied and does not contain a 476 * - the @only string is supplied and does not contain a
475 * character under question 477 * character under question
476 * - the character doesn't fall into a class of symbols 478 * - the character doesn't fall into a class of symbols
477 * defined by given @flags 479 * defined by given @flags
@@ -479,7 +481,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
479 * output buffer. 481 * output buffer.
480 */ 482 */
481 if ((flags & ESCAPE_NP && isprint(c)) || 483 if ((flags & ESCAPE_NP && isprint(c)) ||
482 (is_dict && !strchr(esc, c))) { 484 (is_dict && !strchr(only, c))) {
483 /* do nothing */ 485 /* do nothing */
484 } else { 486 } else {
485 if (flags & ESCAPE_SPACE && escape_space(c, &p, end)) 487 if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c
index 4137bca5f8e8..f355f67169b6 100644
--- a/lib/test-kstrtox.c
+++ b/lib/test-kstrtox.c
@@ -260,6 +260,7 @@ static void __init test_kstrtoll_ok(void)
260 {"4294967297", 10, 4294967297LL}, 260 {"4294967297", 10, 4294967297LL},
261 {"9223372036854775807", 10, 9223372036854775807LL}, 261 {"9223372036854775807", 10, 9223372036854775807LL},
262 262
263 {"-0", 10, 0LL},
263 {"-1", 10, -1LL}, 264 {"-1", 10, -1LL},
264 {"-2", 10, -2LL}, 265 {"-2", 10, -2LL},
265 {"-9223372036854775808", 10, LLONG_MIN}, 266 {"-9223372036854775808", 10, LLONG_MIN},
@@ -277,11 +278,6 @@ static void __init test_kstrtoll_fail(void)
277 {"-9223372036854775809", 10}, 278 {"-9223372036854775809", 10},
278 {"-18446744073709551614", 10}, 279 {"-18446744073709551614", 10},
279 {"-18446744073709551615", 10}, 280 {"-18446744073709551615", 10},
280 /* negative zero isn't an integer in Linux */
281 {"-0", 0},
282 {"-0", 8},
283 {"-0", 10},
284 {"-0", 16},
285 /* sign is first character if any */ 281 /* sign is first character if any */
286 {"-+1", 0}, 282 {"-+1", 0},
287 {"-+1", 8}, 283 {"-+1", 8},
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 098c08eddfab..c1efb1b61017 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -65,7 +65,7 @@ static noinline void __init kmalloc_node_oob_right(void)
65 kfree(ptr); 65 kfree(ptr);
66} 66}
67 67
68static noinline void __init kmalloc_large_oob_rigth(void) 68static noinline void __init kmalloc_large_oob_right(void)
69{ 69{
70 char *ptr; 70 char *ptr;
71 size_t size = KMALLOC_MAX_CACHE_SIZE + 10; 71 size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
@@ -114,7 +114,7 @@ static noinline void __init kmalloc_oob_krealloc_less(void)
114 kfree(ptr1); 114 kfree(ptr1);
115 return; 115 return;
116 } 116 }
117 ptr2[size1] = 'x'; 117 ptr2[size2] = 'x';
118 kfree(ptr2); 118 kfree(ptr2);
119} 119}
120 120
@@ -259,7 +259,7 @@ static int __init kmalloc_tests_init(void)
259 kmalloc_oob_right(); 259 kmalloc_oob_right();
260 kmalloc_oob_left(); 260 kmalloc_oob_left();
261 kmalloc_node_oob_right(); 261 kmalloc_node_oob_right();
262 kmalloc_large_oob_rigth(); 262 kmalloc_large_oob_right();
263 kmalloc_oob_krealloc_more(); 263 kmalloc_oob_krealloc_more();
264 kmalloc_oob_krealloc_less(); 264 kmalloc_oob_krealloc_less();
265 kmalloc_oob_16(); 265 kmalloc_oob_16();
diff --git a/lib/zlib_deflate/deftree.c b/lib/zlib_deflate/deftree.c
index ddf348299f24..9b1756b12743 100644
--- a/lib/zlib_deflate/deftree.c
+++ b/lib/zlib_deflate/deftree.c
@@ -35,6 +35,7 @@
35/* #include "deflate.h" */ 35/* #include "deflate.h" */
36 36
37#include <linux/zutil.h> 37#include <linux/zutil.h>
38#include <linux/bitrev.h>
38#include "defutil.h" 39#include "defutil.h"
39 40
40#ifdef DEBUG_ZLIB 41#ifdef DEBUG_ZLIB
@@ -146,7 +147,6 @@ static void send_all_trees (deflate_state *s, int lcodes, int dcodes,
146static void compress_block (deflate_state *s, ct_data *ltree, 147static void compress_block (deflate_state *s, ct_data *ltree,
147 ct_data *dtree); 148 ct_data *dtree);
148static void set_data_type (deflate_state *s); 149static void set_data_type (deflate_state *s);
149static unsigned bi_reverse (unsigned value, int length);
150static void bi_windup (deflate_state *s); 150static void bi_windup (deflate_state *s);
151static void bi_flush (deflate_state *s); 151static void bi_flush (deflate_state *s);
152static void copy_block (deflate_state *s, char *buf, unsigned len, 152static void copy_block (deflate_state *s, char *buf, unsigned len,
@@ -284,7 +284,7 @@ static void tr_static_init(void)
284 /* The static distance tree is trivial: */ 284 /* The static distance tree is trivial: */
285 for (n = 0; n < D_CODES; n++) { 285 for (n = 0; n < D_CODES; n++) {
286 static_dtree[n].Len = 5; 286 static_dtree[n].Len = 5;
287 static_dtree[n].Code = bi_reverse((unsigned)n, 5); 287 static_dtree[n].Code = bitrev32((u32)n) >> (32 - 5);
288 } 288 }
289 static_init_done = 1; 289 static_init_done = 1;
290} 290}
@@ -520,7 +520,7 @@ static void gen_codes(
520 int len = tree[n].Len; 520 int len = tree[n].Len;
521 if (len == 0) continue; 521 if (len == 0) continue;
522 /* Now reverse the bits */ 522 /* Now reverse the bits */
523 tree[n].Code = bi_reverse(next_code[len]++, len); 523 tree[n].Code = bitrev32((u32)(next_code[len]++)) >> (32 - len);
524 524
525 Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ", 525 Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
526 n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1)); 526 n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
diff --git a/lib/zlib_deflate/defutil.h b/lib/zlib_deflate/defutil.h
index b640b6402e99..a8c370897c9f 100644
--- a/lib/zlib_deflate/defutil.h
+++ b/lib/zlib_deflate/defutil.h
@@ -293,22 +293,6 @@ void zlib_tr_stored_type_only (deflate_state *);
293} 293}
294 294
295/* =========================================================================== 295/* ===========================================================================
296 * Reverse the first len bits of a code, using straightforward code (a faster
297 * method would use a table)
298 * IN assertion: 1 <= len <= 15
299 */
300static inline unsigned bi_reverse(unsigned code, /* the value to invert */
301 int len) /* its bit length */
302{
303 register unsigned res = 0;
304 do {
305 res |= code & 1;
306 code >>= 1, res <<= 1;
307 } while (--len > 0);
308 return res >> 1;
309}
310
311/* ===========================================================================
312 * Flush the bit buffer, keeping at most 7 bits in it. 296 * Flush the bit buffer, keeping at most 7 bits in it.
313 */ 297 */
314static inline void bi_flush(deflate_state *s) 298static inline void bi_flush(deflate_state *s)
diff --git a/mm/Kconfig b/mm/Kconfig
index 3a4070f5ab79..6413d027c0b2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT
649 processes running early in the lifetime of the systemm until kswapd 649 processes running early in the lifetime of the systemm until kswapd
650 finishes the initialisation. 650 finishes the initialisation.
651 651
652config IDLE_PAGE_TRACKING
653 bool "Enable idle page tracking"
654 depends on SYSFS && MMU
655 select PAGE_EXTENSION if !64BIT
656 help
657 This feature allows to estimate the amount of user pages that have
658 not been touched during a given period of time. This information can
659 be useful to tune memory cgroup limits and/or for job placement
660 within a compute cluster.
661
662 See Documentation/vm/idle_page_tracking.txt for more details.
663
652config ZONE_DEVICE 664config ZONE_DEVICE
653 bool "Device memory (pmem, etc...) hotplug support" if EXPERT 665 bool "Device memory (pmem, etc...) hotplug support" if EXPERT
654 default !ZONE_DMA 666 default !ZONE_DMA
diff --git a/mm/Makefile b/mm/Makefile
index b424d5e5b6ff..56f8eed73f1a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o 79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o 80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
81obj-$(CONFIG_USERFAULTFD) += userfaultfd.o 81obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
82obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/debug.c b/mm/debug.c
index 76089ddf99ea..6c1b3ea61bfd 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
48#ifdef CONFIG_TRANSPARENT_HUGEPAGE 48#ifdef CONFIG_TRANSPARENT_HUGEPAGE
49 {1UL << PG_compound_lock, "compound_lock" }, 49 {1UL << PG_compound_lock, "compound_lock" },
50#endif 50#endif
51#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
52 {1UL << PG_young, "young" },
53 {1UL << PG_idle, "idle" },
54#endif
51}; 55};
52 56
53static void dump_flags(unsigned long flags, 57static void dump_flags(unsigned long flags,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b16279cbd91d..4b06b8db9df2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -25,6 +25,7 @@
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/hashtable.h> 26#include <linux/hashtable.h>
27#include <linux/userfaultfd_k.h> 27#include <linux/userfaultfd_k.h>
28#include <linux/page_idle.h>
28 29
29#include <asm/tlb.h> 30#include <asm/tlb.h>
30#include <asm/pgalloc.h> 31#include <asm/pgalloc.h>
@@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page,
1757 /* clear PageTail before overwriting first_page */ 1758 /* clear PageTail before overwriting first_page */
1758 smp_wmb(); 1759 smp_wmb();
1759 1760
1761 if (page_is_young(page))
1762 set_page_young(page_tail);
1763 if (page_is_idle(page))
1764 set_page_idle(page_tail);
1765
1760 /* 1766 /*
1761 * __split_huge_page_splitting() already set the 1767 * __split_huge_page_splitting() already set the
1762 * splitting bit in all pmd that could map this 1768 * splitting bit in all pmd that could map this
@@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2262 VM_BUG_ON_PAGE(PageLRU(page), page); 2268 VM_BUG_ON_PAGE(PageLRU(page), page);
2263 2269
2264 /* If there is no mapped pte young don't collapse the page */ 2270 /* If there is no mapped pte young don't collapse the page */
2265 if (pte_young(pteval) || PageReferenced(page) || 2271 if (pte_young(pteval) ||
2272 page_is_young(page) || PageReferenced(page) ||
2266 mmu_notifier_test_young(vma->vm_mm, address)) 2273 mmu_notifier_test_young(vma->vm_mm, address))
2267 referenced = true; 2274 referenced = true;
2268 } 2275 }
@@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2693 */ 2700 */
2694 if (page_count(page) != 1 + !!PageSwapCache(page)) 2701 if (page_count(page) != 1 + !!PageSwapCache(page))
2695 goto out_unmap; 2702 goto out_unmap;
2696 if (pte_young(pteval) || PageReferenced(page) || 2703 if (pte_young(pteval) ||
2704 page_is_young(page) || PageReferenced(page) ||
2697 mmu_notifier_test_young(vma->vm_mm, address)) 2705 mmu_notifier_test_young(vma->vm_mm, address))
2698 referenced = true; 2706 referenced = true;
2699 } 2707 }
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index aeba0edd6e44..9d26fd9fefe4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
45 /* 45 /*
46 * do a racy check with elevated page count, to make sure PG_hwpoison 46 * do a racy check with elevated page count, to make sure PG_hwpoison
47 * will only be set for the targeted owner (or on a free page). 47 * will only be set for the targeted owner (or on a free page).
48 * We temporarily take page lock for try_get_mem_cgroup_from_page().
49 * memory_failure() will redo the check reliably inside page lock. 48 * memory_failure() will redo the check reliably inside page lock.
50 */ 49 */
51 lock_page(hpage);
52 err = hwpoison_filter(hpage); 50 err = hwpoison_filter(hpage);
53 unlock_page(hpage);
54 if (err) 51 if (err)
55 goto put_out; 52 goto put_out;
56 53
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
126 if (!dentry) 123 if (!dentry)
127 goto fail; 124 goto fail;
128 125
129#ifdef CONFIG_MEMCG_SWAP 126#ifdef CONFIG_MEMCG
130 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
131 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
132 if (!dentry) 129 if (!dentry)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f532f6a37b55..77191eccdc6f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
302 struct kmemleak_object *object) 302 struct kmemleak_object *object)
303{ 303{
304 const u8 *ptr = (const u8 *)object->pointer; 304 const u8 *ptr = (const u8 *)object->pointer;
305 int i, len, remaining; 305 size_t len;
306 unsigned char linebuf[HEX_ROW_SIZE * 5];
307 306
308 /* limit the number of lines to HEX_MAX_LINES */ 307 /* limit the number of lines to HEX_MAX_LINES */
309 remaining = len = 308 len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
310 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); 309
311 310 seq_printf(seq, " hex dump (first %zu bytes):\n", len);
312 seq_printf(seq, " hex dump (first %d bytes):\n", len); 311 seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
313 for (i = 0; i < len; i += HEX_ROW_SIZE) { 312 HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
314 int linelen = min(remaining, HEX_ROW_SIZE);
315
316 remaining -= HEX_ROW_SIZE;
317 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
318 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
319 HEX_ASCII);
320 seq_printf(seq, " %s\n", linebuf);
321 }
322} 313}
323 314
324/* 315/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1742a2db89c7..6ddaeba34e09 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -441,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
441 return &memcg->css; 441 return &memcg->css;
442} 442}
443 443
444/**
445 * page_cgroup_ino - return inode number of the memcg a page is charged to
446 * @page: the page
447 *
448 * Look up the closest online ancestor of the memory cgroup @page is charged to
449 * and return its inode number or 0 if @page is not charged to any cgroup. It
450 * is safe to call this function without holding a reference to @page.
451 *
452 * Note, this function is inherently racy, because there is nothing to prevent
453 * the cgroup inode from getting torn down and potentially reallocated a moment
454 * after page_cgroup_ino() returns, so it only should be used by callers that
455 * do not care (such as procfs interfaces).
456 */
457ino_t page_cgroup_ino(struct page *page)
458{
459 struct mem_cgroup *memcg;
460 unsigned long ino = 0;
461
462 rcu_read_lock();
463 memcg = READ_ONCE(page->mem_cgroup);
464 while (memcg && !(memcg->css.flags & CSS_ONLINE))
465 memcg = parent_mem_cgroup(memcg);
466 if (memcg)
467 ino = cgroup_ino(memcg->css.cgroup);
468 rcu_read_unlock();
469 return ino;
470}
471
444static struct mem_cgroup_per_zone * 472static struct mem_cgroup_per_zone *
445mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 473mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
446{ 474{
@@ -2071,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2071 css_put_many(&memcg->css, nr_pages); 2099 css_put_many(&memcg->css, nr_pages);
2072} 2100}
2073 2101
2074/*
2075 * try_get_mem_cgroup_from_page - look up page's memcg association
2076 * @page: the page
2077 *
2078 * Look up, get a css reference, and return the memcg that owns @page.
2079 *
2080 * The page must be locked to prevent racing with swap-in and page
2081 * cache charges. If coming from an unlocked page table, the caller
2082 * must ensure the page is on the LRU or this can race with charging.
2083 */
2084struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2085{
2086 struct mem_cgroup *memcg;
2087 unsigned short id;
2088 swp_entry_t ent;
2089
2090 VM_BUG_ON_PAGE(!PageLocked(page), page);
2091
2092 memcg = page->mem_cgroup;
2093 if (memcg) {
2094 if (!css_tryget_online(&memcg->css))
2095 memcg = NULL;
2096 } else if (PageSwapCache(page)) {
2097 ent.val = page_private(page);
2098 id = lookup_swap_cgroup_id(ent);
2099 rcu_read_lock();
2100 memcg = mem_cgroup_from_id(id);
2101 if (memcg && !css_tryget_online(&memcg->css))
2102 memcg = NULL;
2103 rcu_read_unlock();
2104 }
2105 return memcg;
2106}
2107
2108static void lock_page_lru(struct page *page, int *isolated) 2102static void lock_page_lru(struct page *page, int *isolated)
2109{ 2103{
2110 struct zone *zone = page_zone(page); 2104 struct zone *zone = page_zone(page);
@@ -5301,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5301 * the page lock, which serializes swap cache removal, which 5295 * the page lock, which serializes swap cache removal, which
5302 * in turn serializes uncharging. 5296 * in turn serializes uncharging.
5303 */ 5297 */
5298 VM_BUG_ON_PAGE(!PageLocked(page), page);
5304 if (page->mem_cgroup) 5299 if (page->mem_cgroup)
5305 goto out; 5300 goto out;
5301
5302 if (do_swap_account) {
5303 swp_entry_t ent = { .val = page_private(page), };
5304 unsigned short id = lookup_swap_cgroup_id(ent);
5305
5306 rcu_read_lock();
5307 memcg = mem_cgroup_from_id(id);
5308 if (memcg && !css_tryget_online(&memcg->css))
5309 memcg = NULL;
5310 rcu_read_unlock();
5311 }
5306 } 5312 }
5307 5313
5308 if (PageTransHuge(page)) { 5314 if (PageTransHuge(page)) {
@@ -5310,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5310 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5316 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5311 } 5317 }
5312 5318
5313 if (do_swap_account && PageSwapCache(page))
5314 memcg = try_get_mem_cgroup_from_page(page);
5315 if (!memcg) 5319 if (!memcg)
5316 memcg = get_mem_cgroup_from_mm(mm); 5320 memcg = get_mem_cgroup_from_mm(mm);
5317 5321
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index eeda6485e76c..95882692e747 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
130 * can only guarantee that the page either belongs to the memcg tasks, or is 130 * can only guarantee that the page either belongs to the memcg tasks, or is
131 * a freed page. 131 * a freed page.
132 */ 132 */
133#ifdef CONFIG_MEMCG_SWAP 133#ifdef CONFIG_MEMCG
134u64 hwpoison_filter_memcg; 134u64 hwpoison_filter_memcg;
135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
136static int hwpoison_filter_task(struct page *p) 136static int hwpoison_filter_task(struct page *p)
137{ 137{
138 struct mem_cgroup *mem;
139 struct cgroup_subsys_state *css;
140 unsigned long ino;
141
142 if (!hwpoison_filter_memcg) 138 if (!hwpoison_filter_memcg)
143 return 0; 139 return 0;
144 140
145 mem = try_get_mem_cgroup_from_page(p); 141 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
146 if (!mem)
147 return -EINVAL;
148
149 css = &mem->css;
150 ino = cgroup_ino(css->cgroup);
151 css_put(css);
152
153 if (ino != hwpoison_filter_memcg)
154 return -EINVAL; 142 return -EINVAL;
155 143
156 return 0; 144 return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 6cd0b2160401..9cb27470fee9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3233,7 +3233,7 @@ out:
3233static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 3233static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3234 unsigned long address, pmd_t *pmd, unsigned int flags) 3234 unsigned long address, pmd_t *pmd, unsigned int flags)
3235{ 3235{
3236 if (!vma->vm_ops) 3236 if (vma_is_anonymous(vma))
3237 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); 3237 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
3238 if (vma->vm_ops->pmd_fault) 3238 if (vma->vm_ops->pmd_fault)
3239 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3239 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
@@ -3244,7 +3244,7 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3244 unsigned long address, pmd_t *pmd, pmd_t orig_pmd, 3244 unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
3245 unsigned int flags) 3245 unsigned int flags)
3246{ 3246{
3247 if (!vma->vm_ops) 3247 if (vma_is_anonymous(vma))
3248 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); 3248 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
3249 if (vma->vm_ops->pmd_fault) 3249 if (vma->vm_ops->pmd_fault)
3250 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3250 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
diff --git a/mm/migrate.c b/mm/migrate.c
index 02ce25df16c2..c3cb566af3e2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h> 38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h> 39#include <linux/mmu_notifier.h>
40#include <linux/page_idle.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42 43
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
524 __set_page_dirty_nobuffers(newpage); 525 __set_page_dirty_nobuffers(newpage);
525 } 526 }
526 527
528 if (page_is_young(page))
529 set_page_young(newpage);
530 if (page_is_idle(page))
531 set_page_idle(newpage);
532
527 /* 533 /*
528 * Copy NUMA information to the new page, to prevent over-eager 534 * Copy NUMA information to the new page, to prevent over-eager
529 * future migrations of this same page. 535 * future migrations of this same page.
diff --git a/mm/mmap.c b/mm/mmap.c
index b6be3249f0a9..971dd2cb77d2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,6 +612,8 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
612void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 612void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
613 struct rb_node **rb_link, struct rb_node *rb_parent) 613 struct rb_node **rb_link, struct rb_node *rb_parent)
614{ 614{
615 WARN_ONCE(vma->vm_file && !vma->vm_ops, "missing vma->vm_ops");
616
615 /* Update tracking information for the gap following the new vma. */ 617 /* Update tracking information for the gap following the new vma. */
616 if (vma->vm_next) 618 if (vma->vm_next)
617 vma_gap_update(vma->vm_next); 619 vma_gap_update(vma->vm_next);
@@ -1260,14 +1262,12 @@ static inline int mlock_future_check(struct mm_struct *mm,
1260/* 1262/*
1261 * The caller must hold down_write(&current->mm->mmap_sem). 1263 * The caller must hold down_write(&current->mm->mmap_sem).
1262 */ 1264 */
1263 1265unsigned long do_mmap(struct file *file, unsigned long addr,
1264unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1265 unsigned long len, unsigned long prot, 1266 unsigned long len, unsigned long prot,
1266 unsigned long flags, unsigned long pgoff, 1267 unsigned long flags, vm_flags_t vm_flags,
1267 unsigned long *populate) 1268 unsigned long pgoff, unsigned long *populate)
1268{ 1269{
1269 struct mm_struct *mm = current->mm; 1270 struct mm_struct *mm = current->mm;
1270 vm_flags_t vm_flags;
1271 1271
1272 *populate = 0; 1272 *populate = 0;
1273 1273
@@ -1311,7 +1311,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1311 * to. we assume access permissions have been handled by the open 1311 * to. we assume access permissions have been handled by the open
1312 * of the memory object, so we don't do any here. 1312 * of the memory object, so we don't do any here.
1313 */ 1313 */
1314 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1314 vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1315 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1315 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1316 1316
1317 if (flags & MAP_LOCKED) 1317 if (flags & MAP_LOCKED)
@@ -1638,6 +1638,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1638 */ 1638 */
1639 WARN_ON_ONCE(addr != vma->vm_start); 1639 WARN_ON_ONCE(addr != vma->vm_start);
1640 1640
1641 /* All file mapping must have ->vm_ops set */
1642 if (!vma->vm_ops) {
1643 static const struct vm_operations_struct dummy_ops = {};
1644 vma->vm_ops = &dummy_ops;
1645 }
1646
1641 addr = vma->vm_start; 1647 addr = vma->vm_start;
1642 vm_flags = vma->vm_flags; 1648 vm_flags = vma->vm_flags;
1643 } else if (vm_flags & VM_SHARED) { 1649 } else if (vm_flags & VM_SHARED) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3b9b3d0741b2..5fbdd367bbed 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
123 return young; 123 return young;
124} 124}
125 125
126int __mmu_notifier_clear_young(struct mm_struct *mm,
127 unsigned long start,
128 unsigned long end)
129{
130 struct mmu_notifier *mn;
131 int young = 0, id;
132
133 id = srcu_read_lock(&srcu);
134 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
135 if (mn->ops->clear_young)
136 young |= mn->ops->clear_young(mn, mm, start, end);
137 }
138 srcu_read_unlock(&srcu, id);
139
140 return young;
141}
142
126int __mmu_notifier_test_young(struct mm_struct *mm, 143int __mmu_notifier_test_young(struct mm_struct *mm,
127 unsigned long address) 144 unsigned long address)
128{ 145{
diff --git a/mm/nommu.c b/mm/nommu.c
index 1cc0709fcaa5..ab14a2014dea 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1233,18 +1233,19 @@ enomem:
1233/* 1233/*
1234 * handle mapping creation for uClinux 1234 * handle mapping creation for uClinux
1235 */ 1235 */
1236unsigned long do_mmap_pgoff(struct file *file, 1236unsigned long do_mmap(struct file *file,
1237 unsigned long addr, 1237 unsigned long addr,
1238 unsigned long len, 1238 unsigned long len,
1239 unsigned long prot, 1239 unsigned long prot,
1240 unsigned long flags, 1240 unsigned long flags,
1241 unsigned long pgoff, 1241 vm_flags_t vm_flags,
1242 unsigned long *populate) 1242 unsigned long pgoff,
1243 unsigned long *populate)
1243{ 1244{
1244 struct vm_area_struct *vma; 1245 struct vm_area_struct *vma;
1245 struct vm_region *region; 1246 struct vm_region *region;
1246 struct rb_node *rb; 1247 struct rb_node *rb;
1247 unsigned long capabilities, vm_flags, result; 1248 unsigned long capabilities, result;
1248 int ret; 1249 int ret;
1249 1250
1250 *populate = 0; 1251 *populate = 0;
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1262 1263
1263 /* we've determined that we can make the mapping, now translate what we 1264 /* we've determined that we can make the mapping, now translate what we
1264 * now know into VMA flags */ 1265 * now know into VMA flags */
1265 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1266 vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
1266 1267
1267 /* we're going to need to record the mapping */ 1268 /* we're going to need to record the mapping */
1268 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); 1269 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index d86fd2f5353f..292ca7b8debd 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -6,6 +6,7 @@
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
7#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
8#include <linux/page_owner.h> 8#include <linux/page_owner.h>
9#include <linux/page_idle.h>
9 10
10/* 11/*
11 * struct page extension 12 * struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
59#ifdef CONFIG_PAGE_OWNER 60#ifdef CONFIG_PAGE_OWNER
60 &page_owner_ops, 61 &page_owner_ops,
61#endif 62#endif
63#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
64 &page_idle_ops,
65#endif
62}; 66};
63 67
64static unsigned long total_usage; 68static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644
index 000000000000..d5dd79041484
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,232 @@
1#include <linux/init.h>
2#include <linux/bootmem.h>
3#include <linux/fs.h>
4#include <linux/sysfs.h>
5#include <linux/kobject.h>
6#include <linux/mm.h>
7#include <linux/mmzone.h>
8#include <linux/pagemap.h>
9#include <linux/rmap.h>
10#include <linux/mmu_notifier.h>
11#include <linux/page_ext.h>
12#include <linux/page_idle.h>
13
14#define BITMAP_CHUNK_SIZE sizeof(u64)
15#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
16
17/*
18 * Idle page tracking only considers user memory pages, for other types of
19 * pages the idle flag is always unset and an attempt to set it is silently
20 * ignored.
21 *
22 * We treat a page as a user memory page if it is on an LRU list, because it is
23 * always safe to pass such a page to rmap_walk(), which is essential for idle
24 * page tracking. With such an indicator of user pages we can skip isolated
25 * pages, but since there are not usually many of them, it will hardly affect
26 * the overall result.
27 *
28 * This function tries to get a user memory page by pfn as described above.
29 */
30static struct page *page_idle_get_page(unsigned long pfn)
31{
32 struct page *page;
33 struct zone *zone;
34
35 if (!pfn_valid(pfn))
36 return NULL;
37
38 page = pfn_to_page(pfn);
39 if (!page || !PageLRU(page) ||
40 !get_page_unless_zero(page))
41 return NULL;
42
43 zone = page_zone(page);
44 spin_lock_irq(&zone->lru_lock);
45 if (unlikely(!PageLRU(page))) {
46 put_page(page);
47 page = NULL;
48 }
49 spin_unlock_irq(&zone->lru_lock);
50 return page;
51}
52
53static int page_idle_clear_pte_refs_one(struct page *page,
54 struct vm_area_struct *vma,
55 unsigned long addr, void *arg)
56{
57 struct mm_struct *mm = vma->vm_mm;
58 spinlock_t *ptl;
59 pmd_t *pmd;
60 pte_t *pte;
61 bool referenced = false;
62
63 if (unlikely(PageTransHuge(page))) {
64 pmd = page_check_address_pmd(page, mm, addr,
65 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
66 if (pmd) {
67 referenced = pmdp_clear_young_notify(vma, addr, pmd);
68 spin_unlock(ptl);
69 }
70 } else {
71 pte = page_check_address(page, mm, addr, &ptl, 0);
72 if (pte) {
73 referenced = ptep_clear_young_notify(vma, addr, pte);
74 pte_unmap_unlock(pte, ptl);
75 }
76 }
77 if (referenced) {
78 clear_page_idle(page);
79 /*
80 * We cleared the referenced bit in a mapping to this page. To
81 * avoid interference with page reclaim, mark it young so that
82 * page_referenced() will return > 0.
83 */
84 set_page_young(page);
85 }
86 return SWAP_AGAIN;
87}
88
89static void page_idle_clear_pte_refs(struct page *page)
90{
91 /*
92 * Since rwc.arg is unused, rwc is effectively immutable, so we
93 * can make it static const to save some cycles and stack.
94 */
95 static const struct rmap_walk_control rwc = {
96 .rmap_one = page_idle_clear_pte_refs_one,
97 .anon_lock = page_lock_anon_vma_read,
98 };
99 bool need_lock;
100
101 if (!page_mapped(page) ||
102 !page_rmapping(page))
103 return;
104
105 need_lock = !PageAnon(page) || PageKsm(page);
106 if (need_lock && !trylock_page(page))
107 return;
108
109 rmap_walk(page, (struct rmap_walk_control *)&rwc);
110
111 if (need_lock)
112 unlock_page(page);
113}
114
115static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
116 struct bin_attribute *attr, char *buf,
117 loff_t pos, size_t count)
118{
119 u64 *out = (u64 *)buf;
120 struct page *page;
121 unsigned long pfn, end_pfn;
122 int bit;
123
124 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
125 return -EINVAL;
126
127 pfn = pos * BITS_PER_BYTE;
128 if (pfn >= max_pfn)
129 return 0;
130
131 end_pfn = pfn + count * BITS_PER_BYTE;
132 if (end_pfn > max_pfn)
133 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
134
135 for (; pfn < end_pfn; pfn++) {
136 bit = pfn % BITMAP_CHUNK_BITS;
137 if (!bit)
138 *out = 0ULL;
139 page = page_idle_get_page(pfn);
140 if (page) {
141 if (page_is_idle(page)) {
142 /*
143 * The page might have been referenced via a
144 * pte, in which case it is not idle. Clear
145 * refs and recheck.
146 */
147 page_idle_clear_pte_refs(page);
148 if (page_is_idle(page))
149 *out |= 1ULL << bit;
150 }
151 put_page(page);
152 }
153 if (bit == BITMAP_CHUNK_BITS - 1)
154 out++;
155 cond_resched();
156 }
157 return (char *)out - buf;
158}
159
160static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
161 struct bin_attribute *attr, char *buf,
162 loff_t pos, size_t count)
163{
164 const u64 *in = (u64 *)buf;
165 struct page *page;
166 unsigned long pfn, end_pfn;
167 int bit;
168
169 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
170 return -EINVAL;
171
172 pfn = pos * BITS_PER_BYTE;
173 if (pfn >= max_pfn)
174 return -ENXIO;
175
176 end_pfn = pfn + count * BITS_PER_BYTE;
177 if (end_pfn > max_pfn)
178 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
179
180 for (; pfn < end_pfn; pfn++) {
181 bit = pfn % BITMAP_CHUNK_BITS;
182 if ((*in >> bit) & 1) {
183 page = page_idle_get_page(pfn);
184 if (page) {
185 page_idle_clear_pte_refs(page);
186 set_page_idle(page);
187 put_page(page);
188 }
189 }
190 if (bit == BITMAP_CHUNK_BITS - 1)
191 in++;
192 cond_resched();
193 }
194 return (char *)in - buf;
195}
196
197static struct bin_attribute page_idle_bitmap_attr =
198 __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
199 page_idle_bitmap_read, page_idle_bitmap_write, 0);
200
201static struct bin_attribute *page_idle_bin_attrs[] = {
202 &page_idle_bitmap_attr,
203 NULL,
204};
205
206static struct attribute_group page_idle_attr_group = {
207 .bin_attrs = page_idle_bin_attrs,
208 .name = "page_idle",
209};
210
211#ifndef CONFIG_64BIT
212static bool need_page_idle(void)
213{
214 return true;
215}
216struct page_ext_operations page_idle_ops = {
217 .need = need_page_idle,
218};
219#endif
220
221static int __init page_idle_init(void)
222{
223 int err;
224
225 err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
226 if (err) {
227 pr_err("page_idle: register sysfs failed\n");
228 return err;
229 }
230 return 0;
231}
232subsys_initcall(page_idle_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0db38e7d0a72..f5b5c1f3dcd7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,6 +59,7 @@
59#include <linux/migrate.h> 59#include <linux/migrate.h>
60#include <linux/hugetlb.h> 60#include <linux/hugetlb.h>
61#include <linux/backing-dev.h> 61#include <linux/backing-dev.h>
62#include <linux/page_idle.h>
62 63
63#include <asm/tlbflush.h> 64#include <asm/tlbflush.h>
64 65
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
886 pte_unmap_unlock(pte, ptl); 887 pte_unmap_unlock(pte, ptl);
887 } 888 }
888 889
890 if (referenced)
891 clear_page_idle(page);
892 if (test_and_clear_page_young(page))
893 referenced++;
894
889 if (referenced) { 895 if (referenced) {
890 pra->referenced++; 896 pra->referenced++;
891 pra->vm_flags |= vma->vm_flags; 897 pra->vm_flags |= vma->vm_flags;
diff --git a/mm/swap.c b/mm/swap.c
index a3a0a2f1f7c3..983f692a47fd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h> 34#include <linux/hugetlb.h>
35#include <linux/page_idle.h>
35 36
36#include "internal.h" 37#include "internal.h"
37 38
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
622 } else if (!PageReferenced(page)) { 623 } else if (!PageReferenced(page)) {
623 SetPageReferenced(page); 624 SetPageReferenced(page);
624 } 625 }
626 if (page_is_idle(page))
627 clear_page_idle(page);
625} 628}
626EXPORT_SYMBOL(mark_page_accessed); 629EXPORT_SYMBOL(mark_page_accessed);
627 630
diff --git a/mm/zpool.c b/mm/zpool.c
index 68d2dd8ed2d8..8f670d3e8706 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -100,6 +100,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
100} 100}
101 101
102/** 102/**
103 * zpool_has_pool() - Check if the pool driver is available
104 * @type The type of the zpool to check (e.g. zbud, zsmalloc)
105 *
106 * This checks if the @type pool driver is available. This will try to load
107 * the requested module, if needed, but there is no guarantee the module will
108 * still be loaded and available immediately after calling. If this returns
109 * true, the caller should assume the pool is available, but must be prepared
110 * to handle the @zpool_create_pool() returning failure. However if this
111 * returns false, the caller should assume the requested pool type is not
112 * available; either the requested pool type module does not exist, or could
113 * not be loaded, and calling @zpool_create_pool() with the pool type will
114 * fail.
115 *
116 * Returns: true if @type pool is available, false if not
117 */
118bool zpool_has_pool(char *type)
119{
120 struct zpool_driver *driver = zpool_get_driver(type);
121
122 if (!driver) {
123 request_module("zpool-%s", type);
124 driver = zpool_get_driver(type);
125 }
126
127 if (!driver)
128 return false;
129
130 zpool_put_driver(driver);
131 return true;
132}
133EXPORT_SYMBOL(zpool_has_pool);
134
135/**
103 * zpool_create_pool() - Create a new zpool 136 * zpool_create_pool() - Create a new zpool
104 * @type The type of the zpool to create (e.g. zbud, zsmalloc) 137 * @type The type of the zpool to create (e.g. zbud, zsmalloc)
105 * @name The name of the zpool (e.g. zram0, zswap) 138 * @name The name of the zpool (e.g. zram0, zswap)
diff --git a/mm/zswap.c b/mm/zswap.c
index 48a1d081e2a5..4043df7c672f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry;
80static bool zswap_enabled; 80static bool zswap_enabled;
81module_param_named(enabled, zswap_enabled, bool, 0644); 81module_param_named(enabled, zswap_enabled, bool, 0644);
82 82
83/* Compressor to be used by zswap (fixed at boot for now) */ 83/* Crypto compressor to use */
84#define ZSWAP_COMPRESSOR_DEFAULT "lzo" 84#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
85static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 85static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
86module_param_named(compressor, zswap_compressor, charp, 0444); 86static struct kparam_string zswap_compressor_kparam = {
87 87 .string = zswap_compressor,
88/* The maximum percentage of memory that the compressed pool can occupy */ 88 .maxlen = sizeof(zswap_compressor),
89static unsigned int zswap_max_pool_percent = 20; 89};
90module_param_named(max_pool_percent, 90static int zswap_compressor_param_set(const char *,
91 zswap_max_pool_percent, uint, 0644); 91 const struct kernel_param *);
92static struct kernel_param_ops zswap_compressor_param_ops = {
93 .set = zswap_compressor_param_set,
94 .get = param_get_string,
95};
96module_param_cb(compressor, &zswap_compressor_param_ops,
97 &zswap_compressor_kparam, 0644);
92 98
93/* Compressed storage to use */ 99/* Compressed storage zpool to use */
94#define ZSWAP_ZPOOL_DEFAULT "zbud" 100#define ZSWAP_ZPOOL_DEFAULT "zbud"
95static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 101static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
96module_param_named(zpool, zswap_zpool_type, charp, 0444); 102static struct kparam_string zswap_zpool_kparam = {
103 .string = zswap_zpool_type,
104 .maxlen = sizeof(zswap_zpool_type),
105};
106static int zswap_zpool_param_set(const char *, const struct kernel_param *);
107static struct kernel_param_ops zswap_zpool_param_ops = {
108 .set = zswap_zpool_param_set,
109 .get = param_get_string,
110};
111module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
97 112
98/* zpool is shared by all of zswap backend */ 113/* The maximum percentage of memory that the compressed pool can occupy */
99static struct zpool *zswap_pool; 114static unsigned int zswap_max_pool_percent = 20;
115module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
100 116
101/********************************* 117/*********************************
102* compression functions 118* data structures
103**********************************/ 119**********************************/
104/* per-cpu compression transforms */
105static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
106 120
107enum comp_op { 121struct zswap_pool {
108 ZSWAP_COMPOP_COMPRESS, 122 struct zpool *zpool;
109 ZSWAP_COMPOP_DECOMPRESS 123 struct crypto_comp * __percpu *tfm;
124 struct kref kref;
125 struct list_head list;
126 struct rcu_head rcu_head;
127 struct notifier_block notifier;
128 char tfm_name[CRYPTO_MAX_ALG_NAME];
110}; 129};
111 130
112static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
113 u8 *dst, unsigned int *dlen)
114{
115 struct crypto_comp *tfm;
116 int ret;
117
118 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
119 switch (op) {
120 case ZSWAP_COMPOP_COMPRESS:
121 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
122 break;
123 case ZSWAP_COMPOP_DECOMPRESS:
124 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
125 break;
126 default:
127 ret = -EINVAL;
128 }
129
130 put_cpu();
131 return ret;
132}
133
134static int __init zswap_comp_init(void)
135{
136 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
137 pr_info("%s compressor not available\n", zswap_compressor);
138 /* fall back to default compressor */
139 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
140 if (!crypto_has_comp(zswap_compressor, 0, 0))
141 /* can't even load the default compressor */
142 return -ENODEV;
143 }
144 pr_info("using %s compressor\n", zswap_compressor);
145
146 /* alloc percpu transforms */
147 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
148 if (!zswap_comp_pcpu_tfms)
149 return -ENOMEM;
150 return 0;
151}
152
153static void __init zswap_comp_exit(void)
154{
155 /* free percpu transforms */
156 free_percpu(zswap_comp_pcpu_tfms);
157}
158
159/*********************************
160* data structures
161**********************************/
162/* 131/*
163 * struct zswap_entry 132 * struct zswap_entry
164 * 133 *
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void)
166 * page within zswap. 135 * page within zswap.
167 * 136 *
168 * rbnode - links the entry into red-black tree for the appropriate swap type 137 * rbnode - links the entry into red-black tree for the appropriate swap type
138 * offset - the swap offset for the entry. Index into the red-black tree.
169 * refcount - the number of outstanding reference to the entry. This is needed 139 * refcount - the number of outstanding reference to the entry. This is needed
170 * to protect against premature freeing of the entry by code 140 * to protect against premature freeing of the entry by code
171 * concurrent calls to load, invalidate, and writeback. The lock 141 * concurrent calls to load, invalidate, and writeback. The lock
172 * for the zswap_tree structure that contains the entry must 142 * for the zswap_tree structure that contains the entry must
173 * be held while changing the refcount. Since the lock must 143 * be held while changing the refcount. Since the lock must
174 * be held, there is no reason to also make refcount atomic. 144 * be held, there is no reason to also make refcount atomic.
175 * offset - the swap offset for the entry. Index into the red-black tree.
176 * handle - zpool allocation handle that stores the compressed page data
177 * length - the length in bytes of the compressed page data. Needed during 145 * length - the length in bytes of the compressed page data. Needed during
178 * decompression 146 * decompression
147 * pool - the zswap_pool the entry's data is in
148 * handle - zpool allocation handle that stores the compressed page data
179 */ 149 */
180struct zswap_entry { 150struct zswap_entry {
181 struct rb_node rbnode; 151 struct rb_node rbnode;
182 pgoff_t offset; 152 pgoff_t offset;
183 int refcount; 153 int refcount;
184 unsigned int length; 154 unsigned int length;
155 struct zswap_pool *pool;
185 unsigned long handle; 156 unsigned long handle;
186}; 157};
187 158
@@ -201,6 +172,51 @@ struct zswap_tree {
201 172
202static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 173static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
203 174
175/* RCU-protected iteration */
176static LIST_HEAD(zswap_pools);
177/* protects zswap_pools list modification */
178static DEFINE_SPINLOCK(zswap_pools_lock);
179
180/* used by param callback function */
181static bool zswap_init_started;
182
183/*********************************
184* helpers and fwd declarations
185**********************************/
186
187#define zswap_pool_debug(msg, p) \
188 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
189 zpool_get_type((p)->zpool))
190
191static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
192static int zswap_pool_get(struct zswap_pool *pool);
193static void zswap_pool_put(struct zswap_pool *pool);
194
195static const struct zpool_ops zswap_zpool_ops = {
196 .evict = zswap_writeback_entry
197};
198
199static bool zswap_is_full(void)
200{
201 return totalram_pages * zswap_max_pool_percent / 100 <
202 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
203}
204
205static void zswap_update_total_size(void)
206{
207 struct zswap_pool *pool;
208 u64 total = 0;
209
210 rcu_read_lock();
211
212 list_for_each_entry_rcu(pool, &zswap_pools, list)
213 total += zpool_get_total_size(pool->zpool);
214
215 rcu_read_unlock();
216
217 zswap_pool_total_size = total;
218}
219
204/********************************* 220/*********************************
205* zswap entry functions 221* zswap entry functions
206**********************************/ 222**********************************/
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
294 */ 310 */
295static void zswap_free_entry(struct zswap_entry *entry) 311static void zswap_free_entry(struct zswap_entry *entry)
296{ 312{
297 zpool_free(zswap_pool, entry->handle); 313 zpool_free(entry->pool->zpool, entry->handle);
314 zswap_pool_put(entry->pool);
298 zswap_entry_cache_free(entry); 315 zswap_entry_cache_free(entry);
299 atomic_dec(&zswap_stored_pages); 316 atomic_dec(&zswap_stored_pages);
300 zswap_pool_total_size = zpool_get_total_size(zswap_pool); 317 zswap_update_total_size();
301} 318}
302 319
303/* caller must hold the tree lock */ 320/* caller must hold the tree lock */
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
339**********************************/ 356**********************************/
340static DEFINE_PER_CPU(u8 *, zswap_dstmem); 357static DEFINE_PER_CPU(u8 *, zswap_dstmem);
341 358
342static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) 359static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
343{ 360{
344 struct crypto_comp *tfm;
345 u8 *dst; 361 u8 *dst;
346 362
347 switch (action) { 363 switch (action) {
348 case CPU_UP_PREPARE: 364 case CPU_UP_PREPARE:
349 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
350 if (IS_ERR(tfm)) {
351 pr_err("can't allocate compressor transform\n");
352 return NOTIFY_BAD;
353 }
354 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
355 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 365 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
356 if (!dst) { 366 if (!dst) {
357 pr_err("can't allocate compressor buffer\n"); 367 pr_err("can't allocate compressor buffer\n");
358 crypto_free_comp(tfm);
359 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
360 return NOTIFY_BAD; 368 return NOTIFY_BAD;
361 } 369 }
362 per_cpu(zswap_dstmem, cpu) = dst; 370 per_cpu(zswap_dstmem, cpu) = dst;
363 break; 371 break;
364 case CPU_DEAD: 372 case CPU_DEAD:
365 case CPU_UP_CANCELED: 373 case CPU_UP_CANCELED:
366 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
367 if (tfm) {
368 crypto_free_comp(tfm);
369 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
370 }
371 dst = per_cpu(zswap_dstmem, cpu); 374 dst = per_cpu(zswap_dstmem, cpu);
372 kfree(dst); 375 kfree(dst);
373 per_cpu(zswap_dstmem, cpu) = NULL; 376 per_cpu(zswap_dstmem, cpu) = NULL;
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
378 return NOTIFY_OK; 381 return NOTIFY_OK;
379} 382}
380 383
381static int zswap_cpu_notifier(struct notifier_block *nb, 384static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
382 unsigned long action, void *pcpu) 385 unsigned long action, void *pcpu)
383{ 386{
384 unsigned long cpu = (unsigned long)pcpu; 387 return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
385 return __zswap_cpu_notifier(action, cpu);
386} 388}
387 389
388static struct notifier_block zswap_cpu_notifier_block = { 390static struct notifier_block zswap_dstmem_notifier = {
389 .notifier_call = zswap_cpu_notifier 391 .notifier_call = zswap_cpu_dstmem_notifier,
390}; 392};
391 393
392static int __init zswap_cpu_init(void) 394static int __init zswap_cpu_dstmem_init(void)
395{
396 unsigned long cpu;
397
398 cpu_notifier_register_begin();
399 for_each_online_cpu(cpu)
400 if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
401 NOTIFY_BAD)
402 goto cleanup;
403 __register_cpu_notifier(&zswap_dstmem_notifier);
404 cpu_notifier_register_done();
405 return 0;
406
407cleanup:
408 for_each_online_cpu(cpu)
409 __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
410 cpu_notifier_register_done();
411 return -ENOMEM;
412}
413
414static void zswap_cpu_dstmem_destroy(void)
415{
416 unsigned long cpu;
417
418 cpu_notifier_register_begin();
419 for_each_online_cpu(cpu)
420 __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
421 __unregister_cpu_notifier(&zswap_dstmem_notifier);
422 cpu_notifier_register_done();
423}
424
425static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
426 unsigned long action, unsigned long cpu)
427{
428 struct crypto_comp *tfm;
429
430 switch (action) {
431 case CPU_UP_PREPARE:
432 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
433 break;
434 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
435 if (IS_ERR_OR_NULL(tfm)) {
436 pr_err("could not alloc crypto comp %s : %ld\n",
437 pool->tfm_name, PTR_ERR(tfm));
438 return NOTIFY_BAD;
439 }
440 *per_cpu_ptr(pool->tfm, cpu) = tfm;
441 break;
442 case CPU_DEAD:
443 case CPU_UP_CANCELED:
444 tfm = *per_cpu_ptr(pool->tfm, cpu);
445 if (!IS_ERR_OR_NULL(tfm))
446 crypto_free_comp(tfm);
447 *per_cpu_ptr(pool->tfm, cpu) = NULL;
448 break;
449 default:
450 break;
451 }
452 return NOTIFY_OK;
453}
454
455static int zswap_cpu_comp_notifier(struct notifier_block *nb,
456 unsigned long action, void *pcpu)
457{
458 unsigned long cpu = (unsigned long)pcpu;
459 struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
460
461 return __zswap_cpu_comp_notifier(pool, action, cpu);
462}
463
464static int zswap_cpu_comp_init(struct zswap_pool *pool)
393{ 465{
394 unsigned long cpu; 466 unsigned long cpu;
395 467
468 memset(&pool->notifier, 0, sizeof(pool->notifier));
469 pool->notifier.notifier_call = zswap_cpu_comp_notifier;
470
396 cpu_notifier_register_begin(); 471 cpu_notifier_register_begin();
397 for_each_online_cpu(cpu) 472 for_each_online_cpu(cpu)
398 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) 473 if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
474 NOTIFY_BAD)
399 goto cleanup; 475 goto cleanup;
400 __register_cpu_notifier(&zswap_cpu_notifier_block); 476 __register_cpu_notifier(&pool->notifier);
401 cpu_notifier_register_done(); 477 cpu_notifier_register_done();
402 return 0; 478 return 0;
403 479
404cleanup: 480cleanup:
405 for_each_online_cpu(cpu) 481 for_each_online_cpu(cpu)
406 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); 482 __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
407 cpu_notifier_register_done(); 483 cpu_notifier_register_done();
408 return -ENOMEM; 484 return -ENOMEM;
409} 485}
410 486
487static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
488{
489 unsigned long cpu;
490
491 cpu_notifier_register_begin();
492 for_each_online_cpu(cpu)
493 __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
494 __unregister_cpu_notifier(&pool->notifier);
495 cpu_notifier_register_done();
496}
497
411/********************************* 498/*********************************
412* helpers 499* pool functions
413**********************************/ 500**********************************/
414static bool zswap_is_full(void) 501
502static struct zswap_pool *__zswap_pool_current(void)
415{ 503{
416 return totalram_pages * zswap_max_pool_percent / 100 < 504 struct zswap_pool *pool;
417 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 505
506 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
507 WARN_ON(!pool);
508
509 return pool;
510}
511
512static struct zswap_pool *zswap_pool_current(void)
513{
514 assert_spin_locked(&zswap_pools_lock);
515
516 return __zswap_pool_current();
517}
518
519static struct zswap_pool *zswap_pool_current_get(void)
520{
521 struct zswap_pool *pool;
522
523 rcu_read_lock();
524
525 pool = __zswap_pool_current();
526 if (!pool || !zswap_pool_get(pool))
527 pool = NULL;
528
529 rcu_read_unlock();
530
531 return pool;
532}
533
534static struct zswap_pool *zswap_pool_last_get(void)
535{
536 struct zswap_pool *pool, *last = NULL;
537
538 rcu_read_lock();
539
540 list_for_each_entry_rcu(pool, &zswap_pools, list)
541 last = pool;
542 if (!WARN_ON(!last) && !zswap_pool_get(last))
543 last = NULL;
544
545 rcu_read_unlock();
546
547 return last;
548}
549
550static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
551{
552 struct zswap_pool *pool;
553
554 assert_spin_locked(&zswap_pools_lock);
555
556 list_for_each_entry_rcu(pool, &zswap_pools, list) {
557 if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
558 continue;
559 if (strncmp(zpool_get_type(pool->zpool), type,
560 sizeof(zswap_zpool_type)))
561 continue;
562 /* if we can't get it, it's about to be destroyed */
563 if (!zswap_pool_get(pool))
564 continue;
565 return pool;
566 }
567
568 return NULL;
569}
570
571static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
572{
573 struct zswap_pool *pool;
574 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
575
576 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
577 if (!pool) {
578 pr_err("pool alloc failed\n");
579 return NULL;
580 }
581
582 pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
583 if (!pool->zpool) {
584 pr_err("%s zpool not available\n", type);
585 goto error;
586 }
587 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
588
589 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
590 pool->tfm = alloc_percpu(struct crypto_comp *);
591 if (!pool->tfm) {
592 pr_err("percpu alloc failed\n");
593 goto error;
594 }
595
596 if (zswap_cpu_comp_init(pool))
597 goto error;
598 pr_debug("using %s compressor\n", pool->tfm_name);
599
600 /* being the current pool takes 1 ref; this func expects the
601 * caller to always add the new pool as the current pool
602 */
603 kref_init(&pool->kref);
604 INIT_LIST_HEAD(&pool->list);
605
606 zswap_pool_debug("created", pool);
607
608 return pool;
609
610error:
611 free_percpu(pool->tfm);
612 if (pool->zpool)
613 zpool_destroy_pool(pool->zpool);
614 kfree(pool);
615 return NULL;
616}
617
618static struct zswap_pool *__zswap_pool_create_fallback(void)
619{
620 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
621 pr_err("compressor %s not available, using default %s\n",
622 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
623 strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
624 sizeof(zswap_compressor));
625 }
626 if (!zpool_has_pool(zswap_zpool_type)) {
627 pr_err("zpool %s not available, using default %s\n",
628 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
629 strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
630 sizeof(zswap_zpool_type));
631 }
632
633 return zswap_pool_create(zswap_zpool_type, zswap_compressor);
634}
635
636static void zswap_pool_destroy(struct zswap_pool *pool)
637{
638 zswap_pool_debug("destroying", pool);
639
640 zswap_cpu_comp_destroy(pool);
641 free_percpu(pool->tfm);
642 zpool_destroy_pool(pool->zpool);
643 kfree(pool);
644}
645
646static int __must_check zswap_pool_get(struct zswap_pool *pool)
647{
648 return kref_get_unless_zero(&pool->kref);
649}
650
651static void __zswap_pool_release(struct rcu_head *head)
652{
653 struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
654
655 /* nobody should have been able to get a kref... */
656 WARN_ON(kref_get_unless_zero(&pool->kref));
657
658 /* pool is now off zswap_pools list and has no references. */
659 zswap_pool_destroy(pool);
660}
661
662static void __zswap_pool_empty(struct kref *kref)
663{
664 struct zswap_pool *pool;
665
666 pool = container_of(kref, typeof(*pool), kref);
667
668 spin_lock(&zswap_pools_lock);
669
670 WARN_ON(pool == zswap_pool_current());
671
672 list_del_rcu(&pool->list);
673 call_rcu(&pool->rcu_head, __zswap_pool_release);
674
675 spin_unlock(&zswap_pools_lock);
676}
677
678static void zswap_pool_put(struct zswap_pool *pool)
679{
680 kref_put(&pool->kref, __zswap_pool_empty);
681}
682
683/*********************************
684* param callbacks
685**********************************/
686
687static int __zswap_param_set(const char *val, const struct kernel_param *kp,
688 char *type, char *compressor)
689{
690 struct zswap_pool *pool, *put_pool = NULL;
691 char str[kp->str->maxlen], *s;
692 int ret;
693
694 /*
695 * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
696 * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
697 * 32 (arbitrary).
698 */
699 strlcpy(str, val, kp->str->maxlen);
700 s = strim(str);
701
702 /* if this is load-time (pre-init) param setting,
703 * don't create a pool; that's done during init.
704 */
705 if (!zswap_init_started)
706 return param_set_copystring(s, kp);
707
708 /* no change required */
709 if (!strncmp(kp->str->string, s, kp->str->maxlen))
710 return 0;
711
712 if (!type) {
713 type = s;
714 if (!zpool_has_pool(type)) {
715 pr_err("zpool %s not available\n", type);
716 return -ENOENT;
717 }
718 } else if (!compressor) {
719 compressor = s;
720 if (!crypto_has_comp(compressor, 0, 0)) {
721 pr_err("compressor %s not available\n", compressor);
722 return -ENOENT;
723 }
724 }
725
726 spin_lock(&zswap_pools_lock);
727
728 pool = zswap_pool_find_get(type, compressor);
729 if (pool) {
730 zswap_pool_debug("using existing", pool);
731 list_del_rcu(&pool->list);
732 } else {
733 spin_unlock(&zswap_pools_lock);
734 pool = zswap_pool_create(type, compressor);
735 spin_lock(&zswap_pools_lock);
736 }
737
738 if (pool)
739 ret = param_set_copystring(s, kp);
740 else
741 ret = -EINVAL;
742
743 if (!ret) {
744 put_pool = zswap_pool_current();
745 list_add_rcu(&pool->list, &zswap_pools);
746 } else if (pool) {
747 /* add the possibly pre-existing pool to the end of the pools
748 * list; if it's new (and empty) then it'll be removed and
749 * destroyed by the put after we drop the lock
750 */
751 list_add_tail_rcu(&pool->list, &zswap_pools);
752 put_pool = pool;
753 }
754
755 spin_unlock(&zswap_pools_lock);
756
757 /* drop the ref from either the old current pool,
758 * or the new pool we failed to add
759 */
760 if (put_pool)
761 zswap_pool_put(put_pool);
762
763 return ret;
764}
765
766static int zswap_compressor_param_set(const char *val,
767 const struct kernel_param *kp)
768{
769 return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
770}
771
772static int zswap_zpool_param_set(const char *val,
773 const struct kernel_param *kp)
774{
775 return __zswap_param_set(val, kp, NULL, zswap_compressor);
418} 776}
419 777
420/********************************* 778/*********************************
@@ -477,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
477 pgoff_t offset; 835 pgoff_t offset;
478 struct zswap_entry *entry; 836 struct zswap_entry *entry;
479 struct page *page; 837 struct page *page;
838 struct crypto_comp *tfm;
480 u8 *src, *dst; 839 u8 *src, *dst;
481 unsigned int dlen; 840 unsigned int dlen;
482 int ret; 841 int ret;
@@ -517,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
517 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 876 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
518 /* decompress */ 877 /* decompress */
519 dlen = PAGE_SIZE; 878 dlen = PAGE_SIZE;
520 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, 879 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
521 ZPOOL_MM_RO) + sizeof(struct zswap_header); 880 ZPOOL_MM_RO) + sizeof(struct zswap_header);
522 dst = kmap_atomic(page); 881 dst = kmap_atomic(page);
523 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 882 tfm = *get_cpu_ptr(entry->pool->tfm);
524 entry->length, dst, &dlen); 883 ret = crypto_comp_decompress(tfm, src, entry->length,
884 dst, &dlen);
885 put_cpu_ptr(entry->pool->tfm);
525 kunmap_atomic(dst); 886 kunmap_atomic(dst);
526 zpool_unmap_handle(zswap_pool, entry->handle); 887 zpool_unmap_handle(entry->pool->zpool, entry->handle);
527 BUG_ON(ret); 888 BUG_ON(ret);
528 BUG_ON(dlen != PAGE_SIZE); 889 BUG_ON(dlen != PAGE_SIZE);
529 890
@@ -572,6 +933,22 @@ end:
572 return ret; 933 return ret;
573} 934}
574 935
936static int zswap_shrink(void)
937{
938 struct zswap_pool *pool;
939 int ret;
940
941 pool = zswap_pool_last_get();
942 if (!pool)
943 return -ENOENT;
944
945 ret = zpool_shrink(pool->zpool, 1, NULL);
946
947 zswap_pool_put(pool);
948
949 return ret;
950}
951
575/********************************* 952/*********************************
576* frontswap hooks 953* frontswap hooks
577**********************************/ 954**********************************/
@@ -581,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
581{ 958{
582 struct zswap_tree *tree = zswap_trees[type]; 959 struct zswap_tree *tree = zswap_trees[type];
583 struct zswap_entry *entry, *dupentry; 960 struct zswap_entry *entry, *dupentry;
961 struct crypto_comp *tfm;
584 int ret; 962 int ret;
585 unsigned int dlen = PAGE_SIZE, len; 963 unsigned int dlen = PAGE_SIZE, len;
586 unsigned long handle; 964 unsigned long handle;
@@ -596,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
596 /* reclaim space if needed */ 974 /* reclaim space if needed */
597 if (zswap_is_full()) { 975 if (zswap_is_full()) {
598 zswap_pool_limit_hit++; 976 zswap_pool_limit_hit++;
599 if (zpool_shrink(zswap_pool, 1, NULL)) { 977 if (zswap_shrink()) {
600 zswap_reject_reclaim_fail++; 978 zswap_reject_reclaim_fail++;
601 ret = -ENOMEM; 979 ret = -ENOMEM;
602 goto reject; 980 goto reject;
@@ -611,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
611 goto reject; 989 goto reject;
612 } 990 }
613 991
992 /* if entry is successfully added, it keeps the reference */
993 entry->pool = zswap_pool_current_get();
994 if (!entry->pool) {
995 ret = -EINVAL;
996 goto freepage;
997 }
998
614 /* compress */ 999 /* compress */
615 dst = get_cpu_var(zswap_dstmem); 1000 dst = get_cpu_var(zswap_dstmem);
1001 tfm = *get_cpu_ptr(entry->pool->tfm);
616 src = kmap_atomic(page); 1002 src = kmap_atomic(page);
617 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); 1003 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
618 kunmap_atomic(src); 1004 kunmap_atomic(src);
1005 put_cpu_ptr(entry->pool->tfm);
619 if (ret) { 1006 if (ret) {
620 ret = -EINVAL; 1007 ret = -EINVAL;
621 goto freepage; 1008 goto put_dstmem;
622 } 1009 }
623 1010
624 /* store */ 1011 /* store */
625 len = dlen + sizeof(struct zswap_header); 1012 len = dlen + sizeof(struct zswap_header);
626 ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, 1013 ret = zpool_malloc(entry->pool->zpool, len,
627 &handle); 1014 __GFP_NORETRY | __GFP_NOWARN, &handle);
628 if (ret == -ENOSPC) { 1015 if (ret == -ENOSPC) {
629 zswap_reject_compress_poor++; 1016 zswap_reject_compress_poor++;
630 goto freepage; 1017 goto put_dstmem;
631 } 1018 }
632 if (ret) { 1019 if (ret) {
633 zswap_reject_alloc_fail++; 1020 zswap_reject_alloc_fail++;
634 goto freepage; 1021 goto put_dstmem;
635 } 1022 }
636 zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); 1023 zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
637 zhdr->swpentry = swp_entry(type, offset); 1024 zhdr->swpentry = swp_entry(type, offset);
638 buf = (u8 *)(zhdr + 1); 1025 buf = (u8 *)(zhdr + 1);
639 memcpy(buf, dst, dlen); 1026 memcpy(buf, dst, dlen);
640 zpool_unmap_handle(zswap_pool, handle); 1027 zpool_unmap_handle(entry->pool->zpool, handle);
641 put_cpu_var(zswap_dstmem); 1028 put_cpu_var(zswap_dstmem);
642 1029
643 /* populate entry */ 1030 /* populate entry */
@@ -660,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
660 1047
661 /* update stats */ 1048 /* update stats */
662 atomic_inc(&zswap_stored_pages); 1049 atomic_inc(&zswap_stored_pages);
663 zswap_pool_total_size = zpool_get_total_size(zswap_pool); 1050 zswap_update_total_size();
664 1051
665 return 0; 1052 return 0;
666 1053
667freepage: 1054put_dstmem:
668 put_cpu_var(zswap_dstmem); 1055 put_cpu_var(zswap_dstmem);
1056 zswap_pool_put(entry->pool);
1057freepage:
669 zswap_entry_cache_free(entry); 1058 zswap_entry_cache_free(entry);
670reject: 1059reject:
671 return ret; 1060 return ret;
@@ -680,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
680{ 1069{
681 struct zswap_tree *tree = zswap_trees[type]; 1070 struct zswap_tree *tree = zswap_trees[type];
682 struct zswap_entry *entry; 1071 struct zswap_entry *entry;
1072 struct crypto_comp *tfm;
683 u8 *src, *dst; 1073 u8 *src, *dst;
684 unsigned int dlen; 1074 unsigned int dlen;
685 int ret; 1075 int ret;
@@ -696,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
696 1086
697 /* decompress */ 1087 /* decompress */
698 dlen = PAGE_SIZE; 1088 dlen = PAGE_SIZE;
699 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, 1089 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
700 ZPOOL_MM_RO) + sizeof(struct zswap_header); 1090 ZPOOL_MM_RO) + sizeof(struct zswap_header);
701 dst = kmap_atomic(page); 1091 dst = kmap_atomic(page);
702 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 1092 tfm = *get_cpu_ptr(entry->pool->tfm);
703 dst, &dlen); 1093 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
1094 put_cpu_ptr(entry->pool->tfm);
704 kunmap_atomic(dst); 1095 kunmap_atomic(dst);
705 zpool_unmap_handle(zswap_pool, entry->handle); 1096 zpool_unmap_handle(entry->pool->zpool, entry->handle);
706 BUG_ON(ret); 1097 BUG_ON(ret);
707 1098
708 spin_lock(&tree->lock); 1099 spin_lock(&tree->lock);
@@ -755,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
755 zswap_trees[type] = NULL; 1146 zswap_trees[type] = NULL;
756} 1147}
757 1148
758static const struct zpool_ops zswap_zpool_ops = {
759 .evict = zswap_writeback_entry
760};
761
762static void zswap_frontswap_init(unsigned type) 1149static void zswap_frontswap_init(unsigned type)
763{ 1150{
764 struct zswap_tree *tree; 1151 struct zswap_tree *tree;
@@ -839,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { }
839**********************************/ 1226**********************************/
840static int __init init_zswap(void) 1227static int __init init_zswap(void)
841{ 1228{
842 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; 1229 struct zswap_pool *pool;
843 1230
844 pr_info("loading zswap\n"); 1231 zswap_init_started = true;
845
846 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
847 &zswap_zpool_ops);
848 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
849 pr_info("%s zpool not available\n", zswap_zpool_type);
850 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
851 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
852 &zswap_zpool_ops);
853 }
854 if (!zswap_pool) {
855 pr_err("%s zpool not available\n", zswap_zpool_type);
856 pr_err("zpool creation failed\n");
857 goto error;
858 }
859 pr_info("using %s pool\n", zswap_zpool_type);
860 1232
861 if (zswap_entry_cache_create()) { 1233 if (zswap_entry_cache_create()) {
862 pr_err("entry cache creation failed\n"); 1234 pr_err("entry cache creation failed\n");
863 goto cachefail; 1235 goto cache_fail;
864 } 1236 }
865 if (zswap_comp_init()) { 1237
866 pr_err("compressor initialization failed\n"); 1238 if (zswap_cpu_dstmem_init()) {
867 goto compfail; 1239 pr_err("dstmem alloc failed\n");
1240 goto dstmem_fail;
868 } 1241 }
869 if (zswap_cpu_init()) { 1242
870 pr_err("per-cpu initialization failed\n"); 1243 pool = __zswap_pool_create_fallback();
871 goto pcpufail; 1244 if (!pool) {
1245 pr_err("pool creation failed\n");
1246 goto pool_fail;
872 } 1247 }
1248 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1249 zpool_get_type(pool->zpool));
1250
1251 list_add(&pool->list, &zswap_pools);
873 1252
874 frontswap_register_ops(&zswap_frontswap_ops); 1253 frontswap_register_ops(&zswap_frontswap_ops);
875 if (zswap_debugfs_init()) 1254 if (zswap_debugfs_init())
876 pr_warn("debugfs initialization failed\n"); 1255 pr_warn("debugfs initialization failed\n");
877 return 0; 1256 return 0;
878pcpufail: 1257
879 zswap_comp_exit(); 1258pool_fail:
880compfail: 1259 zswap_cpu_dstmem_destroy();
1260dstmem_fail:
881 zswap_entry_cache_destroy(); 1261 zswap_entry_cache_destroy();
882cachefail: 1262cache_fail:
883 zpool_destroy_pool(zswap_pool);
884error:
885 return -ENOMEM; 1263 return -ENOMEM;
886} 1264}
887/* must be late so crypto has time to come up */ 1265/* must be late so crypto has time to come up */
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index a51ca0e5beef..f2a1131b2f8b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -264,6 +264,7 @@ our $Sparse = qr{
264 __kernel| 264 __kernel|
265 __force| 265 __force|
266 __iomem| 266 __iomem|
267 __pmem|
267 __must_check| 268 __must_check|
268 __init_refok| 269 __init_refok|
269 __kprobes| 270 __kprobes|
@@ -584,7 +585,7 @@ our $LvalOrFunc = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*};
584our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)}; 585our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)};
585 586
586our $declaration_macros = qr{(?x: 587our $declaration_macros = qr{(?x:
587 (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(| 588 (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
588 (?:$Storage\s+)?LIST_HEAD\s*\(| 589 (?:$Storage\s+)?LIST_HEAD\s*\(|
589 (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\( 590 (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(
590)}; 591)};
@@ -1953,9 +1954,9 @@ sub process {
1953 our $clean = 1; 1954 our $clean = 1;
1954 my $signoff = 0; 1955 my $signoff = 0;
1955 my $is_patch = 0; 1956 my $is_patch = 0;
1956
1957 my $in_header_lines = $file ? 0 : 1; 1957 my $in_header_lines = $file ? 0 : 1;
1958 my $in_commit_log = 0; #Scanning lines before patch 1958 my $in_commit_log = 0; #Scanning lines before patch
1959 my $commit_log_possible_stack_dump = 0;
1959 my $commit_log_long_line = 0; 1960 my $commit_log_long_line = 0;
1960 my $commit_log_has_diff = 0; 1961 my $commit_log_has_diff = 0;
1961 my $reported_maintainer_file = 0; 1962 my $reported_maintainer_file = 0;
@@ -2166,11 +2167,15 @@ sub process {
2166 if ($showfile) { 2167 if ($showfile) {
2167 $prefix = "$realfile:$realline: " 2168 $prefix = "$realfile:$realline: "
2168 } elsif ($emacs) { 2169 } elsif ($emacs) {
2169 $prefix = "$filename:$linenr: "; 2170 if ($file) {
2171 $prefix = "$filename:$realline: ";
2172 } else {
2173 $prefix = "$filename:$linenr: ";
2174 }
2170 } 2175 }
2171 2176
2172 if ($found_file) { 2177 if ($found_file) {
2173 if ($realfile =~ m@^(drivers/net/|net/)@) { 2178 if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) {
2174 $check = 1; 2179 $check = 1;
2175 } else { 2180 } else {
2176 $check = $check_orig; 2181 $check = $check_orig;
@@ -2310,16 +2315,42 @@ sub process {
2310 2315
2311# Check for line lengths > 75 in commit log, warn once 2316# Check for line lengths > 75 in commit log, warn once
2312 if ($in_commit_log && !$commit_log_long_line && 2317 if ($in_commit_log && !$commit_log_long_line &&
2313 length($line) > 75) { 2318 length($line) > 75 &&
2319 !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
2320 # file delta changes
2321 $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
2322 # filename then :
2323 $line =~ /^\s*(?:Fixes:|Link:)/i ||
2324 # A Fixes: or Link: line
2325 $commit_log_possible_stack_dump)) {
2314 WARN("COMMIT_LOG_LONG_LINE", 2326 WARN("COMMIT_LOG_LONG_LINE",
2315 "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr); 2327 "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
2316 $commit_log_long_line = 1; 2328 $commit_log_long_line = 1;
2317 } 2329 }
2318 2330
2331# Check if the commit log is in a possible stack dump
2332 if ($in_commit_log && !$commit_log_possible_stack_dump &&
2333 ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
2334 $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
2335 # timestamp
2336 $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
2337 # stack dump address
2338 $commit_log_possible_stack_dump = 1;
2339 }
2340
2341# Reset possible stack dump if a blank line is found
2342 if ($in_commit_log && $commit_log_possible_stack_dump &&
2343 $line =~ /^\s*$/) {
2344 $commit_log_possible_stack_dump = 0;
2345 }
2346
2319# Check for git id commit length and improperly formed commit descriptions 2347# Check for git id commit length and improperly formed commit descriptions
2320 if ($in_commit_log && $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i) { 2348 if ($in_commit_log &&
2321 my $init_char = $1; 2349 ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
2322 my $orig_commit = lc($2); 2350 ($line =~ /\b[0-9a-f]{12,40}\b/i &&
2351 $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
2352 my $init_char = "c";
2353 my $orig_commit = "";
2323 my $short = 1; 2354 my $short = 1;
2324 my $long = 0; 2355 my $long = 0;
2325 my $case = 1; 2356 my $case = 1;
@@ -2330,6 +2361,13 @@ sub process {
2330 my $orig_desc = "commit description"; 2361 my $orig_desc = "commit description";
2331 my $description = ""; 2362 my $description = "";
2332 2363
2364 if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) {
2365 $init_char = $1;
2366 $orig_commit = lc($2);
2367 } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) {
2368 $orig_commit = lc($1);
2369 }
2370
2333 $short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i); 2371 $short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i);
2334 $long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i); 2372 $long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i);
2335 $space = 0 if ($line =~ /\bcommit [0-9a-f]/i); 2373 $space = 0 if ($line =~ /\bcommit [0-9a-f]/i);
@@ -2738,6 +2776,8 @@ sub process {
2738 } 2776 }
2739 } 2777 }
2740 2778
2779# Block comment styles
2780# Networking with an initial /*
2741 if ($realfile =~ m@^(drivers/net/|net/)@ && 2781 if ($realfile =~ m@^(drivers/net/|net/)@ &&
2742 $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ && 2782 $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
2743 $rawline =~ /^\+[ \t]*\*/ && 2783 $rawline =~ /^\+[ \t]*\*/ &&
@@ -2746,22 +2786,23 @@ sub process {
2746 "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev); 2786 "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
2747 } 2787 }
2748 2788
2749 if ($realfile =~ m@^(drivers/net/|net/)@ && 2789# Block comments use * on subsequent lines
2750 $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /* 2790 if ($prevline =~ /$;[ \t]*$/ && #ends in comment
2791 $prevrawline =~ /^\+.*?\/\*/ && #starting /*
2751 $prevrawline !~ /\*\/[ \t]*$/ && #no trailing */ 2792 $prevrawline !~ /\*\/[ \t]*$/ && #no trailing */
2752 $rawline =~ /^\+/ && #line is new 2793 $rawline =~ /^\+/ && #line is new
2753 $rawline !~ /^\+[ \t]*\*/) { #no leading * 2794 $rawline !~ /^\+[ \t]*\*/) { #no leading *
2754 WARN("NETWORKING_BLOCK_COMMENT_STYLE", 2795 WARN("BLOCK_COMMENT_STYLE",
2755 "networking block comments start with * on subsequent lines\n" . $hereprev); 2796 "Block comments use * on subsequent lines\n" . $hereprev);
2756 } 2797 }
2757 2798
2758 if ($realfile =~ m@^(drivers/net/|net/)@ && 2799# Block comments use */ on trailing lines
2759 $rawline !~ m@^\+[ \t]*\*/[ \t]*$@ && #trailing */ 2800 if ($rawline !~ m@^\+[ \t]*\*/[ \t]*$@ && #trailing */
2760 $rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ && #inline /*...*/ 2801 $rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ && #inline /*...*/
2761 $rawline !~ m@^\+.*\*{2,}/[ \t]*$@ && #trailing **/ 2802 $rawline !~ m@^\+.*\*{2,}/[ \t]*$@ && #trailing **/
2762 $rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) { #non blank */ 2803 $rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) { #non blank */
2763 WARN("NETWORKING_BLOCK_COMMENT_STYLE", 2804 WARN("BLOCK_COMMENT_STYLE",
2764 "networking block comments put the trailing */ on a separate line\n" . $herecurr); 2805 "Block comments use a trailing */ on a separate line\n" . $herecurr);
2765 } 2806 }
2766 2807
2767# check for missing blank lines after struct/union declarations 2808# check for missing blank lines after struct/union declarations
@@ -3067,15 +3108,22 @@ sub process {
3067 3108
3068 substr($s, 0, length($c), ''); 3109 substr($s, 0, length($c), '');
3069 3110
3070 # Make sure we remove the line prefixes as we have 3111 # remove inline comments
3071 # none on the first line, and are going to readd them 3112 $s =~ s/$;/ /g;
3072 # where necessary. 3113 $c =~ s/$;/ /g;
3073 $s =~ s/\n./\n/gs;
3074 3114
3075 # Find out how long the conditional actually is. 3115 # Find out how long the conditional actually is.
3076 my @newlines = ($c =~ /\n/gs); 3116 my @newlines = ($c =~ /\n/gs);
3077 my $cond_lines = 1 + $#newlines; 3117 my $cond_lines = 1 + $#newlines;
3078 3118
3119 # Make sure we remove the line prefixes as we have
3120 # none on the first line, and are going to readd them
3121 # where necessary.
3122 $s =~ s/\n./\n/gs;
3123 while ($s =~ /\n\s+\\\n/) {
3124 $cond_lines += $s =~ s/\n\s+\\\n/\n/g;
3125 }
3126
3079 # We want to check the first line inside the block 3127 # We want to check the first line inside the block
3080 # starting at the end of the conditional, so remove: 3128 # starting at the end of the conditional, so remove:
3081 # 1) any blank line termination 3129 # 1) any blank line termination
@@ -3141,8 +3189,10 @@ sub process {
3141 3189
3142 #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n"; 3190 #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
3143 3191
3144 if ($check && (($sindent % 8) != 0 || 3192 if ($check && $s ne '' &&
3145 ($sindent <= $indent && $s ne ''))) { 3193 (($sindent % 8) != 0 ||
3194 ($sindent < $indent) ||
3195 ($sindent > $indent + 8))) {
3146 WARN("SUSPECT_CODE_INDENT", 3196 WARN("SUSPECT_CODE_INDENT",
3147 "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n"); 3197 "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
3148 } 3198 }
@@ -3439,13 +3489,15 @@ sub process {
3439 } 3489 }
3440 } 3490 }
3441 3491
3442# # no BUG() or BUG_ON() 3492# avoid BUG() or BUG_ON()
3443# if ($line =~ /\b(BUG|BUG_ON)\b/) { 3493 if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
3444# print "Try to use WARN_ON & Recovery code rather than BUG() or BUG_ON()\n"; 3494 my $msg_type = \&WARN;
3445# print "$herecurr"; 3495 $msg_type = \&CHK if ($file);
3446# $clean = 0; 3496 &{$msg_type}("AVOID_BUG",
3447# } 3497 "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
3498 }
3448 3499
3500# avoid LINUX_VERSION_CODE
3449 if ($line =~ /\bLINUX_VERSION_CODE\b/) { 3501 if ($line =~ /\bLINUX_VERSION_CODE\b/) {
3450 WARN("LINUX_VERSION_CODE", 3502 WARN("LINUX_VERSION_CODE",
3451 "LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr); 3503 "LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr);
@@ -3520,7 +3572,7 @@ sub process {
3520# function brace can't be on same line, except for #defines of do while, 3572# function brace can't be on same line, except for #defines of do while,
3521# or if closed on same line 3573# or if closed on same line
3522 if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and 3574 if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and
3523 !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) { 3575 !($line=~/\#\s*define.*do\s\{/) and !($line=~/}/)) {
3524 if (ERROR("OPEN_BRACE", 3576 if (ERROR("OPEN_BRACE",
3525 "open brace '{' following function declarations go on the next line\n" . $herecurr) && 3577 "open brace '{' following function declarations go on the next line\n" . $herecurr) &&
3526 $fix) { 3578 $fix) {
@@ -4032,8 +4084,8 @@ sub process {
4032## } 4084## }
4033 4085
4034#need space before brace following if, while, etc 4086#need space before brace following if, while, etc
4035 if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) || 4087 if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) ||
4036 $line =~ /do{/) { 4088 $line =~ /do\{/) {
4037 if (ERROR("SPACING", 4089 if (ERROR("SPACING",
4038 "space required before the open brace '{'\n" . $herecurr) && 4090 "space required before the open brace '{'\n" . $herecurr) &&
4039 $fix) { 4091 $fix) {
@@ -4179,6 +4231,35 @@ sub process {
4179 } 4231 }
4180 } 4232 }
4181 4233
4234# comparisons with a constant or upper case identifier on the left
4235# avoid cases like "foo + BAR < baz"
4236# only fix matches surrounded by parentheses to avoid incorrect
4237# conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5"
4238 if ($^V && $^V ge 5.10.0 &&
4239 $line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) {
4240 my $lead = $1;
4241 my $const = $2;
4242 my $comp = $3;
4243 my $to = $4;
4244 my $newcomp = $comp;
4245 if ($lead !~ /$Operators\s*$/ &&
4246 $to !~ /^(?:Constant|[A-Z_][A-Z0-9_]*)$/ &&
4247 WARN("CONSTANT_COMPARISON",
4248 "Comparisons should place the constant on the right side of the test\n" . $herecurr) &&
4249 $fix) {
4250 if ($comp eq "<") {
4251 $newcomp = ">";
4252 } elsif ($comp eq "<=") {
4253 $newcomp = ">=";
4254 } elsif ($comp eq ">") {
4255 $newcomp = "<";
4256 } elsif ($comp eq ">=") {
4257 $newcomp = "<=";
4258 }
4259 $fixed[$fixlinenr] =~ s/\(\s*\Q$const\E\s*$Compare\s*\Q$to\E\s*\)/($to $newcomp $const)/;
4260 }
4261 }
4262
4182# Return of what appears to be an errno should normally be negative 4263# Return of what appears to be an errno should normally be negative
4183 if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) { 4264 if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) {
4184 my $name = $1; 4265 my $name = $1;
@@ -4480,7 +4561,7 @@ sub process {
4480 $dstat !~ /^for\s*$Constant$/ && # for (...) 4561 $dstat !~ /^for\s*$Constant$/ && # for (...)
4481 $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() 4562 $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar()
4482 $dstat !~ /^do\s*{/ && # do {... 4563 $dstat !~ /^do\s*{/ && # do {...
4483 $dstat !~ /^\({/ && # ({... 4564 $dstat !~ /^\(\{/ && # ({...
4484 $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/) 4565 $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
4485 { 4566 {
4486 $ctx =~ s/\n*$//; 4567 $ctx =~ s/\n*$//;
@@ -4789,16 +4870,20 @@ sub process {
4789 "Consecutive strings are generally better as a single string\n" . $herecurr); 4870 "Consecutive strings are generally better as a single string\n" . $herecurr);
4790 } 4871 }
4791 4872
4792# check for %L{u,d,i} in strings 4873# check for %L{u,d,i} and 0x%[udi] in strings
4793 my $string; 4874 my $string;
4794 while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) { 4875 while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
4795 $string = substr($rawline, $-[1], $+[1] - $-[1]); 4876 $string = substr($rawline, $-[1], $+[1] - $-[1]);
4796 $string =~ s/%%/__/g; 4877 $string =~ s/%%/__/g;
4797 if ($string =~ /(?<!%)%L[udi]/) { 4878 if ($string =~ /(?<!%)%[\*\d\.\$]*L[udi]/) {
4798 WARN("PRINTF_L", 4879 WARN("PRINTF_L",
4799 "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr); 4880 "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
4800 last; 4881 last;
4801 } 4882 }
4883 if ($string =~ /0x%[\*\d\.\$\Llzth]*[udi]/) {
4884 ERROR("PRINTF_0xDECIMAL",
4885 "Prefixing 0x with decimal output is defective\n" . $herecurr);
4886 }
4802 } 4887 }
4803 4888
4804# check for line continuations in quoted strings with odd counts of " 4889# check for line continuations in quoted strings with odd counts of "
@@ -4816,10 +4901,34 @@ sub process {
4816 4901
4817# check for needless "if (<foo>) fn(<foo>)" uses 4902# check for needless "if (<foo>) fn(<foo>)" uses
4818 if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) { 4903 if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
4819 my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;'; 4904 my $tested = quotemeta($1);
4820 if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) { 4905 my $expr = '\s*\(\s*' . $tested . '\s*\)\s*;';
4821 WARN('NEEDLESS_IF', 4906 if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?|(?:kmem_cache|mempool|dma_pool)_destroy)$expr/) {
4822 "$1(NULL) is safe and this check is probably not required\n" . $hereprev); 4907 my $func = $1;
4908 if (WARN('NEEDLESS_IF',
4909 "$func(NULL) is safe and this check is probably not required\n" . $hereprev) &&
4910 $fix) {
4911 my $do_fix = 1;
4912 my $leading_tabs = "";
4913 my $new_leading_tabs = "";
4914 if ($lines[$linenr - 2] =~ /^\+(\t*)if\s*\(\s*$tested\s*\)\s*$/) {
4915 $leading_tabs = $1;
4916 } else {
4917 $do_fix = 0;
4918 }
4919 if ($lines[$linenr - 1] =~ /^\+(\t+)$func\s*\(\s*$tested\s*\)\s*;\s*$/) {
4920 $new_leading_tabs = $1;
4921 if (length($leading_tabs) + 1 ne length($new_leading_tabs)) {
4922 $do_fix = 0;
4923 }
4924 } else {
4925 $do_fix = 0;
4926 }
4927 if ($do_fix) {
4928 fix_delete_line($fixlinenr - 1, $prevrawline);
4929 $fixed[$fixlinenr] =~ s/^\+$new_leading_tabs/\+$leading_tabs/;
4930 }
4931 }
4823 } 4932 }
4824 } 4933 }
4825 4934
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 3d2201413028..5bed7716f8ab 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -472,7 +472,7 @@ static int sel_mmap_policy_fault(struct vm_area_struct *vma,
472 return 0; 472 return 0;
473} 473}
474 474
475static struct vm_operations_struct sel_mmap_policy_ops = { 475static const struct vm_operations_struct sel_mmap_policy_ops = {
476 .fault = sel_mmap_policy_fault, 476 .fault = sel_mmap_policy_fault,
477 .page_mkwrite = sel_mmap_policy_fault, 477 .page_mkwrite = sel_mmap_policy_fault,
478}; 478};
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4662a8877f6c..a25a73147f71 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -397,6 +397,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
397 return young; 397 return young;
398} 398}
399 399
400static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
401 struct mm_struct *mm,
402 unsigned long start,
403 unsigned long end)
404{
405 struct kvm *kvm = mmu_notifier_to_kvm(mn);
406 int young, idx;
407
408 idx = srcu_read_lock(&kvm->srcu);
409 spin_lock(&kvm->mmu_lock);
410 /*
411 * Even though we do not flush TLB, this will still adversely
412 * affect performance on pre-Haswell Intel EPT, where there is
413 * no EPT Access Bit to clear so that we have to tear down EPT
414 * tables instead. If we find this unacceptable, we can always
415 * add a parameter to kvm_age_hva so that it effectively doesn't
416 * do anything on clear_young.
417 *
418 * Also note that currently we never issue secondary TLB flushes
419 * from clear_young, leaving this job up to the regular system
420 * cadence. If we find this inaccurate, we might come up with a
421 * more sophisticated heuristic later.
422 */
423 young = kvm_age_hva(kvm, start, end);
424 spin_unlock(&kvm->mmu_lock);
425 srcu_read_unlock(&kvm->srcu, idx);
426
427 return young;
428}
429
400static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 430static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
401 struct mm_struct *mm, 431 struct mm_struct *mm,
402 unsigned long address) 432 unsigned long address)
@@ -429,6 +459,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
429 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 459 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
430 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 460 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
431 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 461 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
462 .clear_young = kvm_mmu_notifier_clear_young,
432 .test_young = kvm_mmu_notifier_test_young, 463 .test_young = kvm_mmu_notifier_test_young,
433 .change_pte = kvm_mmu_notifier_change_pte, 464 .change_pte = kvm_mmu_notifier_change_pte,
434 .release = kvm_mmu_notifier_release, 465 .release = kvm_mmu_notifier_release,