aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/aoe/udev.txt2
-rw-r--r--Documentation/block/cmdline-partition.txt39
-rw-r--r--Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt17
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-omap.txt6
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-palmas.txt33
-rw-r--r--Documentation/filesystems/proc.txt19
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt4
-rw-r--r--Documentation/sysctl/kernel.txt1
-rw-r--r--Documentation/sysctl/vm.txt30
-rw-r--r--Documentation/vm/hugetlbpage.txt25
-rw-r--r--Documentation/vm/soft-dirty.txt7
-rw-r--r--MAINTAINERS125
-rw-r--r--arch/alpha/lib/csum_partial_copy.c5
-rw-r--r--arch/arm/mm/hugetlbpage.c5
-rw-r--r--arch/arm64/mm/hugetlbpage.c5
-rw-r--r--arch/ia64/mm/hugetlbpage.c5
-rw-r--r--arch/metag/mm/hugetlbpage.c5
-rw-r--r--arch/mips/mm/hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/hugetlbpage.c10
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/kprobes.h4
-rw-r--r--arch/s390/include/asm/sclp.h1
-rw-r--r--arch/s390/kernel/crash_dump.c219
-rw-r--r--arch/s390/kernel/kprobes.c144
-rw-r--r--arch/s390/mm/hugetlbpage.c5
-rw-r--r--arch/sh/mm/hugetlbpage.c5
-rw-r--r--arch/sparc/kernel/sys_sparc32.c12
-rw-r--r--arch/sparc/mm/hugetlbpage.c5
-rw-r--r--arch/tile/mm/hugetlbpage.c5
-rw-r--r--arch/x86/include/asm/pgtable.h34
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/tlbflush.h37
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c8
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--block/Kconfig6
-rw-r--r--block/Makefile1
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/cmdline-parser.c250
-rw-r--r--block/compat_ioctl.c2
-rw-r--r--block/partitions/Kconfig7
-rw-r--r--block/partitions/Makefile1
-rw-r--r--block/partitions/check.c4
-rw-r--r--block/partitions/cmdline.c99
-rw-r--r--block/partitions/cmdline.h2
-rw-r--r--block/partitions/efi.c171
-rw-r--r--block/partitions/efi.h38
-rw-r--r--drivers/block/aoe/aoe.h4
-rw-r--r--drivers/block/aoe/aoeblk.c100
-rw-r--r--drivers/block/aoe/aoecmd.c4
-rw-r--r--drivers/block/aoe/aoedev.c10
-rw-r--r--drivers/block/cciss.c7
-rw-r--r--drivers/block/mg_disk.c2
-rw-r--r--drivers/block/osdblk.c2
-rw-r--r--drivers/block/pktcdvd.c278
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/block/swim.c2
-rw-r--r--drivers/block/xen-blkback/xenbus.c2
-rw-r--r--drivers/char/tpm/tpm_tis.c60
-rw-r--r--drivers/firmware/dmi_scan.c73
-rw-r--r--drivers/firmware/google/gsmi.c2
-rw-r--r--drivers/iommu/msm_iommu_dev.c2
-rw-r--r--drivers/iommu/omap-iommu.c2
-rw-r--r--drivers/memstick/core/Kconfig12
-rw-r--r--drivers/memstick/core/Makefile2
-rw-r--r--drivers/memstick/core/ms_block.c2385
-rw-r--r--drivers/memstick/core/ms_block.h290
-rw-r--r--drivers/memstick/host/rtsx_pci_ms.c2
-rw-r--r--drivers/platform/x86/apple-gmux.c18
-rw-r--r--drivers/pnp/driver.c13
-rw-r--r--drivers/pps/clients/pps-gpio.c1
-rw-r--r--drivers/rtc/Kconfig9
-rw-r--r--drivers/rtc/Makefile1
-rw-r--r--drivers/rtc/rtc-cmos.c24
-rw-r--r--drivers/rtc/rtc-ds1511.c17
-rw-r--r--drivers/rtc/rtc-ds1553.c13
-rw-r--r--drivers/rtc/rtc-ds1742.c26
-rw-r--r--drivers/rtc/rtc-ep93xx.c14
-rw-r--r--drivers/rtc/rtc-hid-sensor-time.c22
-rw-r--r--drivers/rtc/rtc-imxdi.c16
-rw-r--r--drivers/rtc/rtc-lpc32xx.c24
-rw-r--r--drivers/rtc/rtc-max77686.c4
-rw-r--r--drivers/rtc/rtc-moxart.c330
-rw-r--r--drivers/rtc/rtc-mv.c17
-rw-r--r--drivers/rtc/rtc-mxc.c14
-rw-r--r--drivers/rtc/rtc-nuc900.c2
-rw-r--r--drivers/rtc/rtc-omap.c60
-rw-r--r--drivers/rtc/rtc-palmas.c35
-rw-r--r--drivers/rtc/rtc-pcf2127.c6
-rw-r--r--drivers/rtc/rtc-sirfsoc.c13
-rw-r--r--drivers/rtc/rtc-stk17ta8.c15
-rw-r--r--drivers/rtc/rtc-tx4939.c14
-rw-r--r--drivers/s390/char/zcore.c6
-rw-r--r--drivers/video/acornfb.c266
-rw-r--r--drivers/video/acornfb.h29
-rw-r--r--drivers/w1/masters/mxc_w1.c2
-rw-r--r--drivers/w1/w1.c12
-rw-r--r--drivers/watchdog/hpwdt.c6
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/bio-integrity.c9
-rw-r--r--fs/coredump.c5
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c122
-rw-r--r--fs/file_table.c3
-rw-r--r--fs/fs-writeback.c12
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/hfsplus/Kconfig18
-rw-r--r--fs/hfsplus/Makefile2
-rw-r--r--fs/hfsplus/acl.h30
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/inode.c11
-rw-r--r--fs/hfsplus/posix_acl.c274
-rw-r--r--fs/hfsplus/xattr.c62
-rw-r--r--fs/hfsplus/xattr.h33
-rw-r--r--fs/hfsplus/xattr_security.c13
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c32
-rw-r--r--fs/ocfs2/cluster/tcp.c60
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c18
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c15
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c35
-rw-r--r--fs/ocfs2/dlm/dlmlock.c9
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c18
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c13
-rw-r--r--fs/ocfs2/dlm/dlmthread.c19
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c3
-rw-r--r--fs/ocfs2/extent_map.c11
-rw-r--r--fs/ocfs2/file.c7
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.c43
-rw-r--r--fs/ocfs2/journal.h11
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/move_extents.c3
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/quota_global.c6
-rw-r--r--fs/ocfs2/quota_local.c12
-rw-r--r--fs/ocfs2/refcounttree.c10
-rw-r--r--fs/ocfs2/xattr.c11
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/task_mmu.c50
-rw-r--r--fs/proc/vmcore.c154
-rw-r--r--fs/ramfs/inode.c26
-rw-r--r--include/linux/backing-dev.h3
-rw-r--r--include/linux/binfmts.h2
-rw-r--r--include/linux/cmdline-parser.h43
-rw-r--r--include/linux/compat.h1
-rw-r--r--include/linux/crash_dump.h9
-rw-r--r--include/linux/genalloc.h4
-rw-r--r--include/linux/hugetlb.h25
-rw-r--r--include/linux/init.h1
-rw-r--r--include/linux/ipc_namespace.h2
-rw-r--r--include/linux/kprobes.h34
-rw-r--r--include/linux/lz4.h8
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/mempolicy.h11
-rw-r--r--include/linux/migrate.h5
-rw-r--r--include/linux/mm.h32
-rw-r--r--include/linux/mm_inline.h1
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--include/linux/radix-tree.h1
-rw-r--r--include/linux/ramfs.h2
-rw-r--r--include/linux/rbtree.h22
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/smp.h79
-rw-r--r--include/linux/swap.h52
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/vm_event_item.h6
-rw-r--r--include/linux/vmstat.h4
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/kmem.h10
-rw-r--r--init/do_mounts.c45
-rw-r--r--ipc/msg.c25
-rw-r--r--ipc/namespace.c7
-rw-r--r--ipc/sem.c24
-rw-r--r--ipc/shm.c255
-rw-r--r--ipc/util.c82
-rw-r--r--ipc/util.h14
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/kprobes.c95
-rw-r--r--kernel/modsign_pubkey.c6
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/spinlock.c14
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/task_work.c40
-rw-r--r--kernel/up.c58
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/crc32.c17
-rw-r--r--lib/decompress_inflate.c2
-rw-r--r--lib/genalloc.c22
-rw-r--r--lib/lz4/lz4_decompress.c8
-rw-r--r--lib/radix-tree.c41
-rw-r--r--lib/rbtree.c40
-rw-r--r--lib/rbtree_test.c12
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/huge_memory.c10
-rw-r--r--mm/hugetlb.c447
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c33
-rw-r--r--mm/memblock.c18
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/memory-failure.c174
-rw-r--r--mm/memory.c41
-rw-r--r--mm/memory_hotplug.c112
-rw-r--r--mm/mempolicy.c116
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c63
-rw-r--r--mm/mlock.c316
-rw-r--r--mm/mmap.c59
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/page-writeback.c269
-rw-r--r--mm/page_alloc.c308
-rw-r--r--mm/page_isolation.c14
-rw-r--r--mm/pgtable-generic.c24
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slub.c8
-rw-r--r--mm/sparse.c133
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c596
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c29
-rw-r--r--mm/vmscan.c80
-rw-r--r--mm/vmstat.c95
-rw-r--r--mm/zbud.c4
-rw-r--r--mm/zswap.c18
-rw-r--r--net/socket.c50
-rwxr-xr-xscripts/checkpatch.pl345
247 files changed, 8796 insertions, 2889 deletions
diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt
index 8686e789542e..1f06daf03f5b 100644
--- a/Documentation/aoe/udev.txt
+++ b/Documentation/aoe/udev.txt
@@ -23,4 +23,4 @@ SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="02
23SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220" 23SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220"
24 24
25# aoe block devices 25# aoe block devices
26KERNEL=="etherd*", NAME="%k", GROUP="disk" 26KERNEL=="etherd*", GROUP="disk"
diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt
new file mode 100644
index 000000000000..2bbf4cc40c3f
--- /dev/null
+++ b/Documentation/block/cmdline-partition.txt
@@ -0,0 +1,39 @@
1Embedded device command line partition
2=====================================================================
3
4Read block device partition table from command line.
5The partition used for fixed block device (eMMC) embedded device.
6It is no MBR, save storage space. Bootloader can be easily accessed
7by absolute address of data on the block device.
8Users can easily change the partition.
9
10The format for the command line is just like mtdparts:
11
12blkdevparts=<blkdev-def>[;<blkdev-def>]
13 <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>]
14 <partdef> := <size>[@<offset>](part-name)
15
16<blkdev-id>
17 block device disk name, embedded device used fixed block device,
18 it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0.
19
20<size>
21 partition size, in bytes, such as: 512, 1m, 1G.
22
23<offset>
24 partition start address, in bytes.
25
26(part-name)
27 partition name, kernel send uevent with "PARTNAME". application can create
28 a link to block device partition with the name "PARTNAME".
29 user space application can access partition by partition name.
30
31Example:
32 eMMC disk name is "mmcblk0" and "mmcblk0boot0"
33
34 bootargs:
35 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
36
37 dmesg:
38 mmcblk0: p1(data0) p2(data1) p3()
39 mmcblk0boot0: p1(boot) p2(kernel)
diff --git a/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt
new file mode 100644
index 000000000000..c9d3ac1477fe
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt
@@ -0,0 +1,17 @@
1MOXA ART real-time clock
2
3Required properties:
4
5- compatible : Should be "moxa,moxart-rtc"
6- gpio-rtc-sclk : RTC sclk gpio, with zero flags
7- gpio-rtc-data : RTC data gpio, with zero flags
8- gpio-rtc-reset : RTC reset gpio, with zero flags
9
10Example:
11
12 rtc: rtc {
13 compatible = "moxa,moxart-rtc";
14 gpio-rtc-sclk = <&gpio 5 0>;
15 gpio-rtc-data = <&gpio 6 0>;
16 gpio-rtc-reset = <&gpio 7 0>;
17 };
diff --git a/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/Documentation/devicetree/bindings/rtc/rtc-omap.txt
index b47aa415c820..5a0f02d34d95 100644
--- a/Documentation/devicetree/bindings/rtc/rtc-omap.txt
+++ b/Documentation/devicetree/bindings/rtc/rtc-omap.txt
@@ -1,7 +1,11 @@
1TI Real Time Clock 1TI Real Time Clock
2 2
3Required properties: 3Required properties:
4- compatible: "ti,da830-rtc" 4- compatible:
5 - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family.
6 - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family.
7 This RTC IP has special WAKE-EN Register to enable
8 Wakeup generation for event Alarm.
5- reg: Address range of rtc register set 9- reg: Address range of rtc register set
6- interrupts: rtc timer, alarm interrupts in order 10- interrupts: rtc timer, alarm interrupts in order
7- interrupt-parent: phandle for the interrupt controller 11- interrupt-parent: phandle for the interrupt controller
diff --git a/Documentation/devicetree/bindings/rtc/rtc-palmas.txt b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt
new file mode 100644
index 000000000000..adbccc0a51e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt
@@ -0,0 +1,33 @@
1Palmas RTC controller bindings
2
3Required properties:
4- compatible:
5 - "ti,palmas-rtc" for palma series of the RTC controller
6- interrupt-parent: Parent interrupt device, must be handle of palmas node.
7- interrupts: Interrupt number of RTC submodule on device.
8
9Optional properties:
10
11- ti,backup-battery-chargeable: The Palmas series device like TPS65913 or
12 TPS80036 supports the backup battery for powering the RTC when main
13 battery is removed or in very low power state. The backup battery
14 can be chargeable or non-chargeable. This flag will tells whether
15 battery is chargeable or not. If charging battery then driver can
16 enable the charging.
17- ti,backup-battery-charge-high-current: Enable high current charging in
18 backup battery. Device supports the < 100mA and > 100mA charging.
19 The high current will be > 100mA. Absence of this property will
20 charge battery to lower current i.e. < 100mA.
21
22Example:
23 palmas: tps65913@58 {
24 ...
25 palmas_rtc: rtc {
26 compatible = "ti,palmas-rtc";
27 interrupt-parent = <&palmas>;
28 interrupts = <8 0>;
29 ti,backup-battery-chargeable;
30 ti,backup-battery-charge-high-current;
31 };
32 ...
33 };
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fcc22c982a25..823c95faebd2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -854,16 +854,15 @@ Committed_AS: The amount of memory presently allocated on the system.
854 The committed memory is a sum of all of the memory which 854 The committed memory is a sum of all of the memory which
855 has been allocated by processes, even if it has not been 855 has been allocated by processes, even if it has not been
856 "used" by them as of yet. A process which malloc()'s 1G 856 "used" by them as of yet. A process which malloc()'s 1G
857 of memory, but only touches 300M of it will only show up 857 of memory, but only touches 300M of it will show up as
858 as using 300M of memory even if it has the address space 858 using 1G. This 1G is memory which has been "committed" to
859 allocated for the entire 1G. This 1G is memory which has 859 by the VM and can be used at any time by the allocating
860 been "committed" to by the VM and can be used at any time 860 application. With strict overcommit enabled on the system
861 by the allocating application. With strict overcommit 861 (mode 2 in 'vm.overcommit_memory'),allocations which would
862 enabled on the system (mode 2 in 'vm.overcommit_memory'), 862 exceed the CommitLimit (detailed above) will not be permitted.
863 allocations which would exceed the CommitLimit (detailed 863 This is useful if one needs to guarantee that processes will
864 above) will not be permitted. This is useful if one needs 864 not fail due to lack of memory once that memory has been
865 to guarantee that processes will not fail due to lack of 865 successfully allocated.
866 memory once that memory has been successfully allocated.
867VmallocTotal: total size of vmalloc memory area 866VmallocTotal: total size of vmalloc memory area
868 VmallocUsed: amount of vmalloc area which is used 867 VmallocUsed: amount of vmalloc area which is used
869VmallocChunk: largest contiguous block of vmalloc area which is free 868VmallocChunk: largest contiguous block of vmalloc area which is free
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 59b4a0962e0f..b176928e6963 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -79,6 +79,10 @@ to just make sure certain lists can't become empty.
79Most systems just mount another filesystem over rootfs and ignore it. The 79Most systems just mount another filesystem over rootfs and ignore it. The
80amount of space an empty instance of ramfs takes up is tiny. 80amount of space an empty instance of ramfs takes up is tiny.
81 81
82If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by
83default. To force ramfs, add "rootfstype=ramfs" to the kernel command
84line.
85
82What is initramfs? 86What is initramfs?
83------------------ 87------------------
84 88
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ab7d16efa96b..9d4c1d18ad44 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -182,6 +182,7 @@ core_pattern is used to specify a core dumpfile pattern name.
182 %<NUL> '%' is dropped 182 %<NUL> '%' is dropped
183 %% output one '%' 183 %% output one '%'
184 %p pid 184 %p pid
185 %P global pid (init PID namespace)
185 %u uid 186 %u uid
186 %g gid 187 %g gid
187 %d dump mode, matches PR_SET_DUMPABLE and 188 %d dump mode, matches PR_SET_DUMPABLE and
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 36ecc26c7433..79a797eb3e87 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -200,17 +200,25 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
200 200
201hugepages_treat_as_movable 201hugepages_treat_as_movable
202 202
203This parameter is only useful when kernelcore= is specified at boot time to 203This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
204create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages 204or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE.
205are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero 205ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified,
206value written to hugepages_treat_as_movable allows huge pages to be allocated 206so this parameter has no effect if used without kernelcore=.
207from ZONE_MOVABLE. 207
208 208Hugepage migration is now available in some situations which depend on the
209Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge 209architecture and/or the hugepage size. If a hugepage supports migration,
210pages pool can easily grow or shrink within. Assuming that applications are 210allocation from ZONE_MOVABLE is always enabled for the hugepage regardless
211not running that mlock() a lot of memory, it is likely the huge pages pool 211of the value of this parameter.
212can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value 212IOW, this parameter affects only non-migratable hugepages.
213into nr_hugepages and triggering page reclaim. 213
214Assuming that hugepages are not migratable in your system, one usecase of
215this parameter is that users can make hugepage pool more extensible by
216enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE
217page reclaim/migration/compaction work more and you can get contiguous
218memory more likely. Note that using ZONE_MOVABLE for non-migratable
219hugepages can do harm to other features like memory hotremove (because
220memory hotremove expects that memory blocks on ZONE_MOVABLE are always
221removable,) so it's a trade-off responsible for the users.
214 222
215============================================================== 223==============================================================
216 224
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index 4ac359b7aa17..bdd4bb97fff7 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -165,6 +165,7 @@ which function as described above for the default huge page-sized case.
165 165
166 166
167Interaction of Task Memory Policy with Huge Page Allocation/Freeing 167Interaction of Task Memory Policy with Huge Page Allocation/Freeing
168===================================================================
168 169
169Whether huge pages are allocated and freed via the /proc interface or 170Whether huge pages are allocated and freed via the /proc interface or
170the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA 171the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA
@@ -229,6 +230,7 @@ resulting effect on persistent huge page allocation is as follows:
229 of huge pages over all on-lines nodes with memory. 230 of huge pages over all on-lines nodes with memory.
230 231
231Per Node Hugepages Attributes 232Per Node Hugepages Attributes
233=============================
232 234
233A subset of the contents of the root huge page control directory in sysfs, 235A subset of the contents of the root huge page control directory in sysfs,
234described above, will be replicated under each the system device of each 236described above, will be replicated under each the system device of each
@@ -258,6 +260,7 @@ applied, from which node the huge page allocation will be attempted.
258 260
259 261
260Using Huge Pages 262Using Huge Pages
263================
261 264
262If the user applications are going to request huge pages using mmap system 265If the user applications are going to request huge pages using mmap system
263call, then it is required that system administrator mount a file system of 266call, then it is required that system administrator mount a file system of
@@ -296,20 +299,16 @@ calls, though the mount of filesystem will be required for using mmap calls
296without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see 299without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see
297map_hugetlb.c. 300map_hugetlb.c.
298 301
299******************************************************************* 302Examples
303========
300 304
301/* 3051) map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
302 * map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
303 */
304 306
305******************************************************************* 3072) hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c
306 308
307/* 3093) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
308 * hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c
309 */
310 310
311******************************************************************* 3114) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
312 312 wide range of userspace tools to help with huge page usability, environment
313/* 313 setup, and control. Furthermore it provides useful test cases that should be
314 * hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c 314 used when modifying code to ensure no regressions are introduced.
315 */
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
index 9a12a5956bc0..55684d11a1e8 100644
--- a/Documentation/vm/soft-dirty.txt
+++ b/Documentation/vm/soft-dirty.txt
@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all
28the kernel does is finds this fact out and puts both writable and soft-dirty 28the kernel does is finds this fact out and puts both writable and soft-dirty
29bits on the PTE. 29bits on the PTE.
30 30
31 While in most cases tracking memory changes by #PF-s is more than enough
32there is still a scenario when we can lose soft dirty bits -- a task
33unmaps a previously mapped memory region and then maps a new one at exactly
34the same place. When unmap is called, the kernel internally clears PTE values
35including soft dirty bits. To notify user space application about such
36memory region renewal the kernel always marks new memory regions (and
37expanded regions) as soft dirty.
31 38
32 This feature is actively used by the checkpoint-restore project. You 39 This feature is actively used by the checkpoint-restore project. You
33can find more details about it on http://criu.org 40can find more details about it on http://criu.org
diff --git a/MAINTAINERS b/MAINTAINERS
index be70759e51c5..e61c2e83fc2b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1028,7 +1028,7 @@ F: arch/arm/mach-orion5x/ts78xx-*
1028ARM/MICREL KS8695 ARCHITECTURE 1028ARM/MICREL KS8695 ARCHITECTURE
1029M: Greg Ungerer <gerg@uclinux.org> 1029M: Greg Ungerer <gerg@uclinux.org>
1030L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 1030L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
1031F: arch/arm/mach-ks8695 1031F: arch/arm/mach-ks8695/
1032S: Odd Fixes 1032S: Odd Fixes
1033 1033
1034ARM/MIOA701 MACHINE SUPPORT 1034ARM/MIOA701 MACHINE SUPPORT
@@ -1048,7 +1048,6 @@ M: STEricsson <STEricsson_nomadik_linux@list.st.com>
1048L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 1048L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
1049S: Maintained 1049S: Maintained
1050F: arch/arm/mach-nomadik/ 1050F: arch/arm/mach-nomadik/
1051F: arch/arm/plat-nomadik/
1052F: drivers/i2c/busses/i2c-nomadik.c 1051F: drivers/i2c/busses/i2c-nomadik.c
1053T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git 1052T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git
1054 1053
@@ -1070,7 +1069,7 @@ F: drivers/mmc/host/msm_sdcc.h
1070F: drivers/tty/serial/msm_serial.h 1069F: drivers/tty/serial/msm_serial.h
1071F: drivers/tty/serial/msm_serial.c 1070F: drivers/tty/serial/msm_serial.c
1072F: drivers/*/pm8???-* 1071F: drivers/*/pm8???-*
1073F: drivers/ssbi/ 1072F: drivers/mfd/ssbi/
1074F: include/linux/mfd/pm8xxx/ 1073F: include/linux/mfd/pm8xxx/
1075T: git git://git.kernel.org/pub/scm/linux/kernel/git/davidb/linux-msm.git 1074T: git git://git.kernel.org/pub/scm/linux/kernel/git/davidb/linux-msm.git
1076S: Maintained 1075S: Maintained
@@ -1156,7 +1155,6 @@ L: linux-samsung-soc@vger.kernel.org (moderated for non-subscribers)
1156W: http://www.fluff.org/ben/linux/ 1155W: http://www.fluff.org/ben/linux/
1157S: Maintained 1156S: Maintained
1158F: arch/arm/plat-samsung/ 1157F: arch/arm/plat-samsung/
1159F: arch/arm/plat-s3c24xx/
1160F: arch/arm/mach-s3c24*/ 1158F: arch/arm/mach-s3c24*/
1161F: arch/arm/mach-s3c64xx/ 1159F: arch/arm/mach-s3c64xx/
1162F: drivers/*/*s3c2410* 1160F: drivers/*/*s3c2410*
@@ -1179,8 +1177,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
1179S: Maintained 1177S: Maintained
1180F: arch/arm/mach-s5pv210/mach-aquila.c 1178F: arch/arm/mach-s5pv210/mach-aquila.c
1181F: arch/arm/mach-s5pv210/mach-goni.c 1179F: arch/arm/mach-s5pv210/mach-goni.c
1182F: arch/arm/mach-exynos/mach-universal_c210.c
1183F: arch/arm/mach-exynos/mach-nuri.c
1184 1180
1185ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT 1181ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT
1186M: Kyungmin Park <kyungmin.park@samsung.com> 1182M: Kyungmin Park <kyungmin.park@samsung.com>
@@ -1325,7 +1321,7 @@ F: drivers/mmc/host/wmt-sdmmc.c
1325F: drivers/pwm/pwm-vt8500.c 1321F: drivers/pwm/pwm-vt8500.c
1326F: drivers/rtc/rtc-vt8500.c 1322F: drivers/rtc/rtc-vt8500.c
1327F: drivers/tty/serial/vt8500_serial.c 1323F: drivers/tty/serial/vt8500_serial.c
1328F: drivers/usb/host/ehci-vt8500.c 1324F: drivers/usb/host/ehci-platform.c
1329F: drivers/usb/host/uhci-platform.c 1325F: drivers/usb/host/uhci-platform.c
1330F: drivers/video/vt8500lcdfb.* 1326F: drivers/video/vt8500lcdfb.*
1331F: drivers/video/wm8505fb* 1327F: drivers/video/wm8505fb*
@@ -1815,6 +1811,17 @@ L: netdev@vger.kernel.org
1815S: Supported 1811S: Supported
1816F: drivers/net/ethernet/broadcom/bnx2x/ 1812F: drivers/net/ethernet/broadcom/bnx2x/
1817 1813
1814BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE
1815M: Christian Daudt <csd@broadcom.com>
1816T: git git://git.github.com/broadcom/bcm11351
1817S: Maintained
1818F: arch/arm/mach-bcm/
1819F: arch/arm/boot/dts/bcm113*
1820F: arch/arm/boot/dts/bcm281*
1821F: arch/arm/configs/bcm_defconfig
1822F: drivers/mmc/host/sdhci_bcm_kona.c
1823F: drivers/clocksource/bcm_kona_timer.c
1824
1818BROADCOM BCM2835 ARM ARCHICTURE 1825BROADCOM BCM2835 ARM ARCHICTURE
1819M: Stephen Warren <swarren@wwwdotorg.org> 1826M: Stephen Warren <swarren@wwwdotorg.org>
1820L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers) 1827L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers)
@@ -2035,10 +2042,10 @@ W: http://ceph.com/
2035T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git 2042T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
2036S: Supported 2043S: Supported
2037F: Documentation/filesystems/ceph.txt 2044F: Documentation/filesystems/ceph.txt
2038F: fs/ceph 2045F: fs/ceph/
2039F: net/ceph 2046F: net/ceph/
2040F: include/linux/ceph 2047F: include/linux/ceph/
2041F: include/linux/crush 2048F: include/linux/crush/
2042 2049
2043CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: 2050CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
2044L: linux-usb@vger.kernel.org 2051L: linux-usb@vger.kernel.org
@@ -2335,7 +2342,7 @@ CPU POWER MONITORING SUBSYSTEM
2335M: Dominik Brodowski <linux@dominikbrodowski.net> 2342M: Dominik Brodowski <linux@dominikbrodowski.net>
2336M: Thomas Renninger <trenn@suse.de> 2343M: Thomas Renninger <trenn@suse.de>
2337S: Maintained 2344S: Maintained
2338F: tools/power/cpupower 2345F: tools/power/cpupower/
2339 2346
2340CPUSETS 2347CPUSETS
2341M: Li Zefan <lizefan@huawei.com> 2348M: Li Zefan <lizefan@huawei.com>
@@ -2773,7 +2780,7 @@ L: intel-gfx@lists.freedesktop.org
2773L: dri-devel@lists.freedesktop.org 2780L: dri-devel@lists.freedesktop.org
2774T: git git://people.freedesktop.org/~danvet/drm-intel 2781T: git git://people.freedesktop.org/~danvet/drm-intel
2775S: Supported 2782S: Supported
2776F: drivers/gpu/drm/i915 2783F: drivers/gpu/drm/i915/
2777F: include/drm/i915* 2784F: include/drm/i915*
2778F: include/uapi/drm/i915* 2785F: include/uapi/drm/i915*
2779 2786
@@ -2785,7 +2792,7 @@ M: Kyungmin Park <kyungmin.park@samsung.com>
2785L: dri-devel@lists.freedesktop.org 2792L: dri-devel@lists.freedesktop.org
2786T: git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git 2793T: git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git
2787S: Supported 2794S: Supported
2788F: drivers/gpu/drm/exynos 2795F: drivers/gpu/drm/exynos/
2789F: include/drm/exynos* 2796F: include/drm/exynos*
2790F: include/uapi/drm/exynos* 2797F: include/uapi/drm/exynos*
2791 2798
@@ -3038,7 +3045,7 @@ M: Mauro Carvalho Chehab <m.chehab@samsung.com>
3038L: linux-edac@vger.kernel.org 3045L: linux-edac@vger.kernel.org
3039W: bluesmoke.sourceforge.net 3046W: bluesmoke.sourceforge.net
3040S: Maintained 3047S: Maintained
3041F: drivers/edac/ghes-edac.c 3048F: drivers/edac/ghes_edac.c
3042 3049
3043EDAC-I82443BXGX 3050EDAC-I82443BXGX
3044M: Tim Small <tim@buttersideup.com> 3051M: Tim Small <tim@buttersideup.com>
@@ -3644,8 +3651,8 @@ M: Arnd Bergmann <arnd@arndb.de>
3644L: linux-arch@vger.kernel.org 3651L: linux-arch@vger.kernel.org
3645T: git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git 3652T: git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git
3646S: Maintained 3653S: Maintained
3647F: include/asm-generic 3654F: include/asm-generic/
3648F: include/uapi/asm-generic 3655F: include/uapi/asm-generic/
3649 3656
3650GENERIC UIO DRIVER FOR PCI DEVICES 3657GENERIC UIO DRIVER FOR PCI DEVICES
3651M: "Michael S. Tsirkin" <mst@redhat.com> 3658M: "Michael S. Tsirkin" <mst@redhat.com>
@@ -3687,7 +3694,8 @@ GRE DEMULTIPLEXER DRIVER
3687M: Dmitry Kozlov <xeb@mail.ru> 3694M: Dmitry Kozlov <xeb@mail.ru>
3688L: netdev@vger.kernel.org 3695L: netdev@vger.kernel.org
3689S: Maintained 3696S: Maintained
3690F: net/ipv4/gre.c 3697F: net/ipv4/gre_demux.c
3698F: net/ipv4/gre_offload.c
3691F: include/net/gre.h 3699F: include/net/gre.h
3692 3700
3693GRETH 10/100/1G Ethernet MAC device driver 3701GRETH 10/100/1G Ethernet MAC device driver
@@ -3765,7 +3773,7 @@ L: linux-media@vger.kernel.org
3765T: git git://linuxtv.org/media_tree.git 3773T: git git://linuxtv.org/media_tree.git
3766W: http://linuxtv.org 3774W: http://linuxtv.org
3767S: Odd Fixes 3775S: Odd Fixes
3768F: drivers/media/usb/hdpvr 3776F: drivers/media/usb/hdpvr/
3769 3777
3770HWPOISON MEMORY FAILURE HANDLING 3778HWPOISON MEMORY FAILURE HANDLING
3771M: Andi Kleen <andi@firstfloor.org> 3779M: Andi Kleen <andi@firstfloor.org>
@@ -4574,7 +4582,7 @@ S: Supported
4574W: http://www.openfabrics.org 4582W: http://www.openfabrics.org
4575W: www.open-iscsi.org 4583W: www.open-iscsi.org
4576Q: http://patchwork.kernel.org/project/linux-rdma/list/ 4584Q: http://patchwork.kernel.org/project/linux-rdma/list/
4577F: drivers/infiniband/ulp/iser 4585F: drivers/infiniband/ulp/iser/
4578 4586
4579ISDN SUBSYSTEM 4587ISDN SUBSYSTEM
4580M: Karsten Keil <isdn@linux-pingi.de> 4588M: Karsten Keil <isdn@linux-pingi.de>
@@ -4628,7 +4636,7 @@ W: http://palosaari.fi/linux/
4628Q: http://patchwork.linuxtv.org/project/linux-media/list/ 4636Q: http://patchwork.linuxtv.org/project/linux-media/list/
4629T: git git://linuxtv.org/anttip/media_tree.git 4637T: git git://linuxtv.org/anttip/media_tree.git
4630S: Maintained 4638S: Maintained
4631F: drivers/media/tuners/it913x* 4639F: drivers/media/tuners/tuner_it913x*
4632 4640
4633IVTV VIDEO4LINUX DRIVER 4641IVTV VIDEO4LINUX DRIVER
4634M: Andy Walls <awalls@md.metrocast.net> 4642M: Andy Walls <awalls@md.metrocast.net>
@@ -5964,15 +5972,12 @@ S: Maintained
5964F: arch/arm/*omap*/*pm* 5972F: arch/arm/*omap*/*pm*
5965F: drivers/cpufreq/omap-cpufreq.c 5973F: drivers/cpufreq/omap-cpufreq.c
5966 5974
5967OMAP POWERDOMAIN/CLOCKDOMAIN SOC ADAPTATION LAYER SUPPORT 5975OMAP POWERDOMAIN SOC ADAPTATION LAYER SUPPORT
5968M: Rajendra Nayak <rnayak@ti.com> 5976M: Rajendra Nayak <rnayak@ti.com>
5969M: Paul Walmsley <paul@pwsan.com> 5977M: Paul Walmsley <paul@pwsan.com>
5970L: linux-omap@vger.kernel.org 5978L: linux-omap@vger.kernel.org
5971S: Maintained 5979S: Maintained
5972F: arch/arm/mach-omap2/powerdomain2xxx_3xxx.c 5980F: arch/arm/mach-omap2/prm*
5973F: arch/arm/mach-omap2/powerdomain44xx.c
5974F: arch/arm/mach-omap2/clockdomain2xxx_3xxx.c
5975F: arch/arm/mach-omap2/clockdomain44xx.c
5976 5981
5977OMAP AUDIO SUPPORT 5982OMAP AUDIO SUPPORT
5978M: Peter Ujfalusi <peter.ujfalusi@ti.com> 5983M: Peter Ujfalusi <peter.ujfalusi@ti.com>
@@ -6138,7 +6143,7 @@ W: http://openrisc.net
6138L: linux@lists.openrisc.net (moderated for non-subscribers) 6143L: linux@lists.openrisc.net (moderated for non-subscribers)
6139S: Maintained 6144S: Maintained
6140T: git git://openrisc.net/~jonas/linux 6145T: git git://openrisc.net/~jonas/linux
6141F: arch/openrisc 6146F: arch/openrisc/
6142 6147
6143OPENVSWITCH 6148OPENVSWITCH
6144M: Jesse Gross <jesse@nicira.com> 6149M: Jesse Gross <jesse@nicira.com>
@@ -6429,7 +6434,7 @@ M: Jamie Iles <jamie@jamieiles.com>
6429L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 6434L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
6430T: git git://github.com/jamieiles/linux-2.6-ji.git 6435T: git git://github.com/jamieiles/linux-2.6-ji.git
6431S: Supported 6436S: Supported
6432F: arch/arm/mach-picoxcell 6437F: arch/arm/mach-picoxcell/
6433F: drivers/*/picoxcell* 6438F: drivers/*/picoxcell*
6434F: drivers/*/*/picoxcell* 6439F: drivers/*/*/picoxcell*
6435 6440
@@ -6702,7 +6707,7 @@ F: drivers/spi/spi-pxa2xx*
6702F: drivers/usb/gadget/pxa2* 6707F: drivers/usb/gadget/pxa2*
6703F: include/sound/pxa2xx-lib.h 6708F: include/sound/pxa2xx-lib.h
6704F: sound/arm/pxa* 6709F: sound/arm/pxa*
6705F: sound/soc/pxa 6710F: sound/soc/pxa/
6706 6711
6707MMP SUPPORT 6712MMP SUPPORT
6708M: Eric Miao <eric.y.miao@gmail.com> 6713M: Eric Miao <eric.y.miao@gmail.com>
@@ -7155,7 +7160,7 @@ SAMSUNG AUDIO (ASoC) DRIVERS
7155M: Sangbeom Kim <sbkim73@samsung.com> 7160M: Sangbeom Kim <sbkim73@samsung.com>
7156L: alsa-devel@alsa-project.org (moderated for non-subscribers) 7161L: alsa-devel@alsa-project.org (moderated for non-subscribers)
7157S: Supported 7162S: Supported
7158F: sound/soc/samsung 7163F: sound/soc/samsung/
7159 7164
7160SAMSUNG FRAMEBUFFER DRIVER 7165SAMSUNG FRAMEBUFFER DRIVER
7161M: Jingoo Han <jg1.han@samsung.com> 7166M: Jingoo Han <jg1.han@samsung.com>
@@ -7201,7 +7206,7 @@ SERIAL DRIVERS
7201M: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 7206M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
7202L: linux-serial@vger.kernel.org 7207L: linux-serial@vger.kernel.org
7203S: Maintained 7208S: Maintained
7204F: drivers/tty/serial 7209F: drivers/tty/serial/
7205 7210
7206SYNOPSYS DESIGNWARE DMAC DRIVER 7211SYNOPSYS DESIGNWARE DMAC DRIVER
7207M: Viresh Kumar <viresh.linux@gmail.com> 7212M: Viresh Kumar <viresh.linux@gmail.com>
@@ -7236,7 +7241,7 @@ TLG2300 VIDEO4LINUX-2 DRIVER
7236M: Huang Shijie <shijie8@gmail.com> 7241M: Huang Shijie <shijie8@gmail.com>
7237M: Hans Verkuil <hverkuil@xs4all.nl> 7242M: Hans Verkuil <hverkuil@xs4all.nl>
7238S: Odd Fixes 7243S: Odd Fixes
7239F: drivers/media/usb/tlg2300 7244F: drivers/media/usb/tlg2300/
7240 7245
7241SC1200 WDT DRIVER 7246SC1200 WDT DRIVER
7242M: Zwane Mwaikambo <zwane@arm.linux.org.uk> 7247M: Zwane Mwaikambo <zwane@arm.linux.org.uk>
@@ -7497,7 +7502,7 @@ L: linux-media@vger.kernel.org
7497T: git git://linuxtv.org/media_tree.git 7502T: git git://linuxtv.org/media_tree.git
7498W: http://linuxtv.org 7503W: http://linuxtv.org
7499S: Odd Fixes 7504S: Odd Fixes
7500F: drivers/media/radio/radio-si4713.h 7505F: drivers/media/radio/radio-si4713.c
7501 7506
7502SIANO DVB DRIVER 7507SIANO DVB DRIVER
7503M: Mauro Carvalho Chehab <m.chehab@samsung.com> 7508M: Mauro Carvalho Chehab <m.chehab@samsung.com>
@@ -7506,9 +7511,9 @@ W: http://linuxtv.org
7506T: git git://linuxtv.org/media_tree.git 7511T: git git://linuxtv.org/media_tree.git
7507S: Odd fixes 7512S: Odd fixes
7508F: drivers/media/common/siano/ 7513F: drivers/media/common/siano/
7509F: drivers/media/dvb/siano/
7510F: drivers/media/usb/siano/ 7514F: drivers/media/usb/siano/
7511F: drivers/media/mmc/siano 7515F: drivers/media/usb/siano/
7516F: drivers/media/mmc/siano/
7512 7517
7513SH_VEU V4L2 MEM2MEM DRIVER 7518SH_VEU V4L2 MEM2MEM DRIVER
7514M: Guennadi Liakhovetski <g.liakhovetski@gmx.de> 7519M: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
@@ -7546,9 +7551,9 @@ P: Vincent Sanders <vince@simtec.co.uk>
7546M: Simtec Linux Team <linux@simtec.co.uk> 7551M: Simtec Linux Team <linux@simtec.co.uk>
7547W: http://www.simtec.co.uk/products/EB2410ITX/ 7552W: http://www.simtec.co.uk/products/EB2410ITX/
7548S: Supported 7553S: Supported
7549F: arch/arm/mach-s3c2410/mach-bast.c 7554F: arch/arm/mach-s3c24xx/mach-bast.c
7550F: arch/arm/mach-s3c2410/bast-ide.c 7555F: arch/arm/mach-s3c24xx/bast-ide.c
7551F: arch/arm/mach-s3c2410/bast-irq.c 7556F: arch/arm/mach-s3c24xx/bast-irq.c
7552 7557
7553TI DAVINCI MACHINE SUPPORT 7558TI DAVINCI MACHINE SUPPORT
7554M: Sekhar Nori <nsekhar@ti.com> 7559M: Sekhar Nori <nsekhar@ti.com>
@@ -7557,7 +7562,7 @@ L: davinci-linux-open-source@linux.davincidsp.com (moderated for non-subscribers
7557T: git git://gitorious.org/linux-davinci/linux-davinci.git 7562T: git git://gitorious.org/linux-davinci/linux-davinci.git
7558Q: http://patchwork.kernel.org/project/linux-davinci/list/ 7563Q: http://patchwork.kernel.org/project/linux-davinci/list/
7559S: Supported 7564S: Supported
7560F: arch/arm/mach-davinci 7565F: arch/arm/mach-davinci/
7561F: drivers/i2c/busses/i2c-davinci.c 7566F: drivers/i2c/busses/i2c-davinci.c
7562 7567
7563TI DAVINCI SERIES MEDIA DRIVER 7568TI DAVINCI SERIES MEDIA DRIVER
@@ -7642,7 +7647,7 @@ SMIA AND SMIA++ IMAGE SENSOR DRIVER
7642M: Sakari Ailus <sakari.ailus@iki.fi> 7647M: Sakari Ailus <sakari.ailus@iki.fi>
7643L: linux-media@vger.kernel.org 7648L: linux-media@vger.kernel.org
7644S: Maintained 7649S: Maintained
7645F: drivers/media/i2c/smiapp 7650F: drivers/media/i2c/smiapp/
7646F: include/media/smiapp.h 7651F: include/media/smiapp.h
7647F: drivers/media/i2c/smiapp-pll.c 7652F: drivers/media/i2c/smiapp-pll.c
7648F: drivers/media/i2c/smiapp-pll.h 7653F: drivers/media/i2c/smiapp-pll.h
@@ -7745,6 +7750,11 @@ W: http://tifmxx.berlios.de/
7745S: Maintained 7750S: Maintained
7746F: drivers/memstick/host/tifm_ms.c 7751F: drivers/memstick/host/tifm_ms.c
7747 7752
7753SONY MEMORYSTICK STANDARD SUPPORT
7754M: Maxim Levitsky <maximlevitsky@gmail.com>
7755S: Maintained
7756F: drivers/memstick/core/ms_block.*
7757
7748SOUND 7758SOUND
7749M: Jaroslav Kysela <perex@perex.cz> 7759M: Jaroslav Kysela <perex@perex.cz>
7750M: Takashi Iwai <tiwai@suse.de> 7760M: Takashi Iwai <tiwai@suse.de>
@@ -7821,35 +7831,7 @@ L: spear-devel@list.st.com
7821L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 7831L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
7822W: http://www.st.com/spear 7832W: http://www.st.com/spear
7823S: Maintained 7833S: Maintained
7824F: arch/arm/plat-spear/ 7834F: arch/arm/mach-spear/
7825
7826SPEAR13XX MACHINE SUPPORT
7827M: Viresh Kumar <viresh.linux@gmail.com>
7828M: Shiraz Hashim <shiraz.hashim@st.com>
7829L: spear-devel@list.st.com
7830L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
7831W: http://www.st.com/spear
7832S: Maintained
7833F: arch/arm/mach-spear13xx/
7834
7835SPEAR3XX MACHINE SUPPORT
7836M: Viresh Kumar <viresh.linux@gmail.com>
7837M: Shiraz Hashim <shiraz.hashim@st.com>
7838L: spear-devel@list.st.com
7839L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
7840W: http://www.st.com/spear
7841S: Maintained
7842F: arch/arm/mach-spear3xx/
7843
7844SPEAR6XX MACHINE SUPPORT
7845M: Rajeev Kumar <rajeev-dlh.kumar@st.com>
7846M: Shiraz Hashim <shiraz.hashim@st.com>
7847M: Viresh Kumar <viresh.linux@gmail.com>
7848L: spear-devel@list.st.com
7849L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
7850W: http://www.st.com/spear
7851S: Maintained
7852F: arch/arm/mach-spear6xx/
7853 7835
7854SPEAR CLOCK FRAMEWORK SUPPORT 7836SPEAR CLOCK FRAMEWORK SUPPORT
7855M: Viresh Kumar <viresh.linux@gmail.com> 7837M: Viresh Kumar <viresh.linux@gmail.com>
@@ -8118,7 +8100,7 @@ M: Vineet Gupta <vgupta@synopsys.com>
8118S: Supported 8100S: Supported
8119F: arch/arc/ 8101F: arch/arc/
8120F: Documentation/devicetree/bindings/arc/ 8102F: Documentation/devicetree/bindings/arc/
8121F: drivers/tty/serial/arc-uart.c 8103F: drivers/tty/serial/arc_uart.c
8122 8104
8123SYSV FILESYSTEM 8105SYSV FILESYSTEM
8124M: Christoph Hellwig <hch@infradead.org> 8106M: Christoph Hellwig <hch@infradead.org>
@@ -8808,7 +8790,6 @@ L: linux-usb@vger.kernel.org
8808T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git 8790T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
8809S: Maintained 8791S: Maintained
8810F: drivers/usb/phy/ 8792F: drivers/usb/phy/
8811F: drivers/usb/otg/
8812 8793
8813USB PRINTER DRIVER (usblp) 8794USB PRINTER DRIVER (usblp)
8814M: Pete Zaitcev <zaitcev@redhat.com> 8795M: Pete Zaitcev <zaitcev@redhat.com>
@@ -9339,7 +9320,7 @@ M: Matthew Garrett <matthew.garrett@nebula.com>
9339L: platform-driver-x86@vger.kernel.org 9320L: platform-driver-x86@vger.kernel.org
9340T: git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git 9321T: git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git
9341S: Maintained 9322S: Maintained
9342F: drivers/platform/x86 9323F: drivers/platform/x86/
9343 9324
9344X86 MCE INFRASTRUCTURE 9325X86 MCE INFRASTRUCTURE
9345M: Tony Luck <tony.luck@intel.com> 9326M: Tony Luck <tony.luck@intel.com>
diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c
index 40736da9bea8..ffb19b7da999 100644
--- a/arch/alpha/lib/csum_partial_copy.c
+++ b/arch/alpha/lib/csum_partial_copy.c
@@ -338,6 +338,11 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len,
338 unsigned long doff = 7 & (unsigned long) dst; 338 unsigned long doff = 7 & (unsigned long) dst;
339 339
340 if (len) { 340 if (len) {
341 if (!access_ok(VERIFY_READ, src, len)) {
342 *errp = -EFAULT;
343 memset(dst, 0, len);
344 return sum;
345 }
341 if (!doff) { 346 if (!doff) {
342 if (!soff) 347 if (!soff)
343 checksum = csum_partial_cfu_aligned( 348 checksum = csum_partial_cfu_aligned(
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index 66781bf34077..54ee6163c181 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -56,3 +56,8 @@ int pmd_huge(pmd_t pmd)
56{ 56{
57 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); 57 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
58} 58}
59
60int pmd_huge_support(void)
61{
62 return 1;
63}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2fc8258bab2d..5e9aec358306 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -54,6 +54,11 @@ int pud_huge(pud_t pud)
54 return !(pud_val(pud) & PUD_TABLE_BIT); 54 return !(pud_val(pud) & PUD_TABLE_BIT);
55} 55}
56 56
57int pmd_huge_support(void)
58{
59 return 1;
60}
61
57static __init int setup_hugepagesz(char *opt) 62static __init int setup_hugepagesz(char *opt)
58{ 63{
59 unsigned long ps = memparse(opt, &opt); 64 unsigned long ps = memparse(opt, &opt);
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 76069c18ee42..68232db98baa 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -114,6 +114,11 @@ int pud_huge(pud_t pud)
114 return 0; 114 return 0;
115} 115}
116 116
117int pmd_huge_support(void)
118{
119 return 0;
120}
121
117struct page * 122struct page *
118follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) 123follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
119{ 124{
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c52fa6d0f8e..042431509b56 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -110,6 +110,11 @@ int pud_huge(pud_t pud)
110 return 0; 110 return 0;
111} 111}
112 112
113int pmd_huge_support(void)
114{
115 return 1;
116}
117
113struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 118struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
114 pmd_t *pmd, int write) 119 pmd_t *pmd, int write)
115{ 120{
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index a7fee0dfb7a9..01fda4419ed0 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -85,6 +85,11 @@ int pud_huge(pud_t pud)
85 return (pud_val(pud) & _PAGE_HUGE) != 0; 85 return (pud_val(pud) & _PAGE_HUGE) != 0;
86} 86}
87 87
88int pmd_huge_support(void)
89{
90 return 1;
91}
92
88struct page * 93struct page *
89follow_huge_pmd(struct mm_struct *mm, unsigned long address, 94follow_huge_pmd(struct mm_struct *mm, unsigned long address,
90 pmd_t *pmd, int write) 95 pmd_t *pmd, int write)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 834ca8eb38f2..d67db4bd672d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -86,6 +86,11 @@ int pgd_huge(pgd_t pgd)
86 */ 86 */
87 return ((pgd_val(pgd) & 0x3) != 0x0); 87 return ((pgd_val(pgd) & 0x3) != 0x0);
88} 88}
89
90int pmd_huge_support(void)
91{
92 return 1;
93}
89#else 94#else
90int pmd_huge(pmd_t pmd) 95int pmd_huge(pmd_t pmd)
91{ 96{
@@ -101,6 +106,11 @@ int pgd_huge(pgd_t pgd)
101{ 106{
102 return 0; 107 return 0;
103} 108}
109
110int pmd_huge_support(void)
111{
112 return 0;
113}
104#endif 114#endif
105 115
106pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 116pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fb2723e8ba65..3ec272859e1e 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -526,6 +526,7 @@ config CRASH_DUMP
526 bool "kernel crash dumps" 526 bool "kernel crash dumps"
527 depends on 64BIT && SMP 527 depends on 64BIT && SMP
528 select KEXEC 528 select KEXEC
529 select ZFCPDUMP
529 help 530 help
530 Generate crash dump after being started by kexec. 531 Generate crash dump after being started by kexec.
531 Crash dump kernels are loaded in the main kernel with kexec-tools 532 Crash dump kernels are loaded in the main kernel with kexec-tools
@@ -536,7 +537,7 @@ config CRASH_DUMP
536config ZFCPDUMP 537config ZFCPDUMP
537 def_bool n 538 def_bool n
538 prompt "zfcpdump support" 539 prompt "zfcpdump support"
539 select SMP 540 depends on SMP
540 help 541 help
541 Select this option if you want to build an zfcpdump enabled kernel. 542 Select this option if you want to build an zfcpdump enabled kernel.
542 Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this. 543 Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index dcf6948a875c..4176dfe0fba1 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -31,6 +31,8 @@
31#include <linux/ptrace.h> 31#include <linux/ptrace.h>
32#include <linux/percpu.h> 32#include <linux/percpu.h>
33 33
34#define __ARCH_WANT_KPROBES_INSN_SLOT
35
34struct pt_regs; 36struct pt_regs;
35struct kprobe; 37struct kprobe;
36 38
@@ -57,7 +59,7 @@ typedef u16 kprobe_opcode_t;
57/* Architecture specific copy of original instruction */ 59/* Architecture specific copy of original instruction */
58struct arch_specific_insn { 60struct arch_specific_insn {
59 /* copy of original instruction */ 61 /* copy of original instruction */
60 kprobe_opcode_t insn[MAX_INSN_SIZE]; 62 kprobe_opcode_t *insn;
61}; 63};
62 64
63struct prev_kprobe { 65struct prev_kprobe {
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 06a136136047..7dc7f9c63b65 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -56,5 +56,6 @@ bool sclp_has_linemode(void);
56bool sclp_has_vt220(void); 56bool sclp_has_vt220(void);
57int sclp_pci_configure(u32 fid); 57int sclp_pci_configure(u32 fid);
58int sclp_pci_deconfigure(u32 fid); 58int sclp_pci_deconfigure(u32 fid);
59int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode);
59 60
60#endif /* _ASM_S390_SCLP_H */ 61#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d8f355657171..c84f33d51f7b 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -16,6 +16,7 @@
16#include <asm/os_info.h> 16#include <asm/os_info.h>
17#include <asm/elf.h> 17#include <asm/elf.h>
18#include <asm/ipl.h> 18#include <asm/ipl.h>
19#include <asm/sclp.h>
19 20
20#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) 21#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
21#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) 22#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
@@ -64,22 +65,46 @@ static ssize_t copy_page_real(void *buf, void *src, size_t csize)
64} 65}
65 66
66/* 67/*
67 * Copy one page from "oldmem" 68 * Pointer to ELF header in new kernel
69 */
70static void *elfcorehdr_newmem;
71
72/*
73 * Copy one page from zfcpdump "oldmem"
74 *
75 * For pages below ZFCPDUMP_HSA_SIZE memory from the HSA is copied. Otherwise
76 * real memory copy is used.
77 */
78static ssize_t copy_oldmem_page_zfcpdump(char *buf, size_t csize,
79 unsigned long src, int userbuf)
80{
81 int rc;
82
83 if (src < ZFCPDUMP_HSA_SIZE) {
84 rc = memcpy_hsa(buf, src, csize, userbuf);
85 } else {
86 if (userbuf)
87 rc = copy_to_user_real((void __force __user *) buf,
88 (void *) src, csize);
89 else
90 rc = memcpy_real(buf, (void *) src, csize);
91 }
92 return rc ? rc : csize;
93}
94
95/*
96 * Copy one page from kdump "oldmem"
68 * 97 *
69 * For the kdump reserved memory this functions performs a swap operation: 98 * For the kdump reserved memory this functions performs a swap operation:
70 * - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE]. 99 * - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE].
71 * - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] 100 * - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE]
72 */ 101 */
73ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 102static ssize_t copy_oldmem_page_kdump(char *buf, size_t csize,
74 size_t csize, unsigned long offset, int userbuf) 103 unsigned long src, int userbuf)
104
75{ 105{
76 unsigned long src;
77 int rc; 106 int rc;
78 107
79 if (!csize)
80 return 0;
81
82 src = (pfn << PAGE_SHIFT) + offset;
83 if (src < OLDMEM_SIZE) 108 if (src < OLDMEM_SIZE)
84 src += OLDMEM_BASE; 109 src += OLDMEM_BASE;
85 else if (src > OLDMEM_BASE && 110 else if (src > OLDMEM_BASE &&
@@ -90,7 +115,88 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
90 (void *) src, csize); 115 (void *) src, csize);
91 else 116 else
92 rc = copy_page_real(buf, (void *) src, csize); 117 rc = copy_page_real(buf, (void *) src, csize);
93 return (rc == 0) ? csize : rc; 118 return (rc == 0) ? rc : csize;
119}
120
121/*
122 * Copy one page from "oldmem"
123 */
124ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
125 unsigned long offset, int userbuf)
126{
127 unsigned long src;
128
129 if (!csize)
130 return 0;
131 src = (pfn << PAGE_SHIFT) + offset;
132 if (OLDMEM_BASE)
133 return copy_oldmem_page_kdump(buf, csize, src, userbuf);
134 else
135 return copy_oldmem_page_zfcpdump(buf, csize, src, userbuf);
136}
137
138/*
139 * Remap "oldmem" for kdump
140 *
141 * For the kdump reserved memory this functions performs a swap operation:
142 * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE]
143 */
144static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
145 unsigned long from, unsigned long pfn,
146 unsigned long size, pgprot_t prot)
147{
148 unsigned long size_old;
149 int rc;
150
151 if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) {
152 size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT));
153 rc = remap_pfn_range(vma, from,
154 pfn + (OLDMEM_BASE >> PAGE_SHIFT),
155 size_old, prot);
156 if (rc || size == size_old)
157 return rc;
158 size -= size_old;
159 from += size_old;
160 pfn += size_old >> PAGE_SHIFT;
161 }
162 return remap_pfn_range(vma, from, pfn, size, prot);
163}
164
165/*
166 * Remap "oldmem" for zfcpdump
167 *
168 * We only map available memory above ZFCPDUMP_HSA_SIZE. Memory below
169 * ZFCPDUMP_HSA_SIZE is read on demand using the copy_oldmem_page() function.
170 */
171static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
172 unsigned long from,
173 unsigned long pfn,
174 unsigned long size, pgprot_t prot)
175{
176 unsigned long size_hsa;
177
178 if (pfn < ZFCPDUMP_HSA_SIZE >> PAGE_SHIFT) {
179 size_hsa = min(size, ZFCPDUMP_HSA_SIZE - (pfn << PAGE_SHIFT));
180 if (size == size_hsa)
181 return 0;
182 size -= size_hsa;
183 from += size_hsa;
184 pfn += size_hsa >> PAGE_SHIFT;
185 }
186 return remap_pfn_range(vma, from, pfn, size, prot);
187}
188
189/*
190 * Remap "oldmem" for kdump or zfcpdump
191 */
192int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
193 unsigned long pfn, unsigned long size, pgprot_t prot)
194{
195 if (OLDMEM_BASE)
196 return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot);
197 else
198 return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size,
199 prot);
94} 200}
95 201
96/* 202/*
@@ -101,11 +207,21 @@ int copy_from_oldmem(void *dest, void *src, size_t count)
101 unsigned long copied = 0; 207 unsigned long copied = 0;
102 int rc; 208 int rc;
103 209
104 if ((unsigned long) src < OLDMEM_SIZE) { 210 if (OLDMEM_BASE) {
105 copied = min(count, OLDMEM_SIZE - (unsigned long) src); 211 if ((unsigned long) src < OLDMEM_SIZE) {
106 rc = memcpy_real(dest, src + OLDMEM_BASE, copied); 212 copied = min(count, OLDMEM_SIZE - (unsigned long) src);
107 if (rc) 213 rc = memcpy_real(dest, src + OLDMEM_BASE, copied);
108 return rc; 214 if (rc)
215 return rc;
216 }
217 } else {
218 if ((unsigned long) src < ZFCPDUMP_HSA_SIZE) {
219 copied = min(count,
220 ZFCPDUMP_HSA_SIZE - (unsigned long) src);
221 rc = memcpy_hsa(dest, (unsigned long) src, copied, 0);
222 if (rc)
223 return rc;
224 }
109 } 225 }
110 return memcpy_real(dest + copied, src + copied, count - copied); 226 return memcpy_real(dest + copied, src + copied, count - copied);
111} 227}
@@ -368,14 +484,6 @@ static int get_mem_chunk_cnt(void)
368} 484}
369 485
370/* 486/*
371 * Relocate pointer in order to allow vmcore code access the data
372 */
373static inline unsigned long relocate(unsigned long addr)
374{
375 return OLDMEM_BASE + addr;
376}
377
378/*
379 * Initialize ELF loads (new kernel) 487 * Initialize ELF loads (new kernel)
380 */ 488 */
381static int loads_init(Elf64_Phdr *phdr, u64 loads_offset) 489static int loads_init(Elf64_Phdr *phdr, u64 loads_offset)
@@ -426,7 +534,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
426 ptr = nt_vmcoreinfo(ptr); 534 ptr = nt_vmcoreinfo(ptr);
427 memset(phdr, 0, sizeof(*phdr)); 535 memset(phdr, 0, sizeof(*phdr));
428 phdr->p_type = PT_NOTE; 536 phdr->p_type = PT_NOTE;
429 phdr->p_offset = relocate(notes_offset); 537 phdr->p_offset = notes_offset;
430 phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); 538 phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start);
431 phdr->p_memsz = phdr->p_filesz; 539 phdr->p_memsz = phdr->p_filesz;
432 return ptr; 540 return ptr;
@@ -435,7 +543,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
435/* 543/*
436 * Create ELF core header (new kernel) 544 * Create ELF core header (new kernel)
437 */ 545 */
438static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) 546int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
439{ 547{
440 Elf64_Phdr *phdr_notes, *phdr_loads; 548 Elf64_Phdr *phdr_notes, *phdr_loads;
441 int mem_chunk_cnt; 549 int mem_chunk_cnt;
@@ -443,6 +551,12 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz)
443 u32 alloc_size; 551 u32 alloc_size;
444 u64 hdr_off; 552 u64 hdr_off;
445 553
554 /* If we are not in kdump or zfcpdump mode return */
555 if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP)
556 return 0;
557 /* If elfcorehdr= has been passed via cmdline, we use that one */
558 if (elfcorehdr_addr != ELFCORE_ADDR_MAX)
559 return 0;
446 mem_chunk_cnt = get_mem_chunk_cnt(); 560 mem_chunk_cnt = get_mem_chunk_cnt();
447 561
448 alloc_size = 0x1000 + get_cpu_cnt() * 0x300 + 562 alloc_size = 0x1000 + get_cpu_cnt() * 0x300 +
@@ -460,27 +574,52 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz)
460 ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); 574 ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
461 /* Init loads */ 575 /* Init loads */
462 hdr_off = PTR_DIFF(ptr, hdr); 576 hdr_off = PTR_DIFF(ptr, hdr);
463 loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); 577 loads_init(phdr_loads, hdr_off);
464 *elfcorebuf_sz = hdr_off; 578 *addr = (unsigned long long) hdr;
465 *elfcorebuf = (void *) relocate((unsigned long) hdr); 579 elfcorehdr_newmem = hdr;
466 BUG_ON(*elfcorebuf_sz > alloc_size); 580 *size = (unsigned long long) hdr_off;
581 BUG_ON(elfcorehdr_size > alloc_size);
582 return 0;
467} 583}
468 584
469/* 585/*
470 * Create kdump ELF core header in new kernel, if it has not been passed via 586 * Free ELF core header (new kernel)
471 * the "elfcorehdr" kernel parameter
472 */ 587 */
473static int setup_kdump_elfcorehdr(void) 588void elfcorehdr_free(unsigned long long addr)
474{ 589{
475 size_t elfcorebuf_sz; 590 if (!elfcorehdr_newmem)
476 char *elfcorebuf; 591 return;
592 kfree((void *)(unsigned long)addr);
593}
477 594
478 if (!OLDMEM_BASE || is_kdump_kernel()) 595/*
479 return -EINVAL; 596 * Read from ELF header
480 s390_elf_corehdr_create(&elfcorebuf, &elfcorebuf_sz); 597 */
481 elfcorehdr_addr = (unsigned long long) elfcorebuf; 598ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
482 elfcorehdr_size = elfcorebuf_sz; 599{
483 return 0; 600 void *src = (void *)(unsigned long)*ppos;
601
602 src = elfcorehdr_newmem ? src : src - OLDMEM_BASE;
603 memcpy(buf, src, count);
604 *ppos += count;
605 return count;
484} 606}
485 607
486subsys_initcall(setup_kdump_elfcorehdr); 608/*
609 * Read from ELF notes data
610 */
611ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
612{
613 void *src = (void *)(unsigned long)*ppos;
614 int rc;
615
616 if (elfcorehdr_newmem) {
617 memcpy(buf, src, count);
618 } else {
619 rc = copy_from_oldmem(buf, src, count);
620 if (rc)
621 return rc;
622 }
623 *ppos += count;
624 return count;
625}
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index adbbe7f1cb0d..0ce9fb245034 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -37,6 +37,26 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
37 37
38struct kretprobe_blackpoint kretprobe_blacklist[] = { }; 38struct kretprobe_blackpoint kretprobe_blacklist[] = { };
39 39
40DEFINE_INSN_CACHE_OPS(dmainsn);
41
42static void *alloc_dmainsn_page(void)
43{
44 return (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
45}
46
47static void free_dmainsn_page(void *page)
48{
49 free_page((unsigned long)page);
50}
51
52struct kprobe_insn_cache kprobe_dmainsn_slots = {
53 .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex),
54 .alloc = alloc_dmainsn_page,
55 .free = free_dmainsn_page,
56 .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages),
57 .insn_size = MAX_INSN_SIZE,
58};
59
40static int __kprobes is_prohibited_opcode(kprobe_opcode_t *insn) 60static int __kprobes is_prohibited_opcode(kprobe_opcode_t *insn)
41{ 61{
42 switch (insn[0] >> 8) { 62 switch (insn[0] >> 8) {
@@ -100,9 +120,8 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn)
100 fixup |= FIXUP_RETURN_REGISTER; 120 fixup |= FIXUP_RETURN_REGISTER;
101 break; 121 break;
102 case 0xc0: 122 case 0xc0:
103 if ((insn[0] & 0x0f) == 0x00 || /* larl */ 123 if ((insn[0] & 0x0f) == 0x05) /* brasl */
104 (insn[0] & 0x0f) == 0x05) /* brasl */ 124 fixup |= FIXUP_RETURN_REGISTER;
105 fixup |= FIXUP_RETURN_REGISTER;
106 break; 125 break;
107 case 0xeb: 126 case 0xeb:
108 switch (insn[2] & 0xff) { 127 switch (insn[2] & 0xff) {
@@ -134,18 +153,128 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn)
134 return fixup; 153 return fixup;
135} 154}
136 155
156static int __kprobes is_insn_relative_long(kprobe_opcode_t *insn)
157{
158 /* Check if we have a RIL-b or RIL-c format instruction which
159 * we need to modify in order to avoid instruction emulation. */
160 switch (insn[0] >> 8) {
161 case 0xc0:
162 if ((insn[0] & 0x0f) == 0x00) /* larl */
163 return true;
164 break;
165 case 0xc4:
166 switch (insn[0] & 0x0f) {
167 case 0x02: /* llhrl */
168 case 0x04: /* lghrl */
169 case 0x05: /* lhrl */
170 case 0x06: /* llghrl */
171 case 0x07: /* sthrl */
172 case 0x08: /* lgrl */
173 case 0x0b: /* stgrl */
174 case 0x0c: /* lgfrl */
175 case 0x0d: /* lrl */
176 case 0x0e: /* llgfrl */
177 case 0x0f: /* strl */
178 return true;
179 }
180 break;
181 case 0xc6:
182 switch (insn[0] & 0x0f) {
183 case 0x00: /* exrl */
184 case 0x02: /* pfdrl */
185 case 0x04: /* cghrl */
186 case 0x05: /* chrl */
187 case 0x06: /* clghrl */
188 case 0x07: /* clhrl */
189 case 0x08: /* cgrl */
190 case 0x0a: /* clgrl */
191 case 0x0c: /* cgfrl */
192 case 0x0d: /* crl */
193 case 0x0e: /* clgfrl */
194 case 0x0f: /* clrl */
195 return true;
196 }
197 break;
198 }
199 return false;
200}
201
202static void __kprobes copy_instruction(struct kprobe *p)
203{
204 s64 disp, new_disp;
205 u64 addr, new_addr;
206
207 memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2);
208 if (!is_insn_relative_long(p->ainsn.insn))
209 return;
210 /*
211 * For pc-relative instructions in RIL-b or RIL-c format patch the
212 * RI2 displacement field. We have already made sure that the insn
213 * slot for the patched instruction is within the same 2GB area
214 * as the original instruction (either kernel image or module area).
215 * Therefore the new displacement will always fit.
216 */
217 disp = *(s32 *)&p->ainsn.insn[1];
218 addr = (u64)(unsigned long)p->addr;
219 new_addr = (u64)(unsigned long)p->ainsn.insn;
220 new_disp = ((addr + (disp * 2)) - new_addr) / 2;
221 *(s32 *)&p->ainsn.insn[1] = new_disp;
222}
223
224static inline int is_kernel_addr(void *addr)
225{
226 return addr < (void *)_end;
227}
228
229static inline int is_module_addr(void *addr)
230{
231#ifdef CONFIG_64BIT
232 BUILD_BUG_ON(MODULES_LEN > (1UL << 31));
233 if (addr < (void *)MODULES_VADDR)
234 return 0;
235 if (addr > (void *)MODULES_END)
236 return 0;
237#endif
238 return 1;
239}
240
241static int __kprobes s390_get_insn_slot(struct kprobe *p)
242{
243 /*
244 * Get an insn slot that is within the same 2GB area like the original
245 * instruction. That way instructions with a 32bit signed displacement
246 * field can be patched and executed within the insn slot.
247 */
248 p->ainsn.insn = NULL;
249 if (is_kernel_addr(p->addr))
250 p->ainsn.insn = get_dmainsn_slot();
251 if (is_module_addr(p->addr))
252 p->ainsn.insn = get_insn_slot();
253 return p->ainsn.insn ? 0 : -ENOMEM;
254}
255
256static void __kprobes s390_free_insn_slot(struct kprobe *p)
257{
258 if (!p->ainsn.insn)
259 return;
260 if (is_kernel_addr(p->addr))
261 free_dmainsn_slot(p->ainsn.insn, 0);
262 else
263 free_insn_slot(p->ainsn.insn, 0);
264 p->ainsn.insn = NULL;
265}
266
137int __kprobes arch_prepare_kprobe(struct kprobe *p) 267int __kprobes arch_prepare_kprobe(struct kprobe *p)
138{ 268{
139 if ((unsigned long) p->addr & 0x01) 269 if ((unsigned long) p->addr & 0x01)
140 return -EINVAL; 270 return -EINVAL;
141
142 /* Make sure the probe isn't going on a difficult instruction */ 271 /* Make sure the probe isn't going on a difficult instruction */
143 if (is_prohibited_opcode(p->addr)) 272 if (is_prohibited_opcode(p->addr))
144 return -EINVAL; 273 return -EINVAL;
145 274 if (s390_get_insn_slot(p))
275 return -ENOMEM;
146 p->opcode = *p->addr; 276 p->opcode = *p->addr;
147 memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2); 277 copy_instruction(p);
148
149 return 0; 278 return 0;
150} 279}
151 280
@@ -186,6 +315,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
186 315
187void __kprobes arch_remove_kprobe(struct kprobe *p) 316void __kprobes arch_remove_kprobe(struct kprobe *p)
188{ 317{
318 s390_free_insn_slot(p);
189} 319}
190 320
191static void __kprobes enable_singlestep(struct kprobe_ctlblk *kcb, 321static void __kprobes enable_singlestep(struct kprobe_ctlblk *kcb,
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 248445f92604..d261c62e40a6 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -223,6 +223,11 @@ int pud_huge(pud_t pud)
223 return 0; 223 return 0;
224} 224}
225 225
226int pmd_huge_support(void)
227{
228 return 1;
229}
230
226struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
227 pmd_t *pmdp, int write) 232 pmd_t *pmdp, int write)
228{ 233{
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index d7762349ea48..0d676a41081e 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -83,6 +83,11 @@ int pud_huge(pud_t pud)
83 return 0; 83 return 0;
84} 84}
85 85
86int pmd_huge_support(void)
87{
88 return 0;
89}
90
86struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 91struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
87 pmd_t *pmd, int write) 92 pmd_t *pmd, int write)
88{ 93{
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 3d0ddbc005fe..71368850dfc0 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -169,10 +169,10 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
169 new_ka.ka_restorer = restorer; 169 new_ka.ka_restorer = restorer;
170 ret = get_user(u_handler, &act->sa_handler); 170 ret = get_user(u_handler, &act->sa_handler);
171 new_ka.sa.sa_handler = compat_ptr(u_handler); 171 new_ka.sa.sa_handler = compat_ptr(u_handler);
172 ret |= __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); 172 ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t));
173 sigset_from_compat(&new_ka.sa.sa_mask, &set32); 173 sigset_from_compat(&new_ka.sa.sa_mask, &set32);
174 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); 174 ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
175 ret |= __get_user(u_restorer, &act->sa_restorer); 175 ret |= get_user(u_restorer, &act->sa_restorer);
176 new_ka.sa.sa_restorer = compat_ptr(u_restorer); 176 new_ka.sa.sa_restorer = compat_ptr(u_restorer);
177 if (ret) 177 if (ret)
178 return -EFAULT; 178 return -EFAULT;
@@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
183 if (!ret && oact) { 183 if (!ret && oact) {
184 sigset_to_compat(&set32, &old_ka.sa.sa_mask); 184 sigset_to_compat(&set32, &old_ka.sa.sa_mask);
185 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); 185 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler);
186 ret |= __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); 186 ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t));
187 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 187 ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
188 ret |= __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); 188 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer);
189 if (ret) 189 if (ret)
190 ret = -EFAULT; 190 ret = -EFAULT;
191 } 191 }
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d2b59441ebdd..96399646570a 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -234,6 +234,11 @@ int pud_huge(pud_t pud)
234 return 0; 234 return 0;
235} 235}
236 236
237int pmd_huge_support(void)
238{
239 return 0;
240}
241
237struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 242struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
238 pmd_t *pmd, int write) 243 pmd_t *pmd, int write)
239{ 244{
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index e514899e1100..0cb3bbaa580c 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -166,6 +166,11 @@ int pud_huge(pud_t pud)
166 return !!(pud_val(pud) & _PAGE_HUGE_PAGE); 166 return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
167} 167}
168 168
169int pmd_huge_support(void)
170{
171 return 1;
172}
173
169struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 174struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
170 pmd_t *pmd, int write) 175 pmd_t *pmd, int write)
171{ 176{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 8d16befdec88..3d1999458709 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
315 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); 315 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
316} 316}
317 317
318static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
319{
320 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
321}
322
323static inline int pte_swp_soft_dirty(pte_t pte)
324{
325 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
326}
327
328static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
329{
330 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
331}
332
333static inline pte_t pte_file_clear_soft_dirty(pte_t pte) 318static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
334{ 319{
335 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); 320 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
446 431
447#ifndef __ASSEMBLY__ 432#ifndef __ASSEMBLY__
448#include <linux/mm_types.h> 433#include <linux/mm_types.h>
434#include <linux/mmdebug.h>
449#include <linux/log2.h> 435#include <linux/log2.h>
450 436
451static inline int pte_none(pte_t pte) 437static inline int pte_none(pte_t pte)
@@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
864{ 850{
865} 851}
866 852
853static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
854{
855 VM_BUG_ON(pte_present(pte));
856 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
857}
858
859static inline int pte_swp_soft_dirty(pte_t pte)
860{
861 VM_BUG_ON(pte_present(pte));
862 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
863}
864
865static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
866{
867 VM_BUG_ON(pte_present(pte));
868 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
869}
870
867#include <asm-generic/pgtable.h> 871#include <asm-generic/pgtable.h>
868#endif /* __ASSEMBLY__ */ 872#endif /* __ASSEMBLY__ */
869 873
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f4843e031131..0ecac257fb26 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -75,6 +75,9 @@
75 * with swap entry format. On x86 bits 6 and 7 are *not* involved 75 * with swap entry format. On x86 bits 6 and 7 are *not* involved
76 * into swap entry computation, but bit 6 is used for nonlinear 76 * into swap entry computation, but bit 6 is used for nonlinear
77 * file mapping, so we borrow bit 7 for soft dirty tracking. 77 * file mapping, so we borrow bit 7 for soft dirty tracking.
78 *
79 * Please note that this bit must be treated as swap dirty page
80 * mark if and only if the PTE has present bit clear!
78 */ 81 */
79#ifdef CONFIG_MEM_SOFT_DIRTY 82#ifdef CONFIG_MEM_SOFT_DIRTY
80#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE 83#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cf512003e663..e6d90babc245 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void)
62 62
63static inline void __flush_tlb_one(unsigned long addr) 63static inline void __flush_tlb_one(unsigned long addr)
64{ 64{
65 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
65 __flush_tlb_single(addr); 66 __flush_tlb_single(addr);
66} 67}
67 68
@@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr)
84 85
85#ifndef CONFIG_SMP 86#ifndef CONFIG_SMP
86 87
87#define flush_tlb() __flush_tlb() 88/* "_up" is for UniProcessor.
88#define flush_tlb_all() __flush_tlb_all() 89 *
89#define local_flush_tlb() __flush_tlb() 90 * This is a helper for other header functions. *Not* intended to be called
91 * directly. All global TLB flushes need to either call this, or to bump the
92 * vm statistics themselves.
93 */
94static inline void __flush_tlb_up(void)
95{
96 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
97 __flush_tlb();
98}
99
100static inline void flush_tlb_all(void)
101{
102 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
103 __flush_tlb_all();
104}
105
106static inline void flush_tlb(void)
107{
108 __flush_tlb_up();
109}
110
111static inline void local_flush_tlb(void)
112{
113 __flush_tlb_up();
114}
90 115
91static inline void flush_tlb_mm(struct mm_struct *mm) 116static inline void flush_tlb_mm(struct mm_struct *mm)
92{ 117{
93 if (mm == current->active_mm) 118 if (mm == current->active_mm)
94 __flush_tlb(); 119 __flush_tlb_up();
95} 120}
96 121
97static inline void flush_tlb_page(struct vm_area_struct *vma, 122static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
105 unsigned long start, unsigned long end) 130 unsigned long start, unsigned long end)
106{ 131{
107 if (vma->vm_mm == current->active_mm) 132 if (vma->vm_mm == current->active_mm)
108 __flush_tlb(); 133 __flush_tlb_up();
109} 134}
110 135
111static inline void flush_tlb_mm_range(struct mm_struct *mm, 136static inline void flush_tlb_mm_range(struct mm_struct *mm,
112 unsigned long start, unsigned long end, unsigned long vmflag) 137 unsigned long start, unsigned long end, unsigned long vmflag)
113{ 138{
114 if (mm == current->active_mm) 139 if (mm == current->active_mm)
115 __flush_tlb(); 140 __flush_tlb_up();
116} 141}
117 142
118static inline void native_flush_tlb_others(const struct cpumask *cpumask, 143static inline void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d4cdfa67509e..ce2d0a2c3e4f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
683 } 683 }
684 684
685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
686 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
686 __flush_tlb(); 687 __flush_tlb();
687 688
688 /* Save MTRR state */ 689 /* Save MTRR state */
@@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
696static void post_set(void) __releases(set_atomicity_lock) 697static void post_set(void) __releases(set_atomicity_lock)
697{ 698{
698 /* Flush TLBs (no need to flush caches - they are disabled) */ 699 /* Flush TLBs (no need to flush caches - they are disabled) */
700 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
699 __flush_tlb(); 701 __flush_tlb();
700 702
701 /* Intel (P6) standard MTRRs */ 703 /* Intel (P6) standard MTRRs */
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 7e73e8c69096..9d980d88b747 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
59 return NULL; 59 return NULL;
60} 60}
61 61
62int pmd_huge_support(void)
63{
64 return 0;
65}
62#else 66#else
63 67
64struct page * 68struct page *
@@ -77,6 +81,10 @@ int pud_huge(pud_t pud)
77 return !!(pud_val(pud) & _PAGE_PSE); 81 return !!(pud_val(pud) & _PAGE_PSE);
78} 82}
79 83
84int pmd_huge_support(void)
85{
86 return 1;
87}
80#endif 88#endif
81 89
82/* x86_64 also uses this file */ 90/* x86_64 also uses this file */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 282375f13c7e..ae699b3bbac8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,6 +103,7 @@ static void flush_tlb_func(void *info)
103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
104 return; 104 return;
105 105
106 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
106 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
107 if (f->flush_end == TLB_FLUSH_ALL) 108 if (f->flush_end == TLB_FLUSH_ALL)
108 local_flush_tlb(); 109 local_flush_tlb();
@@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
130 info.flush_start = start; 131 info.flush_start = start;
131 info.flush_end = end; 132 info.flush_end = end;
132 133
134 count_vm_event(NR_TLB_REMOTE_FLUSH);
133 if (is_uv_system()) { 135 if (is_uv_system()) {
134 unsigned int cpu; 136 unsigned int cpu;
135 137
@@ -149,6 +151,7 @@ void flush_tlb_current_task(void)
149 151
150 preempt_disable(); 152 preempt_disable();
151 153
154 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
152 local_flush_tlb(); 155 local_flush_tlb();
153 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
154 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
211 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; 214 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
212 215
213 /* tlb_flushall_shift is on balance point, details in commit log */ 216 /* tlb_flushall_shift is on balance point, details in commit log */
214 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) 217 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
218 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
215 local_flush_tlb(); 219 local_flush_tlb();
216 else { 220 } else {
217 if (has_large_page(mm, start, end)) { 221 if (has_large_page(mm, start, end)) {
218 local_flush_tlb(); 222 local_flush_tlb();
219 goto flush_all; 223 goto flush_all;
220 } 224 }
221 /* flush range by one by one 'invlpg' */ 225 /* flush range by one by one 'invlpg' */
222 for (addr = start; addr < end; addr += PAGE_SIZE) 226 for (addr = start; addr < end; addr += PAGE_SIZE) {
227 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
223 __flush_tlb_single(addr); 228 __flush_tlb_single(addr);
229 }
224 230
225 if (cpumask_any_but(mm_cpumask(mm), 231 if (cpumask_any_but(mm_cpumask(mm),
226 smp_processor_id()) < nr_cpu_ids) 232 smp_processor_id()) < nr_cpu_ids)
@@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
256 262
257static void do_flush_tlb_all(void *info) 263static void do_flush_tlb_all(void *info)
258{ 264{
265 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
259 __flush_tlb_all(); 266 __flush_tlb_all();
260 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 267 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
261 leave_mm(smp_processor_id()); 268 leave_mm(smp_processor_id());
@@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info)
263 270
264void flush_tlb_all(void) 271void flush_tlb_all(void)
265{ 272{
273 count_vm_event(NR_TLB_REMOTE_FLUSH);
266 on_each_cpu(do_flush_tlb_all, NULL, 1); 274 on_each_cpu(do_flush_tlb_all, NULL, 1);
267} 275}
268 276
diff --git a/block/Kconfig b/block/Kconfig
index a7e40a7c8214..7f38e40fee08 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -99,6 +99,12 @@ config BLK_DEV_THROTTLING
99 99
100 See Documentation/cgroups/blkio-controller.txt for more information. 100 See Documentation/cgroups/blkio-controller.txt for more information.
101 101
102config CMDLINE_PARSER
103 bool "Block device command line partition parser"
104 default n
105 ---help---
106 Parsing command line, get the partitions information.
107
102menu "Partition Types" 108menu "Partition Types"
103 109
104source "block/partitions/Kconfig" 110source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 39b76ba66ffd..4fa4be544ece 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
18 18
19obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 19obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
20obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o 20obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
21obj-$(CONFIG_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 4464c823cff2..46cd7bd18b34 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
367 if (!icq) 367 if (!icq)
368 return NULL; 368 return NULL;
369 369
370 if (radix_tree_preload(gfp_mask) < 0) { 370 if (radix_tree_maybe_preload(gfp_mask) < 0) {
371 kmem_cache_free(et->icq_cache, icq); 371 kmem_cache_free(et->icq_cache, icq);
372 return NULL; 372 return NULL;
373 } 373 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5efc5a647183..3aa5b195f4dd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -29,7 +29,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
29 int err; 29 int err;
30 unsigned long v; 30 unsigned long v;
31 31
32 err = strict_strtoul(page, 10, &v); 32 err = kstrtoul(page, 10, &v);
33 if (err || v > UINT_MAX) 33 if (err || v > UINT_MAX)
34 return -EINVAL; 34 return -EINVAL;
35 35
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
new file mode 100644
index 000000000000..cc2637f8674e
--- /dev/null
+++ b/block/cmdline-parser.c
@@ -0,0 +1,250 @@
1/*
2 * Parse command line, get partition information
3 *
4 * Written by Cai Zhiyong <caizhiyong@huawei.com>
5 *
6 */
7#include <linux/buffer_head.h>
8#include <linux/module.h>
9#include <linux/cmdline-parser.h>
10
11static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
12{
13 int ret = 0;
14 struct cmdline_subpart *new_subpart;
15
16 *subpart = NULL;
17
18 new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
19 if (!new_subpart)
20 return -ENOMEM;
21
22 if (*partdef == '-') {
23 new_subpart->size = (sector_t)(~0ULL);
24 partdef++;
25 } else {
26 new_subpart->size = (sector_t)memparse(partdef, &partdef);
27 if (new_subpart->size < (sector_t)PAGE_SIZE) {
28 pr_warn("cmdline partition size is invalid.");
29 ret = -EINVAL;
30 goto fail;
31 }
32 }
33
34 if (*partdef == '@') {
35 partdef++;
36 new_subpart->from = (sector_t)memparse(partdef, &partdef);
37 } else {
38 new_subpart->from = (sector_t)(~0ULL);
39 }
40
41 if (*partdef == '(') {
42 int length;
43 char *next = strchr(++partdef, ')');
44
45 if (!next) {
46 pr_warn("cmdline partition format is invalid.");
47 ret = -EINVAL;
48 goto fail;
49 }
50
51 length = min_t(int, next - partdef,
52 sizeof(new_subpart->name) - 1);
53 strncpy(new_subpart->name, partdef, length);
54 new_subpart->name[length] = '\0';
55
56 partdef = ++next;
57 } else
58 new_subpart->name[0] = '\0';
59
60 new_subpart->flags = 0;
61
62 if (!strncmp(partdef, "ro", 2)) {
63 new_subpart->flags |= PF_RDONLY;
64 partdef += 2;
65 }
66
67 if (!strncmp(partdef, "lk", 2)) {
68 new_subpart->flags |= PF_POWERUP_LOCK;
69 partdef += 2;
70 }
71
72 *subpart = new_subpart;
73 return 0;
74fail:
75 kfree(new_subpart);
76 return ret;
77}
78
79static void free_subpart(struct cmdline_parts *parts)
80{
81 struct cmdline_subpart *subpart;
82
83 while (parts->subpart) {
84 subpart = parts->subpart;
85 parts->subpart = subpart->next_subpart;
86 kfree(subpart);
87 }
88}
89
90static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
91{
92 int ret = -EINVAL;
93 char *next;
94 int length;
95 struct cmdline_subpart **next_subpart;
96 struct cmdline_parts *newparts;
97 char buf[BDEVNAME_SIZE + 32 + 4];
98
99 *parts = NULL;
100
101 newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
102 if (!newparts)
103 return -ENOMEM;
104
105 next = strchr(bdevdef, ':');
106 if (!next) {
107 pr_warn("cmdline partition has no block device.");
108 goto fail;
109 }
110
111 length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
112 strncpy(newparts->name, bdevdef, length);
113 newparts->name[length] = '\0';
114 newparts->nr_subparts = 0;
115
116 next_subpart = &newparts->subpart;
117
118 while (next && *(++next)) {
119 bdevdef = next;
120 next = strchr(bdevdef, ',');
121
122 length = (!next) ? (sizeof(buf) - 1) :
123 min_t(int, next - bdevdef, sizeof(buf) - 1);
124
125 strncpy(buf, bdevdef, length);
126 buf[length] = '\0';
127
128 ret = parse_subpart(next_subpart, buf);
129 if (ret)
130 goto fail;
131
132 newparts->nr_subparts++;
133 next_subpart = &(*next_subpart)->next_subpart;
134 }
135
136 if (!newparts->subpart) {
137 pr_warn("cmdline partition has no valid partition.");
138 ret = -EINVAL;
139 goto fail;
140 }
141
142 *parts = newparts;
143
144 return 0;
145fail:
146 free_subpart(newparts);
147 kfree(newparts);
148 return ret;
149}
150
151void cmdline_parts_free(struct cmdline_parts **parts)
152{
153 struct cmdline_parts *next_parts;
154
155 while (*parts) {
156 next_parts = (*parts)->next_parts;
157 free_subpart(*parts);
158 kfree(*parts);
159 *parts = next_parts;
160 }
161}
162
163int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
164{
165 int ret;
166 char *buf;
167 char *pbuf;
168 char *next;
169 struct cmdline_parts **next_parts;
170
171 *parts = NULL;
172
173 next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
174 if (!buf)
175 return -ENOMEM;
176
177 next_parts = parts;
178
179 while (next && *pbuf) {
180 next = strchr(pbuf, ';');
181 if (next)
182 *next = '\0';
183
184 ret = parse_parts(next_parts, pbuf);
185 if (ret)
186 goto fail;
187
188 if (next)
189 pbuf = ++next;
190
191 next_parts = &(*next_parts)->next_parts;
192 }
193
194 if (!*parts) {
195 pr_warn("cmdline partition has no valid partition.");
196 ret = -EINVAL;
197 goto fail;
198 }
199
200 ret = 0;
201done:
202 kfree(buf);
203 return ret;
204
205fail:
206 cmdline_parts_free(parts);
207 goto done;
208}
209
210struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
211 const char *bdev)
212{
213 while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
214 parts = parts->next_parts;
215 return parts;
216}
217
218/*
219 * add_part()
220 * 0 success.
221 * 1 can not add so many partitions.
222 */
223void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
224 int slot,
225 int (*add_part)(int, struct cmdline_subpart *, void *),
226 void *param)
227
228{
229 sector_t from = 0;
230 struct cmdline_subpart *subpart;
231
232 for (subpart = parts->subpart; subpart;
233 subpart = subpart->next_subpart, slot++) {
234 if (subpart->from == (sector_t)(~0ULL))
235 subpart->from = from;
236 else
237 from = subpart->from;
238
239 if (from >= disk_size)
240 break;
241
242 if (subpart->size > (disk_size - from))
243 subpart->size = disk_size - from;
244
245 from += subpart->size;
246
247 if (add_part(slot, subpart, param))
248 break;
249 }
250}
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7e5d474dc6ba..fbd5a67cb773 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -70,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
70 return ret; 70 return ret;
71 71
72 ret = copy_to_user(ugeo, &geo, 4); 72 ret = copy_to_user(ugeo, &geo, 4);
73 ret |= __put_user(geo.start, &ugeo->start); 73 ret |= put_user(geo.start, &ugeo->start);
74 if (ret) 74 if (ret)
75 ret = -EFAULT; 75 ret = -EFAULT;
76 76
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 4cebb2f0d2f4..87a32086535d 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -260,3 +260,10 @@ config SYSV68_PARTITION
260 partition table format used by Motorola Delta machines (using 260 partition table format used by Motorola Delta machines (using
261 sysv68). 261 sysv68).
262 Otherwise, say N. 262 Otherwise, say N.
263
264config CMDLINE_PARTITION
265 bool "Command line partition support" if PARTITION_ADVANCED
266 select CMDLINE_PARSER
267 help
268 Say Y here if you would read the partitions table from bootargs.
269 The format for the command line is just like mtdparts.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index 2be4d7ba4e3a..37a95270503c 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o
8obj-$(CONFIG_AMIGA_PARTITION) += amiga.o 8obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
9obj-$(CONFIG_ATARI_PARTITION) += atari.o 9obj-$(CONFIG_ATARI_PARTITION) += atari.o
10obj-$(CONFIG_AIX_PARTITION) += aix.o 10obj-$(CONFIG_AIX_PARTITION) += aix.o
11obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
11obj-$(CONFIG_MAC_PARTITION) += mac.o 12obj-$(CONFIG_MAC_PARTITION) += mac.o
12obj-$(CONFIG_LDM_PARTITION) += ldm.o 13obj-$(CONFIG_LDM_PARTITION) += ldm.o
13obj-$(CONFIG_MSDOS_PARTITION) += msdos.o 14obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
diff --git a/block/partitions/check.c b/block/partitions/check.c
index 19ba207ea7d1..9ac1df74f699 100644
--- a/block/partitions/check.c
+++ b/block/partitions/check.c
@@ -34,6 +34,7 @@
34#include "efi.h" 34#include "efi.h"
35#include "karma.h" 35#include "karma.h"
36#include "sysv68.h" 36#include "sysv68.h"
37#include "cmdline.h"
37 38
38int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 39int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
39 40
@@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = {
65 adfspart_check_ADFS, 66 adfspart_check_ADFS,
66#endif 67#endif
67 68
69#ifdef CONFIG_CMDLINE_PARTITION
70 cmdline_partition,
71#endif
68#ifdef CONFIG_EFI_PARTITION 72#ifdef CONFIG_EFI_PARTITION
69 efi_partition, /* this must come before msdos */ 73 efi_partition, /* this must come before msdos */
70#endif 74#endif
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
new file mode 100644
index 000000000000..56cf4ffad51e
--- /dev/null
+++ b/block/partitions/cmdline.c
@@ -0,0 +1,99 @@
1/*
2 * Copyright (C) 2013 HUAWEI
3 * Author: Cai Zhiyong <caizhiyong@huawei.com>
4 *
5 * Read block device partition table from command line.
6 * The partition used for fixed block device (eMMC) embedded device.
7 * It is no MBR, save storage space. Bootloader can be easily accessed
8 * by absolute address of data on the block device.
9 * Users can easily change the partition.
10 *
11 * The format for the command line is just like mtdparts.
12 *
13 * Verbose config please reference "Documentation/block/cmdline-partition.txt"
14 *
15 */
16
17#include <linux/cmdline-parser.h>
18
19#include "check.h"
20#include "cmdline.h"
21
22static char *cmdline;
23static struct cmdline_parts *bdev_parts;
24
25static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
26{
27 int label_min;
28 struct partition_meta_info *info;
29 char tmp[sizeof(info->volname) + 4];
30 struct parsed_partitions *state = (struct parsed_partitions *)param;
31
32 if (slot >= state->limit)
33 return 1;
34
35 put_partition(state, slot, subpart->from >> 9,
36 subpart->size >> 9);
37
38 info = &state->parts[slot].info;
39
40 label_min = min_t(int, sizeof(info->volname) - 1,
41 sizeof(subpart->name));
42 strncpy(info->volname, subpart->name, label_min);
43 info->volname[label_min] = '\0';
44
45 snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
46 strlcat(state->pp_buf, tmp, PAGE_SIZE);
47
48 state->parts[slot].has_info = true;
49
50 return 0;
51}
52
53static int __init cmdline_parts_setup(char *s)
54{
55 cmdline = s;
56 return 1;
57}
58__setup("blkdevparts=", cmdline_parts_setup);
59
60/*
61 * Purpose: allocate cmdline partitions.
62 * Returns:
63 * -1 if unable to read the partition table
64 * 0 if this isn't our partition table
65 * 1 if successful
66 */
67int cmdline_partition(struct parsed_partitions *state)
68{
69 sector_t disk_size;
70 char bdev[BDEVNAME_SIZE];
71 struct cmdline_parts *parts;
72
73 if (cmdline) {
74 if (bdev_parts)
75 cmdline_parts_free(&bdev_parts);
76
77 if (cmdline_parts_parse(&bdev_parts, cmdline)) {
78 cmdline = NULL;
79 return -1;
80 }
81 cmdline = NULL;
82 }
83
84 if (!bdev_parts)
85 return 0;
86
87 bdevname(state->bdev, bdev);
88 parts = cmdline_parts_find(bdev_parts, bdev);
89 if (!parts)
90 return 0;
91
92 disk_size = get_capacity(state->bdev->bd_disk) << 9;
93
94 cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
95
96 strlcat(state->pp_buf, "\n", PAGE_SIZE);
97
98 return 1;
99}
diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h
new file mode 100644
index 000000000000..26e0f8da1414
--- /dev/null
+++ b/block/partitions/cmdline.h
@@ -0,0 +1,2 @@
1
2int cmdline_partition(struct parsed_partitions *state);
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index c85fc895ecdb..1a5ec9a03c00 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -25,6 +25,9 @@
25 * TODO: 25 * TODO:
26 * 26 *
27 * Changelog: 27 * Changelog:
28 * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com>
29 * - detect hybrid MBRs, tighter pMBR checking & cleanups.
30 *
28 * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> 31 * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
29 * - test for valid PMBR and valid PGPT before ever reading 32 * - test for valid PMBR and valid PGPT before ever reading
30 * AGPT, allow override with 'gpt' kernel command line option. 33 * AGPT, allow override with 'gpt' kernel command line option.
@@ -149,34 +152,80 @@ static u64 last_lba(struct block_device *bdev)
149 bdev_logical_block_size(bdev)) - 1ULL; 152 bdev_logical_block_size(bdev)) - 1ULL;
150} 153}
151 154
152static inline int 155static inline int pmbr_part_valid(gpt_mbr_record *part)
153pmbr_part_valid(struct partition *part)
154{ 156{
155 if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && 157 if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT)
156 le32_to_cpu(part->start_sect) == 1UL) 158 goto invalid;
157 return 1; 159
158 return 0; 160 /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */
161 if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
162 goto invalid;
163
164 return GPT_MBR_PROTECTIVE;
165invalid:
166 return 0;
159} 167}
160 168
161/** 169/**
162 * is_pmbr_valid(): test Protective MBR for validity 170 * is_pmbr_valid(): test Protective MBR for validity
163 * @mbr: pointer to a legacy mbr structure 171 * @mbr: pointer to a legacy mbr structure
172 * @total_sectors: amount of sectors in the device
164 * 173 *
165 * Description: Returns 1 if PMBR is valid, 0 otherwise. 174 * Description: Checks for a valid protective or hybrid
166 * Validity depends on two things: 175 * master boot record (MBR). The validity of a pMBR depends
176 * on all of the following properties:
167 * 1) MSDOS signature is in the last two bytes of the MBR 177 * 1) MSDOS signature is in the last two bytes of the MBR
168 * 2) One partition of type 0xEE is found 178 * 2) One partition of type 0xEE is found
179 *
180 * In addition, a hybrid MBR will have up to three additional
181 * primary partitions, which point to the same space that's
182 * marked out by up to three GPT partitions.
183 *
184 * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
185 * GPT_MBR_HYBRID depending on the device layout.
169 */ 186 */
170static int 187static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
171is_pmbr_valid(legacy_mbr *mbr)
172{ 188{
173 int i; 189 int i, part = 0, ret = 0; /* invalid by default */
190
174 if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) 191 if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
175 return 0; 192 goto done;
193
194 for (i = 0; i < 4; i++) {
195 ret = pmbr_part_valid(&mbr->partition_record[i]);
196 if (ret == GPT_MBR_PROTECTIVE) {
197 part = i;
198 /*
199 * Ok, we at least know that there's a protective MBR,
200 * now check if there are other partition types for
201 * hybrid MBR.
202 */
203 goto check_hybrid;
204 }
205 }
206
207 if (ret != GPT_MBR_PROTECTIVE)
208 goto done;
209check_hybrid:
176 for (i = 0; i < 4; i++) 210 for (i = 0; i < 4; i++)
177 if (pmbr_part_valid(&mbr->partition_record[i])) 211 if ((mbr->partition_record[i].os_type !=
178 return 1; 212 EFI_PMBR_OSTYPE_EFI_GPT) &&
179 return 0; 213 (mbr->partition_record[i].os_type != 0x00))
214 ret = GPT_MBR_HYBRID;
215
216 /*
217 * Protective MBRs take up the lesser of the whole disk
218 * or 2 TiB (32bit LBA), ignoring the rest of the disk.
219 *
220 * Hybrid MBRs do not necessarily comply with this.
221 */
222 if (ret == GPT_MBR_PROTECTIVE) {
223 if (le32_to_cpu(mbr->partition_record[part].size_in_lba) !=
224 min((uint32_t) total_sectors - 1, 0xFFFFFFFF))
225 ret = 0;
226 }
227done:
228 return ret;
180} 229}
181 230
182/** 231/**
@@ -243,8 +292,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
243 return NULL; 292 return NULL;
244 293
245 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), 294 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
246 (u8 *) pte, 295 (u8 *) pte, count) < count) {
247 count) < count) {
248 kfree(pte); 296 kfree(pte);
249 pte=NULL; 297 pte=NULL;
250 return NULL; 298 return NULL;
@@ -364,7 +412,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
364 (unsigned long long)lastlba); 412 (unsigned long long)lastlba);
365 goto fail; 413 goto fail;
366 } 414 }
367 415 if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) {
416 pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
417 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
418 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba));
419 goto fail;
420 }
368 /* Check that sizeof_partition_entry has the correct value */ 421 /* Check that sizeof_partition_entry has the correct value */
369 if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { 422 if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
370 pr_debug("GUID Partitition Entry Size check failed.\n"); 423 pr_debug("GUID Partitition Entry Size check failed.\n");
@@ -429,44 +482,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
429 if (!pgpt || !agpt) 482 if (!pgpt || !agpt)
430 return; 483 return;
431 if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { 484 if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
432 printk(KERN_WARNING 485 pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n");
433 "GPT:Primary header LBA != Alt. header alternate_lba\n"); 486 pr_warn("GPT:%lld != %lld\n",
434 printk(KERN_WARNING "GPT:%lld != %lld\n",
435 (unsigned long long)le64_to_cpu(pgpt->my_lba), 487 (unsigned long long)le64_to_cpu(pgpt->my_lba),
436 (unsigned long long)le64_to_cpu(agpt->alternate_lba)); 488 (unsigned long long)le64_to_cpu(agpt->alternate_lba));
437 error_found++; 489 error_found++;
438 } 490 }
439 if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { 491 if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
440 printk(KERN_WARNING 492 pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n");
441 "GPT:Primary header alternate_lba != Alt. header my_lba\n"); 493 pr_warn("GPT:%lld != %lld\n",
442 printk(KERN_WARNING "GPT:%lld != %lld\n",
443 (unsigned long long)le64_to_cpu(pgpt->alternate_lba), 494 (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
444 (unsigned long long)le64_to_cpu(agpt->my_lba)); 495 (unsigned long long)le64_to_cpu(agpt->my_lba));
445 error_found++; 496 error_found++;
446 } 497 }
447 if (le64_to_cpu(pgpt->first_usable_lba) != 498 if (le64_to_cpu(pgpt->first_usable_lba) !=
448 le64_to_cpu(agpt->first_usable_lba)) { 499 le64_to_cpu(agpt->first_usable_lba)) {
449 printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); 500 pr_warn("GPT:first_usable_lbas don't match.\n");
450 printk(KERN_WARNING "GPT:%lld != %lld\n", 501 pr_warn("GPT:%lld != %lld\n",
451 (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), 502 (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
452 (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); 503 (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
453 error_found++; 504 error_found++;
454 } 505 }
455 if (le64_to_cpu(pgpt->last_usable_lba) != 506 if (le64_to_cpu(pgpt->last_usable_lba) !=
456 le64_to_cpu(agpt->last_usable_lba)) { 507 le64_to_cpu(agpt->last_usable_lba)) {
457 printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); 508 pr_warn("GPT:last_usable_lbas don't match.\n");
458 printk(KERN_WARNING "GPT:%lld != %lld\n", 509 pr_warn("GPT:%lld != %lld\n",
459 (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), 510 (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
460 (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); 511 (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
461 error_found++; 512 error_found++;
462 } 513 }
463 if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { 514 if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
464 printk(KERN_WARNING "GPT:disk_guids don't match.\n"); 515 pr_warn("GPT:disk_guids don't match.\n");
465 error_found++; 516 error_found++;
466 } 517 }
467 if (le32_to_cpu(pgpt->num_partition_entries) != 518 if (le32_to_cpu(pgpt->num_partition_entries) !=
468 le32_to_cpu(agpt->num_partition_entries)) { 519 le32_to_cpu(agpt->num_partition_entries)) {
469 printk(KERN_WARNING "GPT:num_partition_entries don't match: " 520 pr_warn("GPT:num_partition_entries don't match: "
470 "0x%x != 0x%x\n", 521 "0x%x != 0x%x\n",
471 le32_to_cpu(pgpt->num_partition_entries), 522 le32_to_cpu(pgpt->num_partition_entries),
472 le32_to_cpu(agpt->num_partition_entries)); 523 le32_to_cpu(agpt->num_partition_entries));
@@ -474,8 +525,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
474 } 525 }
475 if (le32_to_cpu(pgpt->sizeof_partition_entry) != 526 if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
476 le32_to_cpu(agpt->sizeof_partition_entry)) { 527 le32_to_cpu(agpt->sizeof_partition_entry)) {
477 printk(KERN_WARNING 528 pr_warn("GPT:sizeof_partition_entry values don't match: "
478 "GPT:sizeof_partition_entry values don't match: "
479 "0x%x != 0x%x\n", 529 "0x%x != 0x%x\n",
480 le32_to_cpu(pgpt->sizeof_partition_entry), 530 le32_to_cpu(pgpt->sizeof_partition_entry),
481 le32_to_cpu(agpt->sizeof_partition_entry)); 531 le32_to_cpu(agpt->sizeof_partition_entry));
@@ -483,34 +533,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
483 } 533 }
484 if (le32_to_cpu(pgpt->partition_entry_array_crc32) != 534 if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
485 le32_to_cpu(agpt->partition_entry_array_crc32)) { 535 le32_to_cpu(agpt->partition_entry_array_crc32)) {
486 printk(KERN_WARNING 536 pr_warn("GPT:partition_entry_array_crc32 values don't match: "
487 "GPT:partition_entry_array_crc32 values don't match: "
488 "0x%x != 0x%x\n", 537 "0x%x != 0x%x\n",
489 le32_to_cpu(pgpt->partition_entry_array_crc32), 538 le32_to_cpu(pgpt->partition_entry_array_crc32),
490 le32_to_cpu(agpt->partition_entry_array_crc32)); 539 le32_to_cpu(agpt->partition_entry_array_crc32));
491 error_found++; 540 error_found++;
492 } 541 }
493 if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { 542 if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
494 printk(KERN_WARNING 543 pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
495 "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); 544 pr_warn("GPT:%lld != %lld\n",
496 printk(KERN_WARNING "GPT:%lld != %lld\n",
497 (unsigned long long)le64_to_cpu(pgpt->alternate_lba), 545 (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
498 (unsigned long long)lastlba); 546 (unsigned long long)lastlba);
499 error_found++; 547 error_found++;
500 } 548 }
501 549
502 if (le64_to_cpu(agpt->my_lba) != lastlba) { 550 if (le64_to_cpu(agpt->my_lba) != lastlba) {
503 printk(KERN_WARNING 551 pr_warn("GPT:Alternate GPT header not at the end of the disk.\n");
504 "GPT:Alternate GPT header not at the end of the disk.\n"); 552 pr_warn("GPT:%lld != %lld\n",
505 printk(KERN_WARNING "GPT:%lld != %lld\n",
506 (unsigned long long)le64_to_cpu(agpt->my_lba), 553 (unsigned long long)le64_to_cpu(agpt->my_lba),
507 (unsigned long long)lastlba); 554 (unsigned long long)lastlba);
508 error_found++; 555 error_found++;
509 } 556 }
510 557
511 if (error_found) 558 if (error_found)
512 printk(KERN_WARNING 559 pr_warn("GPT: Use GNU Parted to correct GPT errors.\n");
513 "GPT: Use GNU Parted to correct GPT errors.\n");
514 return; 560 return;
515} 561}
516 562
@@ -536,6 +582,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
536 gpt_header *pgpt = NULL, *agpt = NULL; 582 gpt_header *pgpt = NULL, *agpt = NULL;
537 gpt_entry *pptes = NULL, *aptes = NULL; 583 gpt_entry *pptes = NULL, *aptes = NULL;
538 legacy_mbr *legacymbr; 584 legacy_mbr *legacymbr;
585 sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
539 u64 lastlba; 586 u64 lastlba;
540 587
541 if (!ptes) 588 if (!ptes)
@@ -543,17 +590,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
543 590
544 lastlba = last_lba(state->bdev); 591 lastlba = last_lba(state->bdev);
545 if (!force_gpt) { 592 if (!force_gpt) {
546 /* This will be added to the EFI Spec. per Intel after v1.02. */ 593 /* This will be added to the EFI Spec. per Intel after v1.02. */
547 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 594 legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
548 if (legacymbr) { 595 if (!legacymbr)
549 read_lba(state, 0, (u8 *) legacymbr, 596 goto fail;
550 sizeof (*legacymbr)); 597
551 good_pmbr = is_pmbr_valid(legacymbr); 598 read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
552 kfree(legacymbr); 599 good_pmbr = is_pmbr_valid(legacymbr, total_sectors);
553 } 600 kfree(legacymbr);
554 if (!good_pmbr) 601
555 goto fail; 602 if (!good_pmbr)
556 } 603 goto fail;
604
605 pr_debug("Device has a %s MBR\n",
606 good_pmbr == GPT_MBR_PROTECTIVE ?
607 "protective" : "hybrid");
608 }
557 609
558 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, 610 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
559 &pgpt, &pptes); 611 &pgpt, &pptes);
@@ -576,11 +628,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
576 *ptes = pptes; 628 *ptes = pptes;
577 kfree(agpt); 629 kfree(agpt);
578 kfree(aptes); 630 kfree(aptes);
579 if (!good_agpt) { 631 if (!good_agpt)
580 printk(KERN_WARNING 632 pr_warn("Alternate GPT is invalid, using primary GPT.\n");
581 "Alternate GPT is invalid, "
582 "using primary GPT.\n");
583 }
584 return 1; 633 return 1;
585 } 634 }
586 else if (good_agpt) { 635 else if (good_agpt) {
@@ -588,8 +637,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
588 *ptes = aptes; 637 *ptes = aptes;
589 kfree(pgpt); 638 kfree(pgpt);
590 kfree(pptes); 639 kfree(pptes);
591 printk(KERN_WARNING 640 pr_warn("Primary GPT is invalid, using alternate GPT.\n");
592 "Primary GPT is invalid, using alternate GPT.\n");
593 return 1; 641 return 1;
594 } 642 }
595 643
@@ -651,8 +699,7 @@ int efi_partition(struct parsed_partitions *state)
651 put_partition(state, i+1, start * ssz, size * ssz); 699 put_partition(state, i+1, start * ssz, size * ssz);
652 700
653 /* If this is a RAID volume, tell md */ 701 /* If this is a RAID volume, tell md */
654 if (!efi_guidcmp(ptes[i].partition_type_guid, 702 if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID))
655 PARTITION_LINUX_RAID_GUID))
656 state->parts[i + 1].flags = ADDPART_FLAG_RAID; 703 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
657 704
658 info = &state->parts[i + 1].info; 705 info = &state->parts[i + 1].info;
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index b69ab729558f..4efcafba7e64 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -37,6 +37,9 @@
37#define EFI_PMBR_OSTYPE_EFI 0xEF 37#define EFI_PMBR_OSTYPE_EFI 0xEF
38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE 38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
39 39
40#define GPT_MBR_PROTECTIVE 1
41#define GPT_MBR_HYBRID 2
42
40#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL 43#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
41#define GPT_HEADER_REVISION_V1 0x00010000 44#define GPT_HEADER_REVISION_V1 0x00010000
42#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 45#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -101,11 +104,25 @@ typedef struct _gpt_entry {
101 efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; 104 efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
102} __attribute__ ((packed)) gpt_entry; 105} __attribute__ ((packed)) gpt_entry;
103 106
107typedef struct _gpt_mbr_record {
108 u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */
109 u8 start_head; /* unused by EFI, pt start in CHS */
110 u8 start_sector; /* unused by EFI, pt start in CHS */
111 u8 start_track;
112 u8 os_type; /* EFI and legacy non-EFI OS types */
113 u8 end_head; /* unused by EFI, pt end in CHS */
114 u8 end_sector; /* unused by EFI, pt end in CHS */
115 u8 end_track; /* unused by EFI, pt end in CHS */
116 __le32 starting_lba; /* used by EFI - start addr of the on disk pt */
117 __le32 size_in_lba; /* used by EFI - size of pt in LBA */
118} __packed gpt_mbr_record;
119
120
104typedef struct _legacy_mbr { 121typedef struct _legacy_mbr {
105 u8 boot_code[440]; 122 u8 boot_code[440];
106 __le32 unique_mbr_signature; 123 __le32 unique_mbr_signature;
107 __le16 unknown; 124 __le16 unknown;
108 struct partition partition_record[4]; 125 gpt_mbr_record partition_record[4];
109 __le16 signature; 126 __le16 signature;
110} __attribute__ ((packed)) legacy_mbr; 127} __attribute__ ((packed)) legacy_mbr;
111 128
@@ -113,22 +130,3 @@ typedef struct _legacy_mbr {
113extern int efi_partition(struct parsed_partitions *state); 130extern int efi_partition(struct parsed_partitions *state);
114 131
115#endif 132#endif
116
117/*
118 * Overrides for Emacs so that we follow Linus's tabbing style.
119 * Emacs will notice this stuff at the end of the file and automatically
120 * adjust the settings for this buffer only. This must remain at the end
121 * of the file.
122 * --------------------------------------------------------------------------
123 * Local variables:
124 * c-indent-level: 4
125 * c-brace-imaginary-offset: 0
126 * c-brace-offset: -4
127 * c-argdecl-indent: 4
128 * c-label-offset: -4
129 * c-continued-statement-offset: 4
130 * c-continued-brace-offset: 0
131 * indent-tabs-mode: nil
132 * tab-width: 8
133 * End:
134 */
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 025c41d3cb33..14a9d1912318 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
1/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
2#define VERSION "83" 2#define VERSION "85"
3#define AOE_MAJOR 152 3#define AOE_MAJOR 152
4#define DEVICE_NAME "aoe" 4#define DEVICE_NAME "aoe"
5 5
@@ -169,6 +169,7 @@ struct aoedev {
169 ulong ref; 169 ulong ref;
170 struct work_struct work;/* disk create work struct */ 170 struct work_struct work;/* disk create work struct */
171 struct gendisk *gd; 171 struct gendisk *gd;
172 struct dentry *debugfs;
172 struct request_queue *blkq; 173 struct request_queue *blkq;
173 struct hd_geometry geo; 174 struct hd_geometry geo;
174 sector_t ssize; 175 sector_t ssize;
@@ -206,6 +207,7 @@ struct ktstate {
206int aoeblk_init(void); 207int aoeblk_init(void);
207void aoeblk_exit(void); 208void aoeblk_exit(void);
208void aoeblk_gdalloc(void *); 209void aoeblk_gdalloc(void *);
210void aoedisk_rm_debugfs(struct aoedev *d);
209void aoedisk_rm_sysfs(struct aoedev *d); 211void aoedisk_rm_sysfs(struct aoedev *d);
210 212
211int aoechr_init(void); 213int aoechr_init(void);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 916d9ed5c8aa..dd73e1ff1759 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoeblk.c 3 * aoeblk.c
4 * block device routines 4 * block device routines
@@ -17,11 +17,13 @@
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/export.h> 18#include <linux/export.h>
19#include <linux/moduleparam.h> 19#include <linux/moduleparam.h>
20#include <linux/debugfs.h>
20#include <scsi/sg.h> 21#include <scsi/sg.h>
21#include "aoe.h" 22#include "aoe.h"
22 23
23static DEFINE_MUTEX(aoeblk_mutex); 24static DEFINE_MUTEX(aoeblk_mutex);
24static struct kmem_cache *buf_pool_cache; 25static struct kmem_cache *buf_pool_cache;
26static struct dentry *aoe_debugfs_dir;
25 27
26/* GPFS needs a larger value than the default. */ 28/* GPFS needs a larger value than the default. */
27static int aoe_maxsectors; 29static int aoe_maxsectors;
@@ -108,6 +110,55 @@ static ssize_t aoedisk_show_payload(struct device *dev,
108 return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt); 110 return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
109} 111}
110 112
113static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
114{
115 struct aoedev *d;
116 struct aoetgt **t, **te;
117 struct aoeif *ifp, *ife;
118 unsigned long flags;
119 char c;
120
121 d = s->private;
122 seq_printf(s, "rttavg: %d rttdev: %d\n",
123 d->rttavg >> RTTSCALE,
124 d->rttdev >> RTTDSCALE);
125 seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool));
126 seq_printf(s, "kicked: %ld\n", d->kicked);
127 seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt);
128 seq_printf(s, "ref: %ld\n", d->ref);
129
130 spin_lock_irqsave(&d->lock, flags);
131 t = d->targets;
132 te = t + d->ntargets;
133 for (; t < te && *t; t++) {
134 c = '\t';
135 seq_printf(s, "falloc: %ld\n", (*t)->falloc);
136 seq_printf(s, "ffree: %p\n",
137 list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next);
138 seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout,
139 (*t)->maxout, (*t)->nframes);
140 seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh);
141 seq_printf(s, "\ttaint:%d\n", (*t)->taint);
142 seq_printf(s, "\tr:%d\n", (*t)->rpkts);
143 seq_printf(s, "\tw:%d\n", (*t)->wpkts);
144 ifp = (*t)->ifs;
145 ife = ifp + ARRAY_SIZE((*t)->ifs);
146 for (; ifp->nd && ifp < ife; ifp++) {
147 seq_printf(s, "%c%s", c, ifp->nd->name);
148 c = ',';
149 }
150 seq_puts(s, "\n");
151 }
152 spin_unlock_irqrestore(&d->lock, flags);
153
154 return 0;
155}
156
157static int aoe_debugfs_open(struct inode *inode, struct file *file)
158{
159 return single_open(file, aoedisk_debugfs_show, inode->i_private);
160}
161
111static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); 162static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
112static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); 163static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
113static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); 164static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
@@ -130,6 +181,44 @@ static const struct attribute_group attr_group = {
130 .attrs = aoe_attrs, 181 .attrs = aoe_attrs,
131}; 182};
132 183
184static const struct file_operations aoe_debugfs_fops = {
185 .open = aoe_debugfs_open,
186 .read = seq_read,
187 .llseek = seq_lseek,
188 .release = single_release,
189};
190
191static void
192aoedisk_add_debugfs(struct aoedev *d)
193{
194 struct dentry *entry;
195 char *p;
196
197 if (aoe_debugfs_dir == NULL)
198 return;
199 p = strchr(d->gd->disk_name, '/');
200 if (p == NULL)
201 p = d->gd->disk_name;
202 else
203 p++;
204 BUG_ON(*p == '\0');
205 entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
206 &aoe_debugfs_fops);
207 if (IS_ERR_OR_NULL(entry)) {
208 pr_info("aoe: cannot create debugfs file for %s\n",
209 d->gd->disk_name);
210 return;
211 }
212 BUG_ON(d->debugfs);
213 d->debugfs = entry;
214}
215void
216aoedisk_rm_debugfs(struct aoedev *d)
217{
218 debugfs_remove(d->debugfs);
219 d->debugfs = NULL;
220}
221
133static int 222static int
134aoedisk_add_sysfs(struct aoedev *d) 223aoedisk_add_sysfs(struct aoedev *d)
135{ 224{
@@ -330,6 +419,7 @@ aoeblk_gdalloc(void *vp)
330 419
331 add_disk(gd); 420 add_disk(gd);
332 aoedisk_add_sysfs(d); 421 aoedisk_add_sysfs(d);
422 aoedisk_add_debugfs(d);
333 423
334 spin_lock_irqsave(&d->lock, flags); 424 spin_lock_irqsave(&d->lock, flags);
335 WARN_ON(!(d->flags & DEVFL_GD_NOW)); 425 WARN_ON(!(d->flags & DEVFL_GD_NOW));
@@ -351,6 +441,8 @@ err:
351void 441void
352aoeblk_exit(void) 442aoeblk_exit(void)
353{ 443{
444 debugfs_remove_recursive(aoe_debugfs_dir);
445 aoe_debugfs_dir = NULL;
354 kmem_cache_destroy(buf_pool_cache); 446 kmem_cache_destroy(buf_pool_cache);
355} 447}
356 448
@@ -362,7 +454,11 @@ aoeblk_init(void)
362 0, 0, NULL); 454 0, 0, NULL);
363 if (buf_pool_cache == NULL) 455 if (buf_pool_cache == NULL)
364 return -ENOMEM; 456 return -ENOMEM;
365 457 aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
458 if (IS_ERR_OR_NULL(aoe_debugfs_dir)) {
459 pr_info("aoe: cannot create debugfs directory\n");
460 aoe_debugfs_dir = NULL;
461 }
366 return 0; 462 return 0;
367} 463}
368 464
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 4d45dba7fb8f..d2515435e23f 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -380,7 +380,6 @@ aoecmd_ata_rw(struct aoedev *d)
380{ 380{
381 struct frame *f; 381 struct frame *f;
382 struct buf *buf; 382 struct buf *buf;
383 struct aoetgt *t;
384 struct sk_buff *skb; 383 struct sk_buff *skb;
385 struct sk_buff_head queue; 384 struct sk_buff_head queue;
386 ulong bcnt, fbcnt; 385 ulong bcnt, fbcnt;
@@ -391,7 +390,6 @@ aoecmd_ata_rw(struct aoedev *d)
391 f = newframe(d); 390 f = newframe(d);
392 if (f == NULL) 391 if (f == NULL)
393 return 0; 392 return 0;
394 t = *d->tgt;
395 bcnt = d->maxbcnt; 393 bcnt = d->maxbcnt;
396 if (bcnt == 0) 394 if (bcnt == 0)
397 bcnt = DEFAULTBCNT; 395 bcnt = DEFAULTBCNT;
@@ -485,7 +483,6 @@ resend(struct aoedev *d, struct frame *f)
485 struct sk_buff *skb; 483 struct sk_buff *skb;
486 struct sk_buff_head queue; 484 struct sk_buff_head queue;
487 struct aoe_hdr *h; 485 struct aoe_hdr *h;
488 struct aoe_atahdr *ah;
489 struct aoetgt *t; 486 struct aoetgt *t;
490 char buf[128]; 487 char buf[128];
491 u32 n; 488 u32 n;
@@ -500,7 +497,6 @@ resend(struct aoedev *d, struct frame *f)
500 return; 497 return;
501 } 498 }
502 h = (struct aoe_hdr *) skb_mac_header(skb); 499 h = (struct aoe_hdr *) skb_mac_header(skb);
503 ah = (struct aoe_atahdr *) (h+1);
504 500
505 if (!(f->flags & FFL_PROBE)) { 501 if (!(f->flags & FFL_PROBE)) {
506 snprintf(buf, sizeof(buf), 502 snprintf(buf, sizeof(buf),
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 784c92e038d1..e774c50b6842 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -12,6 +12,7 @@
12#include <linux/bitmap.h> 12#include <linux/bitmap.h>
13#include <linux/kdev_t.h> 13#include <linux/kdev_t.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/string.h>
15#include "aoe.h" 16#include "aoe.h"
16 17
17static void dummy_timer(ulong); 18static void dummy_timer(ulong);
@@ -241,16 +242,12 @@ aoedev_downdev(struct aoedev *d)
241static int 242static int
242user_req(char *s, size_t slen, struct aoedev *d) 243user_req(char *s, size_t slen, struct aoedev *d)
243{ 244{
244 char *p; 245 const char *p;
245 size_t lim; 246 size_t lim;
246 247
247 if (!d->gd) 248 if (!d->gd)
248 return 0; 249 return 0;
249 p = strrchr(d->gd->disk_name, '/'); 250 p = kbasename(d->gd->disk_name);
250 if (!p)
251 p = d->gd->disk_name;
252 else
253 p += 1;
254 lim = sizeof(d->gd->disk_name); 251 lim = sizeof(d->gd->disk_name);
255 lim -= p - d->gd->disk_name; 252 lim -= p - d->gd->disk_name;
256 if (slen < lim) 253 if (slen < lim)
@@ -278,6 +275,7 @@ freedev(struct aoedev *d)
278 275
279 del_timer_sync(&d->timer); 276 del_timer_sync(&d->timer);
280 if (d->gd) { 277 if (d->gd) {
278 aoedisk_rm_debugfs(d);
281 aoedisk_rm_sysfs(d); 279 aoedisk_rm_sysfs(d);
282 del_gendisk(d->gd); 280 del_gendisk(d->gd);
283 put_disk(d->gd); 281 put_disk(d->gd);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 62b6c2cc80b5..d2d95ff5353b 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4258,6 +4258,13 @@ static void cciss_find_board_params(ctlr_info_t *h)
4258 h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds; 4258 h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
4259 h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); 4259 h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
4260 /* 4260 /*
4261 * The P600 may exhibit poor performnace under some workloads
4262 * if we use the value in the configuration table. Limit this
4263 * controller to MAXSGENTRIES (32) instead.
4264 */
4265 if (h->board_id == 0x3225103C)
4266 h->maxsgentries = MAXSGENTRIES;
4267 /*
4261 * Limit in-command s/g elements to 32 save dma'able memory. 4268 * Limit in-command s/g elements to 32 save dma'able memory.
4262 * Howvever spec says if 0, use 31 4269 * Howvever spec says if 0, use 31
4263 */ 4270 */
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index a56cfcd5d648..77a60bedd7a3 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -636,7 +636,7 @@ ok_to_write:
636 mg_request(host->breq); 636 mg_request(host->breq);
637} 637}
638 638
639void mg_times_out(unsigned long data) 639static void mg_times_out(unsigned long data)
640{ 640{
641 struct mg_host *host = (struct mg_host *)data; 641 struct mg_host *host = (struct mg_host *)data;
642 char *name; 642 char *name;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 1bbc681688e4..79aa179305b5 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -598,7 +598,7 @@ static ssize_t class_osdblk_remove(struct class *c,
598 unsigned long ul; 598 unsigned long ul;
599 struct list_head *tmp; 599 struct list_head *tmp;
600 600
601 rc = strict_strtoul(buf, 10, &ul); 601 rc = kstrtoul(buf, 10, &ul);
602 if (rc) 602 if (rc)
603 return rc; 603 return rc;
604 604
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f5d0ea11d9fd..56188475cfd3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -44,6 +44,8 @@
44 * 44 *
45 *************************************************************************/ 45 *************************************************************************/
46 46
47#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
48
47#include <linux/pktcdvd.h> 49#include <linux/pktcdvd.h>
48#include <linux/module.h> 50#include <linux/module.h>
49#include <linux/types.h> 51#include <linux/types.h>
@@ -69,23 +71,24 @@
69 71
70#define DRIVER_NAME "pktcdvd" 72#define DRIVER_NAME "pktcdvd"
71 73
72#if PACKET_DEBUG 74#define pkt_err(pd, fmt, ...) \
73#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args) 75 pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
74#else 76#define pkt_notice(pd, fmt, ...) \
75#define DPRINTK(fmt, args...) 77 pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
76#endif 78#define pkt_info(pd, fmt, ...) \
77 79 pr_info("%s: " fmt, pd->name, ##__VA_ARGS__)
78#if PACKET_DEBUG > 1 80
79#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args) 81#define pkt_dbg(level, pd, fmt, ...) \
80#else 82do { \
81#define VPRINTK(fmt, args...) 83 if (level == 2 && PACKET_DEBUG >= 2) \
82#endif 84 pr_notice("%s: %s():" fmt, \
85 pd->name, __func__, ##__VA_ARGS__); \
86 else if (level == 1 && PACKET_DEBUG >= 1) \
87 pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \
88} while (0)
83 89
84#define MAX_SPEED 0xffff 90#define MAX_SPEED 0xffff
85 91
86#define ZONE(sector, pd) (((sector) + (pd)->offset) & \
87 ~(sector_t)((pd)->settings.size - 1))
88
89static DEFINE_MUTEX(pktcdvd_mutex); 92static DEFINE_MUTEX(pktcdvd_mutex);
90static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; 93static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
91static struct proc_dir_entry *pkt_proc; 94static struct proc_dir_entry *pkt_proc;
@@ -103,7 +106,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
103static int pkt_remove_dev(dev_t pkt_dev); 106static int pkt_remove_dev(dev_t pkt_dev);
104static int pkt_seq_show(struct seq_file *m, void *p); 107static int pkt_seq_show(struct seq_file *m, void *p);
105 108
106 109static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
110{
111 return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
112}
107 113
108/* 114/*
109 * create and register a pktcdvd kernel object. 115 * create and register a pktcdvd kernel object.
@@ -424,7 +430,7 @@ static int pkt_sysfs_init(void)
424 if (ret) { 430 if (ret) {
425 kfree(class_pktcdvd); 431 kfree(class_pktcdvd);
426 class_pktcdvd = NULL; 432 class_pktcdvd = NULL;
427 printk(DRIVER_NAME": failed to create class pktcdvd\n"); 433 pr_err("failed to create class pktcdvd\n");
428 return ret; 434 return ret;
429 } 435 }
430 return 0; 436 return 0;
@@ -517,7 +523,7 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
517{ 523{
518 BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); 524 BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
519 if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { 525 if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
520 VPRINTK(DRIVER_NAME": queue empty\n"); 526 pkt_dbg(2, pd, "queue empty\n");
521 atomic_set(&pd->iosched.attention, 1); 527 atomic_set(&pd->iosched.attention, 1);
522 wake_up(&pd->wqueue); 528 wake_up(&pd->wqueue);
523 } 529 }
@@ -734,36 +740,33 @@ out:
734 return ret; 740 return ret;
735} 741}
736 742
743static const char *sense_key_string(__u8 index)
744{
745 static const char * const info[] = {
746 "No sense", "Recovered error", "Not ready",
747 "Medium error", "Hardware error", "Illegal request",
748 "Unit attention", "Data protect", "Blank check",
749 };
750
751 return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
752}
753
737/* 754/*
738 * A generic sense dump / resolve mechanism should be implemented across 755 * A generic sense dump / resolve mechanism should be implemented across
739 * all ATAPI + SCSI devices. 756 * all ATAPI + SCSI devices.
740 */ 757 */
741static void pkt_dump_sense(struct packet_command *cgc) 758static void pkt_dump_sense(struct pktcdvd_device *pd,
759 struct packet_command *cgc)
742{ 760{
743 static char *info[9] = { "No sense", "Recovered error", "Not ready",
744 "Medium error", "Hardware error", "Illegal request",
745 "Unit attention", "Data protect", "Blank check" };
746 int i;
747 struct request_sense *sense = cgc->sense; 761 struct request_sense *sense = cgc->sense;
748 762
749 printk(DRIVER_NAME":"); 763 if (sense)
750 for (i = 0; i < CDROM_PACKET_SIZE; i++) 764 pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
751 printk(" %02x", cgc->cmd[i]); 765 CDROM_PACKET_SIZE, cgc->cmd,
752 printk(" - "); 766 sense->sense_key, sense->asc, sense->ascq,
753 767 sense_key_string(sense->sense_key));
754 if (sense == NULL) { 768 else
755 printk("no sense\n"); 769 pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
756 return;
757 }
758
759 printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq);
760
761 if (sense->sense_key > 8) {
762 printk(" (INVALID)\n");
763 return;
764 }
765
766 printk(" (%s)\n", info[sense->sense_key]);
767} 770}
768 771
769/* 772/*
@@ -806,7 +809,7 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
806 cgc.cmd[5] = write_speed & 0xff; 809 cgc.cmd[5] = write_speed & 0xff;
807 810
808 if ((ret = pkt_generic_packet(pd, &cgc))) 811 if ((ret = pkt_generic_packet(pd, &cgc)))
809 pkt_dump_sense(&cgc); 812 pkt_dump_sense(pd, &cgc);
810 813
811 return ret; 814 return ret;
812} 815}
@@ -872,7 +875,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
872 need_write_seek = 0; 875 need_write_seek = 0;
873 if (need_write_seek && reads_queued) { 876 if (need_write_seek && reads_queued) {
874 if (atomic_read(&pd->cdrw.pending_bios) > 0) { 877 if (atomic_read(&pd->cdrw.pending_bios) > 0) {
875 VPRINTK(DRIVER_NAME": write, waiting\n"); 878 pkt_dbg(2, pd, "write, waiting\n");
876 break; 879 break;
877 } 880 }
878 pkt_flush_cache(pd); 881 pkt_flush_cache(pd);
@@ -881,7 +884,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
881 } else { 884 } else {
882 if (!reads_queued && writes_queued) { 885 if (!reads_queued && writes_queued) {
883 if (atomic_read(&pd->cdrw.pending_bios) > 0) { 886 if (atomic_read(&pd->cdrw.pending_bios) > 0) {
884 VPRINTK(DRIVER_NAME": read, waiting\n"); 887 pkt_dbg(2, pd, "read, waiting\n");
885 break; 888 break;
886 } 889 }
887 pd->iosched.writing = 1; 890 pd->iosched.writing = 1;
@@ -943,7 +946,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
943 set_bit(PACKET_MERGE_SEGS, &pd->flags); 946 set_bit(PACKET_MERGE_SEGS, &pd->flags);
944 return 0; 947 return 0;
945 } else { 948 } else {
946 printk(DRIVER_NAME": cdrom max_phys_segments too small\n"); 949 pkt_err(pd, "cdrom max_phys_segments too small\n");
947 return -EIO; 950 return -EIO;
948 } 951 }
949} 952}
@@ -987,8 +990,9 @@ static void pkt_end_io_read(struct bio *bio, int err)
987 struct pktcdvd_device *pd = pkt->pd; 990 struct pktcdvd_device *pd = pkt->pd;
988 BUG_ON(!pd); 991 BUG_ON(!pd);
989 992
990 VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio, 993 pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
991 (unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err); 994 bio, (unsigned long long)pkt->sector,
995 (unsigned long long)bio->bi_sector, err);
992 996
993 if (err) 997 if (err)
994 atomic_inc(&pkt->io_errors); 998 atomic_inc(&pkt->io_errors);
@@ -1005,7 +1009,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err)
1005 struct pktcdvd_device *pd = pkt->pd; 1009 struct pktcdvd_device *pd = pkt->pd;
1006 BUG_ON(!pd); 1010 BUG_ON(!pd);
1007 1011
1008 VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err); 1012 pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err);
1009 1013
1010 pd->stats.pkt_ended++; 1014 pd->stats.pkt_ended++;
1011 1015
@@ -1047,7 +1051,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
1047 spin_unlock(&pkt->lock); 1051 spin_unlock(&pkt->lock);
1048 1052
1049 if (pkt->cache_valid) { 1053 if (pkt->cache_valid) {
1050 VPRINTK("pkt_gather_data: zone %llx cached\n", 1054 pkt_dbg(2, pd, "zone %llx cached\n",
1051 (unsigned long long)pkt->sector); 1055 (unsigned long long)pkt->sector);
1052 goto out_account; 1056 goto out_account;
1053 } 1057 }
@@ -1070,7 +1074,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
1070 1074
1071 p = (f * CD_FRAMESIZE) / PAGE_SIZE; 1075 p = (f * CD_FRAMESIZE) / PAGE_SIZE;
1072 offset = (f * CD_FRAMESIZE) % PAGE_SIZE; 1076 offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
1073 VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n", 1077 pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n",
1074 f, pkt->pages[p], offset); 1078 f, pkt->pages[p], offset);
1075 if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) 1079 if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
1076 BUG(); 1080 BUG();
@@ -1082,7 +1086,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
1082 } 1086 }
1083 1087
1084out_account: 1088out_account:
1085 VPRINTK("pkt_gather_data: need %d frames for zone %llx\n", 1089 pkt_dbg(2, pd, "need %d frames for zone %llx\n",
1086 frames_read, (unsigned long long)pkt->sector); 1090 frames_read, (unsigned long long)pkt->sector);
1087 pd->stats.pkt_started++; 1091 pd->stats.pkt_started++;
1088 pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); 1092 pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
@@ -1183,7 +1187,8 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state
1183 "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" 1187 "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
1184 }; 1188 };
1185 enum packet_data_state old_state = pkt->state; 1189 enum packet_data_state old_state = pkt->state;
1186 VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector, 1190 pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n",
1191 pkt->id, (unsigned long long)pkt->sector,
1187 state_name[old_state], state_name[state]); 1192 state_name[old_state], state_name[state]);
1188#endif 1193#endif
1189 pkt->state = state; 1194 pkt->state = state;
@@ -1202,12 +1207,10 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
1202 struct rb_node *n; 1207 struct rb_node *n;
1203 int wakeup; 1208 int wakeup;
1204 1209
1205 VPRINTK("handle_queue\n");
1206
1207 atomic_set(&pd->scan_queue, 0); 1210 atomic_set(&pd->scan_queue, 0);
1208 1211
1209 if (list_empty(&pd->cdrw.pkt_free_list)) { 1212 if (list_empty(&pd->cdrw.pkt_free_list)) {
1210 VPRINTK("handle_queue: no pkt\n"); 1213 pkt_dbg(2, pd, "no pkt\n");
1211 return 0; 1214 return 0;
1212 } 1215 }
1213 1216
@@ -1224,7 +1227,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
1224 node = first_node; 1227 node = first_node;
1225 while (node) { 1228 while (node) {
1226 bio = node->bio; 1229 bio = node->bio;
1227 zone = ZONE(bio->bi_sector, pd); 1230 zone = get_zone(bio->bi_sector, pd);
1228 list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { 1231 list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
1229 if (p->sector == zone) { 1232 if (p->sector == zone) {
1230 bio = NULL; 1233 bio = NULL;
@@ -1244,7 +1247,7 @@ try_next_bio:
1244 } 1247 }
1245 spin_unlock(&pd->lock); 1248 spin_unlock(&pd->lock);
1246 if (!bio) { 1249 if (!bio) {
1247 VPRINTK("handle_queue: no bio\n"); 1250 pkt_dbg(2, pd, "no bio\n");
1248 return 0; 1251 return 0;
1249 } 1252 }
1250 1253
@@ -1260,12 +1263,12 @@ try_next_bio:
1260 * to this packet. 1263 * to this packet.
1261 */ 1264 */
1262 spin_lock(&pd->lock); 1265 spin_lock(&pd->lock);
1263 VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone); 1266 pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
1264 while ((node = pkt_rbtree_find(pd, zone)) != NULL) { 1267 while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
1265 bio = node->bio; 1268 bio = node->bio;
1266 VPRINTK("pkt_handle_queue: found zone=%llx\n", 1269 pkt_dbg(2, pd, "found zone=%llx\n",
1267 (unsigned long long)ZONE(bio->bi_sector, pd)); 1270 (unsigned long long)get_zone(bio->bi_sector, pd));
1268 if (ZONE(bio->bi_sector, pd) != zone) 1271 if (get_zone(bio->bi_sector, pd) != zone)
1269 break; 1272 break;
1270 pkt_rbtree_erase(pd, node); 1273 pkt_rbtree_erase(pd, node);
1271 spin_lock(&pkt->lock); 1274 spin_lock(&pkt->lock);
@@ -1316,7 +1319,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
1316 if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) 1319 if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
1317 BUG(); 1320 BUG();
1318 } 1321 }
1319 VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); 1322 pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
1320 1323
1321 /* 1324 /*
1322 * Fill-in bvec with data from orig_bios. 1325 * Fill-in bvec with data from orig_bios.
@@ -1327,7 +1330,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
1327 pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); 1330 pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
1328 spin_unlock(&pkt->lock); 1331 spin_unlock(&pkt->lock);
1329 1332
1330 VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n", 1333 pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
1331 pkt->write_size, (unsigned long long)pkt->sector); 1334 pkt->write_size, (unsigned long long)pkt->sector);
1332 1335
1333 if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { 1336 if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
@@ -1359,7 +1362,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
1359{ 1362{
1360 int uptodate; 1363 int uptodate;
1361 1364
1362 VPRINTK("run_state_machine: pkt %d\n", pkt->id); 1365 pkt_dbg(2, pd, "pkt %d\n", pkt->id);
1363 1366
1364 for (;;) { 1367 for (;;) {
1365 switch (pkt->state) { 1368 switch (pkt->state) {
@@ -1398,7 +1401,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
1398 if (pkt_start_recovery(pkt)) { 1401 if (pkt_start_recovery(pkt)) {
1399 pkt_start_write(pd, pkt); 1402 pkt_start_write(pd, pkt);
1400 } else { 1403 } else {
1401 VPRINTK("No recovery possible\n"); 1404 pkt_dbg(2, pd, "No recovery possible\n");
1402 pkt_set_state(pkt, PACKET_FINISHED_STATE); 1405 pkt_set_state(pkt, PACKET_FINISHED_STATE);
1403 } 1406 }
1404 break; 1407 break;
@@ -1419,8 +1422,6 @@ static void pkt_handle_packets(struct pktcdvd_device *pd)
1419{ 1422{
1420 struct packet_data *pkt, *next; 1423 struct packet_data *pkt, *next;
1421 1424
1422 VPRINTK("pkt_handle_packets\n");
1423
1424 /* 1425 /*
1425 * Run state machine for active packets 1426 * Run state machine for active packets
1426 */ 1427 */
@@ -1502,9 +1503,9 @@ static int kcdrwd(void *foobar)
1502 if (PACKET_DEBUG > 1) { 1503 if (PACKET_DEBUG > 1) {
1503 int states[PACKET_NUM_STATES]; 1504 int states[PACKET_NUM_STATES];
1504 pkt_count_states(pd, states); 1505 pkt_count_states(pd, states);
1505 VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", 1506 pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
1506 states[0], states[1], states[2], states[3], 1507 states[0], states[1], states[2],
1507 states[4], states[5]); 1508 states[3], states[4], states[5]);
1508 } 1509 }
1509 1510
1510 min_sleep_time = MAX_SCHEDULE_TIMEOUT; 1511 min_sleep_time = MAX_SCHEDULE_TIMEOUT;
@@ -1513,9 +1514,9 @@ static int kcdrwd(void *foobar)
1513 min_sleep_time = pkt->sleep_time; 1514 min_sleep_time = pkt->sleep_time;
1514 } 1515 }
1515 1516
1516 VPRINTK("kcdrwd: sleeping\n"); 1517 pkt_dbg(2, pd, "sleeping\n");
1517 residue = schedule_timeout(min_sleep_time); 1518 residue = schedule_timeout(min_sleep_time);
1518 VPRINTK("kcdrwd: wake up\n"); 1519 pkt_dbg(2, pd, "wake up\n");
1519 1520
1520 /* make swsusp happy with our thread */ 1521 /* make swsusp happy with our thread */
1521 try_to_freeze(); 1522 try_to_freeze();
@@ -1563,9 +1564,10 @@ work_to_do:
1563 1564
1564static void pkt_print_settings(struct pktcdvd_device *pd) 1565static void pkt_print_settings(struct pktcdvd_device *pd)
1565{ 1566{
1566 printk(DRIVER_NAME": %s packets, ", pd->settings.fp ? "Fixed" : "Variable"); 1567 pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n",
1567 printk("%u blocks, ", pd->settings.size >> 2); 1568 pd->settings.fp ? "Fixed" : "Variable",
1568 printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2'); 1569 pd->settings.size >> 2,
1570 pd->settings.block_mode == 8 ? '1' : '2');
1569} 1571}
1570 1572
1571static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) 1573static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
@@ -1699,7 +1701,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1699 init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); 1701 init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
1700 cgc.sense = &sense; 1702 cgc.sense = &sense;
1701 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { 1703 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
1702 pkt_dump_sense(&cgc); 1704 pkt_dump_sense(pd, &cgc);
1703 return ret; 1705 return ret;
1704 } 1706 }
1705 1707
@@ -1714,7 +1716,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1714 init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); 1716 init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
1715 cgc.sense = &sense; 1717 cgc.sense = &sense;
1716 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { 1718 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
1717 pkt_dump_sense(&cgc); 1719 pkt_dump_sense(pd, &cgc);
1718 return ret; 1720 return ret;
1719 } 1721 }
1720 1722
@@ -1749,14 +1751,14 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1749 /* 1751 /*
1750 * paranoia 1752 * paranoia
1751 */ 1753 */
1752 printk(DRIVER_NAME": write mode wrong %d\n", wp->data_block_type); 1754 pkt_err(pd, "write mode wrong %d\n", wp->data_block_type);
1753 return 1; 1755 return 1;
1754 } 1756 }
1755 wp->packet_size = cpu_to_be32(pd->settings.size >> 2); 1757 wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
1756 1758
1757 cgc.buflen = cgc.cmd[8] = size; 1759 cgc.buflen = cgc.cmd[8] = size;
1758 if ((ret = pkt_mode_select(pd, &cgc))) { 1760 if ((ret = pkt_mode_select(pd, &cgc))) {
1759 pkt_dump_sense(&cgc); 1761 pkt_dump_sense(pd, &cgc);
1760 return ret; 1762 return ret;
1761 } 1763 }
1762 1764
@@ -1793,7 +1795,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
1793 if (ti->rt == 1 && ti->blank == 0) 1795 if (ti->rt == 1 && ti->blank == 0)
1794 return 1; 1796 return 1;
1795 1797
1796 printk(DRIVER_NAME": bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); 1798 pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
1797 return 0; 1799 return 0;
1798} 1800}
1799 1801
@@ -1811,7 +1813,8 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
1811 case 0x12: /* DVD-RAM */ 1813 case 0x12: /* DVD-RAM */
1812 return 1; 1814 return 1;
1813 default: 1815 default:
1814 VPRINTK(DRIVER_NAME": Wrong disc profile (%x)\n", pd->mmc3_profile); 1816 pkt_dbg(2, pd, "Wrong disc profile (%x)\n",
1817 pd->mmc3_profile);
1815 return 0; 1818 return 0;
1816 } 1819 }
1817 1820
@@ -1820,22 +1823,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
1820 * but i'm not sure, should we leave this to user apps? probably. 1823 * but i'm not sure, should we leave this to user apps? probably.
1821 */ 1824 */
1822 if (di->disc_type == 0xff) { 1825 if (di->disc_type == 0xff) {
1823 printk(DRIVER_NAME": Unknown disc. No track?\n"); 1826 pkt_notice(pd, "unknown disc - no track?\n");
1824 return 0; 1827 return 0;
1825 } 1828 }
1826 1829
1827 if (di->disc_type != 0x20 && di->disc_type != 0) { 1830 if (di->disc_type != 0x20 && di->disc_type != 0) {
1828 printk(DRIVER_NAME": Wrong disc type (%x)\n", di->disc_type); 1831 pkt_err(pd, "wrong disc type (%x)\n", di->disc_type);
1829 return 0; 1832 return 0;
1830 } 1833 }
1831 1834
1832 if (di->erasable == 0) { 1835 if (di->erasable == 0) {
1833 printk(DRIVER_NAME": Disc not erasable\n"); 1836 pkt_notice(pd, "disc not erasable\n");
1834 return 0; 1837 return 0;
1835 } 1838 }
1836 1839
1837 if (di->border_status == PACKET_SESSION_RESERVED) { 1840 if (di->border_status == PACKET_SESSION_RESERVED) {
1838 printk(DRIVER_NAME": Can't write to last track (reserved)\n"); 1841 pkt_err(pd, "can't write to last track (reserved)\n");
1839 return 0; 1842 return 0;
1840 } 1843 }
1841 1844
@@ -1860,7 +1863,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
1860 memset(&ti, 0, sizeof(track_information)); 1863 memset(&ti, 0, sizeof(track_information));
1861 1864
1862 if ((ret = pkt_get_disc_info(pd, &di))) { 1865 if ((ret = pkt_get_disc_info(pd, &di))) {
1863 printk("failed get_disc\n"); 1866 pkt_err(pd, "failed get_disc\n");
1864 return ret; 1867 return ret;
1865 } 1868 }
1866 1869
@@ -1871,12 +1874,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
1871 1874
1872 track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ 1875 track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
1873 if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { 1876 if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
1874 printk(DRIVER_NAME": failed get_track\n"); 1877 pkt_err(pd, "failed get_track\n");
1875 return ret; 1878 return ret;
1876 } 1879 }
1877 1880
1878 if (!pkt_writable_track(pd, &ti)) { 1881 if (!pkt_writable_track(pd, &ti)) {
1879 printk(DRIVER_NAME": can't write to this track\n"); 1882 pkt_err(pd, "can't write to this track\n");
1880 return -EROFS; 1883 return -EROFS;
1881 } 1884 }
1882 1885
@@ -1886,11 +1889,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
1886 */ 1889 */
1887 pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; 1890 pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
1888 if (pd->settings.size == 0) { 1891 if (pd->settings.size == 0) {
1889 printk(DRIVER_NAME": detected zero packet size!\n"); 1892 pkt_notice(pd, "detected zero packet size!\n");
1890 return -ENXIO; 1893 return -ENXIO;
1891 } 1894 }
1892 if (pd->settings.size > PACKET_MAX_SECTORS) { 1895 if (pd->settings.size > PACKET_MAX_SECTORS) {
1893 printk(DRIVER_NAME": packet size is too big\n"); 1896 pkt_err(pd, "packet size is too big\n");
1894 return -EROFS; 1897 return -EROFS;
1895 } 1898 }
1896 pd->settings.fp = ti.fp; 1899 pd->settings.fp = ti.fp;
@@ -1932,7 +1935,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
1932 pd->settings.block_mode = PACKET_BLOCK_MODE2; 1935 pd->settings.block_mode = PACKET_BLOCK_MODE2;
1933 break; 1936 break;
1934 default: 1937 default:
1935 printk(DRIVER_NAME": unknown data mode\n"); 1938 pkt_err(pd, "unknown data mode\n");
1936 return -EROFS; 1939 return -EROFS;
1937 } 1940 }
1938 return 0; 1941 return 0;
@@ -1966,10 +1969,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
1966 cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); 1969 cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
1967 ret = pkt_mode_select(pd, &cgc); 1970 ret = pkt_mode_select(pd, &cgc);
1968 if (ret) { 1971 if (ret) {
1969 printk(DRIVER_NAME": write caching control failed\n"); 1972 pkt_err(pd, "write caching control failed\n");
1970 pkt_dump_sense(&cgc); 1973 pkt_dump_sense(pd, &cgc);
1971 } else if (!ret && set) 1974 } else if (!ret && set)
1972 printk(DRIVER_NAME": enabled write caching on %s\n", pd->name); 1975 pkt_notice(pd, "enabled write caching\n");
1973 return ret; 1976 return ret;
1974} 1977}
1975 1978
@@ -2005,7 +2008,7 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
2005 sizeof(struct mode_page_header); 2008 sizeof(struct mode_page_header);
2006 ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); 2009 ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
2007 if (ret) { 2010 if (ret) {
2008 pkt_dump_sense(&cgc); 2011 pkt_dump_sense(pd, &cgc);
2009 return ret; 2012 return ret;
2010 } 2013 }
2011 } 2014 }
@@ -2064,7 +2067,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
2064 cgc.cmd[8] = 2; 2067 cgc.cmd[8] = 2;
2065 ret = pkt_generic_packet(pd, &cgc); 2068 ret = pkt_generic_packet(pd, &cgc);
2066 if (ret) { 2069 if (ret) {
2067 pkt_dump_sense(&cgc); 2070 pkt_dump_sense(pd, &cgc);
2068 return ret; 2071 return ret;
2069 } 2072 }
2070 size = ((unsigned int) buf[0]<<8) + buf[1] + 2; 2073 size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
@@ -2079,16 +2082,16 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
2079 cgc.cmd[8] = size; 2082 cgc.cmd[8] = size;
2080 ret = pkt_generic_packet(pd, &cgc); 2083 ret = pkt_generic_packet(pd, &cgc);
2081 if (ret) { 2084 if (ret) {
2082 pkt_dump_sense(&cgc); 2085 pkt_dump_sense(pd, &cgc);
2083 return ret; 2086 return ret;
2084 } 2087 }
2085 2088
2086 if (!(buf[6] & 0x40)) { 2089 if (!(buf[6] & 0x40)) {
2087 printk(DRIVER_NAME": Disc type is not CD-RW\n"); 2090 pkt_notice(pd, "disc type is not CD-RW\n");
2088 return 1; 2091 return 1;
2089 } 2092 }
2090 if (!(buf[6] & 0x4)) { 2093 if (!(buf[6] & 0x4)) {
2091 printk(DRIVER_NAME": A1 values on media are not valid, maybe not CDRW?\n"); 2094 pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n");
2092 return 1; 2095 return 1;
2093 } 2096 }
2094 2097
@@ -2108,14 +2111,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
2108 *speed = us_clv_to_speed[sp]; 2111 *speed = us_clv_to_speed[sp];
2109 break; 2112 break;
2110 default: 2113 default:
2111 printk(DRIVER_NAME": Unknown disc sub-type %d\n",st); 2114 pkt_notice(pd, "unknown disc sub-type %d\n", st);
2112 return 1; 2115 return 1;
2113 } 2116 }
2114 if (*speed) { 2117 if (*speed) {
2115 printk(DRIVER_NAME": Max. media speed: %d\n",*speed); 2118 pkt_info(pd, "maximum media speed: %d\n", *speed);
2116 return 0; 2119 return 0;
2117 } else { 2120 } else {
2118 printk(DRIVER_NAME": Unknown speed %d for sub-type %d\n",sp,st); 2121 pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
2119 return 1; 2122 return 1;
2120 } 2123 }
2121} 2124}
@@ -2126,7 +2129,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
2126 struct request_sense sense; 2129 struct request_sense sense;
2127 int ret; 2130 int ret;
2128 2131
2129 VPRINTK(DRIVER_NAME": Performing OPC\n"); 2132 pkt_dbg(2, pd, "Performing OPC\n");
2130 2133
2131 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); 2134 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
2132 cgc.sense = &sense; 2135 cgc.sense = &sense;
@@ -2134,7 +2137,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
2134 cgc.cmd[0] = GPCMD_SEND_OPC; 2137 cgc.cmd[0] = GPCMD_SEND_OPC;
2135 cgc.cmd[1] = 1; 2138 cgc.cmd[1] = 1;
2136 if ((ret = pkt_generic_packet(pd, &cgc))) 2139 if ((ret = pkt_generic_packet(pd, &cgc)))
2137 pkt_dump_sense(&cgc); 2140 pkt_dump_sense(pd, &cgc);
2138 return ret; 2141 return ret;
2139} 2142}
2140 2143
@@ -2144,12 +2147,12 @@ static int pkt_open_write(struct pktcdvd_device *pd)
2144 unsigned int write_speed, media_write_speed, read_speed; 2147 unsigned int write_speed, media_write_speed, read_speed;
2145 2148
2146 if ((ret = pkt_probe_settings(pd))) { 2149 if ((ret = pkt_probe_settings(pd))) {
2147 VPRINTK(DRIVER_NAME": %s failed probe\n", pd->name); 2150 pkt_dbg(2, pd, "failed probe\n");
2148 return ret; 2151 return ret;
2149 } 2152 }
2150 2153
2151 if ((ret = pkt_set_write_settings(pd))) { 2154 if ((ret = pkt_set_write_settings(pd))) {
2152 DPRINTK(DRIVER_NAME": %s failed saving write settings\n", pd->name); 2155 pkt_dbg(1, pd, "failed saving write settings\n");
2153 return -EIO; 2156 return -EIO;
2154 } 2157 }
2155 2158
@@ -2161,26 +2164,26 @@ static int pkt_open_write(struct pktcdvd_device *pd)
2161 case 0x13: /* DVD-RW */ 2164 case 0x13: /* DVD-RW */
2162 case 0x1a: /* DVD+RW */ 2165 case 0x1a: /* DVD+RW */
2163 case 0x12: /* DVD-RAM */ 2166 case 0x12: /* DVD-RAM */
2164 DPRINTK(DRIVER_NAME": write speed %ukB/s\n", write_speed); 2167 pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
2165 break; 2168 break;
2166 default: 2169 default:
2167 if ((ret = pkt_media_speed(pd, &media_write_speed))) 2170 if ((ret = pkt_media_speed(pd, &media_write_speed)))
2168 media_write_speed = 16; 2171 media_write_speed = 16;
2169 write_speed = min(write_speed, media_write_speed * 177); 2172 write_speed = min(write_speed, media_write_speed * 177);
2170 DPRINTK(DRIVER_NAME": write speed %ux\n", write_speed / 176); 2173 pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
2171 break; 2174 break;
2172 } 2175 }
2173 read_speed = write_speed; 2176 read_speed = write_speed;
2174 2177
2175 if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { 2178 if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
2176 DPRINTK(DRIVER_NAME": %s couldn't set write speed\n", pd->name); 2179 pkt_dbg(1, pd, "couldn't set write speed\n");
2177 return -EIO; 2180 return -EIO;
2178 } 2181 }
2179 pd->write_speed = write_speed; 2182 pd->write_speed = write_speed;
2180 pd->read_speed = read_speed; 2183 pd->read_speed = read_speed;
2181 2184
2182 if ((ret = pkt_perform_opc(pd))) { 2185 if ((ret = pkt_perform_opc(pd))) {
2183 DPRINTK(DRIVER_NAME": %s Optimum Power Calibration failed\n", pd->name); 2186 pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
2184 } 2187 }
2185 2188
2186 return 0; 2189 return 0;
@@ -2205,7 +2208,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2205 goto out; 2208 goto out;
2206 2209
2207 if ((ret = pkt_get_last_written(pd, &lba))) { 2210 if ((ret = pkt_get_last_written(pd, &lba))) {
2208 printk(DRIVER_NAME": pkt_get_last_written failed\n"); 2211 pkt_err(pd, "pkt_get_last_written failed\n");
2209 goto out_putdev; 2212 goto out_putdev;
2210 } 2213 }
2211 2214
@@ -2235,11 +2238,11 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2235 2238
2236 if (write) { 2239 if (write) {
2237 if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { 2240 if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
2238 printk(DRIVER_NAME": not enough memory for buffers\n"); 2241 pkt_err(pd, "not enough memory for buffers\n");
2239 ret = -ENOMEM; 2242 ret = -ENOMEM;
2240 goto out_putdev; 2243 goto out_putdev;
2241 } 2244 }
2242 printk(DRIVER_NAME": %lukB available on disc\n", lba << 1); 2245 pkt_info(pd, "%lukB available on disc\n", lba << 1);
2243 } 2246 }
2244 2247
2245 return 0; 2248 return 0;
@@ -2257,7 +2260,7 @@ out:
2257static void pkt_release_dev(struct pktcdvd_device *pd, int flush) 2260static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
2258{ 2261{
2259 if (flush && pkt_flush_cache(pd)) 2262 if (flush && pkt_flush_cache(pd))
2260 DPRINTK(DRIVER_NAME": %s not flushing cache\n", pd->name); 2263 pkt_dbg(1, pd, "not flushing cache\n");
2261 2264
2262 pkt_lock_door(pd, 0); 2265 pkt_lock_door(pd, 0);
2263 2266
@@ -2279,8 +2282,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
2279 struct pktcdvd_device *pd = NULL; 2282 struct pktcdvd_device *pd = NULL;
2280 int ret; 2283 int ret;
2281 2284
2282 VPRINTK(DRIVER_NAME": entering open\n");
2283
2284 mutex_lock(&pktcdvd_mutex); 2285 mutex_lock(&pktcdvd_mutex);
2285 mutex_lock(&ctl_mutex); 2286 mutex_lock(&ctl_mutex);
2286 pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); 2287 pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
@@ -2315,7 +2316,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
2315out_dec: 2316out_dec:
2316 pd->refcnt--; 2317 pd->refcnt--;
2317out: 2318out:
2318 VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
2319 mutex_unlock(&ctl_mutex); 2319 mutex_unlock(&ctl_mutex);
2320 mutex_unlock(&pktcdvd_mutex); 2320 mutex_unlock(&pktcdvd_mutex);
2321 return ret; 2321 return ret;
@@ -2360,7 +2360,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
2360 2360
2361 pd = q->queuedata; 2361 pd = q->queuedata;
2362 if (!pd) { 2362 if (!pd) {
2363 printk(DRIVER_NAME": %s incorrect request queue\n", bdevname(bio->bi_bdev, b)); 2363 pr_err("%s incorrect request queue\n",
2364 bdevname(bio->bi_bdev, b));
2364 goto end_io; 2365 goto end_io;
2365 } 2366 }
2366 2367
@@ -2382,20 +2383,20 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
2382 } 2383 }
2383 2384
2384 if (!test_bit(PACKET_WRITABLE, &pd->flags)) { 2385 if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
2385 printk(DRIVER_NAME": WRITE for ro device %s (%llu)\n", 2386 pkt_notice(pd, "WRITE for ro device (%llu)\n",
2386 pd->name, (unsigned long long)bio->bi_sector); 2387 (unsigned long long)bio->bi_sector);
2387 goto end_io; 2388 goto end_io;
2388 } 2389 }
2389 2390
2390 if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) { 2391 if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
2391 printk(DRIVER_NAME": wrong bio size\n"); 2392 pkt_err(pd, "wrong bio size\n");
2392 goto end_io; 2393 goto end_io;
2393 } 2394 }
2394 2395
2395 blk_queue_bounce(q, &bio); 2396 blk_queue_bounce(q, &bio);
2396 2397
2397 zone = ZONE(bio->bi_sector, pd); 2398 zone = get_zone(bio->bi_sector, pd);
2398 VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", 2399 pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
2399 (unsigned long long)bio->bi_sector, 2400 (unsigned long long)bio->bi_sector,
2400 (unsigned long long)bio_end_sector(bio)); 2401 (unsigned long long)bio_end_sector(bio));
2401 2402
@@ -2405,7 +2406,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
2405 sector_t last_zone; 2406 sector_t last_zone;
2406 int first_sectors; 2407 int first_sectors;
2407 2408
2408 last_zone = ZONE(bio_end_sector(bio) - 1, pd); 2409 last_zone = get_zone(bio_end_sector(bio) - 1, pd);
2409 if (last_zone != zone) { 2410 if (last_zone != zone) {
2410 BUG_ON(last_zone != zone + pd->settings.size); 2411 BUG_ON(last_zone != zone + pd->settings.size);
2411 first_sectors = last_zone - bio->bi_sector; 2412 first_sectors = last_zone - bio->bi_sector;
@@ -2500,7 +2501,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2500 struct bio_vec *bvec) 2501 struct bio_vec *bvec)
2501{ 2502{
2502 struct pktcdvd_device *pd = q->queuedata; 2503 struct pktcdvd_device *pd = q->queuedata;
2503 sector_t zone = ZONE(bmd->bi_sector, pd); 2504 sector_t zone = get_zone(bmd->bi_sector, pd);
2504 int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; 2505 int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
2505 int remaining = (pd->settings.size << 9) - used; 2506 int remaining = (pd->settings.size << 9) - used;
2506 int remaining2; 2507 int remaining2;
@@ -2609,7 +2610,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
2609 struct block_device *bdev; 2610 struct block_device *bdev;
2610 2611
2611 if (pd->pkt_dev == dev) { 2612 if (pd->pkt_dev == dev) {
2612 printk(DRIVER_NAME": Recursive setup not allowed\n"); 2613 pkt_err(pd, "recursive setup not allowed\n");
2613 return -EBUSY; 2614 return -EBUSY;
2614 } 2615 }
2615 for (i = 0; i < MAX_WRITERS; i++) { 2616 for (i = 0; i < MAX_WRITERS; i++) {
@@ -2617,11 +2618,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
2617 if (!pd2) 2618 if (!pd2)
2618 continue; 2619 continue;
2619 if (pd2->bdev->bd_dev == dev) { 2620 if (pd2->bdev->bd_dev == dev) {
2620 printk(DRIVER_NAME": %s already setup\n", bdevname(pd2->bdev, b)); 2621 pkt_err(pd, "%s already setup\n",
2622 bdevname(pd2->bdev, b));
2621 return -EBUSY; 2623 return -EBUSY;
2622 } 2624 }
2623 if (pd2->pkt_dev == dev) { 2625 if (pd2->pkt_dev == dev) {
2624 printk(DRIVER_NAME": Can't chain pktcdvd devices\n"); 2626 pkt_err(pd, "can't chain pktcdvd devices\n");
2625 return -EBUSY; 2627 return -EBUSY;
2626 } 2628 }
2627 } 2629 }
@@ -2644,13 +2646,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
2644 atomic_set(&pd->cdrw.pending_bios, 0); 2646 atomic_set(&pd->cdrw.pending_bios, 0);
2645 pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); 2647 pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
2646 if (IS_ERR(pd->cdrw.thread)) { 2648 if (IS_ERR(pd->cdrw.thread)) {
2647 printk(DRIVER_NAME": can't start kernel thread\n"); 2649 pkt_err(pd, "can't start kernel thread\n");
2648 ret = -ENOMEM; 2650 ret = -ENOMEM;
2649 goto out_mem; 2651 goto out_mem;
2650 } 2652 }
2651 2653
2652 proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd); 2654 proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
2653 DPRINTK(DRIVER_NAME": writer %s mapped to %s\n", pd->name, bdevname(bdev, b)); 2655 pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b));
2654 return 0; 2656 return 0;
2655 2657
2656out_mem: 2658out_mem:
@@ -2665,8 +2667,8 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2665 struct pktcdvd_device *pd = bdev->bd_disk->private_data; 2667 struct pktcdvd_device *pd = bdev->bd_disk->private_data;
2666 int ret; 2668 int ret;
2667 2669
2668 VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, 2670 pkt_dbg(2, pd, "cmd %x, dev %d:%d\n",
2669 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); 2671 cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2670 2672
2671 mutex_lock(&pktcdvd_mutex); 2673 mutex_lock(&pktcdvd_mutex);
2672 switch (cmd) { 2674 switch (cmd) {
@@ -2690,7 +2692,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2690 break; 2692 break;
2691 2693
2692 default: 2694 default:
2693 VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd); 2695 pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
2694 ret = -ENOTTY; 2696 ret = -ENOTTY;
2695 } 2697 }
2696 mutex_unlock(&pktcdvd_mutex); 2698 mutex_unlock(&pktcdvd_mutex);
@@ -2743,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2743 if (!pkt_devs[idx]) 2745 if (!pkt_devs[idx])
2744 break; 2746 break;
2745 if (idx == MAX_WRITERS) { 2747 if (idx == MAX_WRITERS) {
2746 printk(DRIVER_NAME": max %d writers supported\n", MAX_WRITERS); 2748 pr_err("max %d writers supported\n", MAX_WRITERS);
2747 ret = -EBUSY; 2749 ret = -EBUSY;
2748 goto out_mutex; 2750 goto out_mutex;
2749 } 2751 }
@@ -2818,7 +2820,7 @@ out_mem:
2818 kfree(pd); 2820 kfree(pd);
2819out_mutex: 2821out_mutex:
2820 mutex_unlock(&ctl_mutex); 2822 mutex_unlock(&ctl_mutex);
2821 printk(DRIVER_NAME": setup of pktcdvd device failed\n"); 2823 pr_err("setup of pktcdvd device failed\n");
2822 return ret; 2824 return ret;
2823} 2825}
2824 2826
@@ -2839,7 +2841,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
2839 break; 2841 break;
2840 } 2842 }
2841 if (idx == MAX_WRITERS) { 2843 if (idx == MAX_WRITERS) {
2842 DPRINTK(DRIVER_NAME": dev not setup\n"); 2844 pr_debug("dev not setup\n");
2843 ret = -ENXIO; 2845 ret = -ENXIO;
2844 goto out; 2846 goto out;
2845 } 2847 }
@@ -2859,7 +2861,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
2859 blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); 2861 blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
2860 2862
2861 remove_proc_entry(pd->name, pkt_proc); 2863 remove_proc_entry(pd->name, pkt_proc);
2862 DPRINTK(DRIVER_NAME": writer %s unmapped\n", pd->name); 2864 pkt_dbg(1, pd, "writer unmapped\n");
2863 2865
2864 del_gendisk(pd->disk); 2866 del_gendisk(pd->disk);
2865 blk_cleanup_queue(pd->disk->queue); 2867 blk_cleanup_queue(pd->disk->queue);
@@ -2969,7 +2971,7 @@ static int __init pkt_init(void)
2969 2971
2970 ret = register_blkdev(pktdev_major, DRIVER_NAME); 2972 ret = register_blkdev(pktdev_major, DRIVER_NAME);
2971 if (ret < 0) { 2973 if (ret < 0) {
2972 printk(DRIVER_NAME": Unable to register block device\n"); 2974 pr_err("unable to register block device\n");
2973 goto out2; 2975 goto out2;
2974 } 2976 }
2975 if (!pktdev_major) 2977 if (!pktdev_major)
@@ -2983,7 +2985,7 @@ static int __init pkt_init(void)
2983 2985
2984 ret = misc_register(&pkt_misc); 2986 ret = misc_register(&pkt_misc);
2985 if (ret) { 2987 if (ret) {
2986 printk(DRIVER_NAME": Unable to register misc device\n"); 2988 pr_err("unable to register misc device\n");
2987 goto out_misc; 2989 goto out_misc;
2988 } 2990 }
2989 2991
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 39c51cc7fabc..b22a7d0fe5b7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5132,7 +5132,7 @@ static ssize_t rbd_remove(struct bus_type *bus,
5132 bool already = false; 5132 bool already = false;
5133 int ret; 5133 int ret;
5134 5134
5135 ret = strict_strtoul(buf, 10, &ul); 5135 ret = kstrtoul(buf, 10, &ul);
5136 if (ret) 5136 if (ret)
5137 return ret; 5137 return ret;
5138 5138
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 8ed6ccb748cf..b02d53a399f3 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -924,7 +924,6 @@ static int swim_probe(struct platform_device *dev)
924 return 0; 924 return 0;
925 925
926out_kfree: 926out_kfree:
927 platform_set_drvdata(dev, NULL);
928 kfree(swd); 927 kfree(swd);
929out_iounmap: 928out_iounmap:
930 iounmap(swim_base); 929 iounmap(swim_base);
@@ -962,7 +961,6 @@ static int swim_remove(struct platform_device *dev)
962 if (res) 961 if (res)
963 release_mem_region(res->start, resource_size(res)); 962 release_mem_region(res->start, resource_size(res));
964 963
965 platform_set_drvdata(dev, NULL);
966 kfree(swd); 964 kfree(swd);
967 965
968 return 0; 966 return 0;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index fe5c3cd10c34..c2014a0aa206 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -620,7 +620,7 @@ static void backend_changed(struct xenbus_watch *watch,
620 } 620 }
621 621
622 /* Front end dir is a number, which is used as the handle. */ 622 /* Front end dir is a number, which is used as the handle. */
623 err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); 623 err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
624 if (err) 624 if (err)
625 return; 625 return;
626 626
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 4519cb332987..5796d0157ce0 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -766,6 +766,25 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip)
766} 766}
767#endif 767#endif
768 768
769#ifdef CONFIG_PM_SLEEP
770static int tpm_tis_resume(struct device *dev)
771{
772 struct tpm_chip *chip = dev_get_drvdata(dev);
773 int ret;
774
775 if (chip->vendor.irq)
776 tpm_tis_reenable_interrupts(chip);
777
778 ret = tpm_pm_resume(dev);
779 if (!ret)
780 tpm_do_selftest(chip);
781
782 return ret;
783}
784#endif
785
786static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume);
787
769#ifdef CONFIG_PNP 788#ifdef CONFIG_PNP
770static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev, 789static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev,
771 const struct pnp_device_id *pnp_id) 790 const struct pnp_device_id *pnp_id)
@@ -787,26 +806,6 @@ static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev,
787 return tpm_tis_init(&pnp_dev->dev, start, len, irq); 806 return tpm_tis_init(&pnp_dev->dev, start, len, irq);
788} 807}
789 808
790static int tpm_tis_pnp_suspend(struct pnp_dev *dev, pm_message_t msg)
791{
792 return tpm_pm_suspend(&dev->dev);
793}
794
795static int tpm_tis_pnp_resume(struct pnp_dev *dev)
796{
797 struct tpm_chip *chip = pnp_get_drvdata(dev);
798 int ret;
799
800 if (chip->vendor.irq)
801 tpm_tis_reenable_interrupts(chip);
802
803 ret = tpm_pm_resume(&dev->dev);
804 if (!ret)
805 tpm_do_selftest(chip);
806
807 return ret;
808}
809
810static struct pnp_device_id tpm_pnp_tbl[] = { 809static struct pnp_device_id tpm_pnp_tbl[] = {
811 {"PNP0C31", 0}, /* TPM */ 810 {"PNP0C31", 0}, /* TPM */
812 {"ATM1200", 0}, /* Atmel */ 811 {"ATM1200", 0}, /* Atmel */
@@ -835,9 +834,12 @@ static struct pnp_driver tis_pnp_driver = {
835 .name = "tpm_tis", 834 .name = "tpm_tis",
836 .id_table = tpm_pnp_tbl, 835 .id_table = tpm_pnp_tbl,
837 .probe = tpm_tis_pnp_init, 836 .probe = tpm_tis_pnp_init,
838 .suspend = tpm_tis_pnp_suspend,
839 .resume = tpm_tis_pnp_resume,
840 .remove = tpm_tis_pnp_remove, 837 .remove = tpm_tis_pnp_remove,
838#ifdef CONFIG_PM_SLEEP
839 .driver = {
840 .pm = &tpm_tis_pm,
841 },
842#endif
841}; 843};
842 844
843#define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2 845#define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2
@@ -846,20 +848,6 @@ module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id,
846MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe"); 848MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe");
847#endif 849#endif
848 850
849#ifdef CONFIG_PM_SLEEP
850static int tpm_tis_resume(struct device *dev)
851{
852 struct tpm_chip *chip = dev_get_drvdata(dev);
853
854 if (chip->vendor.irq)
855 tpm_tis_reenable_interrupts(chip);
856
857 return tpm_pm_resume(dev);
858}
859#endif
860
861static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume);
862
863static struct platform_driver tis_drv = { 851static struct platform_driver tis_drv = {
864 .driver = { 852 .driver = {
865 .name = "tpm_tis", 853 .name = "tpm_tis",
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 232fa8fce26a..fa0affb699b4 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -14,7 +14,7 @@
14 * of and an antecedent to, SMBIOS, which stands for System 14 * of and an antecedent to, SMBIOS, which stands for System
15 * Management BIOS. See further: http://www.dmtf.org/standards 15 * Management BIOS. See further: http://www.dmtf.org/standards
16 */ 16 */
17static char dmi_empty_string[] = " "; 17static const char dmi_empty_string[] = " ";
18 18
19static u16 __initdata dmi_ver; 19static u16 __initdata dmi_ver;
20/* 20/*
@@ -49,7 +49,7 @@ static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s)
49 return ""; 49 return "";
50} 50}
51 51
52static char * __init dmi_string(const struct dmi_header *dm, u8 s) 52static const char * __init dmi_string(const struct dmi_header *dm, u8 s)
53{ 53{
54 const char *bp = dmi_string_nosave(dm, s); 54 const char *bp = dmi_string_nosave(dm, s);
55 char *str; 55 char *str;
@@ -62,8 +62,6 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
62 str = dmi_alloc(len); 62 str = dmi_alloc(len);
63 if (str != NULL) 63 if (str != NULL)
64 strcpy(str, bp); 64 strcpy(str, bp);
65 else
66 printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len);
67 65
68 return str; 66 return str;
69} 67}
@@ -133,17 +131,18 @@ static int __init dmi_checksum(const u8 *buf, u8 len)
133 return sum == 0; 131 return sum == 0;
134} 132}
135 133
136static char *dmi_ident[DMI_STRING_MAX]; 134static const char *dmi_ident[DMI_STRING_MAX];
137static LIST_HEAD(dmi_devices); 135static LIST_HEAD(dmi_devices);
138int dmi_available; 136int dmi_available;
139 137
140/* 138/*
141 * Save a DMI string 139 * Save a DMI string
142 */ 140 */
143static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int string) 141static void __init dmi_save_ident(const struct dmi_header *dm, int slot,
142 int string)
144{ 143{
145 const char *d = (const char*) dm; 144 const char *d = (const char *) dm;
146 char *p; 145 const char *p;
147 146
148 if (dmi_ident[slot]) 147 if (dmi_ident[slot])
149 return; 148 return;
@@ -155,9 +154,10 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int str
155 dmi_ident[slot] = p; 154 dmi_ident[slot] = p;
156} 155}
157 156
158static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index) 157static void __init dmi_save_uuid(const struct dmi_header *dm, int slot,
158 int index)
159{ 159{
160 const u8 *d = (u8*) dm + index; 160 const u8 *d = (u8 *) dm + index;
161 char *s; 161 char *s;
162 int is_ff = 1, is_00 = 1, i; 162 int is_ff = 1, is_00 = 1, i;
163 163
@@ -188,12 +188,13 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int inde
188 else 188 else
189 sprintf(s, "%pUB", d); 189 sprintf(s, "%pUB", d);
190 190
191 dmi_ident[slot] = s; 191 dmi_ident[slot] = s;
192} 192}
193 193
194static void __init dmi_save_type(const struct dmi_header *dm, int slot, int index) 194static void __init dmi_save_type(const struct dmi_header *dm, int slot,
195 int index)
195{ 196{
196 const u8 *d = (u8*) dm + index; 197 const u8 *d = (u8 *) dm + index;
197 char *s; 198 char *s;
198 199
199 if (dmi_ident[slot]) 200 if (dmi_ident[slot])
@@ -216,10 +217,8 @@ static void __init dmi_save_one_device(int type, const char *name)
216 return; 217 return;
217 218
218 dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1); 219 dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1);
219 if (!dev) { 220 if (!dev)
220 printk(KERN_ERR "dmi_save_one_device: out of memory.\n");
221 return; 221 return;
222 }
223 222
224 dev->type = type; 223 dev->type = type;
225 strcpy((char *)(dev + 1), name); 224 strcpy((char *)(dev + 1), name);
@@ -249,17 +248,14 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
249 struct dmi_device *dev; 248 struct dmi_device *dev;
250 249
251 for (i = 1; i <= count; i++) { 250 for (i = 1; i <= count; i++) {
252 char *devname = dmi_string(dm, i); 251 const char *devname = dmi_string(dm, i);
253 252
254 if (devname == dmi_empty_string) 253 if (devname == dmi_empty_string)
255 continue; 254 continue;
256 255
257 dev = dmi_alloc(sizeof(*dev)); 256 dev = dmi_alloc(sizeof(*dev));
258 if (!dev) { 257 if (!dev)
259 printk(KERN_ERR
260 "dmi_save_oem_strings_devices: out of memory.\n");
261 break; 258 break;
262 }
263 259
264 dev->type = DMI_DEV_TYPE_OEM_STRING; 260 dev->type = DMI_DEV_TYPE_OEM_STRING;
265 dev->name = devname; 261 dev->name = devname;
@@ -272,21 +268,17 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
272static void __init dmi_save_ipmi_device(const struct dmi_header *dm) 268static void __init dmi_save_ipmi_device(const struct dmi_header *dm)
273{ 269{
274 struct dmi_device *dev; 270 struct dmi_device *dev;
275 void * data; 271 void *data;
276 272
277 data = dmi_alloc(dm->length); 273 data = dmi_alloc(dm->length);
278 if (data == NULL) { 274 if (data == NULL)
279 printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
280 return; 275 return;
281 }
282 276
283 memcpy(data, dm, dm->length); 277 memcpy(data, dm, dm->length);
284 278
285 dev = dmi_alloc(sizeof(*dev)); 279 dev = dmi_alloc(sizeof(*dev));
286 if (!dev) { 280 if (!dev)
287 printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
288 return; 281 return;
289 }
290 282
291 dev->type = DMI_DEV_TYPE_IPMI; 283 dev->type = DMI_DEV_TYPE_IPMI;
292 dev->name = "IPMI controller"; 284 dev->name = "IPMI controller";
@@ -301,10 +293,9 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
301 struct dmi_dev_onboard *onboard_dev; 293 struct dmi_dev_onboard *onboard_dev;
302 294
303 onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1); 295 onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1);
304 if (!onboard_dev) { 296 if (!onboard_dev)
305 printk(KERN_ERR "dmi_save_dev_onboard: out of memory.\n");
306 return; 297 return;
307 } 298
308 onboard_dev->instance = instance; 299 onboard_dev->instance = instance;
309 onboard_dev->segment = segment; 300 onboard_dev->segment = segment;
310 onboard_dev->bus = bus; 301 onboard_dev->bus = bus;
@@ -320,7 +311,7 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
320 311
321static void __init dmi_save_extended_devices(const struct dmi_header *dm) 312static void __init dmi_save_extended_devices(const struct dmi_header *dm)
322{ 313{
323 const u8 *d = (u8*) dm + 5; 314 const u8 *d = (u8 *) dm + 5;
324 315
325 /* Skip disabled device */ 316 /* Skip disabled device */
326 if ((*d & 0x80) == 0) 317 if ((*d & 0x80) == 0)
@@ -338,7 +329,7 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
338 */ 329 */
339static void __init dmi_decode(const struct dmi_header *dm, void *dummy) 330static void __init dmi_decode(const struct dmi_header *dm, void *dummy)
340{ 331{
341 switch(dm->type) { 332 switch (dm->type) {
342 case 0: /* BIOS Information */ 333 case 0: /* BIOS Information */
343 dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); 334 dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
344 dmi_save_ident(dm, DMI_BIOS_VERSION, 5); 335 dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
@@ -502,13 +493,7 @@ void __init dmi_scan_machine(void)
502 dmi_available = 1; 493 dmi_available = 1;
503 goto out; 494 goto out;
504 } 495 }
505 } 496 } else {
506 else {
507 /*
508 * no iounmap() for that ioremap(); it would be a no-op, but
509 * it's so early in setup that sucker gets confused into doing
510 * what it shouldn't if we actually call it.
511 */
512 p = dmi_ioremap(0xF0000, 0x10000); 497 p = dmi_ioremap(0xF0000, 0x10000);
513 if (p == NULL) 498 if (p == NULL)
514 goto error; 499 goto error;
@@ -533,7 +518,7 @@ void __init dmi_scan_machine(void)
533 dmi_iounmap(p, 0x10000); 518 dmi_iounmap(p, 0x10000);
534 } 519 }
535 error: 520 error:
536 printk(KERN_INFO "DMI not present or invalid.\n"); 521 pr_info("DMI not present or invalid.\n");
537 out: 522 out:
538 dmi_initialized = 1; 523 dmi_initialized = 1;
539} 524}
@@ -669,7 +654,7 @@ int dmi_name_in_serial(const char *str)
669 654
670/** 655/**
671 * dmi_name_in_vendors - Check if string is in the DMI system or board vendor name 656 * dmi_name_in_vendors - Check if string is in the DMI system or board vendor name
672 * @str: Case sensitive Name 657 * @str: Case sensitive Name
673 */ 658 */
674int dmi_name_in_vendors(const char *str) 659int dmi_name_in_vendors(const char *str)
675{ 660{
@@ -696,13 +681,13 @@ EXPORT_SYMBOL(dmi_name_in_vendors);
696 * A new search is initiated by passing %NULL as the @from argument. 681 * A new search is initiated by passing %NULL as the @from argument.
697 * If @from is not %NULL, searches continue from next device. 682 * If @from is not %NULL, searches continue from next device.
698 */ 683 */
699const struct dmi_device * dmi_find_device(int type, const char *name, 684const struct dmi_device *dmi_find_device(int type, const char *name,
700 const struct dmi_device *from) 685 const struct dmi_device *from)
701{ 686{
702 const struct list_head *head = from ? &from->list : &dmi_devices; 687 const struct list_head *head = from ? &from->list : &dmi_devices;
703 struct list_head *d; 688 struct list_head *d;
704 689
705 for(d = head->next; d != &dmi_devices; d = d->next) { 690 for (d = head->next; d != &dmi_devices; d = d->next) {
706 const struct dmi_device *dev = 691 const struct dmi_device *dev =
707 list_entry(d, struct dmi_device, list); 692 list_entry(d, struct dmi_device, list);
708 693
diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index acba0b9f4406..6eb535ffeddc 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -525,7 +525,7 @@ static ssize_t gsmi_clear_eventlog_store(struct kobject *kobj,
525 u32 data_type; 525 u32 data_type;
526 } param; 526 } param;
527 527
528 rc = strict_strtoul(buf, 0, &val); 528 rc = kstrtoul(buf, 0, &val);
529 if (rc) 529 if (rc)
530 return rc; 530 return rc;
531 531
diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c
index 0a1c9626aa9e..08ba4972da9d 100644
--- a/drivers/iommu/msm_iommu_dev.c
+++ b/drivers/iommu/msm_iommu_dev.c
@@ -282,7 +282,6 @@ static int msm_iommu_remove(struct platform_device *pdev)
282 clk_put(drv->pclk); 282 clk_put(drv->pclk);
283 memset(drv, 0, sizeof(*drv)); 283 memset(drv, 0, sizeof(*drv));
284 kfree(drv); 284 kfree(drv);
285 platform_set_drvdata(pdev, NULL);
286 } 285 }
287 return 0; 286 return 0;
288} 287}
@@ -366,7 +365,6 @@ static int msm_iommu_ctx_remove(struct platform_device *pdev)
366 if (drv) { 365 if (drv) {
367 memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata)); 366 memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata));
368 kfree(drv); 367 kfree(drv);
369 platform_set_drvdata(pdev, NULL);
370 } 368 }
371 return 0; 369 return 0;
372} 370}
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 0ba3766240d5..bcd78a720630 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1008,8 +1008,6 @@ static int omap_iommu_remove(struct platform_device *pdev)
1008 struct resource *res; 1008 struct resource *res;
1009 struct omap_iommu *obj = platform_get_drvdata(pdev); 1009 struct omap_iommu *obj = platform_get_drvdata(pdev);
1010 1010
1011 platform_set_drvdata(pdev, NULL);
1012
1013 iopgtable_clear_entry_all(obj); 1011 iopgtable_clear_entry_all(obj);
1014 1012
1015 irq = platform_get_irq(pdev, 0); 1013 irq = platform_get_irq(pdev, 0);
diff --git a/drivers/memstick/core/Kconfig b/drivers/memstick/core/Kconfig
index 95f1814b5368..1d389491d5fd 100644
--- a/drivers/memstick/core/Kconfig
+++ b/drivers/memstick/core/Kconfig
@@ -24,3 +24,15 @@ config MSPRO_BLOCK
24 support. This provides a block device driver, which you can use 24 support. This provides a block device driver, which you can use
25 to mount the filesystem. Almost everyone wishing MemoryStick 25 to mount the filesystem. Almost everyone wishing MemoryStick
26 support should say Y or M here. 26 support should say Y or M here.
27
28config MS_BLOCK
29 tristate "MemoryStick Standard device driver"
30 depends on BLOCK
31 help
32 Say Y here to enable the MemoryStick Standard device driver
33 support. This provides a block device driver, which you can use
34 to mount the filesystem.
35 This driver works with old (bulky) MemoryStick and MemoryStick Duo
36 but not PRO. Say Y if you have such card.
37 Driver is new and not yet well tested, thus it can damage your card
38 (even permanently)
diff --git a/drivers/memstick/core/Makefile b/drivers/memstick/core/Makefile
index ecd029937738..0d7f90c0ff25 100644
--- a/drivers/memstick/core/Makefile
+++ b/drivers/memstick/core/Makefile
@@ -3,5 +3,5 @@
3# 3#
4 4
5obj-$(CONFIG_MEMSTICK) += memstick.o 5obj-$(CONFIG_MEMSTICK) += memstick.o
6 6obj-$(CONFIG_MS_BLOCK) += ms_block.o
7obj-$(CONFIG_MSPRO_BLOCK) += mspro_block.o 7obj-$(CONFIG_MSPRO_BLOCK) += mspro_block.o
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
new file mode 100644
index 000000000000..08e70232062f
--- /dev/null
+++ b/drivers/memstick/core/ms_block.c
@@ -0,0 +1,2385 @@
1/*
2 * ms_block.c - Sony MemoryStick (legacy) storage support
3
4 * Copyright (C) 2013 Maxim Levitsky <maximlevitsky@gmail.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Minor portions of the driver were copied from mspro_block.c which is
11 * Copyright (C) 2007 Alex Dubov <oakad@yahoo.com>
12 *
13 */
14#define DRIVER_NAME "ms_block"
15#define pr_fmt(fmt) DRIVER_NAME ": " fmt
16
17#include <linux/module.h>
18#include <linux/blkdev.h>
19#include <linux/memstick.h>
20#include <linux/idr.h>
21#include <linux/hdreg.h>
22#include <linux/delay.h>
23#include <linux/slab.h>
24#include <linux/random.h>
25#include <linux/bitmap.h>
26#include <linux/scatterlist.h>
27#include <linux/jiffies.h>
28#include <linux/workqueue.h>
29#include <linux/mutex.h>
30#include "ms_block.h"
31
32static int debug;
33static int cache_flush_timeout = 1000;
34static bool verify_writes;
35
36/*
37 * Copies section of 'sg_from' starting from offset 'offset' and with length
38 * 'len' To another scatterlist of to_nents enties
39 */
40static size_t msb_sg_copy(struct scatterlist *sg_from,
41 struct scatterlist *sg_to, int to_nents, size_t offset, size_t len)
42{
43 size_t copied = 0;
44
45 while (offset > 0) {
46 if (offset >= sg_from->length) {
47 if (sg_is_last(sg_from))
48 return 0;
49
50 offset -= sg_from->length;
51 sg_from = sg_next(sg_from);
52 continue;
53 }
54
55 copied = min(len, sg_from->length - offset);
56 sg_set_page(sg_to, sg_page(sg_from),
57 copied, sg_from->offset + offset);
58
59 len -= copied;
60 offset = 0;
61
62 if (sg_is_last(sg_from) || !len)
63 goto out;
64
65 sg_to = sg_next(sg_to);
66 to_nents--;
67 sg_from = sg_next(sg_from);
68 }
69
70 while (len > sg_from->length && to_nents--) {
71 len -= sg_from->length;
72 copied += sg_from->length;
73
74 sg_set_page(sg_to, sg_page(sg_from),
75 sg_from->length, sg_from->offset);
76
77 if (sg_is_last(sg_from) || !len)
78 goto out;
79
80 sg_from = sg_next(sg_from);
81 sg_to = sg_next(sg_to);
82 }
83
84 if (len && to_nents) {
85 sg_set_page(sg_to, sg_page(sg_from), len, sg_from->offset);
86 copied += len;
87 }
88out:
89 sg_mark_end(sg_to);
90 return copied;
91}
92
93/*
94 * Compares section of 'sg' starting from offset 'offset' and with length 'len'
95 * to linear buffer of length 'len' at address 'buffer'
96 * Returns 0 if equal and -1 otherwice
97 */
98static int msb_sg_compare_to_buffer(struct scatterlist *sg,
99 size_t offset, u8 *buffer, size_t len)
100{
101 int retval = 0, cmplen;
102 struct sg_mapping_iter miter;
103
104 sg_miter_start(&miter, sg, sg_nents(sg),
105 SG_MITER_ATOMIC | SG_MITER_FROM_SG);
106
107 while (sg_miter_next(&miter) && len > 0) {
108 if (offset >= miter.length) {
109 offset -= miter.length;
110 continue;
111 }
112
113 cmplen = min(miter.length - offset, len);
114 retval = memcmp(miter.addr + offset, buffer, cmplen) ? -1 : 0;
115 if (retval)
116 break;
117
118 buffer += cmplen;
119 len -= cmplen;
120 offset = 0;
121 }
122
123 if (!retval && len)
124 retval = -1;
125
126 sg_miter_stop(&miter);
127 return retval;
128}
129
130
131/* Get zone at which block with logical address 'lba' lives
132 * Flash is broken into zones.
133 * Each zone consists of 512 eraseblocks, out of which in first
134 * zone 494 are used and 496 are for all following zones.
135 * Therefore zone #0 hosts blocks 0-493, zone #1 blocks 494-988, etc...
136*/
137static int msb_get_zone_from_lba(int lba)
138{
139 if (lba < 494)
140 return 0;
141 return ((lba - 494) / 496) + 1;
142}
143
144/* Get zone of physical block. Trivial */
145static int msb_get_zone_from_pba(int pba)
146{
147 return pba / MS_BLOCKS_IN_ZONE;
148}
149
150/* Debug test to validate free block counts */
151static int msb_validate_used_block_bitmap(struct msb_data *msb)
152{
153 int total_free_blocks = 0;
154 int i;
155
156 if (!debug)
157 return 0;
158
159 for (i = 0; i < msb->zone_count; i++)
160 total_free_blocks += msb->free_block_count[i];
161
162 if (msb->block_count - bitmap_weight(msb->used_blocks_bitmap,
163 msb->block_count) == total_free_blocks)
164 return 0;
165
166 pr_err("BUG: free block counts don't match the bitmap");
167 msb->read_only = true;
168 return -EINVAL;
169}
170
171/* Mark physical block as used */
172static void msb_mark_block_used(struct msb_data *msb, int pba)
173{
174 int zone = msb_get_zone_from_pba(pba);
175
176 if (test_bit(pba, msb->used_blocks_bitmap)) {
177 pr_err(
178 "BUG: attempt to mark already used pba %d as used", pba);
179 msb->read_only = true;
180 return;
181 }
182
183 if (msb_validate_used_block_bitmap(msb))
184 return;
185
186 /* No races because all IO is single threaded */
187 __set_bit(pba, msb->used_blocks_bitmap);
188 msb->free_block_count[zone]--;
189}
190
191/* Mark physical block as free */
192static void msb_mark_block_unused(struct msb_data *msb, int pba)
193{
194 int zone = msb_get_zone_from_pba(pba);
195
196 if (!test_bit(pba, msb->used_blocks_bitmap)) {
197 pr_err("BUG: attempt to mark already unused pba %d as unused" , pba);
198 msb->read_only = true;
199 return;
200 }
201
202 if (msb_validate_used_block_bitmap(msb))
203 return;
204
205 /* No races because all IO is single threaded */
206 __clear_bit(pba, msb->used_blocks_bitmap);
207 msb->free_block_count[zone]++;
208}
209
210/* Invalidate current register window */
211static void msb_invalidate_reg_window(struct msb_data *msb)
212{
213 msb->reg_addr.w_offset = offsetof(struct ms_register, id);
214 msb->reg_addr.w_length = sizeof(struct ms_id_register);
215 msb->reg_addr.r_offset = offsetof(struct ms_register, id);
216 msb->reg_addr.r_length = sizeof(struct ms_id_register);
217 msb->addr_valid = false;
218}
219
220/* Start a state machine */
221static int msb_run_state_machine(struct msb_data *msb, int (*state_func)
222 (struct memstick_dev *card, struct memstick_request **req))
223{
224 struct memstick_dev *card = msb->card;
225
226 WARN_ON(msb->state != -1);
227 msb->int_polling = false;
228 msb->state = 0;
229 msb->exit_error = 0;
230
231 memset(&card->current_mrq, 0, sizeof(card->current_mrq));
232
233 card->next_request = state_func;
234 memstick_new_req(card->host);
235 wait_for_completion(&card->mrq_complete);
236
237 WARN_ON(msb->state != -1);
238 return msb->exit_error;
239}
240
241/* State machines call that to exit */
242static int msb_exit_state_machine(struct msb_data *msb, int error)
243{
244 WARN_ON(msb->state == -1);
245
246 msb->state = -1;
247 msb->exit_error = error;
248 msb->card->next_request = h_msb_default_bad;
249
250 /* Invalidate reg window on errors */
251 if (error)
252 msb_invalidate_reg_window(msb);
253
254 complete(&msb->card->mrq_complete);
255 return -ENXIO;
256}
257
258/* read INT register */
259static int msb_read_int_reg(struct msb_data *msb, long timeout)
260{
261 struct memstick_request *mrq = &msb->card->current_mrq;
262
263 WARN_ON(msb->state == -1);
264
265 if (!msb->int_polling) {
266 msb->int_timeout = jiffies +
267 msecs_to_jiffies(timeout == -1 ? 500 : timeout);
268 msb->int_polling = true;
269 } else if (time_after(jiffies, msb->int_timeout)) {
270 mrq->data[0] = MEMSTICK_INT_CMDNAK;
271 return 0;
272 }
273
274 if ((msb->caps & MEMSTICK_CAP_AUTO_GET_INT) &&
275 mrq->need_card_int && !mrq->error) {
276 mrq->data[0] = mrq->int_reg;
277 mrq->need_card_int = false;
278 return 0;
279 } else {
280 memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1);
281 return 1;
282 }
283}
284
285/* Read a register */
286static int msb_read_regs(struct msb_data *msb, int offset, int len)
287{
288 struct memstick_request *req = &msb->card->current_mrq;
289
290 if (msb->reg_addr.r_offset != offset ||
291 msb->reg_addr.r_length != len || !msb->addr_valid) {
292
293 msb->reg_addr.r_offset = offset;
294 msb->reg_addr.r_length = len;
295 msb->addr_valid = true;
296
297 memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS,
298 &msb->reg_addr, sizeof(msb->reg_addr));
299 return 0;
300 }
301
302 memstick_init_req(req, MS_TPC_READ_REG, NULL, len);
303 return 1;
304}
305
306/* Write a card register */
307static int msb_write_regs(struct msb_data *msb, int offset, int len, void *buf)
308{
309 struct memstick_request *req = &msb->card->current_mrq;
310
311 if (msb->reg_addr.w_offset != offset ||
312 msb->reg_addr.w_length != len || !msb->addr_valid) {
313
314 msb->reg_addr.w_offset = offset;
315 msb->reg_addr.w_length = len;
316 msb->addr_valid = true;
317
318 memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS,
319 &msb->reg_addr, sizeof(msb->reg_addr));
320 return 0;
321 }
322
323 memstick_init_req(req, MS_TPC_WRITE_REG, buf, len);
324 return 1;
325}
326
327/* Handler for absence of IO */
328static int h_msb_default_bad(struct memstick_dev *card,
329 struct memstick_request **mrq)
330{
331 return -ENXIO;
332}
333
334/*
335 * This function is a handler for reads of one page from device.
336 * Writes output to msb->current_sg, takes sector address from msb->reg.param
337 * Can also be used to read extra data only. Set params accordintly.
338 */
339static int h_msb_read_page(struct memstick_dev *card,
340 struct memstick_request **out_mrq)
341{
342 struct msb_data *msb = memstick_get_drvdata(card);
343 struct memstick_request *mrq = *out_mrq = &card->current_mrq;
344 struct scatterlist sg[2];
345 u8 command, intreg;
346
347 if (mrq->error) {
348 dbg("read_page, unknown error");
349 return msb_exit_state_machine(msb, mrq->error);
350 }
351again:
352 switch (msb->state) {
353 case MSB_RP_SEND_BLOCK_ADDRESS:
354 /* msb_write_regs sometimes "fails" because it needs to update
355 the reg window, and thus it returns request for that.
356 Then we stay in this state and retry */
357 if (!msb_write_regs(msb,
358 offsetof(struct ms_register, param),
359 sizeof(struct ms_param_register),
360 (unsigned char *)&msb->regs.param))
361 return 0;
362
363 msb->state = MSB_RP_SEND_READ_COMMAND;
364 return 0;
365
366 case MSB_RP_SEND_READ_COMMAND:
367 command = MS_CMD_BLOCK_READ;
368 memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1);
369 msb->state = MSB_RP_SEND_INT_REQ;
370 return 0;
371
372 case MSB_RP_SEND_INT_REQ:
373 msb->state = MSB_RP_RECEIVE_INT_REQ_RESULT;
374 /* If dont actually need to send the int read request (only in
375 serial mode), then just fall through */
376 if (msb_read_int_reg(msb, -1))
377 return 0;
378 /* fallthrough */
379
380 case MSB_RP_RECEIVE_INT_REQ_RESULT:
381 intreg = mrq->data[0];
382 msb->regs.status.interrupt = intreg;
383
384 if (intreg & MEMSTICK_INT_CMDNAK)
385 return msb_exit_state_machine(msb, -EIO);
386
387 if (!(intreg & MEMSTICK_INT_CED)) {
388 msb->state = MSB_RP_SEND_INT_REQ;
389 goto again;
390 }
391
392 msb->int_polling = false;
393 msb->state = (intreg & MEMSTICK_INT_ERR) ?
394 MSB_RP_SEND_READ_STATUS_REG : MSB_RP_SEND_OOB_READ;
395 goto again;
396
397 case MSB_RP_SEND_READ_STATUS_REG:
398 /* read the status register to understand source of the INT_ERR */
399 if (!msb_read_regs(msb,
400 offsetof(struct ms_register, status),
401 sizeof(struct ms_status_register)))
402 return 0;
403
404 msb->state = MSB_RP_RECEIVE_OOB_READ;
405 return 0;
406
407 case MSB_RP_RECIVE_STATUS_REG:
408 msb->regs.status = *(struct ms_status_register *)mrq->data;
409 msb->state = MSB_RP_SEND_OOB_READ;
410 /* fallthrough */
411
412 case MSB_RP_SEND_OOB_READ:
413 if (!msb_read_regs(msb,
414 offsetof(struct ms_register, extra_data),
415 sizeof(struct ms_extra_data_register)))
416 return 0;
417
418 msb->state = MSB_RP_RECEIVE_OOB_READ;
419 return 0;
420
421 case MSB_RP_RECEIVE_OOB_READ:
422 msb->regs.extra_data =
423 *(struct ms_extra_data_register *) mrq->data;
424 msb->state = MSB_RP_SEND_READ_DATA;
425 /* fallthrough */
426
427 case MSB_RP_SEND_READ_DATA:
428 /* Skip that state if we only read the oob */
429 if (msb->regs.param.cp == MEMSTICK_CP_EXTRA) {
430 msb->state = MSB_RP_RECEIVE_READ_DATA;
431 goto again;
432 }
433
434 sg_init_table(sg, ARRAY_SIZE(sg));
435 msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg),
436 msb->current_sg_offset,
437 msb->page_size);
438
439 memstick_init_req_sg(mrq, MS_TPC_READ_LONG_DATA, sg);
440 msb->state = MSB_RP_RECEIVE_READ_DATA;
441 return 0;
442
443 case MSB_RP_RECEIVE_READ_DATA:
444 if (!(msb->regs.status.interrupt & MEMSTICK_INT_ERR)) {
445 msb->current_sg_offset += msb->page_size;
446 return msb_exit_state_machine(msb, 0);
447 }
448
449 if (msb->regs.status.status1 & MEMSTICK_UNCORR_ERROR) {
450 dbg("read_page: uncorrectable error");
451 return msb_exit_state_machine(msb, -EBADMSG);
452 }
453
454 if (msb->regs.status.status1 & MEMSTICK_CORR_ERROR) {
455 dbg("read_page: correctable error");
456 msb->current_sg_offset += msb->page_size;
457 return msb_exit_state_machine(msb, -EUCLEAN);
458 } else {
459 dbg("read_page: INT error, but no status error bits");
460 return msb_exit_state_machine(msb, -EIO);
461 }
462 }
463
464 BUG();
465}
466
467/*
468 * Handler of writes of exactly one block.
469 * Takes address from msb->regs.param.
470 * Writes same extra data to blocks, also taken
471 * from msb->regs.extra
472 * Returns -EBADMSG if write fails due to uncorrectable error, or -EIO if
473 * device refuses to take the command or something else
474 */
475static int h_msb_write_block(struct memstick_dev *card,
476 struct memstick_request **out_mrq)
477{
478 struct msb_data *msb = memstick_get_drvdata(card);
479 struct memstick_request *mrq = *out_mrq = &card->current_mrq;
480 struct scatterlist sg[2];
481 u8 intreg, command;
482
483 if (mrq->error)
484 return msb_exit_state_machine(msb, mrq->error);
485
486again:
487 switch (msb->state) {
488
489 /* HACK: Jmicon handling of TPCs between 8 and
490 * sizeof(memstick_request.data) is broken due to hardware
491 * bug in PIO mode that is used for these TPCs
492 * Therefore split the write
493 */
494
495 case MSB_WB_SEND_WRITE_PARAMS:
496 if (!msb_write_regs(msb,
497 offsetof(struct ms_register, param),
498 sizeof(struct ms_param_register),
499 &msb->regs.param))
500 return 0;
501
502 msb->state = MSB_WB_SEND_WRITE_OOB;
503 return 0;
504
505 case MSB_WB_SEND_WRITE_OOB:
506 if (!msb_write_regs(msb,
507 offsetof(struct ms_register, extra_data),
508 sizeof(struct ms_extra_data_register),
509 &msb->regs.extra_data))
510 return 0;
511 msb->state = MSB_WB_SEND_WRITE_COMMAND;
512 return 0;
513
514
515 case MSB_WB_SEND_WRITE_COMMAND:
516 command = MS_CMD_BLOCK_WRITE;
517 memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1);
518 msb->state = MSB_WB_SEND_INT_REQ;
519 return 0;
520
521 case MSB_WB_SEND_INT_REQ:
522 msb->state = MSB_WB_RECEIVE_INT_REQ;
523 if (msb_read_int_reg(msb, -1))
524 return 0;
525 /* fallthrough */
526
527 case MSB_WB_RECEIVE_INT_REQ:
528 intreg = mrq->data[0];
529 msb->regs.status.interrupt = intreg;
530
531 /* errors mean out of here, and fast... */
532 if (intreg & (MEMSTICK_INT_CMDNAK))
533 return msb_exit_state_machine(msb, -EIO);
534
535 if (intreg & MEMSTICK_INT_ERR)
536 return msb_exit_state_machine(msb, -EBADMSG);
537
538
539 /* for last page we need to poll CED */
540 if (msb->current_page == msb->pages_in_block) {
541 if (intreg & MEMSTICK_INT_CED)
542 return msb_exit_state_machine(msb, 0);
543 msb->state = MSB_WB_SEND_INT_REQ;
544 goto again;
545
546 }
547
548 /* for non-last page we need BREQ before writing next chunk */
549 if (!(intreg & MEMSTICK_INT_BREQ)) {
550 msb->state = MSB_WB_SEND_INT_REQ;
551 goto again;
552 }
553
554 msb->int_polling = false;
555 msb->state = MSB_WB_SEND_WRITE_DATA;
556 /* fallthrough */
557
558 case MSB_WB_SEND_WRITE_DATA:
559 sg_init_table(sg, ARRAY_SIZE(sg));
560
561 if (msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg),
562 msb->current_sg_offset,
563 msb->page_size) < msb->page_size)
564 return msb_exit_state_machine(msb, -EIO);
565
566 memstick_init_req_sg(mrq, MS_TPC_WRITE_LONG_DATA, sg);
567 mrq->need_card_int = 1;
568 msb->state = MSB_WB_RECEIVE_WRITE_CONFIRMATION;
569 return 0;
570
571 case MSB_WB_RECEIVE_WRITE_CONFIRMATION:
572 msb->current_page++;
573 msb->current_sg_offset += msb->page_size;
574 msb->state = MSB_WB_SEND_INT_REQ;
575 goto again;
576 default:
577 BUG();
578 }
579
580 return 0;
581}
582
583/*
584 * This function is used to send simple IO requests to device that consist
585 * of register write + command
586 */
587static int h_msb_send_command(struct memstick_dev *card,
588 struct memstick_request **out_mrq)
589{
590 struct msb_data *msb = memstick_get_drvdata(card);
591 struct memstick_request *mrq = *out_mrq = &card->current_mrq;
592 u8 intreg;
593
594 if (mrq->error) {
595 dbg("send_command: unknown error");
596 return msb_exit_state_machine(msb, mrq->error);
597 }
598again:
599 switch (msb->state) {
600
601 /* HACK: see h_msb_write_block */
602 case MSB_SC_SEND_WRITE_PARAMS: /* write param register*/
603 if (!msb_write_regs(msb,
604 offsetof(struct ms_register, param),
605 sizeof(struct ms_param_register),
606 &msb->regs.param))
607 return 0;
608 msb->state = MSB_SC_SEND_WRITE_OOB;
609 return 0;
610
611 case MSB_SC_SEND_WRITE_OOB:
612 if (!msb->command_need_oob) {
613 msb->state = MSB_SC_SEND_COMMAND;
614 goto again;
615 }
616
617 if (!msb_write_regs(msb,
618 offsetof(struct ms_register, extra_data),
619 sizeof(struct ms_extra_data_register),
620 &msb->regs.extra_data))
621 return 0;
622
623 msb->state = MSB_SC_SEND_COMMAND;
624 return 0;
625
626 case MSB_SC_SEND_COMMAND:
627 memstick_init_req(mrq, MS_TPC_SET_CMD, &msb->command_value, 1);
628 msb->state = MSB_SC_SEND_INT_REQ;
629 return 0;
630
631 case MSB_SC_SEND_INT_REQ:
632 msb->state = MSB_SC_RECEIVE_INT_REQ;
633 if (msb_read_int_reg(msb, -1))
634 return 0;
635 /* fallthrough */
636
637 case MSB_SC_RECEIVE_INT_REQ:
638 intreg = mrq->data[0];
639
640 if (intreg & MEMSTICK_INT_CMDNAK)
641 return msb_exit_state_machine(msb, -EIO);
642 if (intreg & MEMSTICK_INT_ERR)
643 return msb_exit_state_machine(msb, -EBADMSG);
644
645 if (!(intreg & MEMSTICK_INT_CED)) {
646 msb->state = MSB_SC_SEND_INT_REQ;
647 goto again;
648 }
649
650 return msb_exit_state_machine(msb, 0);
651 }
652
653 BUG();
654}
655
656/* Small handler for card reset */
657static int h_msb_reset(struct memstick_dev *card,
658 struct memstick_request **out_mrq)
659{
660 u8 command = MS_CMD_RESET;
661 struct msb_data *msb = memstick_get_drvdata(card);
662 struct memstick_request *mrq = *out_mrq = &card->current_mrq;
663
664 if (mrq->error)
665 return msb_exit_state_machine(msb, mrq->error);
666
667 switch (msb->state) {
668 case MSB_RS_SEND:
669 memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1);
670 mrq->need_card_int = 0;
671 msb->state = MSB_RS_CONFIRM;
672 return 0;
673 case MSB_RS_CONFIRM:
674 return msb_exit_state_machine(msb, 0);
675 }
676 BUG();
677}
678
679/* This handler is used to do serial->parallel switch */
680static int h_msb_parallel_switch(struct memstick_dev *card,
681 struct memstick_request **out_mrq)
682{
683 struct msb_data *msb = memstick_get_drvdata(card);
684 struct memstick_request *mrq = *out_mrq = &card->current_mrq;
685 struct memstick_host *host = card->host;
686
687 if (mrq->error) {
688 dbg("parallel_switch: error");
689 msb->regs.param.system &= ~MEMSTICK_SYS_PAM;
690 return msb_exit_state_machine(msb, mrq->error);
691 }
692
693 switch (msb->state) {
694 case MSB_PS_SEND_SWITCH_COMMAND:
695 /* Set the parallel interface on memstick side */
696 msb->regs.param.system |= MEMSTICK_SYS_PAM;
697
698 if (!msb_write_regs(msb,
699 offsetof(struct ms_register, param),
700 1,
701 (unsigned char *)&msb->regs.param))
702 return 0;
703
704 msb->state = MSB_PS_SWICH_HOST;
705 return 0;
706
707 case MSB_PS_SWICH_HOST:
708 /* Set parallel interface on our side + send a dummy request
709 to see if card responds */
710 host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_PAR4);
711 memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1);
712 msb->state = MSB_PS_CONFIRM;
713 return 0;
714
715 case MSB_PS_CONFIRM:
716 return msb_exit_state_machine(msb, 0);
717 }
718
719 BUG();
720}
721
722static int msb_switch_to_parallel(struct msb_data *msb);
723
724/* Reset the card, to guard against hw errors beeing treated as bad blocks */
725static int msb_reset(struct msb_data *msb, bool full)
726{
727
728 bool was_parallel = msb->regs.param.system & MEMSTICK_SYS_PAM;
729 struct memstick_dev *card = msb->card;
730 struct memstick_host *host = card->host;
731 int error;
732
733 /* Reset the card */
734 msb->regs.param.system = MEMSTICK_SYS_BAMD;
735
736 if (full) {
737 error = host->set_param(host,
738 MEMSTICK_POWER, MEMSTICK_POWER_OFF);
739 if (error)
740 goto out_error;
741
742 msb_invalidate_reg_window(msb);
743
744 error = host->set_param(host,
745 MEMSTICK_POWER, MEMSTICK_POWER_ON);
746 if (error)
747 goto out_error;
748
749 error = host->set_param(host,
750 MEMSTICK_INTERFACE, MEMSTICK_SERIAL);
751 if (error) {
752out_error:
753 dbg("Failed to reset the host controller");
754 msb->read_only = true;
755 return -EFAULT;
756 }
757 }
758
759 error = msb_run_state_machine(msb, h_msb_reset);
760 if (error) {
761 dbg("Failed to reset the card");
762 msb->read_only = true;
763 return -ENODEV;
764 }
765
766 /* Set parallel mode */
767 if (was_parallel)
768 msb_switch_to_parallel(msb);
769 return 0;
770}
771
772/* Attempts to switch interface to parallel mode */
773static int msb_switch_to_parallel(struct msb_data *msb)
774{
775 int error;
776
777 error = msb_run_state_machine(msb, h_msb_parallel_switch);
778 if (error) {
779 pr_err("Switch to parallel failed");
780 msb->regs.param.system &= ~MEMSTICK_SYS_PAM;
781 msb_reset(msb, true);
782 return -EFAULT;
783 }
784
785 msb->caps |= MEMSTICK_CAP_AUTO_GET_INT;
786 return 0;
787}
788
789/* Changes overwrite flag on a page */
790static int msb_set_overwrite_flag(struct msb_data *msb,
791 u16 pba, u8 page, u8 flag)
792{
793 if (msb->read_only)
794 return -EROFS;
795
796 msb->regs.param.block_address = cpu_to_be16(pba);
797 msb->regs.param.page_address = page;
798 msb->regs.param.cp = MEMSTICK_CP_OVERWRITE;
799 msb->regs.extra_data.overwrite_flag = flag;
800 msb->command_value = MS_CMD_BLOCK_WRITE;
801 msb->command_need_oob = true;
802
803 dbg_verbose("changing overwrite flag to %02x for sector %d, page %d",
804 flag, pba, page);
805 return msb_run_state_machine(msb, h_msb_send_command);
806}
807
808static int msb_mark_bad(struct msb_data *msb, int pba)
809{
810 pr_notice("marking pba %d as bad", pba);
811 msb_reset(msb, true);
812 return msb_set_overwrite_flag(
813 msb, pba, 0, 0xFF & ~MEMSTICK_OVERWRITE_BKST);
814}
815
816static int msb_mark_page_bad(struct msb_data *msb, int pba, int page)
817{
818 dbg("marking page %d of pba %d as bad", page, pba);
819 msb_reset(msb, true);
820 return msb_set_overwrite_flag(msb,
821 pba, page, ~MEMSTICK_OVERWRITE_PGST0);
822}
823
824/* Erases one physical block */
825static int msb_erase_block(struct msb_data *msb, u16 pba)
826{
827 int error, try;
828 if (msb->read_only)
829 return -EROFS;
830
831 dbg_verbose("erasing pba %d", pba);
832
833 for (try = 1; try < 3; try++) {
834 msb->regs.param.block_address = cpu_to_be16(pba);
835 msb->regs.param.page_address = 0;
836 msb->regs.param.cp = MEMSTICK_CP_BLOCK;
837 msb->command_value = MS_CMD_BLOCK_ERASE;
838 msb->command_need_oob = false;
839
840
841 error = msb_run_state_machine(msb, h_msb_send_command);
842 if (!error || msb_reset(msb, true))
843 break;
844 }
845
846 if (error) {
847 pr_err("erase failed, marking pba %d as bad", pba);
848 msb_mark_bad(msb, pba);
849 }
850
851 dbg_verbose("erase success, marking pba %d as unused", pba);
852 msb_mark_block_unused(msb, pba);
853 __set_bit(pba, msb->erased_blocks_bitmap);
854 return error;
855}
856
857/* Reads one page from device */
858static int msb_read_page(struct msb_data *msb,
859 u16 pba, u8 page, struct ms_extra_data_register *extra,
860 struct scatterlist *sg, int offset)
861{
862 int try, error;
863
864 if (pba == MS_BLOCK_INVALID) {
865 unsigned long flags;
866 struct sg_mapping_iter miter;
867 size_t len = msb->page_size;
868
869 dbg_verbose("read unmapped sector. returning 0xFF");
870
871 local_irq_save(flags);
872 sg_miter_start(&miter, sg, sg_nents(sg),
873 SG_MITER_ATOMIC | SG_MITER_TO_SG);
874
875 while (sg_miter_next(&miter) && len > 0) {
876
877 int chunklen;
878
879 if (offset && offset >= miter.length) {
880 offset -= miter.length;
881 continue;
882 }
883
884 chunklen = min(miter.length - offset, len);
885 memset(miter.addr + offset, 0xFF, chunklen);
886 len -= chunklen;
887 offset = 0;
888 }
889
890 sg_miter_stop(&miter);
891 local_irq_restore(flags);
892
893 if (offset)
894 return -EFAULT;
895
896 if (extra)
897 memset(extra, 0xFF, sizeof(*extra));
898 return 0;
899 }
900
901 if (pba >= msb->block_count) {
902 pr_err("BUG: attempt to read beyond the end of the card at pba %d", pba);
903 return -EINVAL;
904 }
905
906 for (try = 1; try < 3; try++) {
907 msb->regs.param.block_address = cpu_to_be16(pba);
908 msb->regs.param.page_address = page;
909 msb->regs.param.cp = MEMSTICK_CP_PAGE;
910
911 msb->current_sg = sg;
912 msb->current_sg_offset = offset;
913 error = msb_run_state_machine(msb, h_msb_read_page);
914
915
916 if (error == -EUCLEAN) {
917 pr_notice("correctable error on pba %d, page %d",
918 pba, page);
919 error = 0;
920 }
921
922 if (!error && extra)
923 *extra = msb->regs.extra_data;
924
925 if (!error || msb_reset(msb, true))
926 break;
927
928 }
929
930 /* Mark bad pages */
931 if (error == -EBADMSG) {
932 pr_err("uncorrectable error on read of pba %d, page %d",
933 pba, page);
934
935 if (msb->regs.extra_data.overwrite_flag &
936 MEMSTICK_OVERWRITE_PGST0)
937 msb_mark_page_bad(msb, pba, page);
938 return -EBADMSG;
939 }
940
941 if (error)
942 pr_err("read of pba %d, page %d failed with error %d",
943 pba, page, error);
944 return error;
945}
946
947/* Reads oob of page only */
948static int msb_read_oob(struct msb_data *msb, u16 pba, u16 page,
949 struct ms_extra_data_register *extra)
950{
951 int error;
952
953 BUG_ON(!extra);
954 msb->regs.param.block_address = cpu_to_be16(pba);
955 msb->regs.param.page_address = page;
956 msb->regs.param.cp = MEMSTICK_CP_EXTRA;
957
958 if (pba > msb->block_count) {
959 pr_err("BUG: attempt to read beyond the end of card at pba %d", pba);
960 return -EINVAL;
961 }
962
963 error = msb_run_state_machine(msb, h_msb_read_page);
964 *extra = msb->regs.extra_data;
965
966 if (error == -EUCLEAN) {
967 pr_notice("correctable error on pba %d, page %d",
968 pba, page);
969 return 0;
970 }
971
972 return error;
973}
974
975/* Reads a block and compares it with data contained in scatterlist orig_sg */
976static int msb_verify_block(struct msb_data *msb, u16 pba,
977 struct scatterlist *orig_sg, int offset)
978{
979 struct scatterlist sg;
980 int page = 0, error;
981
982 sg_init_one(&sg, msb->block_buffer, msb->block_size);
983
984 while (page < msb->pages_in_block) {
985
986 error = msb_read_page(msb, pba, page,
987 NULL, &sg, page * msb->page_size);
988 if (error)
989 return error;
990 page++;
991 }
992
993 if (msb_sg_compare_to_buffer(orig_sg, offset,
994 msb->block_buffer, msb->block_size))
995 return -EIO;
996 return 0;
997}
998
999/* Writes exectly one block + oob */
1000static int msb_write_block(struct msb_data *msb,
1001 u16 pba, u32 lba, struct scatterlist *sg, int offset)
1002{
1003 int error, current_try = 1;
1004 BUG_ON(sg->length < msb->page_size);
1005
1006 if (msb->read_only)
1007 return -EROFS;
1008
1009 if (pba == MS_BLOCK_INVALID) {
1010 pr_err(
1011 "BUG: write: attempt to write MS_BLOCK_INVALID block");
1012 return -EINVAL;
1013 }
1014
1015 if (pba >= msb->block_count || lba >= msb->logical_block_count) {
1016 pr_err(
1017 "BUG: write: attempt to write beyond the end of device");
1018 return -EINVAL;
1019 }
1020
1021 if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) {
1022 pr_err("BUG: write: lba zone mismatch");
1023 return -EINVAL;
1024 }
1025
1026 if (pba == msb->boot_block_locations[0] ||
1027 pba == msb->boot_block_locations[1]) {
1028 pr_err("BUG: write: attempt to write to boot blocks!");
1029 return -EINVAL;
1030 }
1031
1032 while (1) {
1033
1034 if (msb->read_only)
1035 return -EROFS;
1036
1037 msb->regs.param.cp = MEMSTICK_CP_BLOCK;
1038 msb->regs.param.page_address = 0;
1039 msb->regs.param.block_address = cpu_to_be16(pba);
1040
1041 msb->regs.extra_data.management_flag = 0xFF;
1042 msb->regs.extra_data.overwrite_flag = 0xF8;
1043 msb->regs.extra_data.logical_address = cpu_to_be16(lba);
1044
1045 msb->current_sg = sg;
1046 msb->current_sg_offset = offset;
1047 msb->current_page = 0;
1048
1049 error = msb_run_state_machine(msb, h_msb_write_block);
1050
1051 /* Sector we just wrote to is assumed erased since its pba
1052 was erased. If it wasn't erased, write will succeed
1053 and will just clear the bits that were set in the block
1054 thus test that what we have written,
1055 matches what we expect.
1056 We do trust the blocks that we erased */
1057 if (!error && (verify_writes ||
1058 !test_bit(pba, msb->erased_blocks_bitmap)))
1059 error = msb_verify_block(msb, pba, sg, offset);
1060
1061 if (!error)
1062 break;
1063
1064 if (current_try > 1 || msb_reset(msb, true))
1065 break;
1066
1067 pr_err("write failed, trying to erase the pba %d", pba);
1068 error = msb_erase_block(msb, pba);
1069 if (error)
1070 break;
1071
1072 current_try++;
1073 }
1074 return error;
1075}
1076
1077/* Finds a free block for write replacement */
1078static u16 msb_get_free_block(struct msb_data *msb, int zone)
1079{
1080 u16 pos;
1081 int pba = zone * MS_BLOCKS_IN_ZONE;
1082 int i;
1083
1084 get_random_bytes(&pos, sizeof(pos));
1085
1086 if (!msb->free_block_count[zone]) {
1087 pr_err("NO free blocks in the zone %d, to use for a write, (media is WORN out) switching to RO mode", zone);
1088 msb->read_only = true;
1089 return MS_BLOCK_INVALID;
1090 }
1091
1092 pos %= msb->free_block_count[zone];
1093
1094 dbg_verbose("have %d choices for a free block, selected randomally: %d",
1095 msb->free_block_count[zone], pos);
1096
1097 pba = find_next_zero_bit(msb->used_blocks_bitmap,
1098 msb->block_count, pba);
1099 for (i = 0; i < pos; ++i)
1100 pba = find_next_zero_bit(msb->used_blocks_bitmap,
1101 msb->block_count, pba + 1);
1102
1103 dbg_verbose("result of the free blocks scan: pba %d", pba);
1104
1105 if (pba == msb->block_count || (msb_get_zone_from_pba(pba)) != zone) {
1106 pr_err("BUG: cant get a free block");
1107 msb->read_only = true;
1108 return MS_BLOCK_INVALID;
1109 }
1110
1111 msb_mark_block_used(msb, pba);
1112 return pba;
1113}
1114
1115static int msb_update_block(struct msb_data *msb, u16 lba,
1116 struct scatterlist *sg, int offset)
1117{
1118 u16 pba, new_pba;
1119 int error, try;
1120
1121 pba = msb->lba_to_pba_table[lba];
1122 dbg_verbose("start of a block update at lba %d, pba %d", lba, pba);
1123
1124 if (pba != MS_BLOCK_INVALID) {
1125 dbg_verbose("setting the update flag on the block");
1126 msb_set_overwrite_flag(msb, pba, 0,
1127 0xFF & ~MEMSTICK_OVERWRITE_UDST);
1128 }
1129
1130 for (try = 0; try < 3; try++) {
1131 new_pba = msb_get_free_block(msb,
1132 msb_get_zone_from_lba(lba));
1133
1134 if (new_pba == MS_BLOCK_INVALID) {
1135 error = -EIO;
1136 goto out;
1137 }
1138
1139 dbg_verbose("block update: writing updated block to the pba %d",
1140 new_pba);
1141 error = msb_write_block(msb, new_pba, lba, sg, offset);
1142 if (error == -EBADMSG) {
1143 msb_mark_bad(msb, new_pba);
1144 continue;
1145 }
1146
1147 if (error)
1148 goto out;
1149
1150 dbg_verbose("block update: erasing the old block");
1151 msb_erase_block(msb, pba);
1152 msb->lba_to_pba_table[lba] = new_pba;
1153 return 0;
1154 }
1155out:
1156 if (error) {
1157 pr_err("block update error after %d tries, switching to r/o mode", try);
1158 msb->read_only = true;
1159 }
1160 return error;
1161}
1162
1163/* Converts endiannes in the boot block for easy use */
1164static void msb_fix_boot_page_endianness(struct ms_boot_page *p)
1165{
1166 p->header.block_id = be16_to_cpu(p->header.block_id);
1167 p->header.format_reserved = be16_to_cpu(p->header.format_reserved);
1168 p->entry.disabled_block.start_addr
1169 = be32_to_cpu(p->entry.disabled_block.start_addr);
1170 p->entry.disabled_block.data_size
1171 = be32_to_cpu(p->entry.disabled_block.data_size);
1172 p->entry.cis_idi.start_addr
1173 = be32_to_cpu(p->entry.cis_idi.start_addr);
1174 p->entry.cis_idi.data_size
1175 = be32_to_cpu(p->entry.cis_idi.data_size);
1176 p->attr.block_size = be16_to_cpu(p->attr.block_size);
1177 p->attr.number_of_blocks = be16_to_cpu(p->attr.number_of_blocks);
1178 p->attr.number_of_effective_blocks
1179 = be16_to_cpu(p->attr.number_of_effective_blocks);
1180 p->attr.page_size = be16_to_cpu(p->attr.page_size);
1181 p->attr.memory_manufacturer_code
1182 = be16_to_cpu(p->attr.memory_manufacturer_code);
1183 p->attr.memory_device_code = be16_to_cpu(p->attr.memory_device_code);
1184 p->attr.implemented_capacity
1185 = be16_to_cpu(p->attr.implemented_capacity);
1186 p->attr.controller_number = be16_to_cpu(p->attr.controller_number);
1187 p->attr.controller_function = be16_to_cpu(p->attr.controller_function);
1188}
1189
1190static int msb_read_boot_blocks(struct msb_data *msb)
1191{
1192 int pba = 0;
1193 struct scatterlist sg;
1194 struct ms_extra_data_register extra;
1195 struct ms_boot_page *page;
1196
1197 msb->boot_block_locations[0] = MS_BLOCK_INVALID;
1198 msb->boot_block_locations[1] = MS_BLOCK_INVALID;
1199 msb->boot_block_count = 0;
1200
1201 dbg_verbose("Start of a scan for the boot blocks");
1202
1203 if (!msb->boot_page) {
1204 page = kmalloc(sizeof(struct ms_boot_page)*2, GFP_KERNEL);
1205 if (!page)
1206 return -ENOMEM;
1207
1208 msb->boot_page = page;
1209 } else
1210 page = msb->boot_page;
1211
1212 msb->block_count = MS_BLOCK_MAX_BOOT_ADDR;
1213
1214 for (pba = 0; pba < MS_BLOCK_MAX_BOOT_ADDR; pba++) {
1215
1216 sg_init_one(&sg, page, sizeof(*page));
1217 if (msb_read_page(msb, pba, 0, &extra, &sg, 0)) {
1218 dbg("boot scan: can't read pba %d", pba);
1219 continue;
1220 }
1221
1222 if (extra.management_flag & MEMSTICK_MANAGEMENT_SYSFLG) {
1223 dbg("managment flag doesn't indicate boot block %d",
1224 pba);
1225 continue;
1226 }
1227
1228 if (be16_to_cpu(page->header.block_id) != MS_BLOCK_BOOT_ID) {
1229 dbg("the pba at %d doesn' contain boot block ID", pba);
1230 continue;
1231 }
1232
1233 msb_fix_boot_page_endianness(page);
1234 msb->boot_block_locations[msb->boot_block_count] = pba;
1235
1236 page++;
1237 msb->boot_block_count++;
1238
1239 if (msb->boot_block_count == 2)
1240 break;
1241 }
1242
1243 if (!msb->boot_block_count) {
1244 pr_err("media doesn't contain master page, aborting");
1245 return -EIO;
1246 }
1247
1248 dbg_verbose("End of scan for boot blocks");
1249 return 0;
1250}
1251
1252static int msb_read_bad_block_table(struct msb_data *msb, int block_nr)
1253{
1254 struct ms_boot_page *boot_block;
1255 struct scatterlist sg;
1256 u16 *buffer = NULL;
1257 int offset = 0;
1258 int i, error = 0;
1259 int data_size, data_offset, page, page_offset, size_to_read;
1260 u16 pba;
1261
1262 BUG_ON(block_nr > 1);
1263 boot_block = &msb->boot_page[block_nr];
1264 pba = msb->boot_block_locations[block_nr];
1265
1266 if (msb->boot_block_locations[block_nr] == MS_BLOCK_INVALID)
1267 return -EINVAL;
1268
1269 data_size = boot_block->entry.disabled_block.data_size;
1270 data_offset = sizeof(struct ms_boot_page) +
1271 boot_block->entry.disabled_block.start_addr;
1272 if (!data_size)
1273 return 0;
1274
1275 page = data_offset / msb->page_size;
1276 page_offset = data_offset % msb->page_size;
1277 size_to_read =
1278 DIV_ROUND_UP(data_size + page_offset, msb->page_size) *
1279 msb->page_size;
1280
1281 dbg("reading bad block of boot block at pba %d, offset %d len %d",
1282 pba, data_offset, data_size);
1283
1284 buffer = kzalloc(size_to_read, GFP_KERNEL);
1285 if (!buffer)
1286 return -ENOMEM;
1287
1288 /* Read the buffer */
1289 sg_init_one(&sg, buffer, size_to_read);
1290
1291 while (offset < size_to_read) {
1292 error = msb_read_page(msb, pba, page, NULL, &sg, offset);
1293 if (error)
1294 goto out;
1295
1296 page++;
1297 offset += msb->page_size;
1298
1299 if (page == msb->pages_in_block) {
1300 pr_err(
1301 "bad block table extends beyond the boot block");
1302 break;
1303 }
1304 }
1305
1306 /* Process the bad block table */
1307 for (i = page_offset; i < data_size / sizeof(u16); i++) {
1308
1309 u16 bad_block = be16_to_cpu(buffer[i]);
1310
1311 if (bad_block >= msb->block_count) {
1312 dbg("bad block table contains invalid block %d",
1313 bad_block);
1314 continue;
1315 }
1316
1317 if (test_bit(bad_block, msb->used_blocks_bitmap)) {
1318 dbg("duplicate bad block %d in the table",
1319 bad_block);
1320 continue;
1321 }
1322
1323 dbg("block %d is marked as factory bad", bad_block);
1324 msb_mark_block_used(msb, bad_block);
1325 }
1326out:
1327 kfree(buffer);
1328 return error;
1329}
1330
1331static int msb_ftl_initialize(struct msb_data *msb)
1332{
1333 int i;
1334
1335 if (msb->ftl_initialized)
1336 return 0;
1337
1338 msb->zone_count = msb->block_count / MS_BLOCKS_IN_ZONE;
1339 msb->logical_block_count = msb->zone_count * 496 - 2;
1340
1341 msb->used_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL);
1342 msb->erased_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL);
1343 msb->lba_to_pba_table =
1344 kmalloc(msb->logical_block_count * sizeof(u16), GFP_KERNEL);
1345
1346 if (!msb->used_blocks_bitmap || !msb->lba_to_pba_table ||
1347 !msb->erased_blocks_bitmap) {
1348 kfree(msb->used_blocks_bitmap);
1349 kfree(msb->lba_to_pba_table);
1350 kfree(msb->erased_blocks_bitmap);
1351 return -ENOMEM;
1352 }
1353
1354 for (i = 0; i < msb->zone_count; i++)
1355 msb->free_block_count[i] = MS_BLOCKS_IN_ZONE;
1356
1357 memset(msb->lba_to_pba_table, MS_BLOCK_INVALID,
1358 msb->logical_block_count * sizeof(u16));
1359
1360 dbg("initial FTL tables created. Zone count = %d, Logical block count = %d",
1361 msb->zone_count, msb->logical_block_count);
1362
1363 msb->ftl_initialized = true;
1364 return 0;
1365}
1366
1367static int msb_ftl_scan(struct msb_data *msb)
1368{
1369 u16 pba, lba, other_block;
1370 u8 overwrite_flag, managment_flag, other_overwrite_flag;
1371 int error;
1372 struct ms_extra_data_register extra;
1373 u8 *overwrite_flags = kzalloc(msb->block_count, GFP_KERNEL);
1374
1375 if (!overwrite_flags)
1376 return -ENOMEM;
1377
1378 dbg("Start of media scanning");
1379 for (pba = 0; pba < msb->block_count; pba++) {
1380
1381 if (pba == msb->boot_block_locations[0] ||
1382 pba == msb->boot_block_locations[1]) {
1383 dbg_verbose("pba %05d -> [boot block]", pba);
1384 msb_mark_block_used(msb, pba);
1385 continue;
1386 }
1387
1388 if (test_bit(pba, msb->used_blocks_bitmap)) {
1389 dbg_verbose("pba %05d -> [factory bad]", pba);
1390 continue;
1391 }
1392
1393 memset(&extra, 0, sizeof(extra));
1394 error = msb_read_oob(msb, pba, 0, &extra);
1395
1396 /* can't trust the page if we can't read the oob */
1397 if (error == -EBADMSG) {
1398 pr_notice(
1399 "oob of pba %d damaged, will try to erase it", pba);
1400 msb_mark_block_used(msb, pba);
1401 msb_erase_block(msb, pba);
1402 continue;
1403 } else if (error) {
1404 pr_err("unknown error %d on read of oob of pba %d - aborting",
1405 error, pba);
1406
1407 kfree(overwrite_flags);
1408 return error;
1409 }
1410
1411 lba = be16_to_cpu(extra.logical_address);
1412 managment_flag = extra.management_flag;
1413 overwrite_flag = extra.overwrite_flag;
1414 overwrite_flags[pba] = overwrite_flag;
1415
1416 /* Skip bad blocks */
1417 if (!(overwrite_flag & MEMSTICK_OVERWRITE_BKST)) {
1418 dbg("pba %05d -> [BAD]", pba);
1419 msb_mark_block_used(msb, pba);
1420 continue;
1421 }
1422
1423 /* Skip system/drm blocks */
1424 if ((managment_flag & MEMSTICK_MANAGMENT_FLAG_NORMAL) !=
1425 MEMSTICK_MANAGMENT_FLAG_NORMAL) {
1426 dbg("pba %05d -> [reserved managment flag %02x]",
1427 pba, managment_flag);
1428 msb_mark_block_used(msb, pba);
1429 continue;
1430 }
1431
1432 /* Erase temporary tables */
1433 if (!(managment_flag & MEMSTICK_MANAGEMENT_ATFLG)) {
1434 dbg("pba %05d -> [temp table] - will erase", pba);
1435
1436 msb_mark_block_used(msb, pba);
1437 msb_erase_block(msb, pba);
1438 continue;
1439 }
1440
1441 if (lba == MS_BLOCK_INVALID) {
1442 dbg_verbose("pba %05d -> [free]", pba);
1443 continue;
1444 }
1445
1446 msb_mark_block_used(msb, pba);
1447
1448 /* Block has LBA not according to zoning*/
1449 if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) {
1450 pr_notice("pba %05d -> [bad lba %05d] - will erase",
1451 pba, lba);
1452 msb_erase_block(msb, pba);
1453 continue;
1454 }
1455
1456 /* No collisions - great */
1457 if (msb->lba_to_pba_table[lba] == MS_BLOCK_INVALID) {
1458 dbg_verbose("pba %05d -> [lba %05d]", pba, lba);
1459 msb->lba_to_pba_table[lba] = pba;
1460 continue;
1461 }
1462
1463 other_block = msb->lba_to_pba_table[lba];
1464 other_overwrite_flag = overwrite_flags[other_block];
1465
1466 pr_notice("Collision between pba %d and pba %d",
1467 pba, other_block);
1468
1469 if (!(overwrite_flag & MEMSTICK_OVERWRITE_UDST)) {
1470 pr_notice("pba %d is marked as stable, use it", pba);
1471 msb_erase_block(msb, other_block);
1472 msb->lba_to_pba_table[lba] = pba;
1473 continue;
1474 }
1475
1476 if (!(other_overwrite_flag & MEMSTICK_OVERWRITE_UDST)) {
1477 pr_notice("pba %d is marked as stable, use it",
1478 other_block);
1479 msb_erase_block(msb, pba);
1480 continue;
1481 }
1482
1483 pr_notice("collision between blocks %d and %d, without stable flag set on both, erasing pba %d",
1484 pba, other_block, other_block);
1485
1486 msb_erase_block(msb, other_block);
1487 msb->lba_to_pba_table[lba] = pba;
1488 }
1489
1490 dbg("End of media scanning");
1491 kfree(overwrite_flags);
1492 return 0;
1493}
1494
1495static void msb_cache_flush_timer(unsigned long data)
1496{
1497 struct msb_data *msb = (struct msb_data *)data;
1498 msb->need_flush_cache = true;
1499 queue_work(msb->io_queue, &msb->io_work);
1500}
1501
1502
1503static void msb_cache_discard(struct msb_data *msb)
1504{
1505 if (msb->cache_block_lba == MS_BLOCK_INVALID)
1506 return;
1507
1508 del_timer_sync(&msb->cache_flush_timer);
1509
1510 dbg_verbose("Discarding the write cache");
1511 msb->cache_block_lba = MS_BLOCK_INVALID;
1512 bitmap_zero(&msb->valid_cache_bitmap, msb->pages_in_block);
1513}
1514
1515static int msb_cache_init(struct msb_data *msb)
1516{
1517 setup_timer(&msb->cache_flush_timer, msb_cache_flush_timer,
1518 (unsigned long)msb);
1519
1520 if (!msb->cache)
1521 msb->cache = kzalloc(msb->block_size, GFP_KERNEL);
1522 if (!msb->cache)
1523 return -ENOMEM;
1524
1525 msb_cache_discard(msb);
1526 return 0;
1527}
1528
1529static int msb_cache_flush(struct msb_data *msb)
1530{
1531 struct scatterlist sg;
1532 struct ms_extra_data_register extra;
1533 int page, offset, error;
1534 u16 pba, lba;
1535
1536 if (msb->read_only)
1537 return -EROFS;
1538
1539 if (msb->cache_block_lba == MS_BLOCK_INVALID)
1540 return 0;
1541
1542 lba = msb->cache_block_lba;
1543 pba = msb->lba_to_pba_table[lba];
1544
1545 dbg_verbose("Flushing the write cache of pba %d (LBA %d)",
1546 pba, msb->cache_block_lba);
1547
1548 sg_init_one(&sg, msb->cache , msb->block_size);
1549
1550 /* Read all missing pages in cache */
1551 for (page = 0; page < msb->pages_in_block; page++) {
1552
1553 if (test_bit(page, &msb->valid_cache_bitmap))
1554 continue;
1555
1556 offset = page * msb->page_size;
1557
1558 dbg_verbose("reading non-present sector %d of cache block %d",
1559 page, lba);
1560 error = msb_read_page(msb, pba, page, &extra, &sg, offset);
1561
1562 /* Bad pages are copied with 00 page status */
1563 if (error == -EBADMSG) {
1564 pr_err("read error on sector %d, contents probably damaged", page);
1565 continue;
1566 }
1567
1568 if (error)
1569 return error;
1570
1571 if ((extra.overwrite_flag & MEMSTICK_OV_PG_NORMAL) !=
1572 MEMSTICK_OV_PG_NORMAL) {
1573 dbg("page %d is marked as bad", page);
1574 continue;
1575 }
1576
1577 set_bit(page, &msb->valid_cache_bitmap);
1578 }
1579
1580 /* Write the cache now */
1581 error = msb_update_block(msb, msb->cache_block_lba, &sg, 0);
1582 pba = msb->lba_to_pba_table[msb->cache_block_lba];
1583
1584 /* Mark invalid pages */
1585 if (!error) {
1586 for (page = 0; page < msb->pages_in_block; page++) {
1587
1588 if (test_bit(page, &msb->valid_cache_bitmap))
1589 continue;
1590
1591 dbg("marking page %d as containing damaged data",
1592 page);
1593 msb_set_overwrite_flag(msb,
1594 pba , page, 0xFF & ~MEMSTICK_OV_PG_NORMAL);
1595 }
1596 }
1597
1598 msb_cache_discard(msb);
1599 return error;
1600}
1601
1602static int msb_cache_write(struct msb_data *msb, int lba,
1603 int page, bool add_to_cache_only, struct scatterlist *sg, int offset)
1604{
1605 int error;
1606 struct scatterlist sg_tmp[10];
1607
1608 if (msb->read_only)
1609 return -EROFS;
1610
1611 if (msb->cache_block_lba == MS_BLOCK_INVALID ||
1612 lba != msb->cache_block_lba)
1613 if (add_to_cache_only)
1614 return 0;
1615
1616 /* If we need to write different block */
1617 if (msb->cache_block_lba != MS_BLOCK_INVALID &&
1618 lba != msb->cache_block_lba) {
1619 dbg_verbose("first flush the cache");
1620 error = msb_cache_flush(msb);
1621 if (error)
1622 return error;
1623 }
1624
1625 if (msb->cache_block_lba == MS_BLOCK_INVALID) {
1626 msb->cache_block_lba = lba;
1627 mod_timer(&msb->cache_flush_timer,
1628 jiffies + msecs_to_jiffies(cache_flush_timeout));
1629 }
1630
1631 dbg_verbose("Write of LBA %d page %d to cache ", lba, page);
1632
1633 sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp));
1634 msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp), offset, msb->page_size);
1635
1636 sg_copy_to_buffer(sg_tmp, sg_nents(sg_tmp),
1637 msb->cache + page * msb->page_size, msb->page_size);
1638
1639 set_bit(page, &msb->valid_cache_bitmap);
1640 return 0;
1641}
1642
1643static int msb_cache_read(struct msb_data *msb, int lba,
1644 int page, struct scatterlist *sg, int offset)
1645{
1646 int pba = msb->lba_to_pba_table[lba];
1647 struct scatterlist sg_tmp[10];
1648 int error = 0;
1649
1650 if (lba == msb->cache_block_lba &&
1651 test_bit(page, &msb->valid_cache_bitmap)) {
1652
1653 dbg_verbose("Read of LBA %d (pba %d) sector %d from cache",
1654 lba, pba, page);
1655
1656 sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp));
1657 msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp),
1658 offset, msb->page_size);
1659 sg_copy_from_buffer(sg_tmp, sg_nents(sg_tmp),
1660 msb->cache + msb->page_size * page,
1661 msb->page_size);
1662 } else {
1663 dbg_verbose("Read of LBA %d (pba %d) sector %d from device",
1664 lba, pba, page);
1665
1666 error = msb_read_page(msb, pba, page, NULL, sg, offset);
1667 if (error)
1668 return error;
1669
1670 msb_cache_write(msb, lba, page, true, sg, offset);
1671 }
1672 return error;
1673}
1674
1675/* Emulated geometry table
1676 * This table content isn't that importaint,
1677 * One could put here different values, providing that they still
1678 * cover whole disk.
1679 * 64 MB entry is what windows reports for my 64M memstick */
1680
1681static const struct chs_entry chs_table[] = {
1682/* size sectors cylynders heads */
1683 { 4, 16, 247, 2 },
1684 { 8, 16, 495, 2 },
1685 { 16, 16, 495, 4 },
1686 { 32, 16, 991, 4 },
1687 { 64, 16, 991, 8 },
1688 {128, 16, 991, 16 },
1689 { 0 }
1690};
1691
1692/* Load information about the card */
1693static int msb_init_card(struct memstick_dev *card)
1694{
1695 struct msb_data *msb = memstick_get_drvdata(card);
1696 struct memstick_host *host = card->host;
1697 struct ms_boot_page *boot_block;
1698 int error = 0, i, raw_size_in_megs;
1699
1700 msb->caps = 0;
1701
1702 if (card->id.class >= MEMSTICK_CLASS_ROM &&
1703 card->id.class <= MEMSTICK_CLASS_ROM)
1704 msb->read_only = true;
1705
1706 msb->state = -1;
1707 error = msb_reset(msb, false);
1708 if (error)
1709 return error;
1710
1711 /* Due to a bug in Jmicron driver written by Alex Dubov,
1712 its serial mode barely works,
1713 so we switch to parallel mode right away */
1714 if (host->caps & MEMSTICK_CAP_PAR4)
1715 msb_switch_to_parallel(msb);
1716
1717 msb->page_size = sizeof(struct ms_boot_page);
1718
1719 /* Read the boot page */
1720 error = msb_read_boot_blocks(msb);
1721 if (error)
1722 return -EIO;
1723
1724 boot_block = &msb->boot_page[0];
1725
1726 /* Save intersting attributes from boot page */
1727 msb->block_count = boot_block->attr.number_of_blocks;
1728 msb->page_size = boot_block->attr.page_size;
1729
1730 msb->pages_in_block = boot_block->attr.block_size * 2;
1731 msb->block_size = msb->page_size * msb->pages_in_block;
1732
1733 if (msb->page_size > PAGE_SIZE) {
1734 /* this isn't supported by linux at all, anyway*/
1735 dbg("device page %d size isn't supported", msb->page_size);
1736 return -EINVAL;
1737 }
1738
1739 msb->block_buffer = kzalloc(msb->block_size, GFP_KERNEL);
1740 if (!msb->block_buffer)
1741 return -ENOMEM;
1742
1743 raw_size_in_megs = (msb->block_size * msb->block_count) >> 20;
1744
1745 for (i = 0; chs_table[i].size; i++) {
1746
1747 if (chs_table[i].size != raw_size_in_megs)
1748 continue;
1749
1750 msb->geometry.cylinders = chs_table[i].cyl;
1751 msb->geometry.heads = chs_table[i].head;
1752 msb->geometry.sectors = chs_table[i].sec;
1753 break;
1754 }
1755
1756 if (boot_block->attr.transfer_supporting == 1)
1757 msb->caps |= MEMSTICK_CAP_PAR4;
1758
1759 if (boot_block->attr.device_type & 0x03)
1760 msb->read_only = true;
1761
1762 dbg("Total block count = %d", msb->block_count);
1763 dbg("Each block consists of %d pages", msb->pages_in_block);
1764 dbg("Page size = %d bytes", msb->page_size);
1765 dbg("Parallel mode supported: %d", !!(msb->caps & MEMSTICK_CAP_PAR4));
1766 dbg("Read only: %d", msb->read_only);
1767
1768#if 0
1769 /* Now we can switch the interface */
1770 if (host->caps & msb->caps & MEMSTICK_CAP_PAR4)
1771 msb_switch_to_parallel(msb);
1772#endif
1773
1774 error = msb_cache_init(msb);
1775 if (error)
1776 return error;
1777
1778 error = msb_ftl_initialize(msb);
1779 if (error)
1780 return error;
1781
1782
1783 /* Read the bad block table */
1784 error = msb_read_bad_block_table(msb, 0);
1785
1786 if (error && error != -ENOMEM) {
1787 dbg("failed to read bad block table from primary boot block, trying from backup");
1788 error = msb_read_bad_block_table(msb, 1);
1789 }
1790
1791 if (error)
1792 return error;
1793
1794 /* *drum roll* Scan the media */
1795 error = msb_ftl_scan(msb);
1796 if (error) {
1797 pr_err("Scan of media failed");
1798 return error;
1799 }
1800
1801 return 0;
1802
1803}
1804
1805static int msb_do_write_request(struct msb_data *msb, int lba,
1806 int page, struct scatterlist *sg, size_t len, int *sucessfuly_written)
1807{
1808 int error = 0;
1809 off_t offset = 0;
1810 *sucessfuly_written = 0;
1811
1812 while (offset < len) {
1813 if (page == 0 && len - offset >= msb->block_size) {
1814
1815 if (msb->cache_block_lba == lba)
1816 msb_cache_discard(msb);
1817
1818 dbg_verbose("Writing whole lba %d", lba);
1819 error = msb_update_block(msb, lba, sg, offset);
1820 if (error)
1821 return error;
1822
1823 offset += msb->block_size;
1824 *sucessfuly_written += msb->block_size;
1825 lba++;
1826 continue;
1827 }
1828
1829 error = msb_cache_write(msb, lba, page, false, sg, offset);
1830 if (error)
1831 return error;
1832
1833 offset += msb->page_size;
1834 *sucessfuly_written += msb->page_size;
1835
1836 page++;
1837 if (page == msb->pages_in_block) {
1838 page = 0;
1839 lba++;
1840 }
1841 }
1842 return 0;
1843}
1844
1845static int msb_do_read_request(struct msb_data *msb, int lba,
1846 int page, struct scatterlist *sg, int len, int *sucessfuly_read)
1847{
1848 int error = 0;
1849 int offset = 0;
1850 *sucessfuly_read = 0;
1851
1852 while (offset < len) {
1853
1854 error = msb_cache_read(msb, lba, page, sg, offset);
1855 if (error)
1856 return error;
1857
1858 offset += msb->page_size;
1859 *sucessfuly_read += msb->page_size;
1860
1861 page++;
1862 if (page == msb->pages_in_block) {
1863 page = 0;
1864 lba++;
1865 }
1866 }
1867 return 0;
1868}
1869
1870static void msb_io_work(struct work_struct *work)
1871{
1872 struct msb_data *msb = container_of(work, struct msb_data, io_work);
1873 int page, error, len;
1874 sector_t lba;
1875 unsigned long flags;
1876 struct scatterlist *sg = msb->prealloc_sg;
1877
1878 dbg_verbose("IO: work started");
1879
1880 while (1) {
1881 spin_lock_irqsave(&msb->q_lock, flags);
1882
1883 if (msb->need_flush_cache) {
1884 msb->need_flush_cache = false;
1885 spin_unlock_irqrestore(&msb->q_lock, flags);
1886 msb_cache_flush(msb);
1887 continue;
1888 }
1889
1890 if (!msb->req) {
1891 msb->req = blk_fetch_request(msb->queue);
1892 if (!msb->req) {
1893 dbg_verbose("IO: no more requests exiting");
1894 spin_unlock_irqrestore(&msb->q_lock, flags);
1895 return;
1896 }
1897 }
1898
1899 spin_unlock_irqrestore(&msb->q_lock, flags);
1900
1901 /* If card was removed meanwhile */
1902 if (!msb->req)
1903 return;
1904
1905 /* process the request */
1906 dbg_verbose("IO: processing new request");
1907 blk_rq_map_sg(msb->queue, msb->req, sg);
1908
1909 lba = blk_rq_pos(msb->req);
1910
1911 sector_div(lba, msb->page_size / 512);
1912 page = do_div(lba, msb->pages_in_block);
1913
1914 if (rq_data_dir(msb->req) == READ)
1915 error = msb_do_read_request(msb, lba, page, sg,
1916 blk_rq_bytes(msb->req), &len);
1917 else
1918 error = msb_do_write_request(msb, lba, page, sg,
1919 blk_rq_bytes(msb->req), &len);
1920
1921 spin_lock_irqsave(&msb->q_lock, flags);
1922
1923 if (len)
1924 if (!__blk_end_request(msb->req, 0, len))
1925 msb->req = NULL;
1926
1927 if (error && msb->req) {
1928 dbg_verbose("IO: ending one sector of the request with error");
1929 if (!__blk_end_request(msb->req, error, msb->page_size))
1930 msb->req = NULL;
1931 }
1932
1933 if (msb->req)
1934 dbg_verbose("IO: request still pending");
1935
1936 spin_unlock_irqrestore(&msb->q_lock, flags);
1937 }
1938}
1939
1940static DEFINE_IDR(msb_disk_idr); /*set of used disk numbers */
1941static DEFINE_MUTEX(msb_disk_lock); /* protects against races in open/release */
1942
1943static int msb_bd_open(struct block_device *bdev, fmode_t mode)
1944{
1945 struct gendisk *disk = bdev->bd_disk;
1946 struct msb_data *msb = disk->private_data;
1947
1948 dbg_verbose("block device open");
1949
1950 mutex_lock(&msb_disk_lock);
1951
1952 if (msb && msb->card)
1953 msb->usage_count++;
1954
1955 mutex_unlock(&msb_disk_lock);
1956 return 0;
1957}
1958
1959static void msb_data_clear(struct msb_data *msb)
1960{
1961 kfree(msb->boot_page);
1962 kfree(msb->used_blocks_bitmap);
1963 kfree(msb->lba_to_pba_table);
1964 kfree(msb->cache);
1965 msb->card = NULL;
1966}
1967
1968static int msb_disk_release(struct gendisk *disk)
1969{
1970 struct msb_data *msb = disk->private_data;
1971
1972 dbg_verbose("block device release");
1973 mutex_lock(&msb_disk_lock);
1974
1975 if (msb) {
1976 if (msb->usage_count)
1977 msb->usage_count--;
1978
1979 if (!msb->usage_count) {
1980 disk->private_data = NULL;
1981 idr_remove(&msb_disk_idr, msb->disk_id);
1982 put_disk(disk);
1983 kfree(msb);
1984 }
1985 }
1986 mutex_unlock(&msb_disk_lock);
1987 return 0;
1988}
1989
1990static void msb_bd_release(struct gendisk *disk, fmode_t mode)
1991{
1992 msb_disk_release(disk);
1993}
1994
1995static int msb_bd_getgeo(struct block_device *bdev,
1996 struct hd_geometry *geo)
1997{
1998 struct msb_data *msb = bdev->bd_disk->private_data;
1999 *geo = msb->geometry;
2000 return 0;
2001}
2002
2003static int msb_prepare_req(struct request_queue *q, struct request *req)
2004{
2005 if (req->cmd_type != REQ_TYPE_FS &&
2006 req->cmd_type != REQ_TYPE_BLOCK_PC) {
2007 blk_dump_rq_flags(req, "MS unsupported request");
2008 return BLKPREP_KILL;
2009 }
2010 req->cmd_flags |= REQ_DONTPREP;
2011 return BLKPREP_OK;
2012}
2013
2014static void msb_submit_req(struct request_queue *q)
2015{
2016 struct memstick_dev *card = q->queuedata;
2017 struct msb_data *msb = memstick_get_drvdata(card);
2018 struct request *req = NULL;
2019
2020 dbg_verbose("Submit request");
2021
2022 if (msb->card_dead) {
2023 dbg("Refusing requests on removed card");
2024
2025 WARN_ON(!msb->io_queue_stopped);
2026
2027 while ((req = blk_fetch_request(q)) != NULL)
2028 __blk_end_request_all(req, -ENODEV);
2029 return;
2030 }
2031
2032 if (msb->req)
2033 return;
2034
2035 if (!msb->io_queue_stopped)
2036 queue_work(msb->io_queue, &msb->io_work);
2037}
2038
2039static int msb_check_card(struct memstick_dev *card)
2040{
2041 struct msb_data *msb = memstick_get_drvdata(card);
2042 return (msb->card_dead == 0);
2043}
2044
2045static void msb_stop(struct memstick_dev *card)
2046{
2047 struct msb_data *msb = memstick_get_drvdata(card);
2048 unsigned long flags;
2049
2050 dbg("Stopping all msblock IO");
2051
2052 spin_lock_irqsave(&msb->q_lock, flags);
2053 blk_stop_queue(msb->queue);
2054 msb->io_queue_stopped = true;
2055 spin_unlock_irqrestore(&msb->q_lock, flags);
2056
2057 del_timer_sync(&msb->cache_flush_timer);
2058 flush_workqueue(msb->io_queue);
2059
2060 if (msb->req) {
2061 spin_lock_irqsave(&msb->q_lock, flags);
2062 blk_requeue_request(msb->queue, msb->req);
2063 msb->req = NULL;
2064 spin_unlock_irqrestore(&msb->q_lock, flags);
2065 }
2066
2067}
2068
2069static void msb_start(struct memstick_dev *card)
2070{
2071 struct msb_data *msb = memstick_get_drvdata(card);
2072 unsigned long flags;
2073
2074 dbg("Resuming IO from msblock");
2075
2076 msb_invalidate_reg_window(msb);
2077
2078 spin_lock_irqsave(&msb->q_lock, flags);
2079 if (!msb->io_queue_stopped || msb->card_dead) {
2080 spin_unlock_irqrestore(&msb->q_lock, flags);
2081 return;
2082 }
2083 spin_unlock_irqrestore(&msb->q_lock, flags);
2084
2085 /* Kick cache flush anyway, its harmless */
2086 msb->need_flush_cache = true;
2087 msb->io_queue_stopped = false;
2088
2089 spin_lock_irqsave(&msb->q_lock, flags);
2090 blk_start_queue(msb->queue);
2091 spin_unlock_irqrestore(&msb->q_lock, flags);
2092
2093 queue_work(msb->io_queue, &msb->io_work);
2094
2095}
2096
2097static const struct block_device_operations msb_bdops = {
2098 .open = msb_bd_open,
2099 .release = msb_bd_release,
2100 .getgeo = msb_bd_getgeo,
2101 .owner = THIS_MODULE
2102};
2103
2104/* Registers the block device */
2105static int msb_init_disk(struct memstick_dev *card)
2106{
2107 struct msb_data *msb = memstick_get_drvdata(card);
2108 struct memstick_host *host = card->host;
2109 int rc;
2110 u64 limit = BLK_BOUNCE_HIGH;
2111 unsigned long capacity;
2112
2113 if (host->dev.dma_mask && *(host->dev.dma_mask))
2114 limit = *(host->dev.dma_mask);
2115
2116 mutex_lock(&msb_disk_lock);
2117 msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL);
2118 mutex_unlock(&msb_disk_lock);
2119
2120 if (msb->disk_id < 0)
2121 return msb->disk_id;
2122
2123 msb->disk = alloc_disk(0);
2124 if (!msb->disk) {
2125 rc = -ENOMEM;
2126 goto out_release_id;
2127 }
2128
2129 msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock);
2130 if (!msb->queue) {
2131 rc = -ENOMEM;
2132 goto out_put_disk;
2133 }
2134
2135 msb->queue->queuedata = card;
2136 blk_queue_prep_rq(msb->queue, msb_prepare_req);
2137
2138 blk_queue_bounce_limit(msb->queue, limit);
2139 blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
2140 blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
2141 blk_queue_max_segment_size(msb->queue,
2142 MS_BLOCK_MAX_PAGES * msb->page_size);
2143 blk_queue_logical_block_size(msb->queue, msb->page_size);
2144
2145 sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id);
2146 msb->disk->fops = &msb_bdops;
2147 msb->disk->private_data = msb;
2148 msb->disk->queue = msb->queue;
2149 msb->disk->driverfs_dev = &card->dev;
2150 msb->disk->flags |= GENHD_FL_EXT_DEVT;
2151
2152 capacity = msb->pages_in_block * msb->logical_block_count;
2153 capacity *= (msb->page_size / 512);
2154 set_capacity(msb->disk, capacity);
2155 dbg("Set total disk size to %lu sectors", capacity);
2156
2157 msb->usage_count = 1;
2158 msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM);
2159 INIT_WORK(&msb->io_work, msb_io_work);
2160 sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
2161
2162 if (msb->read_only)
2163 set_disk_ro(msb->disk, 1);
2164
2165 msb_start(card);
2166 add_disk(msb->disk);
2167 dbg("Disk added");
2168 return 0;
2169
2170out_put_disk:
2171 put_disk(msb->disk);
2172out_release_id:
2173 mutex_lock(&msb_disk_lock);
2174 idr_remove(&msb_disk_idr, msb->disk_id);
2175 mutex_unlock(&msb_disk_lock);
2176 return rc;
2177}
2178
2179static int msb_probe(struct memstick_dev *card)
2180{
2181 struct msb_data *msb;
2182 int rc = 0;
2183
2184 msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL);
2185 if (!msb)
2186 return -ENOMEM;
2187 memstick_set_drvdata(card, msb);
2188 msb->card = card;
2189 spin_lock_init(&msb->q_lock);
2190
2191 rc = msb_init_card(card);
2192 if (rc)
2193 goto out_free;
2194
2195 rc = msb_init_disk(card);
2196 if (!rc) {
2197 card->check = msb_check_card;
2198 card->stop = msb_stop;
2199 card->start = msb_start;
2200 return 0;
2201 }
2202out_free:
2203 memstick_set_drvdata(card, NULL);
2204 msb_data_clear(msb);
2205 kfree(msb);
2206 return rc;
2207}
2208
2209static void msb_remove(struct memstick_dev *card)
2210{
2211 struct msb_data *msb = memstick_get_drvdata(card);
2212 unsigned long flags;
2213
2214 if (!msb->io_queue_stopped)
2215 msb_stop(card);
2216
2217 dbg("Removing the disk device");
2218
2219 /* Take care of unhandled + new requests from now on */
2220 spin_lock_irqsave(&msb->q_lock, flags);
2221 msb->card_dead = true;
2222 blk_start_queue(msb->queue);
2223 spin_unlock_irqrestore(&msb->q_lock, flags);
2224
2225 /* Remove the disk */
2226 del_gendisk(msb->disk);
2227 blk_cleanup_queue(msb->queue);
2228 msb->queue = NULL;
2229
2230 mutex_lock(&msb_disk_lock);
2231 msb_data_clear(msb);
2232 mutex_unlock(&msb_disk_lock);
2233
2234 msb_disk_release(msb->disk);
2235 memstick_set_drvdata(card, NULL);
2236}
2237
2238#ifdef CONFIG_PM
2239
2240static int msb_suspend(struct memstick_dev *card, pm_message_t state)
2241{
2242 msb_stop(card);
2243 return 0;
2244}
2245
2246static int msb_resume(struct memstick_dev *card)
2247{
2248 struct msb_data *msb = memstick_get_drvdata(card);
2249 struct msb_data *new_msb = NULL;
2250 bool card_dead = true;
2251
2252#ifndef CONFIG_MEMSTICK_UNSAFE_RESUME
2253 msb->card_dead = true;
2254 return 0;
2255#endif
2256 mutex_lock(&card->host->lock);
2257
2258 new_msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL);
2259 if (!new_msb)
2260 goto out;
2261
2262 new_msb->card = card;
2263 memstick_set_drvdata(card, new_msb);
2264 spin_lock_init(&new_msb->q_lock);
2265 sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
2266
2267 if (msb_init_card(card))
2268 goto out;
2269
2270 if (msb->block_size != new_msb->block_size)
2271 goto out;
2272
2273 if (memcmp(msb->boot_page, new_msb->boot_page,
2274 sizeof(struct ms_boot_page)))
2275 goto out;
2276
2277 if (msb->logical_block_count != new_msb->logical_block_count ||
2278 memcmp(msb->lba_to_pba_table, new_msb->lba_to_pba_table,
2279 msb->logical_block_count))
2280 goto out;
2281
2282 if (msb->block_count != new_msb->block_count ||
2283 memcmp(msb->used_blocks_bitmap, new_msb->used_blocks_bitmap,
2284 msb->block_count / 8))
2285 goto out;
2286
2287 card_dead = false;
2288out:
2289 if (card_dead)
2290 dbg("Card was removed/replaced during suspend");
2291
2292 msb->card_dead = card_dead;
2293 memstick_set_drvdata(card, msb);
2294
2295 if (new_msb) {
2296 msb_data_clear(new_msb);
2297 kfree(new_msb);
2298 }
2299
2300 msb_start(card);
2301 mutex_unlock(&card->host->lock);
2302 return 0;
2303}
2304#else
2305
2306#define msb_suspend NULL
2307#define msb_resume NULL
2308
2309#endif /* CONFIG_PM */
2310
2311static struct memstick_device_id msb_id_tbl[] = {
2312 {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
2313 MEMSTICK_CLASS_FLASH},
2314
2315 {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
2316 MEMSTICK_CLASS_ROM},
2317
2318 {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
2319 MEMSTICK_CLASS_RO},
2320
2321 {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
2322 MEMSTICK_CLASS_WP},
2323
2324 {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_DUO, MEMSTICK_CATEGORY_STORAGE_DUO,
2325 MEMSTICK_CLASS_DUO},
2326 {}
2327};
2328MODULE_DEVICE_TABLE(memstick, msb_id_tbl);
2329
2330
2331static struct memstick_driver msb_driver = {
2332 .driver = {
2333 .name = DRIVER_NAME,
2334 .owner = THIS_MODULE
2335 },
2336 .id_table = msb_id_tbl,
2337 .probe = msb_probe,
2338 .remove = msb_remove,
2339 .suspend = msb_suspend,
2340 .resume = msb_resume
2341};
2342
2343static int major;
2344
2345static int __init msb_init(void)
2346{
2347 int rc = register_blkdev(0, DRIVER_NAME);
2348
2349 if (rc < 0) {
2350 pr_err("failed to register major (error %d)\n", rc);
2351 return rc;
2352 }
2353
2354 major = rc;
2355 rc = memstick_register_driver(&msb_driver);
2356 if (rc) {
2357 unregister_blkdev(major, DRIVER_NAME);
2358 pr_err("failed to register memstick driver (error %d)\n", rc);
2359 }
2360
2361 return rc;
2362}
2363
2364static void __exit msb_exit(void)
2365{
2366 memstick_unregister_driver(&msb_driver);
2367 unregister_blkdev(major, DRIVER_NAME);
2368 idr_destroy(&msb_disk_idr);
2369}
2370
2371module_init(msb_init);
2372module_exit(msb_exit);
2373
2374module_param(cache_flush_timeout, int, S_IRUGO);
2375MODULE_PARM_DESC(cache_flush_timeout,
2376 "Cache flush timeout in msec (1000 default)");
2377module_param(debug, int, S_IRUGO | S_IWUSR);
2378MODULE_PARM_DESC(debug, "Debug level (0-2)");
2379
2380module_param(verify_writes, bool, S_IRUGO);
2381MODULE_PARM_DESC(verify_writes, "Read back and check all data that is written");
2382
2383MODULE_LICENSE("GPL");
2384MODULE_AUTHOR("Maxim Levitsky");
2385MODULE_DESCRIPTION("Sony MemoryStick block device driver");
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
new file mode 100644
index 000000000000..96e637550988
--- /dev/null
+++ b/drivers/memstick/core/ms_block.h
@@ -0,0 +1,290 @@
1/*
2 * ms_block.h - Sony MemoryStick (legacy) storage support
3
4 * Copyright (C) 2013 Maxim Levitsky <maximlevitsky@gmail.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Minor portions of the driver are copied from mspro_block.c which is
11 * Copyright (C) 2007 Alex Dubov <oakad@yahoo.com>
12 *
13 * Also ms structures were copied from old broken driver by same author
14 * These probably come from MS spec
15 *
16 */
17
18#ifndef MS_BLOCK_NEW_H
19#define MS_BLOCK_NEW_H
20
21#define MS_BLOCK_MAX_SEGS 32
22#define MS_BLOCK_MAX_PAGES ((2 << 16) - 1)
23
24#define MS_BLOCK_MAX_BOOT_ADDR 0x000c
25#define MS_BLOCK_BOOT_ID 0x0001
26#define MS_BLOCK_INVALID 0xffff
27#define MS_MAX_ZONES 16
28#define MS_BLOCKS_IN_ZONE 512
29
30#define MS_BLOCK_MAP_LINE_SZ 16
31#define MS_BLOCK_PART_SHIFT 3
32
33
34#define MEMSTICK_UNCORR_ERROR (MEMSTICK_STATUS1_UCFG | \
35 MEMSTICK_STATUS1_UCEX | MEMSTICK_STATUS1_UCDT)
36
37#define MEMSTICK_CORR_ERROR (MEMSTICK_STATUS1_FGER | MEMSTICK_STATUS1_EXER | \
38 MEMSTICK_STATUS1_DTER)
39
40#define MEMSTICK_INT_ERROR (MEMSTICK_INT_CMDNAK | MEMSTICK_INT_ERR)
41
42#define MEMSTICK_OVERWRITE_FLAG_NORMAL \
43 (MEMSTICK_OVERWRITE_PGST1 | \
44 MEMSTICK_OVERWRITE_PGST0 | \
45 MEMSTICK_OVERWRITE_BKST)
46
47#define MEMSTICK_OV_PG_NORMAL \
48 (MEMSTICK_OVERWRITE_PGST1 | MEMSTICK_OVERWRITE_PGST0)
49
50#define MEMSTICK_MANAGMENT_FLAG_NORMAL \
51 (MEMSTICK_MANAGEMENT_SYSFLG | \
52 MEMSTICK_MANAGEMENT_SCMS1 | \
53 MEMSTICK_MANAGEMENT_SCMS0) \
54
55struct ms_boot_header {
56 unsigned short block_id;
57 unsigned short format_reserved;
58 unsigned char reserved0[184];
59 unsigned char data_entry;
60 unsigned char reserved1[179];
61} __packed;
62
63
64struct ms_system_item {
65 unsigned int start_addr;
66 unsigned int data_size;
67 unsigned char data_type_id;
68 unsigned char reserved[3];
69} __packed;
70
71struct ms_system_entry {
72 struct ms_system_item disabled_block;
73 struct ms_system_item cis_idi;
74 unsigned char reserved[24];
75} __packed;
76
77struct ms_boot_attr_info {
78 unsigned char memorystick_class;
79 unsigned char format_unique_value1;
80 unsigned short block_size;
81 unsigned short number_of_blocks;
82 unsigned short number_of_effective_blocks;
83 unsigned short page_size;
84 unsigned char extra_data_size;
85 unsigned char format_unique_value2;
86 unsigned char assembly_time[8];
87 unsigned char format_unique_value3;
88 unsigned char serial_number[3];
89 unsigned char assembly_manufacturer_code;
90 unsigned char assembly_model_code[3];
91 unsigned short memory_manufacturer_code;
92 unsigned short memory_device_code;
93 unsigned short implemented_capacity;
94 unsigned char format_unique_value4[2];
95 unsigned char vcc;
96 unsigned char vpp;
97 unsigned short controller_number;
98 unsigned short controller_function;
99 unsigned char reserved0[9];
100 unsigned char transfer_supporting;
101 unsigned short format_unique_value5;
102 unsigned char format_type;
103 unsigned char memorystick_application;
104 unsigned char device_type;
105 unsigned char reserved1[22];
106 unsigned char format_uniqure_value6[2];
107 unsigned char reserved2[15];
108} __packed;
109
110struct ms_cis_idi {
111 unsigned short general_config;
112 unsigned short logical_cylinders;
113 unsigned short reserved0;
114 unsigned short logical_heads;
115 unsigned short track_size;
116 unsigned short page_size;
117 unsigned short pages_per_track;
118 unsigned short msw;
119 unsigned short lsw;
120 unsigned short reserved1;
121 unsigned char serial_number[20];
122 unsigned short buffer_type;
123 unsigned short buffer_size_increments;
124 unsigned short long_command_ecc;
125 unsigned char firmware_version[28];
126 unsigned char model_name[18];
127 unsigned short reserved2[5];
128 unsigned short pio_mode_number;
129 unsigned short dma_mode_number;
130 unsigned short field_validity;
131 unsigned short current_logical_cylinders;
132 unsigned short current_logical_heads;
133 unsigned short current_pages_per_track;
134 unsigned int current_page_capacity;
135 unsigned short mutiple_page_setting;
136 unsigned int addressable_pages;
137 unsigned short single_word_dma;
138 unsigned short multi_word_dma;
139 unsigned char reserved3[128];
140} __packed;
141
142
143struct ms_boot_page {
144 struct ms_boot_header header;
145 struct ms_system_entry entry;
146 struct ms_boot_attr_info attr;
147} __packed;
148
149struct msb_data {
150 unsigned int usage_count;
151 struct memstick_dev *card;
152 struct gendisk *disk;
153 struct request_queue *queue;
154 spinlock_t q_lock;
155 struct hd_geometry geometry;
156 struct attribute_group attr_group;
157 struct request *req;
158 int caps;
159 int disk_id;
160
161 /* IO */
162 struct workqueue_struct *io_queue;
163 bool io_queue_stopped;
164 struct work_struct io_work;
165 bool card_dead;
166
167 /* Media properties */
168 struct ms_boot_page *boot_page;
169 u16 boot_block_locations[2];
170 int boot_block_count;
171
172 bool read_only;
173 unsigned short page_size;
174 int block_size;
175 int pages_in_block;
176 int zone_count;
177 int block_count;
178 int logical_block_count;
179
180 /* FTL tables */
181 unsigned long *used_blocks_bitmap;
182 unsigned long *erased_blocks_bitmap;
183 u16 *lba_to_pba_table;
184 int free_block_count[MS_MAX_ZONES];
185 bool ftl_initialized;
186
187 /* Cache */
188 unsigned char *cache;
189 unsigned long valid_cache_bitmap;
190 int cache_block_lba;
191 bool need_flush_cache;
192 struct timer_list cache_flush_timer;
193
194 /* Preallocated buffers */
195 unsigned char *block_buffer;
196 struct scatterlist prealloc_sg[MS_BLOCK_MAX_SEGS+1];
197
198
199 /* handler's local data */
200 struct ms_register_addr reg_addr;
201 bool addr_valid;
202
203 u8 command_value;
204 bool command_need_oob;
205 struct scatterlist *current_sg;
206 int current_sg_offset;
207
208 struct ms_register regs;
209 int current_page;
210
211 int state;
212 int exit_error;
213 bool int_polling;
214 unsigned long int_timeout;
215
216};
217
218enum msb_readpage_states {
219 MSB_RP_SEND_BLOCK_ADDRESS = 0,
220 MSB_RP_SEND_READ_COMMAND,
221
222 MSB_RP_SEND_INT_REQ,
223 MSB_RP_RECEIVE_INT_REQ_RESULT,
224
225 MSB_RP_SEND_READ_STATUS_REG,
226 MSB_RP_RECIVE_STATUS_REG,
227
228 MSB_RP_SEND_OOB_READ,
229 MSB_RP_RECEIVE_OOB_READ,
230
231 MSB_RP_SEND_READ_DATA,
232 MSB_RP_RECEIVE_READ_DATA,
233};
234
235enum msb_write_block_states {
236 MSB_WB_SEND_WRITE_PARAMS = 0,
237 MSB_WB_SEND_WRITE_OOB,
238 MSB_WB_SEND_WRITE_COMMAND,
239
240 MSB_WB_SEND_INT_REQ,
241 MSB_WB_RECEIVE_INT_REQ,
242
243 MSB_WB_SEND_WRITE_DATA,
244 MSB_WB_RECEIVE_WRITE_CONFIRMATION,
245};
246
247enum msb_send_command_states {
248 MSB_SC_SEND_WRITE_PARAMS,
249 MSB_SC_SEND_WRITE_OOB,
250 MSB_SC_SEND_COMMAND,
251
252 MSB_SC_SEND_INT_REQ,
253 MSB_SC_RECEIVE_INT_REQ,
254
255};
256
257enum msb_reset_states {
258 MSB_RS_SEND,
259 MSB_RS_CONFIRM,
260};
261
262enum msb_par_switch_states {
263 MSB_PS_SEND_SWITCH_COMMAND,
264 MSB_PS_SWICH_HOST,
265 MSB_PS_CONFIRM,
266};
267
268struct chs_entry {
269 unsigned long size;
270 unsigned char sec;
271 unsigned short cyl;
272 unsigned char head;
273};
274
275static int msb_reset(struct msb_data *msb, bool full);
276
277static int h_msb_default_bad(struct memstick_dev *card,
278 struct memstick_request **mrq);
279
280#define __dbg(level, format, ...) \
281 do { \
282 if (debug >= level) \
283 pr_err(format "\n", ## __VA_ARGS__); \
284 } while (0)
285
286
287#define dbg(format, ...) __dbg(1, format, ## __VA_ARGS__)
288#define dbg_verbose(format, ...) __dbg(2, format, ## __VA_ARGS__)
289
290#endif
diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c
index cf8bd727dfc7..25f8f93decb6 100644
--- a/drivers/memstick/host/rtsx_pci_ms.c
+++ b/drivers/memstick/host/rtsx_pci_ms.c
@@ -612,8 +612,6 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev)
612 memstick_remove_host(msh); 612 memstick_remove_host(msh);
613 memstick_free_host(msh); 613 memstick_free_host(msh);
614 614
615 platform_set_drvdata(pdev, NULL);
616
617 dev_dbg(&(pdev->dev), 615 dev_dbg(&(pdev->dev),
618 ": Realtek PCI-E Memstick controller has been removed\n"); 616 ": Realtek PCI-E Memstick controller has been removed\n");
619 617
diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index f74bfcbb7bad..8eea2efbbb6d 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -393,17 +393,21 @@ static void gmux_notify_handler(acpi_handle device, u32 value, void *context)
393 complete(&gmux_data->powerchange_done); 393 complete(&gmux_data->powerchange_done);
394} 394}
395 395
396static int gmux_suspend(struct pnp_dev *pnp, pm_message_t state) 396static int gmux_suspend(struct device *dev)
397{ 397{
398 struct pnp_dev *pnp = to_pnp_dev(dev);
398 struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp); 399 struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp);
400
399 gmux_data->resume_client_id = gmux_active_client(gmux_data); 401 gmux_data->resume_client_id = gmux_active_client(gmux_data);
400 gmux_disable_interrupts(gmux_data); 402 gmux_disable_interrupts(gmux_data);
401 return 0; 403 return 0;
402} 404}
403 405
404static int gmux_resume(struct pnp_dev *pnp) 406static int gmux_resume(struct device *dev)
405{ 407{
408 struct pnp_dev *pnp = to_pnp_dev(dev);
406 struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp); 409 struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp);
410
407 gmux_enable_interrupts(gmux_data); 411 gmux_enable_interrupts(gmux_data);
408 gmux_switchto(gmux_data->resume_client_id); 412 gmux_switchto(gmux_data->resume_client_id);
409 if (gmux_data->power_state == VGA_SWITCHEROO_OFF) 413 if (gmux_data->power_state == VGA_SWITCHEROO_OFF)
@@ -605,13 +609,19 @@ static const struct pnp_device_id gmux_device_ids[] = {
605 {"", 0} 609 {"", 0}
606}; 610};
607 611
612static const struct dev_pm_ops gmux_dev_pm_ops = {
613 .suspend = gmux_suspend,
614 .resume = gmux_resume,
615};
616
608static struct pnp_driver gmux_pnp_driver = { 617static struct pnp_driver gmux_pnp_driver = {
609 .name = "apple-gmux", 618 .name = "apple-gmux",
610 .probe = gmux_probe, 619 .probe = gmux_probe,
611 .remove = gmux_remove, 620 .remove = gmux_remove,
612 .id_table = gmux_device_ids, 621 .id_table = gmux_device_ids,
613 .suspend = gmux_suspend, 622 .driver = {
614 .resume = gmux_resume 623 .pm = &gmux_dev_pm_ops,
624 },
615}; 625};
616 626
617static int __init apple_gmux_init(void) 627static int __init apple_gmux_init(void)
diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index 12adb43a0693..a39ee38a9414 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -163,6 +163,13 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state)
163 if (!pnp_drv) 163 if (!pnp_drv)
164 return 0; 164 return 0;
165 165
166 if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) {
167 error = pnp_drv->driver.pm->suspend(dev);
168 suspend_report_result(pnp_drv->driver.pm->suspend, error);
169 if (error)
170 return error;
171 }
172
166 if (pnp_drv->suspend) { 173 if (pnp_drv->suspend) {
167 error = pnp_drv->suspend(pnp_dev, state); 174 error = pnp_drv->suspend(pnp_dev, state);
168 if (error) 175 if (error)
@@ -211,6 +218,12 @@ static int pnp_bus_resume(struct device *dev)
211 return error; 218 return error;
212 } 219 }
213 220
221 if (pnp_drv->driver.pm && pnp_drv->driver.pm->resume) {
222 error = pnp_drv->driver.pm->resume(dev);
223 if (error)
224 return error;
225 }
226
214 if (pnp_drv->resume) { 227 if (pnp_drv->resume) {
215 error = pnp_drv->resume(pnp_dev); 228 error = pnp_drv->resume(pnp_dev);
216 if (error) 229 if (error)
diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
index eae0eda9ff39..9966124ad988 100644
--- a/drivers/pps/clients/pps-gpio.c
+++ b/drivers/pps/clients/pps-gpio.c
@@ -184,7 +184,6 @@ static int pps_gpio_remove(struct platform_device *pdev)
184{ 184{
185 struct pps_gpio_device_data *data = platform_get_drvdata(pdev); 185 struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
186 186
187 platform_set_drvdata(pdev, NULL);
188 pps_unregister_source(data->pps); 187 pps_unregister_source(data->pps);
189 dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq); 188 dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq);
190 return 0; 189 return 0;
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 9e3498bf302b..9654aa3c05cb 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1249,6 +1249,15 @@ config RTC_DRV_SIRFSOC
1249 Say "yes" here to support the real time clock on SiRF SOC chips. 1249 Say "yes" here to support the real time clock on SiRF SOC chips.
1250 This driver can also be built as a module called rtc-sirfsoc. 1250 This driver can also be built as a module called rtc-sirfsoc.
1251 1251
1252config RTC_DRV_MOXART
1253 tristate "MOXA ART RTC"
1254 help
1255 If you say yes here you get support for the MOXA ART
1256 RTC module.
1257
1258 This driver can also be built as a module. If so, the module
1259 will be called rtc-moxart
1260
1252comment "HID Sensor RTC drivers" 1261comment "HID Sensor RTC drivers"
1253 1262
1254config RTC_DRV_HID_SENSOR_TIME 1263config RTC_DRV_HID_SENSOR_TIME
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index d3b4488f48f2..2dff3d2009b5 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -130,3 +130,4 @@ obj-$(CONFIG_RTC_DRV_WM831X) += rtc-wm831x.o
130obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o 130obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o
131obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o 131obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o
132obj-$(CONFIG_RTC_DRV_SIRFSOC) += rtc-sirfsoc.o 132obj-$(CONFIG_RTC_DRV_SIRFSOC) += rtc-sirfsoc.o
133obj-$(CONFIG_RTC_DRV_MOXART) += rtc-moxart.o
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index be06d7150de5..24e733c98f8b 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -1018,23 +1018,6 @@ static void __exit cmos_pnp_remove(struct pnp_dev *pnp)
1018 cmos_do_remove(&pnp->dev); 1018 cmos_do_remove(&pnp->dev);
1019} 1019}
1020 1020
1021#ifdef CONFIG_PM
1022
1023static int cmos_pnp_suspend(struct pnp_dev *pnp, pm_message_t mesg)
1024{
1025 return cmos_suspend(&pnp->dev);
1026}
1027
1028static int cmos_pnp_resume(struct pnp_dev *pnp)
1029{
1030 return cmos_resume(&pnp->dev);
1031}
1032
1033#else
1034#define cmos_pnp_suspend NULL
1035#define cmos_pnp_resume NULL
1036#endif
1037
1038static void cmos_pnp_shutdown(struct pnp_dev *pnp) 1021static void cmos_pnp_shutdown(struct pnp_dev *pnp)
1039{ 1022{
1040 if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pnp->dev)) 1023 if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pnp->dev))
@@ -1060,8 +1043,11 @@ static struct pnp_driver cmos_pnp_driver = {
1060 1043
1061 /* flag ensures resume() gets called, and stops syslog spam */ 1044 /* flag ensures resume() gets called, and stops syslog spam */
1062 .flags = PNP_DRIVER_RES_DO_NOT_CHANGE, 1045 .flags = PNP_DRIVER_RES_DO_NOT_CHANGE,
1063 .suspend = cmos_pnp_suspend, 1046#ifdef CONFIG_PM_SLEEP
1064 .resume = cmos_pnp_resume, 1047 .driver = {
1048 .pm = &cmos_pm_ops,
1049 },
1050#endif
1065}; 1051};
1066 1052
1067#endif /* CONFIG_PNP */ 1053#endif /* CONFIG_PNP */
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 308a8fefe76f..bc7b4fcf603c 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -89,7 +89,6 @@ enum ds1511reg {
89struct rtc_plat_data { 89struct rtc_plat_data {
90 struct rtc_device *rtc; 90 struct rtc_device *rtc;
91 void __iomem *ioaddr; /* virtual base address */ 91 void __iomem *ioaddr; /* virtual base address */
92 int size; /* amount of memory mapped */
93 int irq; 92 int irq;
94 unsigned int irqen; 93 unsigned int irqen;
95 int alrm_sec; 94 int alrm_sec;
@@ -479,20 +478,14 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
479 struct rtc_plat_data *pdata; 478 struct rtc_plat_data *pdata;
480 int ret = 0; 479 int ret = 0;
481 480
482 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
483 if (!res)
484 return -ENODEV;
485
486 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); 481 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
487 if (!pdata) 482 if (!pdata)
488 return -ENOMEM; 483 return -ENOMEM;
489 pdata->size = resource_size(res); 484
490 if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, 485 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
491 pdev->name)) 486 ds1511_base = devm_ioremap_resource(&pdev->dev, res);
492 return -EBUSY; 487 if (IS_ERR(ds1511_base))
493 ds1511_base = devm_ioremap(&pdev->dev, res->start, pdata->size); 488 return PTR_ERR(ds1511_base);
494 if (!ds1511_base)
495 return -ENOMEM;
496 pdata->ioaddr = ds1511_base; 489 pdata->ioaddr = ds1511_base;
497 pdata->irq = platform_get_irq(pdev, 0); 490 pdata->irq = platform_get_irq(pdev, 0);
498 491
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index 8c6c952e90b1..fd31571941f5 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -285,19 +285,14 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
285 void __iomem *ioaddr; 285 void __iomem *ioaddr;
286 int ret = 0; 286 int ret = 0;
287 287
288 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
289 if (!res)
290 return -ENODEV;
291 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); 288 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
292 if (!pdata) 289 if (!pdata)
293 return -ENOMEM; 290 return -ENOMEM;
294 if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE,
295 pdev->name))
296 return -EBUSY;
297 291
298 ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE); 292 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
299 if (!ioaddr) 293 ioaddr = devm_ioremap_resource(&pdev->dev, res);
300 return -ENOMEM; 294 if (IS_ERR(ioaddr))
295 return PTR_ERR(ioaddr);
301 pdata->ioaddr = ioaddr; 296 pdata->ioaddr = ioaddr;
302 pdata->irq = platform_get_irq(pdev, 0); 297 pdata->irq = platform_get_irq(pdev, 0);
303 298
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index eccdc62ae1c0..17b73fdc3b6e 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -52,11 +52,9 @@
52#define RTC_BATT_FLAG 0x80 52#define RTC_BATT_FLAG 0x80
53 53
54struct rtc_plat_data { 54struct rtc_plat_data {
55 struct rtc_device *rtc;
56 void __iomem *ioaddr_nvram; 55 void __iomem *ioaddr_nvram;
57 void __iomem *ioaddr_rtc; 56 void __iomem *ioaddr_rtc;
58 size_t size_nvram; 57 size_t size_nvram;
59 size_t size;
60 unsigned long last_jiffies; 58 unsigned long last_jiffies;
61 struct bin_attribute nvram_attr; 59 struct bin_attribute nvram_attr;
62}; 60};
@@ -117,11 +115,7 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm)
117 /* year is 1900 + tm->tm_year */ 115 /* year is 1900 + tm->tm_year */
118 tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900; 116 tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900;
119 117
120 if (rtc_valid_tm(tm) < 0) { 118 return rtc_valid_tm(tm);
121 dev_err(dev, "retrieved date/time is not valid.\n");
122 rtc_time_to_tm(0, tm);
123 }
124 return 0;
125} 119}
126 120
127static const struct rtc_class_ops ds1742_rtc_ops = { 121static const struct rtc_class_ops ds1742_rtc_ops = {
@@ -168,22 +162,17 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
168 void __iomem *ioaddr; 162 void __iomem *ioaddr;
169 int ret = 0; 163 int ret = 0;
170 164
171 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
172 if (!res)
173 return -ENODEV;
174 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); 165 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
175 if (!pdata) 166 if (!pdata)
176 return -ENOMEM; 167 return -ENOMEM;
177 pdata->size = resource_size(res); 168
178 if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, 169 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
179 pdev->name)) 170 ioaddr = devm_ioremap_resource(&pdev->dev, res);
180 return -EBUSY; 171 if (IS_ERR(ioaddr))
181 ioaddr = devm_ioremap(&pdev->dev, res->start, pdata->size); 172 return PTR_ERR(ioaddr);
182 if (!ioaddr)
183 return -ENOMEM;
184 173
185 pdata->ioaddr_nvram = ioaddr; 174 pdata->ioaddr_nvram = ioaddr;
186 pdata->size_nvram = pdata->size - RTC_SIZE; 175 pdata->size_nvram = resource_size(res) - RTC_SIZE;
187 pdata->ioaddr_rtc = ioaddr + pdata->size_nvram; 176 pdata->ioaddr_rtc = ioaddr + pdata->size_nvram;
188 177
189 sysfs_bin_attr_init(&pdata->nvram_attr); 178 sysfs_bin_attr_init(&pdata->nvram_attr);
@@ -212,7 +201,6 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
212 &ds1742_rtc_ops, THIS_MODULE); 201 &ds1742_rtc_ops, THIS_MODULE);
213 if (IS_ERR(rtc)) 202 if (IS_ERR(rtc))
214 return PTR_ERR(rtc); 203 return PTR_ERR(rtc);
215 pdata->rtc = rtc;
216 204
217 ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr); 205 ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr);
218 206
diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c
index 549b3c3792d2..580e7b56bde8 100644
--- a/drivers/rtc/rtc-ep93xx.c
+++ b/drivers/rtc/rtc-ep93xx.c
@@ -138,17 +138,9 @@ static int ep93xx_rtc_probe(struct platform_device *pdev)
138 return -ENOMEM; 138 return -ENOMEM;
139 139
140 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 140 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
141 if (!res) 141 ep93xx_rtc->mmio_base = devm_ioremap_resource(&pdev->dev, res);
142 return -ENXIO; 142 if (IS_ERR(ep93xx_rtc->mmio_base))
143 143 return PTR_ERR(ep93xx_rtc->mmio_base);
144 if (!devm_request_mem_region(&pdev->dev, res->start,
145 resource_size(res), pdev->name))
146 return -EBUSY;
147
148 ep93xx_rtc->mmio_base = devm_ioremap(&pdev->dev, res->start,
149 resource_size(res));
150 if (!ep93xx_rtc->mmio_base)
151 return -ENXIO;
152 144
153 pdev->dev.platform_data = ep93xx_rtc; 145 pdev->dev.platform_data = ep93xx_rtc;
154 platform_set_drvdata(pdev, ep93xx_rtc); 146 platform_set_drvdata(pdev, ep93xx_rtc);
diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c
index 7273b0139e5c..4e2a81854f51 100644
--- a/drivers/rtc/rtc-hid-sensor-time.c
+++ b/drivers/rtc/rtc-hid-sensor-time.c
@@ -23,10 +23,6 @@
23#include <linux/iio/iio.h> 23#include <linux/iio/iio.h>
24#include <linux/rtc.h> 24#include <linux/rtc.h>
25 25
26/* Format: HID-SENSOR-usage_id_in_hex */
27/* Usage ID from spec for Time: 0x2000A0 */
28#define DRIVER_NAME "HID-SENSOR-2000a0" /* must be lowercase */
29
30enum hid_time_channel { 26enum hid_time_channel {
31 CHANNEL_SCAN_INDEX_YEAR, 27 CHANNEL_SCAN_INDEX_YEAR,
32 CHANNEL_SCAN_INDEX_MONTH, 28 CHANNEL_SCAN_INDEX_MONTH,
@@ -283,9 +279,11 @@ static int hid_time_probe(struct platform_device *pdev)
283 "hid-sensor-time", &hid_time_rtc_ops, 279 "hid-sensor-time", &hid_time_rtc_ops,
284 THIS_MODULE); 280 THIS_MODULE);
285 281
286 if (IS_ERR(time_state->rtc)) { 282 if (IS_ERR_OR_NULL(time_state->rtc)) {
283 ret = time_state->rtc ? PTR_ERR(time_state->rtc) : -ENODEV;
284 time_state->rtc = NULL;
285 sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME);
287 dev_err(&pdev->dev, "rtc device register failed!\n"); 286 dev_err(&pdev->dev, "rtc device register failed!\n");
288 return PTR_ERR(time_state->rtc);
289 } 287 }
290 288
291 return ret; 289 return ret;
@@ -300,9 +298,19 @@ static int hid_time_remove(struct platform_device *pdev)
300 return 0; 298 return 0;
301} 299}
302 300
301static struct platform_device_id hid_time_ids[] = {
302 {
303 /* Format: HID-SENSOR-usage_id_in_hex_lowercase */
304 .name = "HID-SENSOR-2000a0",
305 },
306 { /* sentinel */ }
307};
308MODULE_DEVICE_TABLE(platform, hid_time_ids);
309
303static struct platform_driver hid_time_platform_driver = { 310static struct platform_driver hid_time_platform_driver = {
311 .id_table = hid_time_ids,
304 .driver = { 312 .driver = {
305 .name = DRIVER_NAME, 313 .name = KBUILD_MODNAME,
306 .owner = THIS_MODULE, 314 .owner = THIS_MODULE,
307 }, 315 },
308 .probe = hid_time_probe, 316 .probe = hid_time_probe,
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index d3a8c8e255de..abd7f9091f34 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -375,24 +375,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
375 struct imxdi_dev *imxdi; 375 struct imxdi_dev *imxdi;
376 int rc; 376 int rc;
377 377
378 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
379 if (!res)
380 return -ENODEV;
381
382 imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL); 378 imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL);
383 if (!imxdi) 379 if (!imxdi)
384 return -ENOMEM; 380 return -ENOMEM;
385 381
386 imxdi->pdev = pdev; 382 imxdi->pdev = pdev;
387 383
388 if (!devm_request_mem_region(&pdev->dev, res->start, resource_size(res), 384 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
389 pdev->name)) 385 imxdi->ioaddr = devm_ioremap_resource(&pdev->dev, res);
390 return -EBUSY; 386 if (IS_ERR(imxdi->ioaddr))
391 387 return PTR_ERR(imxdi->ioaddr);
392 imxdi->ioaddr = devm_ioremap(&pdev->dev, res->start,
393 resource_size(res));
394 if (imxdi->ioaddr == NULL)
395 return -ENOMEM;
396 388
397 spin_lock_init(&imxdi->irq_lock); 389 spin_lock_init(&imxdi->irq_lock);
398 390
diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c
index 8276ae94a2a9..bfdbcb82d069 100644
--- a/drivers/rtc/rtc-lpc32xx.c
+++ b/drivers/rtc/rtc-lpc32xx.c
@@ -201,16 +201,9 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev)
201{ 201{
202 struct resource *res; 202 struct resource *res;
203 struct lpc32xx_rtc *rtc; 203 struct lpc32xx_rtc *rtc;
204 resource_size_t size;
205 int rtcirq; 204 int rtcirq;
206 u32 tmp; 205 u32 tmp;
207 206
208 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
209 if (!res) {
210 dev_err(&pdev->dev, "Can't get memory resource\n");
211 return -ENOENT;
212 }
213
214 rtcirq = platform_get_irq(pdev, 0); 207 rtcirq = platform_get_irq(pdev, 0);
215 if (rtcirq < 0 || rtcirq >= NR_IRQS) { 208 if (rtcirq < 0 || rtcirq >= NR_IRQS) {
216 dev_warn(&pdev->dev, "Can't get interrupt resource\n"); 209 dev_warn(&pdev->dev, "Can't get interrupt resource\n");
@@ -224,19 +217,10 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev)
224 } 217 }
225 rtc->irq = rtcirq; 218 rtc->irq = rtcirq;
226 219
227 size = resource_size(res); 220 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
228 221 rtc->rtc_base = devm_ioremap_resource(&pdev->dev, res);
229 if (!devm_request_mem_region(&pdev->dev, res->start, size, 222 if (IS_ERR(rtc->rtc_base))
230 pdev->name)) { 223 return PTR_ERR(rtc->rtc_base);
231 dev_err(&pdev->dev, "RTC registers are not free\n");
232 return -EBUSY;
233 }
234
235 rtc->rtc_base = devm_ioremap(&pdev->dev, res->start, size);
236 if (!rtc->rtc_base) {
237 dev_err(&pdev->dev, "Can't map memory\n");
238 return -ENOMEM;
239 }
240 224
241 spin_lock_init(&rtc->lock); 225 spin_lock_init(&rtc->lock);
242 226
diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c
index 9915cb96014b..9efe118a28ba 100644
--- a/drivers/rtc/rtc-max77686.c
+++ b/drivers/rtc/rtc-max77686.c
@@ -240,9 +240,9 @@ static int max77686_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
240 } 240 }
241 241
242 alrm->pending = 0; 242 alrm->pending = 0;
243 ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS1, &val); 243 ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS2, &val);
244 if (ret < 0) { 244 if (ret < 0) {
245 dev_err(info->dev, "%s:%d fail to read status1 reg(%d)\n", 245 dev_err(info->dev, "%s:%d fail to read status2 reg(%d)\n",
246 __func__, __LINE__, ret); 246 __func__, __LINE__, ret);
247 goto out; 247 goto out;
248 } 248 }
diff --git a/drivers/rtc/rtc-moxart.c b/drivers/rtc/rtc-moxart.c
new file mode 100644
index 000000000000..c29dee0946e6
--- /dev/null
+++ b/drivers/rtc/rtc-moxart.c
@@ -0,0 +1,330 @@
1/*
2 * MOXA ART RTC driver.
3 *
4 * Copyright (C) 2013 Jonas Jensen
5 *
6 * Jonas Jensen <jonas.jensen@gmail.com>
7 *
8 * Based on code from
9 * Moxa Technology Co., Ltd. <www.moxa.com>
10 *
11 * This file is licensed under the terms of the GNU General Public
12 * License version 2. This program is licensed "as is" without any
13 * warranty of any kind, whether express or implied.
14 */
15
16#include <linux/init.h>
17#include <linux/kernel.h>
18#include <linux/delay.h>
19#include <linux/rtc.h>
20#include <linux/platform_device.h>
21#include <linux/module.h>
22#include <linux/gpio.h>
23#include <linux/of_gpio.h>
24
25#define GPIO_RTC_RESERVED 0x0C
26#define GPIO_RTC_DATA_SET 0x10
27#define GPIO_RTC_DATA_CLEAR 0x14
28#define GPIO_RTC_PIN_PULL_ENABLE 0x18
29#define GPIO_RTC_PIN_PULL_TYPE 0x1C
30#define GPIO_RTC_INT_ENABLE 0x20
31#define GPIO_RTC_INT_RAW_STATE 0x24
32#define GPIO_RTC_INT_MASKED_STATE 0x28
33#define GPIO_RTC_INT_MASK 0x2C
34#define GPIO_RTC_INT_CLEAR 0x30
35#define GPIO_RTC_INT_TRIGGER 0x34
36#define GPIO_RTC_INT_BOTH 0x38
37#define GPIO_RTC_INT_RISE_NEG 0x3C
38#define GPIO_RTC_BOUNCE_ENABLE 0x40
39#define GPIO_RTC_BOUNCE_PRE_SCALE 0x44
40#define GPIO_RTC_PROTECT_W 0x8E
41#define GPIO_RTC_PROTECT_R 0x8F
42#define GPIO_RTC_YEAR_W 0x8C
43#define GPIO_RTC_YEAR_R 0x8D
44#define GPIO_RTC_DAY_W 0x8A
45#define GPIO_RTC_DAY_R 0x8B
46#define GPIO_RTC_MONTH_W 0x88
47#define GPIO_RTC_MONTH_R 0x89
48#define GPIO_RTC_DATE_W 0x86
49#define GPIO_RTC_DATE_R 0x87
50#define GPIO_RTC_HOURS_W 0x84
51#define GPIO_RTC_HOURS_R 0x85
52#define GPIO_RTC_MINUTES_W 0x82
53#define GPIO_RTC_MINUTES_R 0x83
54#define GPIO_RTC_SECONDS_W 0x80
55#define GPIO_RTC_SECONDS_R 0x81
56#define GPIO_RTC_DELAY_TIME 8
57
58struct moxart_rtc {
59 struct rtc_device *rtc;
60 spinlock_t rtc_lock;
61 int gpio_data, gpio_sclk, gpio_reset;
62};
63
64static int day_of_year[12] = { 0, 31, 59, 90, 120, 151, 181,
65 212, 243, 273, 304, 334 };
66
67static void moxart_rtc_write_byte(struct device *dev, u8 data)
68{
69 struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
70 int i;
71
72 for (i = 0; i < 8; i++, data >>= 1) {
73 gpio_set_value(moxart_rtc->gpio_sclk, 0);
74 gpio_set_value(moxart_rtc->gpio_data, ((data & 1) == 1));
75 udelay(GPIO_RTC_DELAY_TIME);
76 gpio_set_value(moxart_rtc->gpio_sclk, 1);
77 udelay(GPIO_RTC_DELAY_TIME);
78 }
79}
80
81static u8 moxart_rtc_read_byte(struct device *dev)
82{
83 struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
84 int i;
85 u8 data = 0;
86
87 for (i = 0; i < 8; i++) {
88 gpio_set_value(moxart_rtc->gpio_sclk, 0);
89 udelay(GPIO_RTC_DELAY_TIME);
90 gpio_set_value(moxart_rtc->gpio_sclk, 1);
91 udelay(GPIO_RTC_DELAY_TIME);
92 if (gpio_get_value(moxart_rtc->gpio_data))
93 data |= (1 << i);
94 udelay(GPIO_RTC_DELAY_TIME);
95 }
96 return data;
97}
98
99static u8 moxart_rtc_read_register(struct device *dev, u8 cmd)
100{
101 struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
102 u8 data;
103 unsigned long flags;
104
105 local_irq_save(flags);
106
107 gpio_direction_output(moxart_rtc->gpio_data, 0);
108 gpio_set_value(moxart_rtc->gpio_reset, 1);
109 udelay(GPIO_RTC_DELAY_TIME);
110 moxart_rtc_write_byte(dev, cmd);
111 gpio_direction_input(moxart_rtc->gpio_data);
112 udelay(GPIO_RTC_DELAY_TIME);
113 data = moxart_rtc_read_byte(dev);
114 gpio_set_value(moxart_rtc->gpio_sclk, 0);
115 gpio_set_value(moxart_rtc->gpio_reset, 0);
116 udelay(GPIO_RTC_DELAY_TIME);
117
118 local_irq_restore(flags);
119
120 return data;
121}
122
123static void moxart_rtc_write_register(struct device *dev, u8 cmd, u8 data)
124{
125 struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
126 unsigned long flags;
127
128 local_irq_save(flags);
129
130 gpio_direction_output(moxart_rtc->gpio_data, 0);
131 gpio_set_value(moxart_rtc->gpio_reset, 1);
132 udelay(GPIO_RTC_DELAY_TIME);
133 moxart_rtc_write_byte(dev, cmd);
134 moxart_rtc_write_byte(dev, data);
135 gpio_set_value(moxart_rtc->gpio_sclk, 0);
136 gpio_set_value(moxart_rtc->gpio_reset, 0);
137 udelay(GPIO_RTC_DELAY_TIME);
138
139 local_irq_restore(flags);
140}
141
142static int moxart_rtc_set_time(struct device *dev, struct rtc_time *tm)
143{
144 struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
145
146 spin_lock_irq(&moxart_rtc->rtc_lock);
147
148 moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0);
149 moxart_rtc_write_register(dev, GPIO_RTC_YEAR_W,
150 (((tm->tm_year - 100) / 10) << 4) |
151 ((tm->tm_year - 100) % 10));
152
153 moxart_rtc_write_register(dev, GPIO_RTC_MONTH_W,
154 (((tm->tm_mon + 1) / 10) << 4) |
155 ((tm->tm_mon + 1) % 10));
156
157 moxart_rtc_write_register(dev, GPIO_RTC_DATE_W,
158 ((tm->tm_mday / 10) << 4) |
159 (tm->tm_mday % 10));
160
161 moxart_rtc_write_register(dev, GPIO_RTC_HOURS_W,
162 ((tm->tm_hour / 10) << 4) |
163 (tm->tm_hour % 10));
164
165 moxart_rtc_write_register(dev, GPIO_RTC_MINUTES_W,
166 ((tm->tm_min / 10) << 4) |
167 (tm->tm_min % 10));
168
169 moxart_rtc_write_register(dev, GPIO_RTC_SECONDS_W,
170 ((tm->tm_sec / 10) << 4) |
171 (tm->tm_sec % 10));
172
173 moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0x80);
174
175 spin_unlock_irq(&moxart_rtc->rtc_lock);
176
177 dev_dbg(dev, "%s: success tm_year=%d tm_mon=%d\n"
178 "tm_mday=%d tm_hour=%d tm_min=%d tm_sec=%d\n",
179 __func__, tm->tm_year, tm->tm_mon, tm->tm_mday,
180 tm->tm_hour, tm->tm_min, tm->tm_sec);
181
182 return 0;
183}
184
185static int moxart_rtc_read_time(struct device *dev, struct rtc_time *tm)
186{
187 struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
188 unsigned char v;
189
190 spin_lock_irq(&moxart_rtc->rtc_lock);
191
192 v = moxart_rtc_read_register(dev, GPIO_RTC_SECONDS_R);
193 tm->tm_sec = (((v & 0x70) >> 4) * 10) + (v & 0x0F);
194
195 v = moxart_rtc_read_register(dev, GPIO_RTC_MINUTES_R);
196 tm->tm_min = (((v & 0x70) >> 4) * 10) + (v & 0x0F);
197
198 v = moxart_rtc_read_register(dev, GPIO_RTC_HOURS_R);
199 if (v & 0x80) { /* 12-hour mode */
200 tm->tm_hour = (((v & 0x10) >> 4) * 10) + (v & 0x0F);
201 if (v & 0x20) { /* PM mode */
202 tm->tm_hour += 12;
203 if (tm->tm_hour >= 24)
204 tm->tm_hour = 0;
205 }
206 } else { /* 24-hour mode */
207 tm->tm_hour = (((v & 0x30) >> 4) * 10) + (v & 0x0F);
208 }
209
210 v = moxart_rtc_read_register(dev, GPIO_RTC_DATE_R);
211 tm->tm_mday = (((v & 0x30) >> 4) * 10) + (v & 0x0F);
212
213 v = moxart_rtc_read_register(dev, GPIO_RTC_MONTH_R);
214 tm->tm_mon = (((v & 0x10) >> 4) * 10) + (v & 0x0F);
215 tm->tm_mon--;
216
217 v = moxart_rtc_read_register(dev, GPIO_RTC_YEAR_R);
218 tm->tm_year = (((v & 0xF0) >> 4) * 10) + (v & 0x0F);
219 tm->tm_year += 100;
220 if (tm->tm_year <= 69)
221 tm->tm_year += 100;
222
223 v = moxart_rtc_read_register(dev, GPIO_RTC_DAY_R);
224 tm->tm_wday = (v & 0x0f) - 1;
225 tm->tm_yday = day_of_year[tm->tm_mon];
226 tm->tm_yday += (tm->tm_mday - 1);
227 if (tm->tm_mon >= 2) {
228 if (!(tm->tm_year % 4) && (tm->tm_year % 100))
229 tm->tm_yday++;
230 }
231
232 tm->tm_isdst = 0;
233
234 spin_unlock_irq(&moxart_rtc->rtc_lock);
235
236 return 0;
237}
238
239static const struct rtc_class_ops moxart_rtc_ops = {
240 .read_time = moxart_rtc_read_time,
241 .set_time = moxart_rtc_set_time,
242};
243
244static int moxart_rtc_probe(struct platform_device *pdev)
245{
246 struct moxart_rtc *moxart_rtc;
247 int ret = 0;
248
249 moxart_rtc = devm_kzalloc(&pdev->dev, sizeof(*moxart_rtc), GFP_KERNEL);
250 if (!moxart_rtc) {
251 dev_err(&pdev->dev, "devm_kzalloc failed\n");
252 return -ENOMEM;
253 }
254
255 moxart_rtc->gpio_data = of_get_named_gpio(pdev->dev.of_node,
256 "gpio-rtc-data", 0);
257 if (!gpio_is_valid(moxart_rtc->gpio_data)) {
258 dev_err(&pdev->dev, "invalid gpio (data): %d\n",
259 moxart_rtc->gpio_data);
260 return moxart_rtc->gpio_data;
261 }
262
263 moxart_rtc->gpio_sclk = of_get_named_gpio(pdev->dev.of_node,
264 "gpio-rtc-sclk", 0);
265 if (!gpio_is_valid(moxart_rtc->gpio_sclk)) {
266 dev_err(&pdev->dev, "invalid gpio (sclk): %d\n",
267 moxart_rtc->gpio_sclk);
268 return moxart_rtc->gpio_sclk;
269 }
270
271 moxart_rtc->gpio_reset = of_get_named_gpio(pdev->dev.of_node,
272 "gpio-rtc-reset", 0);
273 if (!gpio_is_valid(moxart_rtc->gpio_reset)) {
274 dev_err(&pdev->dev, "invalid gpio (reset): %d\n",
275 moxart_rtc->gpio_reset);
276 return moxart_rtc->gpio_reset;
277 }
278
279 spin_lock_init(&moxart_rtc->rtc_lock);
280 platform_set_drvdata(pdev, moxart_rtc);
281
282 ret = devm_gpio_request(&pdev->dev, moxart_rtc->gpio_data, "rtc_data");
283 if (ret) {
284 dev_err(&pdev->dev, "can't get rtc_data gpio\n");
285 return ret;
286 }
287
288 ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_sclk,
289 GPIOF_DIR_OUT, "rtc_sclk");
290 if (ret) {
291 dev_err(&pdev->dev, "can't get rtc_sclk gpio\n");
292 return ret;
293 }
294
295 ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_reset,
296 GPIOF_DIR_OUT, "rtc_reset");
297 if (ret) {
298 dev_err(&pdev->dev, "can't get rtc_reset gpio\n");
299 return ret;
300 }
301
302 moxart_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
303 &moxart_rtc_ops,
304 THIS_MODULE);
305 if (IS_ERR(moxart_rtc->rtc)) {
306 dev_err(&pdev->dev, "devm_rtc_device_register failed\n");
307 return PTR_ERR(moxart_rtc->rtc);
308 }
309
310 return 0;
311}
312
313static const struct of_device_id moxart_rtc_match[] = {
314 { .compatible = "moxa,moxart-rtc" },
315 { },
316};
317
318static struct platform_driver moxart_rtc_driver = {
319 .probe = moxart_rtc_probe,
320 .driver = {
321 .name = "moxart-rtc",
322 .owner = THIS_MODULE,
323 .of_match_table = moxart_rtc_match,
324 },
325};
326module_platform_driver(moxart_rtc_driver);
327
328MODULE_DESCRIPTION("MOXART RTC driver");
329MODULE_LICENSE("GPL");
330MODULE_AUTHOR("Jonas Jensen <jonas.jensen@gmail.com>");
diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c
index baab802f2153..d536c5962c99 100644
--- a/drivers/rtc/rtc-mv.c
+++ b/drivers/rtc/rtc-mv.c
@@ -221,26 +221,17 @@ static int __init mv_rtc_probe(struct platform_device *pdev)
221{ 221{
222 struct resource *res; 222 struct resource *res;
223 struct rtc_plat_data *pdata; 223 struct rtc_plat_data *pdata;
224 resource_size_t size;
225 u32 rtc_time; 224 u32 rtc_time;
226 int ret = 0; 225 int ret = 0;
227 226
228 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
229 if (!res)
230 return -ENODEV;
231
232 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); 227 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
233 if (!pdata) 228 if (!pdata)
234 return -ENOMEM; 229 return -ENOMEM;
235 230
236 size = resource_size(res); 231 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
237 if (!devm_request_mem_region(&pdev->dev, res->start, size, 232 pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res);
238 pdev->name)) 233 if (IS_ERR(pdata->ioaddr))
239 return -EBUSY; 234 return PTR_ERR(pdata->ioaddr);
240
241 pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, size);
242 if (!pdata->ioaddr)
243 return -ENOMEM;
244 235
245 pdata->clk = devm_clk_get(&pdev->dev, NULL); 236 pdata->clk = devm_clk_get(&pdev->dev, NULL);
246 /* Not all SoCs require a clock.*/ 237 /* Not all SoCs require a clock.*/
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index ab87bacb8f88..50c572645546 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -377,22 +377,16 @@ static int mxc_rtc_probe(struct platform_device *pdev)
377 unsigned long rate; 377 unsigned long rate;
378 int ret; 378 int ret;
379 379
380 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
381 if (!res)
382 return -ENODEV;
383
384 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); 380 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
385 if (!pdata) 381 if (!pdata)
386 return -ENOMEM; 382 return -ENOMEM;
387 383
388 pdata->devtype = pdev->id_entry->driver_data; 384 pdata->devtype = pdev->id_entry->driver_data;
389 385
390 if (!devm_request_mem_region(&pdev->dev, res->start, 386 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
391 resource_size(res), pdev->name)) 387 pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res);
392 return -EBUSY; 388 if (IS_ERR(pdata->ioaddr))
393 389 return PTR_ERR(pdata->ioaddr);
394 pdata->ioaddr = devm_ioremap(&pdev->dev, res->start,
395 resource_size(res));
396 390
397 pdata->clk = devm_clk_get(&pdev->dev, NULL); 391 pdata->clk = devm_clk_get(&pdev->dev, NULL);
398 if (IS_ERR(pdata->clk)) { 392 if (IS_ERR(pdata->clk)) {
diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c
index 22861c5e0c59..248653c74b80 100644
--- a/drivers/rtc/rtc-nuc900.c
+++ b/drivers/rtc/rtc-nuc900.c
@@ -99,7 +99,7 @@ static int *check_rtc_access_enable(struct nuc900_rtc *nuc900_rtc)
99 if (!timeout) 99 if (!timeout)
100 return ERR_PTR(-EPERM); 100 return ERR_PTR(-EPERM);
101 101
102 return 0; 102 return NULL;
103} 103}
104 104
105static int nuc900_rtc_bcd2bin(unsigned int timereg, 105static int nuc900_rtc_bcd2bin(unsigned int timereg,
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index c6ffbaec32a4..c7d97ee59327 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -70,6 +70,8 @@
70#define OMAP_RTC_KICK0_REG 0x6c 70#define OMAP_RTC_KICK0_REG 0x6c
71#define OMAP_RTC_KICK1_REG 0x70 71#define OMAP_RTC_KICK1_REG 0x70
72 72
73#define OMAP_RTC_IRQWAKEEN 0x7c
74
73/* OMAP_RTC_CTRL_REG bit fields: */ 75/* OMAP_RTC_CTRL_REG bit fields: */
74#define OMAP_RTC_CTRL_SPLIT (1<<7) 76#define OMAP_RTC_CTRL_SPLIT (1<<7)
75#define OMAP_RTC_CTRL_DISABLE (1<<6) 77#define OMAP_RTC_CTRL_DISABLE (1<<6)
@@ -94,12 +96,21 @@
94#define OMAP_RTC_INTERRUPTS_IT_ALARM (1<<3) 96#define OMAP_RTC_INTERRUPTS_IT_ALARM (1<<3)
95#define OMAP_RTC_INTERRUPTS_IT_TIMER (1<<2) 97#define OMAP_RTC_INTERRUPTS_IT_TIMER (1<<2)
96 98
99/* OMAP_RTC_IRQWAKEEN bit fields: */
100#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN (1<<1)
101
97/* OMAP_RTC_KICKER values */ 102/* OMAP_RTC_KICKER values */
98#define KICK0_VALUE 0x83e70b13 103#define KICK0_VALUE 0x83e70b13
99#define KICK1_VALUE 0x95a4f1e0 104#define KICK1_VALUE 0x95a4f1e0
100 105
101#define OMAP_RTC_HAS_KICKER 0x1 106#define OMAP_RTC_HAS_KICKER 0x1
102 107
108/*
109 * Few RTC IP revisions has special WAKE-EN Register to enable Wakeup
110 * generation for event Alarm.
111 */
112#define OMAP_RTC_HAS_IRQWAKEEN 0x2
113
103static void __iomem *rtc_base; 114static void __iomem *rtc_base;
104 115
105#define rtc_read(addr) readb(rtc_base + (addr)) 116#define rtc_read(addr) readb(rtc_base + (addr))
@@ -299,12 +310,18 @@ static struct rtc_class_ops omap_rtc_ops = {
299static int omap_rtc_alarm; 310static int omap_rtc_alarm;
300static int omap_rtc_timer; 311static int omap_rtc_timer;
301 312
302#define OMAP_RTC_DATA_DA830_IDX 1 313#define OMAP_RTC_DATA_AM3352_IDX 1
314#define OMAP_RTC_DATA_DA830_IDX 2
303 315
304static struct platform_device_id omap_rtc_devtype[] = { 316static struct platform_device_id omap_rtc_devtype[] = {
305 { 317 {
306 .name = DRIVER_NAME, 318 .name = DRIVER_NAME,
307 }, { 319 },
320 [OMAP_RTC_DATA_AM3352_IDX] = {
321 .name = "am3352-rtc",
322 .driver_data = OMAP_RTC_HAS_KICKER | OMAP_RTC_HAS_IRQWAKEEN,
323 },
324 [OMAP_RTC_DATA_DA830_IDX] = {
308 .name = "da830-rtc", 325 .name = "da830-rtc",
309 .driver_data = OMAP_RTC_HAS_KICKER, 326 .driver_data = OMAP_RTC_HAS_KICKER,
310 }, 327 },
@@ -316,6 +333,9 @@ static const struct of_device_id omap_rtc_of_match[] = {
316 { .compatible = "ti,da830-rtc", 333 { .compatible = "ti,da830-rtc",
317 .data = &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX], 334 .data = &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX],
318 }, 335 },
336 { .compatible = "ti,am3352-rtc",
337 .data = &omap_rtc_devtype[OMAP_RTC_DATA_AM3352_IDX],
338 },
319 {}, 339 {},
320}; 340};
321MODULE_DEVICE_TABLE(of, omap_rtc_of_match); 341MODULE_DEVICE_TABLE(of, omap_rtc_of_match);
@@ -464,16 +484,28 @@ static u8 irqstat;
464 484
465static int omap_rtc_suspend(struct device *dev) 485static int omap_rtc_suspend(struct device *dev)
466{ 486{
487 u8 irqwake_stat;
488 struct platform_device *pdev = to_platform_device(dev);
489 const struct platform_device_id *id_entry =
490 platform_get_device_id(pdev);
491
467 irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG); 492 irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG);
468 493
469 /* FIXME the RTC alarm is not currently acting as a wakeup event 494 /* FIXME the RTC alarm is not currently acting as a wakeup event
470 * source, and in fact this enable() call is just saving a flag 495 * source on some platforms, and in fact this enable() call is just
471 * that's never used... 496 * saving a flag that's never used...
472 */ 497 */
473 if (device_may_wakeup(dev)) 498 if (device_may_wakeup(dev)) {
474 enable_irq_wake(omap_rtc_alarm); 499 enable_irq_wake(omap_rtc_alarm);
475 else 500
501 if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) {
502 irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN);
503 irqwake_stat |= OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN;
504 rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN);
505 }
506 } else {
476 rtc_write(0, OMAP_RTC_INTERRUPTS_REG); 507 rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
508 }
477 509
478 /* Disable the clock/module */ 510 /* Disable the clock/module */
479 pm_runtime_put_sync(dev); 511 pm_runtime_put_sync(dev);
@@ -483,13 +515,25 @@ static int omap_rtc_suspend(struct device *dev)
483 515
484static int omap_rtc_resume(struct device *dev) 516static int omap_rtc_resume(struct device *dev)
485{ 517{
518 u8 irqwake_stat;
519 struct platform_device *pdev = to_platform_device(dev);
520 const struct platform_device_id *id_entry =
521 platform_get_device_id(pdev);
522
486 /* Enable the clock/module so that we can access the registers */ 523 /* Enable the clock/module so that we can access the registers */
487 pm_runtime_get_sync(dev); 524 pm_runtime_get_sync(dev);
488 525
489 if (device_may_wakeup(dev)) 526 if (device_may_wakeup(dev)) {
490 disable_irq_wake(omap_rtc_alarm); 527 disable_irq_wake(omap_rtc_alarm);
491 else 528
529 if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) {
530 irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN);
531 irqwake_stat &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN;
532 rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN);
533 }
534 } else {
492 rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG); 535 rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG);
536 }
493 return 0; 537 return 0;
494} 538}
495#endif 539#endif
diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c
index a1fecc8d97fc..fffb7d3449d7 100644
--- a/drivers/rtc/rtc-palmas.c
+++ b/drivers/rtc/rtc-palmas.c
@@ -238,6 +238,15 @@ static int palmas_rtc_probe(struct platform_device *pdev)
238 struct palmas *palmas = dev_get_drvdata(pdev->dev.parent); 238 struct palmas *palmas = dev_get_drvdata(pdev->dev.parent);
239 struct palmas_rtc *palmas_rtc = NULL; 239 struct palmas_rtc *palmas_rtc = NULL;
240 int ret; 240 int ret;
241 bool enable_bb_charging = false;
242 bool high_bb_charging;
243
244 if (pdev->dev.of_node) {
245 enable_bb_charging = of_property_read_bool(pdev->dev.of_node,
246 "ti,backup-battery-chargeable");
247 high_bb_charging = of_property_read_bool(pdev->dev.of_node,
248 "ti,backup-battery-charge-high-current");
249 }
241 250
242 palmas_rtc = devm_kzalloc(&pdev->dev, sizeof(struct palmas_rtc), 251 palmas_rtc = devm_kzalloc(&pdev->dev, sizeof(struct palmas_rtc),
243 GFP_KERNEL); 252 GFP_KERNEL);
@@ -254,6 +263,32 @@ static int palmas_rtc_probe(struct platform_device *pdev)
254 palmas_rtc->dev = &pdev->dev; 263 palmas_rtc->dev = &pdev->dev;
255 platform_set_drvdata(pdev, palmas_rtc); 264 platform_set_drvdata(pdev, palmas_rtc);
256 265
266 if (enable_bb_charging) {
267 unsigned reg = PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG;
268
269 if (high_bb_charging)
270 reg = 0;
271
272 ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE,
273 PALMAS_BACKUP_BATTERY_CTRL,
274 PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG, reg);
275 if (ret < 0) {
276 dev_err(&pdev->dev,
277 "BACKUP_BATTERY_CTRL update failed, %d\n", ret);
278 return ret;
279 }
280
281 ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE,
282 PALMAS_BACKUP_BATTERY_CTRL,
283 PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN,
284 PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN);
285 if (ret < 0) {
286 dev_err(&pdev->dev,
287 "BACKUP_BATTERY_CTRL update failed, %d\n", ret);
288 return ret;
289 }
290 }
291
257 /* Start RTC */ 292 /* Start RTC */
258 ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG, 293 ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG,
259 PALMAS_RTC_CTRL_REG_STOP_RTC, 294 PALMAS_RTC_CTRL_REG_STOP_RTC,
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 205b9f7da1b8..1ee514a3972c 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -203,11 +203,6 @@ static int pcf2127_probe(struct i2c_client *client,
203 return 0; 203 return 0;
204} 204}
205 205
206static int pcf2127_remove(struct i2c_client *client)
207{
208 return 0;
209}
210
211static const struct i2c_device_id pcf2127_id[] = { 206static const struct i2c_device_id pcf2127_id[] = {
212 { "pcf2127", 0 }, 207 { "pcf2127", 0 },
213 { } 208 { }
@@ -229,7 +224,6 @@ static struct i2c_driver pcf2127_driver = {
229 .of_match_table = of_match_ptr(pcf2127_of_match), 224 .of_match_table = of_match_ptr(pcf2127_of_match),
230 }, 225 },
231 .probe = pcf2127_probe, 226 .probe = pcf2127_probe,
232 .remove = pcf2127_remove,
233 .id_table = pcf2127_id, 227 .id_table = pcf2127_id,
234}; 228};
235 229
diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c
index aa7ed4b5f7f0..63460cf80f1b 100644
--- a/drivers/rtc/rtc-sirfsoc.c
+++ b/drivers/rtc/rtc-sirfsoc.c
@@ -44,6 +44,7 @@ struct sirfsoc_rtc_drv {
44 struct rtc_device *rtc; 44 struct rtc_device *rtc;
45 u32 rtc_base; 45 u32 rtc_base;
46 u32 irq; 46 u32 irq;
47 unsigned irq_wake;
47 /* Overflow for every 8 years extra time */ 48 /* Overflow for every 8 years extra time */
48 u32 overflow_rtc; 49 u32 overflow_rtc;
49#ifdef CONFIG_PM 50#ifdef CONFIG_PM
@@ -355,8 +356,8 @@ static int sirfsoc_rtc_suspend(struct device *dev)
355 rtcdrv->saved_counter = 356 rtcdrv->saved_counter =
356 sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); 357 sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
357 rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc; 358 rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc;
358 if (device_may_wakeup(&pdev->dev)) 359 if (device_may_wakeup(&pdev->dev) && !enable_irq_wake(rtcdrv->irq))
359 enable_irq_wake(rtcdrv->irq); 360 rtcdrv->irq_wake = 1;
360 361
361 return 0; 362 return 0;
362} 363}
@@ -423,8 +424,10 @@ static int sirfsoc_rtc_resume(struct device *dev)
423 struct platform_device *pdev = to_platform_device(dev); 424 struct platform_device *pdev = to_platform_device(dev);
424 struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); 425 struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev);
425 sirfsoc_rtc_thaw(dev); 426 sirfsoc_rtc_thaw(dev);
426 if (device_may_wakeup(&pdev->dev)) 427 if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) {
427 disable_irq_wake(rtcdrv->irq); 428 disable_irq_wake(rtcdrv->irq);
429 rtcdrv->irq_wake = 0;
430 }
428 431
429 return 0; 432 return 0;
430} 433}
@@ -434,8 +437,10 @@ static int sirfsoc_rtc_restore(struct device *dev)
434 struct platform_device *pdev = to_platform_device(dev); 437 struct platform_device *pdev = to_platform_device(dev);
435 struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); 438 struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev);
436 439
437 if (device_may_wakeup(&pdev->dev)) 440 if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) {
438 disable_irq_wake(rtcdrv->irq); 441 disable_irq_wake(rtcdrv->irq);
442 rtcdrv->irq_wake = 0;
443 }
439 return 0; 444 return 0;
440} 445}
441 446
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index af5e97e3f272..a176ba614683 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -294,19 +294,14 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
294 void __iomem *ioaddr; 294 void __iomem *ioaddr;
295 int ret = 0; 295 int ret = 0;
296 296
297 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
298 if (!res)
299 return -ENODEV;
300
301 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); 297 pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
302 if (!pdata) 298 if (!pdata)
303 return -ENOMEM; 299 return -ENOMEM;
304 if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE, 300
305 pdev->name)) 301 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
306 return -EBUSY; 302 ioaddr = devm_ioremap_resource(&pdev->dev, res);
307 ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE); 303 if (IS_ERR(ioaddr))
308 if (!ioaddr) 304 return PTR_ERR(ioaddr);
309 return -ENOMEM;
310 pdata->ioaddr = ioaddr; 305 pdata->ioaddr = ioaddr;
311 pdata->irq = platform_get_irq(pdev, 0); 306 pdata->irq = platform_get_irq(pdev, 0);
312 307
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index f9a0677e4e3b..4f87234e0dee 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -244,9 +244,6 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
244 struct resource *res; 244 struct resource *res;
245 int irq, ret; 245 int irq, ret;
246 246
247 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
248 if (!res)
249 return -ENODEV;
250 irq = platform_get_irq(pdev, 0); 247 irq = platform_get_irq(pdev, 0);
251 if (irq < 0) 248 if (irq < 0)
252 return -ENODEV; 249 return -ENODEV;
@@ -255,13 +252,10 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
255 return -ENOMEM; 252 return -ENOMEM;
256 platform_set_drvdata(pdev, pdata); 253 platform_set_drvdata(pdev, pdata);
257 254
258 if (!devm_request_mem_region(&pdev->dev, res->start, 255 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
259 resource_size(res), pdev->name)) 256 pdata->rtcreg = devm_ioremap_resource(&pdev->dev, res);
260 return -EBUSY; 257 if (IS_ERR(pdata->rtcreg))
261 pdata->rtcreg = devm_ioremap(&pdev->dev, res->start, 258 return PTR_ERR(pdata->rtcreg);
262 resource_size(res));
263 if (!pdata->rtcreg)
264 return -EBUSY;
265 259
266 spin_lock_init(&pdata->lock); 260 spin_lock_init(&pdata->lock);
267 tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP); 261 tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP);
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 9e5e14686e75..794820a123d0 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -30,8 +30,8 @@
30 30
31#define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x) 31#define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x)
32 32
33#define TO_USER 0 33#define TO_USER 1
34#define TO_KERNEL 1 34#define TO_KERNEL 0
35#define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */ 35#define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */
36 36
37enum arch_id { 37enum arch_id {
@@ -73,7 +73,7 @@ static struct ipl_parameter_block *ipl_block;
73 * @count: Size of buffer, which should be copied 73 * @count: Size of buffer, which should be copied
74 * @mode: Either TO_KERNEL or TO_USER 74 * @mode: Either TO_KERNEL or TO_USER
75 */ 75 */
76static int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode) 76int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode)
77{ 77{
78 int offs, blk_num; 78 int offs, blk_num;
79 static char buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); 79 static char buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index 6488a7351a60..7e8346ec9cdc 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -38,14 +38,6 @@
38#include "acornfb.h" 38#include "acornfb.h"
39 39
40/* 40/*
41 * VIDC machines can't do 16 or 32BPP modes.
42 */
43#ifdef HAS_VIDC
44#undef FBCON_HAS_CFB16
45#undef FBCON_HAS_CFB32
46#endif
47
48/*
49 * Default resolution. 41 * Default resolution.
50 * NOTE that it has to be supported in the table towards 42 * NOTE that it has to be supported in the table towards
51 * the end of this file. 43 * the end of this file.
@@ -106,238 +98,6 @@ static struct vidc_timing current_vidc;
106 98
107extern unsigned int vram_size; /* set by setup.c */ 99extern unsigned int vram_size; /* set by setup.c */
108 100
109#ifdef HAS_VIDC
110
111#define MAX_SIZE 480*1024
112
113/* CTL VIDC Actual
114 * 24.000 0 8.000
115 * 25.175 0 8.392
116 * 36.000 0 12.000
117 * 24.000 1 12.000
118 * 25.175 1 12.588
119 * 24.000 2 16.000
120 * 25.175 2 16.783
121 * 36.000 1 18.000
122 * 24.000 3 24.000
123 * 36.000 2 24.000
124 * 25.175 3 25.175
125 * 36.000 3 36.000
126 */
127struct pixclock {
128 u_long min_clock;
129 u_long max_clock;
130 u_int vidc_ctl;
131 u_int vid_ctl;
132};
133
134static struct pixclock arc_clocks[] = {
135 /* we allow +/-1% on these */
136 { 123750, 126250, VIDC_CTRL_DIV3, VID_CTL_24MHz }, /* 8.000MHz */
137 { 82500, 84167, VIDC_CTRL_DIV2, VID_CTL_24MHz }, /* 12.000MHz */
138 { 61875, 63125, VIDC_CTRL_DIV1_5, VID_CTL_24MHz }, /* 16.000MHz */
139 { 41250, 42083, VIDC_CTRL_DIV1, VID_CTL_24MHz }, /* 24.000MHz */
140};
141
142static struct pixclock *
143acornfb_valid_pixrate(struct fb_var_screeninfo *var)
144{
145 u_long pixclock = var->pixclock;
146 u_int i;
147
148 if (!var->pixclock)
149 return NULL;
150
151 for (i = 0; i < ARRAY_SIZE(arc_clocks); i++)
152 if (pixclock > arc_clocks[i].min_clock &&
153 pixclock < arc_clocks[i].max_clock)
154 return arc_clocks + i;
155
156 return NULL;
157}
158
159/* VIDC Rules:
160 * hcr : must be even (interlace, hcr/2 must be even)
161 * hswr : must be even
162 * hdsr : must be odd
163 * hder : must be odd
164 *
165 * vcr : must be odd
166 * vswr : >= 1
167 * vdsr : >= 1
168 * vder : >= vdsr
169 * if interlaced, then hcr/2 must be even
170 */
171static void
172acornfb_set_timing(struct fb_var_screeninfo *var)
173{
174 struct pixclock *pclk;
175 struct vidc_timing vidc;
176 u_int horiz_correction;
177 u_int sync_len, display_start, display_end, cycle;
178 u_int is_interlaced;
179 u_int vid_ctl, vidc_ctl;
180 u_int bandwidth;
181
182 memset(&vidc, 0, sizeof(vidc));
183
184 pclk = acornfb_valid_pixrate(var);
185 vidc_ctl = pclk->vidc_ctl;
186 vid_ctl = pclk->vid_ctl;
187
188 bandwidth = var->pixclock * 8 / var->bits_per_pixel;
189 /* 25.175, 4bpp = 79.444ns per byte, 317.776ns per word: fifo = 2,6 */
190 if (bandwidth > 143500)
191 vidc_ctl |= VIDC_CTRL_FIFO_3_7;
192 else if (bandwidth > 71750)
193 vidc_ctl |= VIDC_CTRL_FIFO_2_6;
194 else if (bandwidth > 35875)
195 vidc_ctl |= VIDC_CTRL_FIFO_1_5;
196 else
197 vidc_ctl |= VIDC_CTRL_FIFO_0_4;
198
199 switch (var->bits_per_pixel) {
200 case 1:
201 horiz_correction = 19;
202 vidc_ctl |= VIDC_CTRL_1BPP;
203 break;
204
205 case 2:
206 horiz_correction = 11;
207 vidc_ctl |= VIDC_CTRL_2BPP;
208 break;
209
210 case 4:
211 horiz_correction = 7;
212 vidc_ctl |= VIDC_CTRL_4BPP;
213 break;
214
215 default:
216 case 8:
217 horiz_correction = 5;
218 vidc_ctl |= VIDC_CTRL_8BPP;
219 break;
220 }
221
222 if (var->sync & FB_SYNC_COMP_HIGH_ACT) /* should be FB_SYNC_COMP */
223 vidc_ctl |= VIDC_CTRL_CSYNC;
224 else {
225 if (!(var->sync & FB_SYNC_HOR_HIGH_ACT))
226 vid_ctl |= VID_CTL_HS_NHSYNC;
227
228 if (!(var->sync & FB_SYNC_VERT_HIGH_ACT))
229 vid_ctl |= VID_CTL_VS_NVSYNC;
230 }
231
232 sync_len = var->hsync_len;
233 display_start = sync_len + var->left_margin;
234 display_end = display_start + var->xres;
235 cycle = display_end + var->right_margin;
236
237 /* if interlaced, then hcr/2 must be even */
238 is_interlaced = (var->vmode & FB_VMODE_MASK) == FB_VMODE_INTERLACED;
239
240 if (is_interlaced) {
241 vidc_ctl |= VIDC_CTRL_INTERLACE;
242 if (cycle & 2) {
243 cycle += 2;
244 var->right_margin += 2;
245 }
246 }
247
248 vidc.h_cycle = (cycle - 2) / 2;
249 vidc.h_sync_width = (sync_len - 2) / 2;
250 vidc.h_border_start = (display_start - 1) / 2;
251 vidc.h_display_start = (display_start - horiz_correction) / 2;
252 vidc.h_display_end = (display_end - horiz_correction) / 2;
253 vidc.h_border_end = (display_end - 1) / 2;
254 vidc.h_interlace = (vidc.h_cycle + 1) / 2;
255
256 sync_len = var->vsync_len;
257 display_start = sync_len + var->upper_margin;
258 display_end = display_start + var->yres;
259 cycle = display_end + var->lower_margin;
260
261 if (is_interlaced)
262 cycle = (cycle - 3) / 2;
263 else
264 cycle = cycle - 1;
265
266 vidc.v_cycle = cycle;
267 vidc.v_sync_width = sync_len - 1;
268 vidc.v_border_start = display_start - 1;
269 vidc.v_display_start = vidc.v_border_start;
270 vidc.v_display_end = display_end - 1;
271 vidc.v_border_end = vidc.v_display_end;
272
273 if (machine_is_a5k())
274 __raw_writeb(vid_ctl, IOEB_VID_CTL);
275
276 if (memcmp(&current_vidc, &vidc, sizeof(vidc))) {
277 current_vidc = vidc;
278
279 vidc_writel(0xe0000000 | vidc_ctl);
280 vidc_writel(0x80000000 | (vidc.h_cycle << 14));
281 vidc_writel(0x84000000 | (vidc.h_sync_width << 14));
282 vidc_writel(0x88000000 | (vidc.h_border_start << 14));
283 vidc_writel(0x8c000000 | (vidc.h_display_start << 14));
284 vidc_writel(0x90000000 | (vidc.h_display_end << 14));
285 vidc_writel(0x94000000 | (vidc.h_border_end << 14));
286 vidc_writel(0x98000000);
287 vidc_writel(0x9c000000 | (vidc.h_interlace << 14));
288 vidc_writel(0xa0000000 | (vidc.v_cycle << 14));
289 vidc_writel(0xa4000000 | (vidc.v_sync_width << 14));
290 vidc_writel(0xa8000000 | (vidc.v_border_start << 14));
291 vidc_writel(0xac000000 | (vidc.v_display_start << 14));
292 vidc_writel(0xb0000000 | (vidc.v_display_end << 14));
293 vidc_writel(0xb4000000 | (vidc.v_border_end << 14));
294 vidc_writel(0xb8000000);
295 vidc_writel(0xbc000000);
296 }
297#ifdef DEBUG_MODE_SELECTION
298 printk(KERN_DEBUG "VIDC registers for %dx%dx%d:\n", var->xres,
299 var->yres, var->bits_per_pixel);
300 printk(KERN_DEBUG " H-cycle : %d\n", vidc.h_cycle);
301 printk(KERN_DEBUG " H-sync-width : %d\n", vidc.h_sync_width);
302 printk(KERN_DEBUG " H-border-start : %d\n", vidc.h_border_start);
303 printk(KERN_DEBUG " H-display-start : %d\n", vidc.h_display_start);
304 printk(KERN_DEBUG " H-display-end : %d\n", vidc.h_display_end);
305 printk(KERN_DEBUG " H-border-end : %d\n", vidc.h_border_end);
306 printk(KERN_DEBUG " H-interlace : %d\n", vidc.h_interlace);
307 printk(KERN_DEBUG " V-cycle : %d\n", vidc.v_cycle);
308 printk(KERN_DEBUG " V-sync-width : %d\n", vidc.v_sync_width);
309 printk(KERN_DEBUG " V-border-start : %d\n", vidc.v_border_start);
310 printk(KERN_DEBUG " V-display-start : %d\n", vidc.v_display_start);
311 printk(KERN_DEBUG " V-display-end : %d\n", vidc.v_display_end);
312 printk(KERN_DEBUG " V-border-end : %d\n", vidc.v_border_end);
313 printk(KERN_DEBUG " VIDC Ctrl (E) : 0x%08X\n", vidc_ctl);
314 printk(KERN_DEBUG " IOEB Ctrl : 0x%08X\n", vid_ctl);
315#endif
316}
317
318static int
319acornfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
320 u_int trans, struct fb_info *info)
321{
322 union palette pal;
323
324 if (regno >= current_par.palette_size)
325 return 1;
326
327 pal.p = 0;
328 pal.vidc.reg = regno;
329 pal.vidc.red = red >> 12;
330 pal.vidc.green = green >> 12;
331 pal.vidc.blue = blue >> 12;
332
333 current_par.palette[regno] = pal;
334
335 vidc_writel(pal.p);
336
337 return 0;
338}
339#endif
340
341#ifdef HAS_VIDC20 101#ifdef HAS_VIDC20
342#include <mach/acornfb.h> 102#include <mach/acornfb.h>
343 103
@@ -634,16 +394,7 @@ acornfb_adjust_timing(struct fb_info *info, struct fb_var_screeninfo *var, u_int
634 /* hsync_len must be even */ 394 /* hsync_len must be even */
635 var->hsync_len = (var->hsync_len + 1) & ~1; 395 var->hsync_len = (var->hsync_len + 1) & ~1;
636 396
637#ifdef HAS_VIDC 397#if defined(HAS_VIDC20)
638 /* left_margin must be odd */
639 if ((var->left_margin & 1) == 0) {
640 var->left_margin -= 1;
641 var->right_margin += 1;
642 }
643
644 /* right_margin must be odd */
645 var->right_margin |= 1;
646#elif defined(HAS_VIDC20)
647 /* left_margin must be even */ 398 /* left_margin must be even */
648 if (var->left_margin & 1) { 399 if (var->left_margin & 1) {
649 var->left_margin += 1; 400 var->left_margin += 1;
@@ -787,11 +538,7 @@ static int acornfb_set_par(struct fb_info *info)
787 break; 538 break;
788 case 8: 539 case 8:
789 current_par.palette_size = VIDC_PALETTE_SIZE; 540 current_par.palette_size = VIDC_PALETTE_SIZE;
790#ifdef HAS_VIDC
791 info->fix.visual = FB_VISUAL_STATIC_PSEUDOCOLOR;
792#else
793 info->fix.visual = FB_VISUAL_PSEUDOCOLOR; 541 info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
794#endif
795 break; 542 break;
796#ifdef HAS_VIDC20 543#ifdef HAS_VIDC20
797 case 16: 544 case 16:
@@ -971,9 +718,6 @@ static void acornfb_init_fbinfo(void)
971#if defined(HAS_VIDC20) 718#if defined(HAS_VIDC20)
972 fb_info.var.red.length = 8; 719 fb_info.var.red.length = 8;
973 fb_info.var.transp.length = 4; 720 fb_info.var.transp.length = 4;
974#elif defined(HAS_VIDC)
975 fb_info.var.red.length = 4;
976 fb_info.var.transp.length = 1;
977#endif 721#endif
978 fb_info.var.green = fb_info.var.red; 722 fb_info.var.green = fb_info.var.red;
979 fb_info.var.blue = fb_info.var.red; 723 fb_info.var.blue = fb_info.var.red;
@@ -1310,14 +1054,6 @@ static int acornfb_probe(struct platform_device *dev)
1310 fb_info.fix.smem_start = handle; 1054 fb_info.fix.smem_start = handle;
1311 } 1055 }
1312#endif 1056#endif
1313#if defined(HAS_VIDC)
1314 /*
1315 * Archimedes/A5000 machines use a fixed address for their
1316 * framebuffers. Free unused pages
1317 */
1318 free_unused_pages(PAGE_OFFSET + size, PAGE_OFFSET + MAX_SIZE);
1319#endif
1320
1321 fb_info.fix.smem_len = size; 1057 fb_info.fix.smem_len = size;
1322 current_par.palette_size = VIDC_PALETTE_SIZE; 1058 current_par.palette_size = VIDC_PALETTE_SIZE;
1323 1059
diff --git a/drivers/video/acornfb.h b/drivers/video/acornfb.h
index fb2a7fffe506..175c8ff3367c 100644
--- a/drivers/video/acornfb.h
+++ b/drivers/video/acornfb.h
@@ -13,10 +13,6 @@
13#include <asm/hardware/iomd.h> 13#include <asm/hardware/iomd.h>
14#define VIDC_PALETTE_SIZE 256 14#define VIDC_PALETTE_SIZE 256
15#define VIDC_NAME "VIDC20" 15#define VIDC_NAME "VIDC20"
16#elif defined(HAS_VIDC)
17#include <asm/hardware/memc.h>
18#define VIDC_PALETTE_SIZE 16
19#define VIDC_NAME "VIDC"
20#endif 16#endif
21 17
22#define EXTEND8(x) ((x)|(x)<<8) 18#define EXTEND8(x) ((x)|(x)<<8)
@@ -101,31 +97,6 @@ struct modex_params {
101 const struct modey_params *modey; 97 const struct modey_params *modey;
102}; 98};
103 99
104#ifdef HAS_VIDC
105
106#define VID_CTL_VS_NVSYNC (1 << 3)
107#define VID_CTL_HS_NHSYNC (1 << 2)
108#define VID_CTL_24MHz (0)
109#define VID_CTL_25MHz (1)
110#define VID_CTL_36MHz (2)
111
112#define VIDC_CTRL_CSYNC (1 << 7)
113#define VIDC_CTRL_INTERLACE (1 << 6)
114#define VIDC_CTRL_FIFO_0_4 (0 << 4)
115#define VIDC_CTRL_FIFO_1_5 (1 << 4)
116#define VIDC_CTRL_FIFO_2_6 (2 << 4)
117#define VIDC_CTRL_FIFO_3_7 (3 << 4)
118#define VIDC_CTRL_1BPP (0 << 2)
119#define VIDC_CTRL_2BPP (1 << 2)
120#define VIDC_CTRL_4BPP (2 << 2)
121#define VIDC_CTRL_8BPP (3 << 2)
122#define VIDC_CTRL_DIV3 (0 << 0)
123#define VIDC_CTRL_DIV2 (1 << 0)
124#define VIDC_CTRL_DIV1_5 (2 << 0)
125#define VIDC_CTRL_DIV1 (3 << 0)
126
127#endif
128
129#ifdef HAS_VIDC20 100#ifdef HAS_VIDC20
130/* 101/*
131 * VIDC20 registers 102 * VIDC20 registers
diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c
index 47e12cfc2a57..15c7251b0556 100644
--- a/drivers/w1/masters/mxc_w1.c
+++ b/drivers/w1/masters/mxc_w1.c
@@ -152,8 +152,6 @@ static int mxc_w1_remove(struct platform_device *pdev)
152 152
153 clk_disable_unprepare(mdev->clk); 153 clk_disable_unprepare(mdev->clk);
154 154
155 platform_set_drvdata(pdev, NULL);
156
157 return 0; 155 return 0;
158} 156}
159 157
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 22013ca2119c..c7c64f18773d 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -234,9 +234,11 @@ static ssize_t w1_master_attribute_store_search(struct device * dev,
234{ 234{
235 long tmp; 235 long tmp;
236 struct w1_master *md = dev_to_w1_master(dev); 236 struct w1_master *md = dev_to_w1_master(dev);
237 int ret;
237 238
238 if (strict_strtol(buf, 0, &tmp) == -EINVAL) 239 ret = kstrtol(buf, 0, &tmp);
239 return -EINVAL; 240 if (ret)
241 return ret;
240 242
241 mutex_lock(&md->mutex); 243 mutex_lock(&md->mutex);
242 md->search_count = tmp; 244 md->search_count = tmp;
@@ -266,9 +268,11 @@ static ssize_t w1_master_attribute_store_pullup(struct device *dev,
266{ 268{
267 long tmp; 269 long tmp;
268 struct w1_master *md = dev_to_w1_master(dev); 270 struct w1_master *md = dev_to_w1_master(dev);
271 int ret;
269 272
270 if (strict_strtol(buf, 0, &tmp) == -EINVAL) 273 ret = kstrtol(buf, 0, &tmp);
271 return -EINVAL; 274 if (ret)
275 return ret;
272 276
273 mutex_lock(&md->mutex); 277 mutex_lock(&md->mutex);
274 md->enable_pullup = tmp; 278 md->enable_pullup = tmp;
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index de7e4f497222..5be5e3d14f79 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -162,7 +162,8 @@ extern asmlinkage void asminline_call(struct cmn_registers *pi86Regs,
162#define HPWDT_ARCH 32 162#define HPWDT_ARCH 32
163 163
164asm(".text \n\t" 164asm(".text \n\t"
165 ".align 4 \n" 165 ".align 4 \n\t"
166 ".globl asminline_call \n"
166 "asminline_call: \n\t" 167 "asminline_call: \n\t"
167 "pushl %ebp \n\t" 168 "pushl %ebp \n\t"
168 "movl %esp, %ebp \n\t" 169 "movl %esp, %ebp \n\t"
@@ -352,7 +353,8 @@ static int detect_cru_service(void)
352#define HPWDT_ARCH 64 353#define HPWDT_ARCH 64
353 354
354asm(".text \n\t" 355asm(".text \n\t"
355 ".align 4 \n" 356 ".align 4 \n\t"
357 ".globl asminline_call \n"
356 "asminline_call: \n\t" 358 "asminline_call: \n\t"
357 "pushq %rbp \n\t" 359 "pushq %rbp \n\t"
358 "movq %rsp, %rbp \n\t" 360 "movq %rsp, %rbp \n\t"
diff --git a/fs/affs/file.c b/fs/affs/file.c
index af3261b78102..776e3935a758 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -836,7 +836,7 @@ affs_truncate(struct inode *inode)
836 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
837 struct page *page; 837 struct page *page;
838 void *fsdata; 838 void *fsdata;
839 u32 size = inode->i_size; 839 loff_t size = inode->i_size;
840 int res; 840 int res;
841 841
842 res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); 842 res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8fb42916d8a2..60250847929f 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -716,13 +716,14 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
716 return 0; 716 return 0;
717 717
718 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); 718 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
719 719 if (!bs->bio_integrity_pool)
720 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
721 if (!bs->bvec_integrity_pool)
722 return -1; 720 return -1;
723 721
724 if (!bs->bio_integrity_pool) 722 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
723 if (!bs->bvec_integrity_pool) {
724 mempool_destroy(bs->bio_integrity_pool);
725 return -1; 725 return -1;
726 }
726 727
727 return 0; 728 return 0;
728} 729}
diff --git a/fs/coredump.c b/fs/coredump.c
index 72f816d6cad9..9bdeca12ae0e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -190,6 +190,11 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
190 err = cn_printf(cn, "%d", 190 err = cn_printf(cn, "%d",
191 task_tgid_vnr(current)); 191 task_tgid_vnr(current));
192 break; 192 break;
193 /* global pid */
194 case 'P':
195 err = cn_printf(cn, "%d",
196 task_tgid_nr(current));
197 break;
193 /* uid */ 198 /* uid */
194 case 'u': 199 case 'u':
195 err = cn_printf(cn, "%d", cred->uid); 200 err = cn_printf(cn, "%d", cred->uid);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 293f86741ddb..473e09da7d02 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -740,6 +740,7 @@ static void ep_free(struct eventpoll *ep)
740 epi = rb_entry(rbp, struct epitem, rbn); 740 epi = rb_entry(rbp, struct epitem, rbn);
741 741
742 ep_unregister_pollwait(ep, epi); 742 ep_unregister_pollwait(ep, epi);
743 cond_resched();
743 } 744 }
744 745
745 /* 746 /*
@@ -754,6 +755,7 @@ static void ep_free(struct eventpoll *ep)
754 while ((rbp = rb_first(&ep->rbr)) != NULL) { 755 while ((rbp = rb_first(&ep->rbr)) != NULL) {
755 epi = rb_entry(rbp, struct epitem, rbn); 756 epi = rb_entry(rbp, struct epitem, rbn);
756 ep_remove(ep, epi); 757 ep_remove(ep, epi);
758 cond_resched();
757 } 759 }
758 mutex_unlock(&ep->mtx); 760 mutex_unlock(&ep->mtx);
759 761
diff --git a/fs/exec.c b/fs/exec.c
index fd774c7cb483..8875dd10ae7a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock);
74void __register_binfmt(struct linux_binfmt * fmt, int insert) 74void __register_binfmt(struct linux_binfmt * fmt, int insert)
75{ 75{
76 BUG_ON(!fmt); 76 BUG_ON(!fmt);
77 if (WARN_ON(!fmt->load_binary))
78 return;
77 write_lock(&binfmt_lock); 79 write_lock(&binfmt_lock);
78 insert ? list_add(&fmt->lh, &formats) : 80 insert ? list_add(&fmt->lh, &formats) :
79 list_add_tail(&fmt->lh, &formats); 81 list_add_tail(&fmt->lh, &formats);
@@ -266,7 +268,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
266 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 268 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
267 vma->vm_end = STACK_TOP_MAX; 269 vma->vm_end = STACK_TOP_MAX;
268 vma->vm_start = vma->vm_end - PAGE_SIZE; 270 vma->vm_start = vma->vm_end - PAGE_SIZE;
269 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 271 vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
270 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 272 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
271 INIT_LIST_HEAD(&vma->anon_vma_chain); 273 INIT_LIST_HEAD(&vma->anon_vma_chain);
272 274
@@ -1365,18 +1367,18 @@ out:
1365} 1367}
1366EXPORT_SYMBOL(remove_arg_zero); 1368EXPORT_SYMBOL(remove_arg_zero);
1367 1369
1370#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1368/* 1371/*
1369 * cycle the list of binary formats handler, until one recognizes the image 1372 * cycle the list of binary formats handler, until one recognizes the image
1370 */ 1373 */
1371int search_binary_handler(struct linux_binprm *bprm) 1374int search_binary_handler(struct linux_binprm *bprm)
1372{ 1375{
1373 unsigned int depth = bprm->recursion_depth; 1376 bool need_retry = IS_ENABLED(CONFIG_MODULES);
1374 int try,retval;
1375 struct linux_binfmt *fmt; 1377 struct linux_binfmt *fmt;
1376 pid_t old_pid, old_vpid; 1378 int retval;
1377 1379
1378 /* This allows 4 levels of binfmt rewrites before failing hard. */ 1380 /* This allows 4 levels of binfmt rewrites before failing hard. */
1379 if (depth > 5) 1381 if (bprm->recursion_depth > 5)
1380 return -ELOOP; 1382 return -ELOOP;
1381 1383
1382 retval = security_bprm_check(bprm); 1384 retval = security_bprm_check(bprm);
@@ -1387,71 +1389,67 @@ int search_binary_handler(struct linux_binprm *bprm)
1387 if (retval) 1389 if (retval)
1388 return retval; 1390 return retval;
1389 1391
1392 retval = -ENOENT;
1393 retry:
1394 read_lock(&binfmt_lock);
1395 list_for_each_entry(fmt, &formats, lh) {
1396 if (!try_module_get(fmt->module))
1397 continue;
1398 read_unlock(&binfmt_lock);
1399 bprm->recursion_depth++;
1400 retval = fmt->load_binary(bprm);
1401 bprm->recursion_depth--;
1402 if (retval >= 0 || retval != -ENOEXEC ||
1403 bprm->mm == NULL || bprm->file == NULL) {
1404 put_binfmt(fmt);
1405 return retval;
1406 }
1407 read_lock(&binfmt_lock);
1408 put_binfmt(fmt);
1409 }
1410 read_unlock(&binfmt_lock);
1411
1412 if (need_retry && retval == -ENOEXEC) {
1413 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1414 printable(bprm->buf[2]) && printable(bprm->buf[3]))
1415 return retval;
1416 if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
1417 return retval;
1418 need_retry = false;
1419 goto retry;
1420 }
1421
1422 return retval;
1423}
1424EXPORT_SYMBOL(search_binary_handler);
1425
1426static int exec_binprm(struct linux_binprm *bprm)
1427{
1428 pid_t old_pid, old_vpid;
1429 int ret;
1430
1390 /* Need to fetch pid before load_binary changes it */ 1431 /* Need to fetch pid before load_binary changes it */
1391 old_pid = current->pid; 1432 old_pid = current->pid;
1392 rcu_read_lock(); 1433 rcu_read_lock();
1393 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); 1434 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1394 rcu_read_unlock(); 1435 rcu_read_unlock();
1395 1436
1396 retval = -ENOENT; 1437 ret = search_binary_handler(bprm);
1397 for (try=0; try<2; try++) { 1438 if (ret >= 0) {
1398 read_lock(&binfmt_lock); 1439 trace_sched_process_exec(current, old_pid, bprm);
1399 list_for_each_entry(fmt, &formats, lh) { 1440 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1400 int (*fn)(struct linux_binprm *) = fmt->load_binary; 1441 current->did_exec = 1;
1401 if (!fn) 1442 proc_exec_connector(current);
1402 continue; 1443
1403 if (!try_module_get(fmt->module)) 1444 if (bprm->file) {
1404 continue; 1445 allow_write_access(bprm->file);
1405 read_unlock(&binfmt_lock); 1446 fput(bprm->file);
1406 bprm->recursion_depth = depth + 1; 1447 bprm->file = NULL; /* to catch use-after-free */
1407 retval = fn(bprm);
1408 bprm->recursion_depth = depth;
1409 if (retval >= 0) {
1410 if (depth == 0) {
1411 trace_sched_process_exec(current, old_pid, bprm);
1412 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1413 }
1414 put_binfmt(fmt);
1415 allow_write_access(bprm->file);
1416 if (bprm->file)
1417 fput(bprm->file);
1418 bprm->file = NULL;
1419 current->did_exec = 1;
1420 proc_exec_connector(current);
1421 return retval;
1422 }
1423 read_lock(&binfmt_lock);
1424 put_binfmt(fmt);
1425 if (retval != -ENOEXEC || bprm->mm == NULL)
1426 break;
1427 if (!bprm->file) {
1428 read_unlock(&binfmt_lock);
1429 return retval;
1430 }
1431 } 1448 }
1432 read_unlock(&binfmt_lock);
1433#ifdef CONFIG_MODULES
1434 if (retval != -ENOEXEC || bprm->mm == NULL) {
1435 break;
1436 } else {
1437#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1438 if (printable(bprm->buf[0]) &&
1439 printable(bprm->buf[1]) &&
1440 printable(bprm->buf[2]) &&
1441 printable(bprm->buf[3]))
1442 break; /* -ENOEXEC */
1443 if (try)
1444 break; /* -ENOEXEC */
1445 request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1446 }
1447#else
1448 break;
1449#endif
1450 } 1449 }
1451 return retval;
1452}
1453 1450
1454EXPORT_SYMBOL(search_binary_handler); 1451 return ret;
1452}
1455 1453
1456/* 1454/*
1457 * sys_execve() executes a new program. 1455 * sys_execve() executes a new program.
@@ -1541,7 +1539,7 @@ static int do_execve_common(const char *filename,
1541 if (retval < 0) 1539 if (retval < 0)
1542 goto out; 1540 goto out;
1543 1541
1544 retval = search_binary_handler(bprm); 1542 retval = exec_binprm(bprm);
1545 if (retval < 0) 1543 if (retval < 0)
1546 goto out; 1544 goto out;
1547 1545
diff --git a/fs/file_table.c b/fs/file_table.c
index 322cd37626cb..abdd15ad13c9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,8 +311,7 @@ void fput(struct file *file)
311 return; 311 return;
312 /* 312 /*
313 * After this task has run exit_task_work(), 313 * After this task has run exit_task_work(),
314 * task_work_add() will fail. free_ipc_ns()-> 314 * task_work_add() will fail. Fall through to delayed
315 * shm_destroy() can do this. Fall through to delayed
316 * fput to avoid leaking *file. 315 * fput to avoid leaking *file.
317 */ 316 */
318 } 317 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 68851ff2fd41..30f6f27d5a59 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -723,7 +723,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
723 return wrote; 723 return wrote;
724} 724}
725 725
726long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, 726static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
727 enum wb_reason reason) 727 enum wb_reason reason)
728{ 728{
729 struct wb_writeback_work work = { 729 struct wb_writeback_work work = {
@@ -1049,10 +1049,8 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1049{ 1049{
1050 struct backing_dev_info *bdi; 1050 struct backing_dev_info *bdi;
1051 1051
1052 if (!nr_pages) { 1052 if (!nr_pages)
1053 nr_pages = global_page_state(NR_FILE_DIRTY) + 1053 nr_pages = get_nr_dirty_pages();
1054 global_page_state(NR_UNSTABLE_NFS);
1055 }
1056 1054
1057 rcu_read_lock(); 1055 rcu_read_lock();
1058 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1056 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
@@ -1173,6 +1171,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1173 bool wakeup_bdi = false; 1171 bool wakeup_bdi = false;
1174 bdi = inode_to_bdi(inode); 1172 bdi = inode_to_bdi(inode);
1175 1173
1174 spin_unlock(&inode->i_lock);
1175 spin_lock(&bdi->wb.list_lock);
1176 if (bdi_cap_writeback_dirty(bdi)) { 1176 if (bdi_cap_writeback_dirty(bdi)) {
1177 WARN(!test_bit(BDI_registered, &bdi->state), 1177 WARN(!test_bit(BDI_registered, &bdi->state),
1178 "bdi-%s not registered\n", bdi->name); 1178 "bdi-%s not registered\n", bdi->name);
@@ -1187,8 +1187,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1187 wakeup_bdi = true; 1187 wakeup_bdi = true;
1188 } 1188 }
1189 1189
1190 spin_unlock(&inode->i_lock);
1191 spin_lock(&bdi->wb.list_lock);
1192 inode->dirtied_when = jiffies; 1190 inode->dirtied_when = jiffies;
1193 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1191 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1194 spin_unlock(&bdi->wb.list_lock); 1192 spin_unlock(&bdi->wb.list_lock);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 8702b732109a..73899c1c3449 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -913,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
913 (1 << FSCACHE_OP_WAITING) | 913 (1 << FSCACHE_OP_WAITING) |
914 (1 << FSCACHE_OP_UNUSE_COOKIE); 914 (1 << FSCACHE_OP_UNUSE_COOKIE);
915 915
916 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 916 ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
917 if (ret < 0) 917 if (ret < 0)
918 goto nomem_free; 918 goto nomem_free;
919 919
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e0fe703ee3d6..84434594e80e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
930 fc->bdi.name = "fuse"; 930 fc->bdi.name = "fuse";
931 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 931 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
932 /* fuse does it's own writeback accounting */ 932 /* fuse does it's own writeback accounting */
933 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 933 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
934 934
935 err = bdi_init(&fc->bdi); 935 err = bdi_init(&fc->bdi);
936 if (err) 936 if (err)
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index a63371815aab..24bc20fd42f7 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -11,3 +11,21 @@ config HFSPLUS_FS
11 MacOS 8. It includes all Mac specific filesystem data such as 11 MacOS 8. It includes all Mac specific filesystem data such as
12 data forks and creator codes, but it also has several UNIX 12 data forks and creator codes, but it also has several UNIX
13 style features such as file ownership and permissions. 13 style features such as file ownership and permissions.
14
15config HFSPLUS_FS_POSIX_ACL
16 bool "HFS+ POSIX Access Control Lists"
17 depends on HFSPLUS_FS
18 select FS_POSIX_ACL
19 help
20 POSIX Access Control Lists (ACLs) support permissions for users and
21 groups beyond the owner/group/world scheme.
22
23 To learn more about Access Control Lists, visit the POSIX ACLs for
24 Linux website <http://acl.bestbits.at/>.
25
26 It needs to understand that POSIX ACLs are treated only under
27 Linux. POSIX ACLs doesn't mean something under Mac OS X.
28 Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
29 which are part of the NFSv4 standard.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 09d278bb7b91..683fca2e5e65 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
7hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ 7hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
8 bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ 8 bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
9 attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o 9 attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
10
11hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
new file mode 100644
index 000000000000..07c0d4947527
--- /dev/null
+++ b/fs/hfsplus/acl.h
@@ -0,0 +1,30 @@
1/*
2 * linux/fs/hfsplus/acl.h
3 *
4 * Vyacheslav Dubeyko <slava@dubeyko.com>
5 *
6 * Handler for Posix Access Control Lists (ACLs) support.
7 */
8
9#include <linux/posix_acl_xattr.h>
10
11#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
12
13/* posix_acl.c */
14struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
15extern int hfsplus_posix_acl_chmod(struct inode *);
16extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
17
18#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */
19#define hfsplus_get_posix_acl NULL
20
21static inline int hfsplus_posix_acl_chmod(struct inode *inode)
22{
23 return 0;
24}
25
26static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
27{
28 return 0;
29}
30#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d8ce4bd17fc5..4a4fea002673 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -16,6 +16,7 @@
16#include "hfsplus_fs.h" 16#include "hfsplus_fs.h"
17#include "hfsplus_raw.h" 17#include "hfsplus_raw.h"
18#include "xattr.h" 18#include "xattr.h"
19#include "acl.h"
19 20
20static inline void hfsplus_instantiate(struct dentry *dentry, 21static inline void hfsplus_instantiate(struct dentry *dentry,
21 struct inode *inode, u32 cnid) 22 struct inode *inode, u32 cnid)
@@ -529,6 +530,9 @@ const struct inode_operations hfsplus_dir_inode_operations = {
529 .getxattr = generic_getxattr, 530 .getxattr = generic_getxattr,
530 .listxattr = hfsplus_listxattr, 531 .listxattr = hfsplus_listxattr,
531 .removexattr = hfsplus_removexattr, 532 .removexattr = hfsplus_removexattr,
533#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
534 .get_acl = hfsplus_get_posix_acl,
535#endif
532}; 536};
533 537
534const struct file_operations hfsplus_dir_operations = { 538const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index ede79317cfb8..2b9cd01696e2 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -30,6 +30,7 @@
30#define DBG_EXTENT 0x00000020 30#define DBG_EXTENT 0x00000020
31#define DBG_BITMAP 0x00000040 31#define DBG_BITMAP 0x00000040
32#define DBG_ATTR_MOD 0x00000080 32#define DBG_ATTR_MOD 0x00000080
33#define DBG_ACL_MOD 0x00000100
33 34
34#if 0 35#if 0
35#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) 36#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f833d35630ab..4d2edaea891c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -19,6 +19,7 @@
19#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
20#include "hfsplus_raw.h" 20#include "hfsplus_raw.h"
21#include "xattr.h" 21#include "xattr.h"
22#include "acl.h"
22 23
23static int hfsplus_readpage(struct file *file, struct page *page) 24static int hfsplus_readpage(struct file *file, struct page *page)
24{ 25{
@@ -316,6 +317,13 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
316 317
317 setattr_copy(inode, attr); 318 setattr_copy(inode, attr);
318 mark_inode_dirty(inode); 319 mark_inode_dirty(inode);
320
321 if (attr->ia_valid & ATTR_MODE) {
322 error = hfsplus_posix_acl_chmod(inode);
323 if (unlikely(error))
324 return error;
325 }
326
319 return 0; 327 return 0;
320} 328}
321 329
@@ -383,6 +391,9 @@ static const struct inode_operations hfsplus_file_inode_operations = {
383 .getxattr = generic_getxattr, 391 .getxattr = generic_getxattr,
384 .listxattr = hfsplus_listxattr, 392 .listxattr = hfsplus_listxattr,
385 .removexattr = hfsplus_removexattr, 393 .removexattr = hfsplus_removexattr,
394#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
395 .get_acl = hfsplus_get_posix_acl,
396#endif
386}; 397};
387 398
388static const struct file_operations hfsplus_file_operations = { 399static const struct file_operations hfsplus_file_operations = {
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
new file mode 100644
index 000000000000..b609cc14c72e
--- /dev/null
+++ b/fs/hfsplus/posix_acl.c
@@ -0,0 +1,274 @@
1/*
2 * linux/fs/hfsplus/posix_acl.c
3 *
4 * Vyacheslav Dubeyko <slava@dubeyko.com>
5 *
6 * Handler for Posix Access Control Lists (ACLs) support.
7 */
8
9#include "hfsplus_fs.h"
10#include "xattr.h"
11#include "acl.h"
12
13struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
14{
15 struct posix_acl *acl;
16 char *xattr_name;
17 char *value = NULL;
18 ssize_t size;
19
20 acl = get_cached_acl(inode, type);
21 if (acl != ACL_NOT_CACHED)
22 return acl;
23
24 switch (type) {
25 case ACL_TYPE_ACCESS:
26 xattr_name = POSIX_ACL_XATTR_ACCESS;
27 break;
28 case ACL_TYPE_DEFAULT:
29 xattr_name = POSIX_ACL_XATTR_DEFAULT;
30 break;
31 default:
32 return ERR_PTR(-EINVAL);
33 }
34
35 size = __hfsplus_getxattr(inode, xattr_name, NULL, 0);
36
37 if (size > 0) {
38 value = (char *)hfsplus_alloc_attr_entry();
39 if (unlikely(!value))
40 return ERR_PTR(-ENOMEM);
41 size = __hfsplus_getxattr(inode, xattr_name, value, size);
42 }
43
44 if (size > 0)
45 acl = posix_acl_from_xattr(&init_user_ns, value, size);
46 else if (size == -ENODATA)
47 acl = NULL;
48 else
49 acl = ERR_PTR(size);
50
51 hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
52
53 if (!IS_ERR(acl))
54 set_cached_acl(inode, type, acl);
55
56 return acl;
57}
58
59static int hfsplus_set_posix_acl(struct inode *inode,
60 int type,
61 struct posix_acl *acl)
62{
63 int err;
64 char *xattr_name;
65 size_t size = 0;
66 char *value = NULL;
67
68 if (S_ISLNK(inode->i_mode))
69 return -EOPNOTSUPP;
70
71 switch (type) {
72 case ACL_TYPE_ACCESS:
73 xattr_name = POSIX_ACL_XATTR_ACCESS;
74 if (acl) {
75 err = posix_acl_equiv_mode(acl, &inode->i_mode);
76 if (err < 0)
77 return err;
78 }
79 err = 0;
80 break;
81
82 case ACL_TYPE_DEFAULT:
83 xattr_name = POSIX_ACL_XATTR_DEFAULT;
84 if (!S_ISDIR(inode->i_mode))
85 return acl ? -EACCES : 0;
86 break;
87
88 default:
89 return -EINVAL;
90 }
91
92 if (acl) {
93 size = posix_acl_xattr_size(acl->a_count);
94 if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE))
95 return -ENOMEM;
96 value = (char *)hfsplus_alloc_attr_entry();
97 if (unlikely(!value))
98 return -ENOMEM;
99 err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
100 if (unlikely(err < 0))
101 goto end_set_acl;
102 }
103
104 err = __hfsplus_setxattr(inode, xattr_name, value, size, 0);
105
106end_set_acl:
107 hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
108
109 if (!err)
110 set_cached_acl(inode, type, acl);
111
112 return err;
113}
114
115int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
116{
117 int err = 0;
118 struct posix_acl *acl = NULL;
119
120 hfs_dbg(ACL_MOD,
121 "[%s]: ino %lu, dir->ino %lu\n",
122 __func__, inode->i_ino, dir->i_ino);
123
124 if (S_ISLNK(inode->i_mode))
125 return 0;
126
127 acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT);
128 if (IS_ERR(acl))
129 return PTR_ERR(acl);
130
131 if (acl) {
132 if (S_ISDIR(inode->i_mode)) {
133 err = hfsplus_set_posix_acl(inode,
134 ACL_TYPE_DEFAULT,
135 acl);
136 if (unlikely(err))
137 goto init_acl_cleanup;
138 }
139
140 err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
141 if (unlikely(err < 0))
142 return err;
143
144 if (err > 0)
145 err = hfsplus_set_posix_acl(inode,
146 ACL_TYPE_ACCESS,
147 acl);
148 } else
149 inode->i_mode &= ~current_umask();
150
151init_acl_cleanup:
152 posix_acl_release(acl);
153 return err;
154}
155
156int hfsplus_posix_acl_chmod(struct inode *inode)
157{
158 int err;
159 struct posix_acl *acl;
160
161 hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
162
163 if (S_ISLNK(inode->i_mode))
164 return -EOPNOTSUPP;
165
166 acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
167 if (IS_ERR(acl) || !acl)
168 return PTR_ERR(acl);
169
170 err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
171 if (unlikely(err))
172 return err;
173
174 err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl);
175 posix_acl_release(acl);
176 return err;
177}
178
179static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
180 const char *name,
181 void *buffer,
182 size_t size,
183 int type)
184{
185 int err = 0;
186 struct posix_acl *acl;
187
188 hfs_dbg(ACL_MOD,
189 "[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
190 __func__, dentry->d_inode->i_ino, buffer, size, type);
191
192 if (strcmp(name, "") != 0)
193 return -EINVAL;
194
195 acl = hfsplus_get_posix_acl(dentry->d_inode, type);
196 if (IS_ERR(acl))
197 return PTR_ERR(acl);
198 if (acl == NULL)
199 return -ENODATA;
200
201 err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
202 posix_acl_release(acl);
203
204 return err;
205}
206
207static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
208 const char *name,
209 const void *value,
210 size_t size,
211 int flags,
212 int type)
213{
214 int err = 0;
215 struct inode *inode = dentry->d_inode;
216 struct posix_acl *acl = NULL;
217
218 hfs_dbg(ACL_MOD,
219 "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
220 __func__, inode->i_ino, value, size, flags, type);
221
222 if (strcmp(name, "") != 0)
223 return -EINVAL;
224
225 if (!inode_owner_or_capable(inode))
226 return -EPERM;
227
228 if (value) {
229 acl = posix_acl_from_xattr(&init_user_ns, value, size);
230 if (IS_ERR(acl))
231 return PTR_ERR(acl);
232 else if (acl) {
233 err = posix_acl_valid(acl);
234 if (err)
235 goto end_xattr_set_acl;
236 }
237 }
238
239 err = hfsplus_set_posix_acl(inode, type, acl);
240
241end_xattr_set_acl:
242 posix_acl_release(acl);
243 return err;
244}
245
246static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
247 char *list,
248 size_t list_size,
249 const char *name,
250 size_t name_len,
251 int type)
252{
253 /*
254 * This method is not used.
255 * It is used hfsplus_listxattr() instead of generic_listxattr().
256 */
257 return -EOPNOTSUPP;
258}
259
260const struct xattr_handler hfsplus_xattr_acl_access_handler = {
261 .prefix = POSIX_ACL_XATTR_ACCESS,
262 .flags = ACL_TYPE_ACCESS,
263 .list = hfsplus_xattr_list_posix_acl,
264 .get = hfsplus_xattr_get_posix_acl,
265 .set = hfsplus_xattr_set_posix_acl,
266};
267
268const struct xattr_handler hfsplus_xattr_acl_default_handler = {
269 .prefix = POSIX_ACL_XATTR_DEFAULT,
270 .flags = ACL_TYPE_DEFAULT,
271 .list = hfsplus_xattr_list_posix_acl,
272 .get = hfsplus_xattr_get_posix_acl,
273 .set = hfsplus_xattr_set_posix_acl,
274};
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index f66346155df5..bd8471fb9a6a 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -8,11 +8,16 @@
8 8
9#include "hfsplus_fs.h" 9#include "hfsplus_fs.h"
10#include "xattr.h" 10#include "xattr.h"
11#include "acl.h"
11 12
12const struct xattr_handler *hfsplus_xattr_handlers[] = { 13const struct xattr_handler *hfsplus_xattr_handlers[] = {
13 &hfsplus_xattr_osx_handler, 14 &hfsplus_xattr_osx_handler,
14 &hfsplus_xattr_user_handler, 15 &hfsplus_xattr_user_handler,
15 &hfsplus_xattr_trusted_handler, 16 &hfsplus_xattr_trusted_handler,
17#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
18 &hfsplus_xattr_acl_access_handler,
19 &hfsplus_xattr_acl_default_handler,
20#endif
16 &hfsplus_xattr_security_handler, 21 &hfsplus_xattr_security_handler,
17 NULL 22 NULL
18}; 23};
@@ -46,11 +51,58 @@ static inline int is_known_namespace(const char *name)
46 return true; 51 return true;
47} 52}
48 53
54static int can_set_system_xattr(struct inode *inode, const char *name,
55 const void *value, size_t size)
56{
57#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
58 struct posix_acl *acl;
59 int err;
60
61 if (!inode_owner_or_capable(inode))
62 return -EPERM;
63
64 /*
65 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
66 */
67 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
68 acl = posix_acl_from_xattr(&init_user_ns, value, size);
69 if (IS_ERR(acl))
70 return PTR_ERR(acl);
71 if (acl) {
72 err = posix_acl_equiv_mode(acl, &inode->i_mode);
73 posix_acl_release(acl);
74 if (err < 0)
75 return err;
76 mark_inode_dirty(inode);
77 }
78 /*
79 * We're changing the ACL. Get rid of the cached one
80 */
81 forget_cached_acl(inode, ACL_TYPE_ACCESS);
82
83 return 0;
84 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
85 acl = posix_acl_from_xattr(&init_user_ns, value, size);
86 if (IS_ERR(acl))
87 return PTR_ERR(acl);
88 posix_acl_release(acl);
89
90 /*
91 * We're changing the default ACL. Get rid of the cached one
92 */
93 forget_cached_acl(inode, ACL_TYPE_DEFAULT);
94
95 return 0;
96 }
97#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
98 return -EOPNOTSUPP;
99}
100
49static int can_set_xattr(struct inode *inode, const char *name, 101static int can_set_xattr(struct inode *inode, const char *name,
50 const void *value, size_t value_len) 102 const void *value, size_t value_len)
51{ 103{
52 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 104 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
53 return -EOPNOTSUPP; /* TODO: implement ACL support */ 105 return can_set_system_xattr(inode, name, value, value_len);
54 106
55 if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { 107 if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
56 /* 108 /*
@@ -253,11 +305,10 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
253 return len; 305 return len;
254} 306}
255 307
256static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, 308static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
257 void *value, size_t size) 309 void *value, size_t size)
258{ 310{
259 ssize_t res = 0; 311 ssize_t res = 0;
260 struct inode *inode = dentry->d_inode;
261 struct hfs_find_data fd; 312 struct hfs_find_data fd;
262 u16 entry_type; 313 u16 entry_type;
263 u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); 314 u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
@@ -304,10 +355,9 @@ end_getxattr_finder_info:
304 return res; 355 return res;
305} 356}
306 357
307ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 358ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
308 void *value, size_t size) 359 void *value, size_t size)
309{ 360{
310 struct inode *inode = dentry->d_inode;
311 struct hfs_find_data fd; 361 struct hfs_find_data fd;
312 hfsplus_attr_entry *entry; 362 hfsplus_attr_entry *entry;
313 __be32 xattr_record_type; 363 __be32 xattr_record_type;
@@ -333,7 +383,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
333 } 383 }
334 384
335 if (!strcmp_xattr_finder_info(name)) 385 if (!strcmp_xattr_finder_info(name))
336 return hfsplus_getxattr_finder_info(dentry, value, size); 386 return hfsplus_getxattr_finder_info(inode, value, size);
337 387
338 if (!HFSPLUS_SB(inode->i_sb)->attr_tree) 388 if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
339 return -EOPNOTSUPP; 389 return -EOPNOTSUPP;
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 847b695b984d..841b5698c0fc 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,8 @@
14extern const struct xattr_handler hfsplus_xattr_osx_handler; 14extern const struct xattr_handler hfsplus_xattr_osx_handler;
15extern const struct xattr_handler hfsplus_xattr_user_handler; 15extern const struct xattr_handler hfsplus_xattr_user_handler;
16extern const struct xattr_handler hfsplus_xattr_trusted_handler; 16extern const struct xattr_handler hfsplus_xattr_trusted_handler;
17/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ 17extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
18/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ 18extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
19extern const struct xattr_handler hfsplus_xattr_security_handler; 19extern const struct xattr_handler hfsplus_xattr_security_handler;
20 20
21extern const struct xattr_handler *hfsplus_xattr_handlers[]; 21extern const struct xattr_handler *hfsplus_xattr_handlers[];
@@ -29,9 +29,17 @@ static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
29 return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); 29 return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
30} 30}
31 31
32ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 32ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
33 void *value, size_t size); 33 void *value, size_t size);
34 34
35static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
36 const char *name,
37 void *value,
38 size_t size)
39{
40 return __hfsplus_getxattr(dentry->d_inode, name, value, size);
41}
42
35ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); 43ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
36 44
37int hfsplus_removexattr(struct dentry *dentry, const char *name); 45int hfsplus_removexattr(struct dentry *dentry, const char *name);
@@ -39,22 +47,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name);
39int hfsplus_init_security(struct inode *inode, struct inode *dir, 47int hfsplus_init_security(struct inode *inode, struct inode *dir,
40 const struct qstr *qstr); 48 const struct qstr *qstr);
41 49
42static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) 50int hfsplus_init_inode_security(struct inode *inode, struct inode *dir,
43{ 51 const struct qstr *qstr);
44 /*TODO: implement*/
45 return 0;
46}
47
48static inline int hfsplus_init_inode_security(struct inode *inode,
49 struct inode *dir,
50 const struct qstr *qstr)
51{
52 int err;
53
54 err = hfsplus_init_acl(inode, dir);
55 if (!err)
56 err = hfsplus_init_security(inode, dir, qstr);
57 return err;
58}
59 52
60#endif 53#endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 83b842f113c5..00722765ea79 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -9,6 +9,7 @@
9#include <linux/security.h> 9#include <linux/security.h>
10#include "hfsplus_fs.h" 10#include "hfsplus_fs.h"
11#include "xattr.h" 11#include "xattr.h"
12#include "acl.h"
12 13
13static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, 14static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
14 void *buffer, size_t size, int type) 15 void *buffer, size_t size, int type)
@@ -96,6 +97,18 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir,
96 &hfsplus_initxattrs, NULL); 97 &hfsplus_initxattrs, NULL);
97} 98}
98 99
100int hfsplus_init_inode_security(struct inode *inode,
101 struct inode *dir,
102 const struct qstr *qstr)
103{
104 int err;
105
106 err = hfsplus_init_posix_acl(inode, dir);
107 if (!err)
108 err = hfsplus_init_security(inode, dir, qstr);
109 return err;
110}
111
99const struct xattr_handler hfsplus_xattr_security_handler = { 112const struct xattr_handler hfsplus_xattr_security_handler = {
100 .prefix = XATTR_SECURITY_PREFIX, 113 .prefix = XATTR_SECURITY_PREFIX,
101 .list = hfsplus_security_listxattr, 114 .list = hfsplus_security_listxattr,
diff --git a/fs/namespace.c b/fs/namespace.c
index 25845d1b300b..da5c49483430 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,7 +17,7 @@
17#include <linux/security.h> 17#include <linux/security.h>
18#include <linux/idr.h> 18#include <linux/idr.h>
19#include <linux/acct.h> /* acct_auto_close_mnt */ 19#include <linux/acct.h> /* acct_auto_close_mnt */
20#include <linux/ramfs.h> /* init_rootfs */ 20#include <linux/init.h> /* init_rootfs */
21#include <linux/fs_struct.h> /* get_fs_root et.al. */ 21#include <linux/fs_struct.h> /* get_fs_root et.al. */
22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 8a404576fb26..b4f788e0ca31 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -51,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
51 return ERR_PTR(-EINVAL); 51 return ERR_PTR(-EINVAL);
52 52
53 count = size / sizeof(struct posix_acl_entry); 53 count = size / sizeof(struct posix_acl_entry);
54 if (count < 0)
55 return ERR_PTR(-EINVAL);
56 if (count == 0)
57 return NULL;
58 54
59 acl = posix_acl_alloc(count, GFP_NOFS); 55 acl = posix_acl_alloc(count, GFP_NOFS);
60 if (!acl) 56 if (!acl)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 94417a85ce6e..f37d3c0e2053 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2044,7 +2044,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2044 2044
2045out_write_size: 2045out_write_size:
2046 pos += copied; 2046 pos += copied;
2047 if (pos > inode->i_size) { 2047 if (pos > i_size_read(inode)) {
2048 i_size_write(inode, pos); 2048 i_size_write(inode, pos);
2049 mark_inode_dirty(inode); 2049 mark_inode_dirty(inode);
2050 } 2050 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c1c864e81cc..363f0dcc924f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -628,11 +628,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
628 struct o2nm_node *node, 628 struct o2nm_node *node,
629 int idx) 629 int idx)
630{ 630{
631 struct list_head *iter;
632 struct o2hb_callback_func *f; 631 struct o2hb_callback_func *f;
633 632
634 list_for_each(iter, &hbcall->list) { 633 list_for_each_entry(f, &hbcall->list, hc_item) {
635 f = list_entry(iter, struct o2hb_callback_func, hc_item);
636 mlog(ML_HEARTBEAT, "calling funcs %p\n", f); 634 mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
637 (f->hc_func)(node, idx, f->hc_data); 635 (f->hc_func)(node, idx, f->hc_data);
638 } 636 }
@@ -641,16 +639,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
641/* Will run the list in order until we process the passed event */ 639/* Will run the list in order until we process the passed event */
642static void o2hb_run_event_list(struct o2hb_node_event *queued_event) 640static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
643{ 641{
644 int empty;
645 struct o2hb_callback *hbcall; 642 struct o2hb_callback *hbcall;
646 struct o2hb_node_event *event; 643 struct o2hb_node_event *event;
647 644
648 spin_lock(&o2hb_live_lock);
649 empty = list_empty(&queued_event->hn_item);
650 spin_unlock(&o2hb_live_lock);
651 if (empty)
652 return;
653
654 /* Holding callback sem assures we don't alter the callback 645 /* Holding callback sem assures we don't alter the callback
655 * lists when doing this, and serializes ourselves with other 646 * lists when doing this, and serializes ourselves with other
656 * processes wanting callbacks. */ 647 * processes wanting callbacks. */
@@ -709,6 +700,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
709 struct o2hb_node_event event = 700 struct o2hb_node_event event =
710 { .hn_item = LIST_HEAD_INIT(event.hn_item), }; 701 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
711 struct o2nm_node *node; 702 struct o2nm_node *node;
703 int queued = 0;
712 704
713 node = o2nm_get_node_by_num(slot->ds_node_num); 705 node = o2nm_get_node_by_num(slot->ds_node_num);
714 if (!node) 706 if (!node)
@@ -726,11 +718,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
726 718
727 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 719 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
728 slot->ds_node_num); 720 slot->ds_node_num);
721 queued = 1;
729 } 722 }
730 } 723 }
731 spin_unlock(&o2hb_live_lock); 724 spin_unlock(&o2hb_live_lock);
732 725
733 o2hb_run_event_list(&event); 726 if (queued)
727 o2hb_run_event_list(&event);
734 728
735 o2nm_node_put(node); 729 o2nm_node_put(node);
736} 730}
@@ -790,6 +784,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
790 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 784 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
791 unsigned int slot_dead_ms; 785 unsigned int slot_dead_ms;
792 int tmp; 786 int tmp;
787 int queued = 0;
793 788
794 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 789 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
795 790
@@ -883,6 +878,7 @@ fire_callbacks:
883 slot->ds_node_num); 878 slot->ds_node_num);
884 879
885 changed = 1; 880 changed = 1;
881 queued = 1;
886 } 882 }
887 883
888 list_add_tail(&slot->ds_live_item, 884 list_add_tail(&slot->ds_live_item,
@@ -934,6 +930,7 @@ fire_callbacks:
934 node, slot->ds_node_num); 930 node, slot->ds_node_num);
935 931
936 changed = 1; 932 changed = 1;
933 queued = 1;
937 } 934 }
938 935
939 /* We don't clear this because the node is still 936 /* We don't clear this because the node is still
@@ -949,7 +946,8 @@ fire_callbacks:
949out: 946out:
950 spin_unlock(&o2hb_live_lock); 947 spin_unlock(&o2hb_live_lock);
951 948
952 o2hb_run_event_list(&event); 949 if (queued)
950 o2hb_run_event_list(&event);
953 951
954 if (node) 952 if (node)
955 o2nm_node_put(node); 953 o2nm_node_put(node);
@@ -2516,8 +2514,7 @@ unlock:
2516int o2hb_register_callback(const char *region_uuid, 2514int o2hb_register_callback(const char *region_uuid,
2517 struct o2hb_callback_func *hc) 2515 struct o2hb_callback_func *hc)
2518{ 2516{
2519 struct o2hb_callback_func *tmp; 2517 struct o2hb_callback_func *f;
2520 struct list_head *iter;
2521 struct o2hb_callback *hbcall; 2518 struct o2hb_callback *hbcall;
2522 int ret; 2519 int ret;
2523 2520
@@ -2540,10 +2537,9 @@ int o2hb_register_callback(const char *region_uuid,
2540 2537
2541 down_write(&o2hb_callback_sem); 2538 down_write(&o2hb_callback_sem);
2542 2539
2543 list_for_each(iter, &hbcall->list) { 2540 list_for_each_entry(f, &hbcall->list, hc_item) {
2544 tmp = list_entry(iter, struct o2hb_callback_func, hc_item); 2541 if (hc->hc_priority < f->hc_priority) {
2545 if (hc->hc_priority < tmp->hc_priority) { 2542 list_add_tail(&hc->hc_item, &f->hc_item);
2546 list_add_tail(&hc->hc_item, iter);
2547 break; 2543 break;
2548 } 2544 }
2549 } 2545 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d644dc611425..2cd2406b4140 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -543,8 +543,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
543 } 543 }
544 544
545 if (was_valid && !valid) { 545 if (was_valid && !valid) {
546 printk(KERN_NOTICE "o2net: No longer connected to " 546 if (old_sc)
547 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 547 printk(KERN_NOTICE "o2net: No longer connected to "
548 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
548 o2net_complete_nodes_nsw(nn); 549 o2net_complete_nodes_nsw(nn);
549 } 550 }
550 551
@@ -765,32 +766,32 @@ static struct o2net_msg_handler *
765o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, 766o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
766 struct rb_node **ret_parent) 767 struct rb_node **ret_parent)
767{ 768{
768 struct rb_node **p = &o2net_handler_tree.rb_node; 769 struct rb_node **p = &o2net_handler_tree.rb_node;
769 struct rb_node *parent = NULL; 770 struct rb_node *parent = NULL;
770 struct o2net_msg_handler *nmh, *ret = NULL; 771 struct o2net_msg_handler *nmh, *ret = NULL;
771 int cmp; 772 int cmp;
772 773
773 while (*p) { 774 while (*p) {
774 parent = *p; 775 parent = *p;
775 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); 776 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
776 cmp = o2net_handler_cmp(nmh, msg_type, key); 777 cmp = o2net_handler_cmp(nmh, msg_type, key);
777 778
778 if (cmp < 0) 779 if (cmp < 0)
779 p = &(*p)->rb_left; 780 p = &(*p)->rb_left;
780 else if (cmp > 0) 781 else if (cmp > 0)
781 p = &(*p)->rb_right; 782 p = &(*p)->rb_right;
782 else { 783 else {
783 ret = nmh; 784 ret = nmh;
784 break; 785 break;
785 } 786 }
786 } 787 }
787 788
788 if (ret_p != NULL) 789 if (ret_p != NULL)
789 *ret_p = p; 790 *ret_p = p;
790 if (ret_parent != NULL) 791 if (ret_parent != NULL)
791 *ret_parent = parent; 792 *ret_parent = parent;
792 793
793 return ret; 794 return ret;
794} 795}
795 796
796static void o2net_handler_kref_release(struct kref *kref) 797static void o2net_handler_kref_release(struct kref *kref)
@@ -1695,13 +1696,12 @@ static void o2net_start_connect(struct work_struct *work)
1695 ret = 0; 1696 ret = 0;
1696 1697
1697out: 1698out:
1698 if (ret) { 1699 if (ret && sc) {
1699 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT 1700 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
1700 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); 1701 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
1701 /* 0 err so that another will be queued and attempted 1702 /* 0 err so that another will be queued and attempted
1702 * from set_nn_state */ 1703 * from set_nn_state */
1703 if (sc) 1704 o2net_ensure_shutdown(nn, sc, 0);
1704 o2net_ensure_shutdown(nn, sc, 0);
1705 } 1705 }
1706 if (sc) 1706 if (sc)
1707 sc_put(sc); 1707 sc_put(sc);
@@ -1873,12 +1873,16 @@ static int o2net_accept_one(struct socket *sock)
1873 1873
1874 if (o2nm_this_node() >= node->nd_num) { 1874 if (o2nm_this_node() >= node->nd_num) {
1875 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1875 local_node = o2nm_get_node_by_num(o2nm_this_node());
1876 printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " 1876 if (local_node)
1877 "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " 1877 printk(KERN_NOTICE "o2net: Unexpected connect attempt "
1878 "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, 1878 "seen at node '%s' (%u, %pI4:%d) from "
1879 &(local_node->nd_ipv4_address), 1879 "node '%s' (%u, %pI4:%d)\n",
1880 ntohs(local_node->nd_ipv4_port), node->nd_name, 1880 local_node->nd_name, local_node->nd_num,
1881 node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1881 &(local_node->nd_ipv4_address),
1882 ntohs(local_node->nd_ipv4_port),
1883 node->nd_name,
1884 node->nd_num, &sin.sin_addr.s_addr,
1885 ntohs(sin.sin_port));
1882 ret = -EINVAL; 1886 ret = -EINVAL;
1883 goto out; 1887 goto out;
1884 } 1888 }
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fbec0be62326..b46278f9ae44 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -292,7 +292,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
292 struct dlm_lock *lock = NULL; 292 struct dlm_lock *lock = NULL;
293 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; 293 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
294 char *name; 294 char *name;
295 struct list_head *iter, *head=NULL; 295 struct list_head *head = NULL;
296 __be64 cookie; 296 __be64 cookie;
297 u32 flags; 297 u32 flags;
298 u8 node; 298 u8 node;
@@ -373,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
373 /* try convert queue for both ast/bast */ 373 /* try convert queue for both ast/bast */
374 head = &res->converting; 374 head = &res->converting;
375 lock = NULL; 375 lock = NULL;
376 list_for_each(iter, head) { 376 list_for_each_entry(lock, head, list) {
377 lock = list_entry (iter, struct dlm_lock, list);
378 if (lock->ml.cookie == cookie) 377 if (lock->ml.cookie == cookie)
379 goto do_ast; 378 goto do_ast;
380 } 379 }
@@ -385,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
385 else 384 else
386 head = &res->granted; 385 head = &res->granted;
387 386
388 list_for_each(iter, head) { 387 list_for_each_entry(lock, head, list) {
389 lock = list_entry (iter, struct dlm_lock, list);
390 if (lock->ml.cookie == cookie) 388 if (lock->ml.cookie == cookie)
391 goto do_ast; 389 goto do_ast;
392 } 390 }
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index de854cca12a2..e0517762fcc0 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1079,11 +1079,9 @@ static inline int dlm_lock_compatible(int existing, int request)
1079static inline int dlm_lock_on_list(struct list_head *head, 1079static inline int dlm_lock_on_list(struct list_head *head,
1080 struct dlm_lock *lock) 1080 struct dlm_lock *lock)
1081{ 1081{
1082 struct list_head *iter;
1083 struct dlm_lock *tmplock; 1082 struct dlm_lock *tmplock;
1084 1083
1085 list_for_each(iter, head) { 1084 list_for_each_entry(tmplock, head, list) {
1086 tmplock = list_entry(iter, struct dlm_lock, list);
1087 if (tmplock == lock) 1085 if (tmplock == lock)
1088 return 1; 1086 return 1;
1089 } 1087 }
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 29a886d1e82c..e36d63ff1783 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -123,7 +123,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
123 int *kick_thread) 123 int *kick_thread)
124{ 124{
125 enum dlm_status status = DLM_NORMAL; 125 enum dlm_status status = DLM_NORMAL;
126 struct list_head *iter;
127 struct dlm_lock *tmplock=NULL; 126 struct dlm_lock *tmplock=NULL;
128 127
129 assert_spin_locked(&res->spinlock); 128 assert_spin_locked(&res->spinlock);
@@ -185,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
185 184
186 /* upconvert from here on */ 185 /* upconvert from here on */
187 status = DLM_NORMAL; 186 status = DLM_NORMAL;
188 list_for_each(iter, &res->granted) { 187 list_for_each_entry(tmplock, &res->granted, list) {
189 tmplock = list_entry(iter, struct dlm_lock, list);
190 if (tmplock == lock) 188 if (tmplock == lock)
191 continue; 189 continue;
192 if (!dlm_lock_compatible(tmplock->ml.type, type)) 190 if (!dlm_lock_compatible(tmplock->ml.type, type))
193 goto switch_queues; 191 goto switch_queues;
194 } 192 }
195 193
196 list_for_each(iter, &res->converting) { 194 list_for_each_entry(tmplock, &res->converting, list) {
197 tmplock = list_entry(iter, struct dlm_lock, list);
198 if (!dlm_lock_compatible(tmplock->ml.type, type)) 195 if (!dlm_lock_compatible(tmplock->ml.type, type))
199 goto switch_queues; 196 goto switch_queues;
200 /* existing conversion requests take precedence */ 197 /* existing conversion requests take precedence */
@@ -424,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
424 struct dlm_ctxt *dlm = data; 421 struct dlm_ctxt *dlm = data;
425 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; 422 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
426 struct dlm_lock_resource *res = NULL; 423 struct dlm_lock_resource *res = NULL;
427 struct list_head *iter;
428 struct dlm_lock *lock = NULL; 424 struct dlm_lock *lock = NULL;
425 struct dlm_lock *tmp_lock;
429 struct dlm_lockstatus *lksb; 426 struct dlm_lockstatus *lksb;
430 enum dlm_status status = DLM_NORMAL; 427 enum dlm_status status = DLM_NORMAL;
431 u32 flags; 428 u32 flags;
@@ -471,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
471 dlm_error(status); 468 dlm_error(status);
472 goto leave; 469 goto leave;
473 } 470 }
474 list_for_each(iter, &res->granted) { 471 list_for_each_entry(tmp_lock, &res->granted, list) {
475 lock = list_entry(iter, struct dlm_lock, list); 472 if (tmp_lock->ml.cookie == cnv->cookie &&
476 if (lock->ml.cookie == cnv->cookie && 473 tmp_lock->ml.node == cnv->node_idx) {
477 lock->ml.node == cnv->node_idx) { 474 lock = tmp_lock;
478 dlm_lock_get(lock); 475 dlm_lock_get(lock);
479 break; 476 break;
480 } 477 }
481 lock = NULL;
482 } 478 }
483 spin_unlock(&res->spinlock); 479 spin_unlock(&res->spinlock);
484 if (!lock) { 480 if (!lock) {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0e28e242226d..e33cd7a3c582 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -96,7 +96,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
96 96
97void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 97void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
98{ 98{
99 struct list_head *iter2;
100 struct dlm_lock *lock; 99 struct dlm_lock *lock;
101 char buf[DLM_LOCKID_NAME_MAX]; 100 char buf[DLM_LOCKID_NAME_MAX];
102 101
@@ -118,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
118 res->inflight_locks, atomic_read(&res->asts_reserved)); 117 res->inflight_locks, atomic_read(&res->asts_reserved));
119 dlm_print_lockres_refmap(res); 118 dlm_print_lockres_refmap(res);
120 printk(" granted queue:\n"); 119 printk(" granted queue:\n");
121 list_for_each(iter2, &res->granted) { 120 list_for_each_entry(lock, &res->granted, list) {
122 lock = list_entry(iter2, struct dlm_lock, list);
123 __dlm_print_lock(lock); 121 __dlm_print_lock(lock);
124 } 122 }
125 printk(" converting queue:\n"); 123 printk(" converting queue:\n");
126 list_for_each(iter2, &res->converting) { 124 list_for_each_entry(lock, &res->converting, list) {
127 lock = list_entry(iter2, struct dlm_lock, list);
128 __dlm_print_lock(lock); 125 __dlm_print_lock(lock);
129 } 126 }
130 printk(" blocked queue:\n"); 127 printk(" blocked queue:\n");
131 list_for_each(iter2, &res->blocked) { 128 list_for_each_entry(lock, &res->blocked, list) {
132 lock = list_entry(iter2, struct dlm_lock, list);
133 __dlm_print_lock(lock); 129 __dlm_print_lock(lock);
134 } 130 }
135} 131}
@@ -446,7 +442,6 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
446{ 442{
447 struct dlm_master_list_entry *mle; 443 struct dlm_master_list_entry *mle;
448 struct hlist_head *bucket; 444 struct hlist_head *bucket;
449 struct hlist_node *list;
450 int i, out = 0; 445 int i, out = 0;
451 unsigned long total = 0, longest = 0, bucket_count = 0; 446 unsigned long total = 0, longest = 0, bucket_count = 0;
452 447
@@ -456,9 +451,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
456 spin_lock(&dlm->master_lock); 451 spin_lock(&dlm->master_lock);
457 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 452 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
458 bucket = dlm_master_hash(dlm, i); 453 bucket = dlm_master_hash(dlm, i);
459 hlist_for_each(list, bucket) { 454 hlist_for_each_entry(mle, bucket, master_hash_node) {
460 mle = hlist_entry(list, struct dlm_master_list_entry,
461 master_hash_node);
462 ++total; 455 ++total;
463 ++bucket_count; 456 ++bucket_count;
464 if (len - out < 200) 457 if (len - out < 200)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index dbb17c07656a..8b3382abf840 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -193,7 +193,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
193 unsigned int hash) 193 unsigned int hash)
194{ 194{
195 struct hlist_head *bucket; 195 struct hlist_head *bucket;
196 struct hlist_node *list; 196 struct dlm_lock_resource *res;
197 197
198 mlog(0, "%.*s\n", len, name); 198 mlog(0, "%.*s\n", len, name);
199 199
@@ -201,9 +201,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
201 201
202 bucket = dlm_lockres_hash(dlm, hash); 202 bucket = dlm_lockres_hash(dlm, hash);
203 203
204 hlist_for_each(list, bucket) { 204 hlist_for_each_entry(res, bucket, hash_node) {
205 struct dlm_lock_resource *res = hlist_entry(list,
206 struct dlm_lock_resource, hash_node);
207 if (res->lockname.name[0] != name[0]) 205 if (res->lockname.name[0] != name[0])
208 continue; 206 continue;
209 if (unlikely(res->lockname.len != len)) 207 if (unlikely(res->lockname.len != len))
@@ -262,22 +260,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
262 260
263static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) 261static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
264{ 262{
265 struct dlm_ctxt *tmp = NULL; 263 struct dlm_ctxt *tmp;
266 struct list_head *iter;
267 264
268 assert_spin_locked(&dlm_domain_lock); 265 assert_spin_locked(&dlm_domain_lock);
269 266
270 /* tmp->name here is always NULL terminated, 267 /* tmp->name here is always NULL terminated,
271 * but domain may not be! */ 268 * but domain may not be! */
272 list_for_each(iter, &dlm_domains) { 269 list_for_each_entry(tmp, &dlm_domains, list) {
273 tmp = list_entry (iter, struct dlm_ctxt, list);
274 if (strlen(tmp->name) == len && 270 if (strlen(tmp->name) == len &&
275 memcmp(tmp->name, domain, len)==0) 271 memcmp(tmp->name, domain, len)==0)
276 break; 272 return tmp;
277 tmp = NULL;
278 } 273 }
279 274
280 return tmp; 275 return NULL;
281} 276}
282 277
283/* For null terminated domain strings ONLY */ 278/* For null terminated domain strings ONLY */
@@ -366,25 +361,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
366 * you shouldn't trust your pointer. */ 361 * you shouldn't trust your pointer. */
367struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) 362struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
368{ 363{
369 struct list_head *iter; 364 struct dlm_ctxt *target;
370 struct dlm_ctxt *target = NULL; 365 struct dlm_ctxt *ret = NULL;
371 366
372 spin_lock(&dlm_domain_lock); 367 spin_lock(&dlm_domain_lock);
373 368
374 list_for_each(iter, &dlm_domains) { 369 list_for_each_entry(target, &dlm_domains, list) {
375 target = list_entry (iter, struct dlm_ctxt, list);
376
377 if (target == dlm) { 370 if (target == dlm) {
378 __dlm_get(target); 371 __dlm_get(target);
372 ret = target;
379 break; 373 break;
380 } 374 }
381
382 target = NULL;
383 } 375 }
384 376
385 spin_unlock(&dlm_domain_lock); 377 spin_unlock(&dlm_domain_lock);
386 378
387 return target; 379 return ret;
388} 380}
389 381
390int dlm_domain_fully_joined(struct dlm_ctxt *dlm) 382int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -2296,13 +2288,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
2296void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, 2288void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
2297 int node_num) 2289 int node_num)
2298{ 2290{
2299 struct list_head *iter;
2300 struct dlm_eviction_cb *cb; 2291 struct dlm_eviction_cb *cb;
2301 2292
2302 down_read(&dlm_callback_sem); 2293 down_read(&dlm_callback_sem);
2303 list_for_each(iter, &dlm->dlm_eviction_callbacks) { 2294 list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
2304 cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
2305
2306 cb->ec_func(node_num, cb->ec_data); 2295 cb->ec_func(node_num, cb->ec_data);
2307 } 2296 }
2308 up_read(&dlm_callback_sem); 2297 up_read(&dlm_callback_sem);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 47e67c2d228f..5d32f7511f74 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -91,19 +91,14 @@ void dlm_destroy_lock_cache(void)
91static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, 91static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
92 struct dlm_lock *lock) 92 struct dlm_lock *lock)
93{ 93{
94 struct list_head *iter;
95 struct dlm_lock *tmplock; 94 struct dlm_lock *tmplock;
96 95
97 list_for_each(iter, &res->granted) { 96 list_for_each_entry(tmplock, &res->granted, list) {
98 tmplock = list_entry(iter, struct dlm_lock, list);
99
100 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 97 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
101 return 0; 98 return 0;
102 } 99 }
103 100
104 list_for_each(iter, &res->converting) { 101 list_for_each_entry(tmplock, &res->converting, list) {
105 tmplock = list_entry(iter, struct dlm_lock, list);
106
107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 102 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
108 return 0; 103 return 0;
109 if (!dlm_lock_compatible(tmplock->ml.convert_type, 104 if (!dlm_lock_compatible(tmplock->ml.convert_type,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 33ecbe0e6734..cf0f103963b1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -342,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
342{ 342{
343 struct dlm_master_list_entry *tmpmle; 343 struct dlm_master_list_entry *tmpmle;
344 struct hlist_head *bucket; 344 struct hlist_head *bucket;
345 struct hlist_node *list;
346 unsigned int hash; 345 unsigned int hash;
347 346
348 assert_spin_locked(&dlm->master_lock); 347 assert_spin_locked(&dlm->master_lock);
349 348
350 hash = dlm_lockid_hash(name, namelen); 349 hash = dlm_lockid_hash(name, namelen);
351 bucket = dlm_master_hash(dlm, hash); 350 bucket = dlm_master_hash(dlm, hash);
352 hlist_for_each(list, bucket) { 351 hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
353 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
354 master_hash_node);
355 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 352 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
356 continue; 353 continue;
357 dlm_get_mle(tmpmle); 354 dlm_get_mle(tmpmle);
@@ -3183,7 +3180,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3183 struct dlm_master_list_entry *mle; 3180 struct dlm_master_list_entry *mle;
3184 struct dlm_lock_resource *res; 3181 struct dlm_lock_resource *res;
3185 struct hlist_head *bucket; 3182 struct hlist_head *bucket;
3186 struct hlist_node *list; 3183 struct hlist_node *tmp;
3187 unsigned int i; 3184 unsigned int i;
3188 3185
3189 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); 3186 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
@@ -3194,10 +3191,7 @@ top:
3194 spin_lock(&dlm->master_lock); 3191 spin_lock(&dlm->master_lock);
3195 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 3192 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3196 bucket = dlm_master_hash(dlm, i); 3193 bucket = dlm_master_hash(dlm, i);
3197 hlist_for_each(list, bucket) { 3194 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3198 mle = hlist_entry(list, struct dlm_master_list_entry,
3199 master_hash_node);
3200
3201 BUG_ON(mle->type != DLM_MLE_BLOCK && 3195 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3202 mle->type != DLM_MLE_MASTER && 3196 mle->type != DLM_MLE_MASTER &&
3203 mle->type != DLM_MLE_MIGRATION); 3197 mle->type != DLM_MLE_MIGRATION);
@@ -3378,7 +3372,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
3378 int i; 3372 int i;
3379 struct hlist_head *bucket; 3373 struct hlist_head *bucket;
3380 struct dlm_master_list_entry *mle; 3374 struct dlm_master_list_entry *mle;
3381 struct hlist_node *tmp, *list; 3375 struct hlist_node *tmp;
3382 3376
3383 /* 3377 /*
3384 * We notified all other nodes that we are exiting the domain and 3378 * We notified all other nodes that we are exiting the domain and
@@ -3394,9 +3388,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
3394 3388
3395 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 3389 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3396 bucket = dlm_master_hash(dlm, i); 3390 bucket = dlm_master_hash(dlm, i);
3397 hlist_for_each_safe(list, tmp, bucket) { 3391 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3398 mle = hlist_entry(list, struct dlm_master_list_entry,
3399 master_hash_node);
3400 if (mle->type != DLM_MLE_BLOCK) { 3392 if (mle->type != DLM_MLE_BLOCK) {
3401 mlog(ML_ERROR, "bad mle: %p\n", mle); 3393 mlog(ML_ERROR, "bad mle: %p\n", mle);
3402 dlm_print_one_mle(mle); 3394 dlm_print_one_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 773bd32bfd8c..0b5adca1b178 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -787,6 +787,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
787{ 787{
788 struct dlm_lock_request lr; 788 struct dlm_lock_request lr;
789 int ret; 789 int ret;
790 int status;
790 791
791 mlog(0, "\n"); 792 mlog(0, "\n");
792 793
@@ -800,13 +801,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
800 801
801 // send message 802 // send message
802 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, 803 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
803 &lr, sizeof(lr), request_from, NULL); 804 &lr, sizeof(lr), request_from, &status);
804 805
805 /* negative status is handled by caller */ 806 /* negative status is handled by caller */
806 if (ret < 0) 807 if (ret < 0)
807 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " 808 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
808 "to recover dead node %u\n", dlm->name, ret, 809 "to recover dead node %u\n", dlm->name, ret,
809 request_from, dead_node); 810 request_from, dead_node);
811 else
812 ret = status;
810 // return from here, then 813 // return from here, then
811 // sleep until all received or error 814 // sleep until all received or error
812 return ret; 815 return ret;
@@ -2328,6 +2331,14 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2328 } else if (res->owner == dlm->node_num) { 2331 } else if (res->owner == dlm->node_num) {
2329 dlm_free_dead_locks(dlm, res, dead_node); 2332 dlm_free_dead_locks(dlm, res, dead_node);
2330 __dlm_lockres_calc_usage(dlm, res); 2333 __dlm_lockres_calc_usage(dlm, res);
2334 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2335 if (test_bit(dead_node, res->refmap)) {
2336 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2337 "no locks and had not purged before dying\n",
2338 dlm->name, res->lockname.len,
2339 res->lockname.name, dead_node);
2340 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2341 }
2331 } 2342 }
2332 spin_unlock(&res->spinlock); 2343 spin_unlock(&res->spinlock);
2333 } 2344 }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index e73c833fc2a1..9db869de829d 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -286,8 +286,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
286 struct dlm_lock_resource *res) 286 struct dlm_lock_resource *res)
287{ 287{
288 struct dlm_lock *lock, *target; 288 struct dlm_lock *lock, *target;
289 struct list_head *iter;
290 struct list_head *head;
291 int can_grant = 1; 289 int can_grant = 1;
292 290
293 /* 291 /*
@@ -314,9 +312,7 @@ converting:
314 dlm->name, res->lockname.len, res->lockname.name); 312 dlm->name, res->lockname.len, res->lockname.name);
315 BUG(); 313 BUG();
316 } 314 }
317 head = &res->granted; 315 list_for_each_entry(lock, &res->granted, list) {
318 list_for_each(iter, head) {
319 lock = list_entry(iter, struct dlm_lock, list);
320 if (lock==target) 316 if (lock==target)
321 continue; 317 continue;
322 if (!dlm_lock_compatible(lock->ml.type, 318 if (!dlm_lock_compatible(lock->ml.type,
@@ -333,9 +329,8 @@ converting:
333 target->ml.convert_type; 329 target->ml.convert_type;
334 } 330 }
335 } 331 }
336 head = &res->converting; 332
337 list_for_each(iter, head) { 333 list_for_each_entry(lock, &res->converting, list) {
338 lock = list_entry(iter, struct dlm_lock, list);
339 if (lock==target) 334 if (lock==target)
340 continue; 335 continue;
341 if (!dlm_lock_compatible(lock->ml.type, 336 if (!dlm_lock_compatible(lock->ml.type,
@@ -384,9 +379,7 @@ blocked:
384 goto leave; 379 goto leave;
385 target = list_entry(res->blocked.next, struct dlm_lock, list); 380 target = list_entry(res->blocked.next, struct dlm_lock, list);
386 381
387 head = &res->granted; 382 list_for_each_entry(lock, &res->granted, list) {
388 list_for_each(iter, head) {
389 lock = list_entry(iter, struct dlm_lock, list);
390 if (lock==target) 383 if (lock==target)
391 continue; 384 continue;
392 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { 385 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
@@ -400,9 +393,7 @@ blocked:
400 } 393 }
401 } 394 }
402 395
403 head = &res->converting; 396 list_for_each_entry(lock, &res->converting, list) {
404 list_for_each(iter, head) {
405 lock = list_entry(iter, struct dlm_lock, list);
406 if (lock==target) 397 if (lock==target)
407 continue; 398 continue;
408 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { 399 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 850aa7e87537..5698b52cf5c9 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -388,7 +388,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
388 struct dlm_ctxt *dlm = data; 388 struct dlm_ctxt *dlm = data;
389 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; 389 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
390 struct dlm_lock_resource *res = NULL; 390 struct dlm_lock_resource *res = NULL;
391 struct list_head *iter;
392 struct dlm_lock *lock = NULL; 391 struct dlm_lock *lock = NULL;
393 enum dlm_status status = DLM_NORMAL; 392 enum dlm_status status = DLM_NORMAL;
394 int found = 0, i; 393 int found = 0, i;
@@ -458,8 +457,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
458 } 457 }
459 458
460 for (i=0; i<3; i++) { 459 for (i=0; i<3; i++) {
461 list_for_each(iter, queue) { 460 list_for_each_entry(lock, queue, list) {
462 lock = list_entry(iter, struct dlm_lock, list);
463 if (lock->ml.cookie == unlock->cookie && 461 if (lock->ml.cookie == unlock->cookie &&
464 lock->ml.node == unlock->node_idx) { 462 lock->ml.node == unlock->node_idx) {
465 dlm_lock_get(lock); 463 dlm_lock_get(lock);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 12bafb7265ce..efa2b3d339e3 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -401,11 +401,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
401{ 401{
402 struct inode *inode = new_inode(sb); 402 struct inode *inode = new_inode(sb);
403 umode_t mode = S_IFDIR | 0755; 403 umode_t mode = S_IFDIR | 0755;
404 struct dlmfs_inode_private *ip;
405 404
406 if (inode) { 405 if (inode) {
407 ip = DLMFS_I(inode);
408
409 inode->i_ino = get_next_ino(); 406 inode->i_ino = get_next_ino();
410 inode_init_owner(inode, NULL, mode); 407 inode_init_owner(inode, NULL, mode);
411 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 408 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2487116d0d33..767370b656ca 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -781,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
781 cpos = map_start >> osb->s_clustersize_bits; 781 cpos = map_start >> osb->s_clustersize_bits;
782 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, 782 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
783 map_start + map_len); 783 map_start + map_len);
784 mapping_end -= cpos;
785 is_last = 0; 784 is_last = 0;
786 while (cpos < mapping_end && !is_last) { 785 while (cpos < mapping_end && !is_last) {
787 u32 fe_flags; 786 u32 fe_flags;
@@ -852,20 +851,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
852 851
853 down_read(&OCFS2_I(inode)->ip_alloc_sem); 852 down_read(&OCFS2_I(inode)->ip_alloc_sem);
854 853
855 if (*offset >= inode->i_size) { 854 if (*offset >= i_size_read(inode)) {
856 ret = -ENXIO; 855 ret = -ENXIO;
857 goto out_unlock; 856 goto out_unlock;
858 } 857 }
859 858
860 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 859 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
861 if (whence == SEEK_HOLE) 860 if (whence == SEEK_HOLE)
862 *offset = inode->i_size; 861 *offset = i_size_read(inode);
863 goto out_unlock; 862 goto out_unlock;
864 } 863 }
865 864
866 clen = 0; 865 clen = 0;
867 cpos = *offset >> cs_bits; 866 cpos = *offset >> cs_bits;
868 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); 867 cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
869 868
870 while (cpos < cend && !is_last) { 869 while (cpos < cend && !is_last) {
871 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, 870 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
@@ -904,8 +903,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
904 extlen = clen; 903 extlen = clen;
905 extlen <<= cs_bits; 904 extlen <<= cs_bits;
906 905
907 if ((extoff + extlen) > inode->i_size) 906 if ((extoff + extlen) > i_size_read(inode))
908 extlen = inode->i_size - extoff; 907 extlen = i_size_read(inode) - extoff;
909 extoff += extlen; 908 extoff += extlen;
910 if (extoff > *offset) 909 if (extoff > *offset)
911 *offset = extoff; 910 *offset = extoff;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3261d71319ee..4f8197caa487 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -671,11 +671,7 @@ restarted_transaction:
671 } else { 671 } else {
672 BUG_ON(why != RESTART_TRANS); 672 BUG_ON(why != RESTART_TRANS);
673 673
674 /* TODO: This can be more intelligent. */ 674 status = ocfs2_allocate_extend_trans(handle, 1);
675 credits = ocfs2_calc_extend_credits(osb->sb,
676 &fe->id2.i_list,
677 clusters_to_add);
678 status = ocfs2_extend_trans(handle, credits);
679 if (status < 0) { 675 if (status < 0) {
680 /* handle still has to be committed at 676 /* handle still has to be committed at
681 * this point. */ 677 * this point. */
@@ -1800,6 +1796,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1800 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1796 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1801 1797
1802out: 1798out:
1799 ocfs2_free_path(path);
1803 ocfs2_schedule_truncate_log_flush(osb, 1); 1800 ocfs2_schedule_truncate_log_flush(osb, 1);
1804 ocfs2_run_deallocs(osb, &dealloc); 1801 ocfs2_run_deallocs(osb, &dealloc);
1805 1802
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 0c60ef2d8056..fa32ce9b455d 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -303,7 +303,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
303 if (o2info_from_user(oij, req)) 303 if (o2info_from_user(oij, req))
304 goto bail; 304 goto bail;
305 305
306 oij.ij_journal_size = osb->journal->j_inode->i_size; 306 oij.ij_journal_size = i_size_read(osb->journal->j_inode);
307 307
308 o2info_set_request_filled(&oij.ij_req); 308 o2info_set_request_filled(&oij.ij_req);
309 309
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 242170d83971..44fc3e530c3d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -455,6 +455,41 @@ bail:
455 return status; 455 return status;
456} 456}
457 457
458/*
459 * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
460 * If that fails, restart the transaction & regain write access for the
461 * buffer head which is used for metadata modifications.
462 * Taken from Ext4: extend_or_restart_transaction()
463 */
464int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
465{
466 int status, old_nblks;
467
468 BUG_ON(!handle);
469
470 old_nblks = handle->h_buffer_credits;
471 trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
472
473 if (old_nblks < thresh)
474 return 0;
475
476 status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
477 if (status < 0) {
478 mlog_errno(status);
479 goto bail;
480 }
481
482 if (status > 0) {
483 status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
484 if (status < 0)
485 mlog_errno(status);
486 }
487
488bail:
489 return status;
490}
491
492
458struct ocfs2_triggers { 493struct ocfs2_triggers {
459 struct jbd2_buffer_trigger_type ot_triggers; 494 struct jbd2_buffer_trigger_type ot_triggers;
460 int ot_offset; 495 int ot_offset;
@@ -801,14 +836,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
801 inode_lock = 1; 836 inode_lock = 1;
802 di = (struct ocfs2_dinode *)bh->b_data; 837 di = (struct ocfs2_dinode *)bh->b_data;
803 838
804 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 839 if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) {
805 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", 840 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
806 inode->i_size); 841 i_size_read(inode));
807 status = -EINVAL; 842 status = -EINVAL;
808 goto done; 843 goto done;
809 } 844 }
810 845
811 trace_ocfs2_journal_init(inode->i_size, 846 trace_ocfs2_journal_init(i_size_read(inode),
812 (unsigned long long)inode->i_blocks, 847 (unsigned long long)inode->i_blocks,
813 OCFS2_I(inode)->ip_clusters); 848 OCFS2_I(inode)->ip_clusters);
814 849
@@ -1096,7 +1131,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
1096 1131
1097 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 1132 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
1098 1133
1099 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); 1134 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
1100 v_blkno = 0; 1135 v_blkno = 0;
1101 while (v_blkno < num_blocks) { 1136 while (v_blkno < num_blocks) {
1102 status = ocfs2_extent_map_get_blocks(inode, v_blkno, 1137 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0a992737dcaf..0b479bab3671 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -258,6 +258,17 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
258int ocfs2_commit_trans(struct ocfs2_super *osb, 258int ocfs2_commit_trans(struct ocfs2_super *osb,
259 handle_t *handle); 259 handle_t *handle);
260int ocfs2_extend_trans(handle_t *handle, int nblocks); 260int ocfs2_extend_trans(handle_t *handle, int nblocks);
261int ocfs2_allocate_extend_trans(handle_t *handle,
262 int thresh);
263
264/*
265 * Define an arbitrary limit for the amount of data we will anticipate
266 * writing to any given transaction. For unbounded transactions such as
267 * fallocate(2) we can write more than this, but we always
268 * start off at the maximum transaction size and grow the transaction
269 * optimistically as we go.
270 */
271#define OCFS2_MAX_TRANS_DATA 64U
261 272
262/* 273/*
263 * Create access is for when we get a newly created buffer and we're 274 * Create access is for when we get a newly created buffer and we're
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index aebeacd807c3..cd5496b7a0a3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1082,7 +1082,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
1082 } 1082 }
1083 1083
1084retry_enospc: 1084retry_enospc:
1085 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; 1085 (*ac)->ac_bits_wanted = osb->local_alloc_bits;
1086 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1086 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1087 if (status == -ENOSPC) { 1087 if (status == -ENOSPC) {
1088 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 1088 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1154,7 +1154,7 @@ retry_enospc:
1154 OCFS2_LA_DISABLED) 1154 OCFS2_LA_DISABLED)
1155 goto bail; 1155 goto bail;
1156 1156
1157 ac->ac_bits_wanted = osb->local_alloc_default_bits; 1157 ac->ac_bits_wanted = osb->local_alloc_bits;
1158 status = ocfs2_claim_clusters(handle, ac, 1158 status = ocfs2_claim_clusters(handle, ac,
1159 osb->local_alloc_bits, 1159 osb->local_alloc_bits,
1160 &cluster_off, 1160 &cluster_off,
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 452068b45749..3d3f3c83065c 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -152,6 +152,7 @@ static int __ocfs2_move_extent(handle_t *handle,
152 } 152 }
153 153
154out: 154out:
155 ocfs2_free_path(path);
155 return ret; 156 return ret;
156} 157}
157 158
@@ -845,7 +846,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
845 struct ocfs2_move_extents *range = context->range; 846 struct ocfs2_move_extents *range = context->range;
846 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
847 848
848 if ((inode->i_size == 0) || (range->me_len == 0)) 849 if ((i_size_read(inode) == 0) || (range->me_len == 0))
849 return 0; 850 return 0;
850 851
851 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 852 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 3b481f490633..1b60c62aa9d6 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2579,6 +2579,8 @@ DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
2579 2579
2580DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); 2580DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
2581 2581
2582DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
2583
2582DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); 2584DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
2583 2585
2584DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); 2586DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 332a281f217e..aaa50611ec66 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -234,7 +234,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; 234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
235 } 235 }
236 236
237 if (gqinode->i_size < off + len) { 237 if (i_size_read(gqinode) < off + len) {
238 loff_t rounded_end = 238 loff_t rounded_end =
239 ocfs2_align_bytes_to_blocks(sb, off + len); 239 ocfs2_align_bytes_to_blocks(sb, off + len);
240 240
@@ -778,8 +778,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
778 */ 778 */
779 WARN_ON(journal_current_handle()); 779 WARN_ON(journal_current_handle());
780 status = ocfs2_extend_no_holes(gqinode, NULL, 780 status = ocfs2_extend_no_holes(gqinode, NULL,
781 gqinode->i_size + (need_alloc << sb->s_blocksize_bits), 781 i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
782 gqinode->i_size); 782 i_size_read(gqinode));
783 if (status < 0) 783 if (status < 0)
784 goto out_dq; 784 goto out_dq;
785 } 785 }
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 27fe7ee4874c..2e4344be3b96 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -982,14 +982,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
982 982
983 /* We are protected by dqio_sem so no locking needed */ 983 /* We are protected by dqio_sem so no locking needed */
984 status = ocfs2_extend_no_holes(lqinode, NULL, 984 status = ocfs2_extend_no_holes(lqinode, NULL,
985 lqinode->i_size + 2 * sb->s_blocksize, 985 i_size_read(lqinode) + 2 * sb->s_blocksize,
986 lqinode->i_size); 986 i_size_read(lqinode));
987 if (status < 0) { 987 if (status < 0) {
988 mlog_errno(status); 988 mlog_errno(status);
989 goto out; 989 goto out;
990 } 990 }
991 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, 991 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
992 lqinode->i_size + 2 * sb->s_blocksize); 992 i_size_read(lqinode) + 2 * sb->s_blocksize);
993 if (status < 0) { 993 if (status < 0) {
994 mlog_errno(status); 994 mlog_errno(status);
995 goto out; 995 goto out;
@@ -1125,14 +1125,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1125 1125
1126 /* We are protected by dqio_sem so no locking needed */ 1126 /* We are protected by dqio_sem so no locking needed */
1127 status = ocfs2_extend_no_holes(lqinode, NULL, 1127 status = ocfs2_extend_no_holes(lqinode, NULL,
1128 lqinode->i_size + sb->s_blocksize, 1128 i_size_read(lqinode) + sb->s_blocksize,
1129 lqinode->i_size); 1129 i_size_read(lqinode));
1130 if (status < 0) { 1130 if (status < 0) {
1131 mlog_errno(status); 1131 mlog_errno(status);
1132 goto out; 1132 goto out;
1133 } 1133 }
1134 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, 1134 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
1135 lqinode->i_size + sb->s_blocksize); 1135 i_size_read(lqinode) + sb->s_blocksize);
1136 if (status < 0) { 1136 if (status < 0) {
1137 mlog_errno(status); 1137 mlog_errno(status);
1138 goto out; 1138 goto out;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a70d604593b6..bf4dfc14bb2c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3854,7 +3854,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
3854 while (cpos < clusters) { 3854 while (cpos < clusters) {
3855 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3855 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3856 &num_clusters, &ext_flags); 3856 &num_clusters, &ext_flags);
3857 3857 if (ret) {
3858 mlog_errno(ret);
3859 goto unlock;
3860 }
3858 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { 3861 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3859 ret = ocfs2_add_refcount_flag(inode, &di_et, 3862 ret = ocfs2_add_refcount_flag(inode, &di_et,
3860 &ref_tree->rf_ci, 3863 &ref_tree->rf_ci,
@@ -4025,7 +4028,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode,
4025 while (cpos < clusters) { 4028 while (cpos < clusters) {
4026 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, 4029 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4027 &num_clusters, &ext_flags); 4030 &num_clusters, &ext_flags);
4028 4031 if (ret) {
4032 mlog_errno(ret);
4033 goto out;
4034 }
4029 if (p_cluster) { 4035 if (p_cluster) {
4030 ret = ocfs2_add_refcounted_extent(t_inode, &et, 4036 ret = ocfs2_add_refcounted_extent(t_inode, &et,
4031 ref_ci, ref_root_bh, 4037 ref_ci, ref_root_bh,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 317ef0abccbb..6ce0686eab72 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3505,7 +3505,7 @@ int ocfs2_xattr_set(struct inode *inode,
3505 int ret, credits, ref_meta = 0, ref_credits = 0; 3505 int ret, credits, ref_meta = 0, ref_credits = 0;
3506 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3506 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3507 struct inode *tl_inode = osb->osb_tl_inode; 3507 struct inode *tl_inode = osb->osb_tl_inode;
3508 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 3508 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
3509 struct ocfs2_refcount_tree *ref_tree = NULL; 3509 struct ocfs2_refcount_tree *ref_tree = NULL;
3510 3510
3511 struct ocfs2_xattr_info xi = { 3511 struct ocfs2_xattr_info xi = {
@@ -3609,13 +3609,14 @@ int ocfs2_xattr_set(struct inode *inode,
3609 if (IS_ERR(ctxt.handle)) { 3609 if (IS_ERR(ctxt.handle)) {
3610 ret = PTR_ERR(ctxt.handle); 3610 ret = PTR_ERR(ctxt.handle);
3611 mlog_errno(ret); 3611 mlog_errno(ret);
3612 goto cleanup; 3612 goto out_free_ac;
3613 } 3613 }
3614 3614
3615 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); 3615 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
3616 3616
3617 ocfs2_commit_trans(osb, ctxt.handle); 3617 ocfs2_commit_trans(osb, ctxt.handle);
3618 3618
3619out_free_ac:
3619 if (ctxt.data_ac) 3620 if (ctxt.data_ac)
3620 ocfs2_free_alloc_context(ctxt.data_ac); 3621 ocfs2_free_alloc_context(ctxt.data_ac);
3621 if (ctxt.meta_ac) 3622 if (ctxt.meta_ac)
@@ -5881,6 +5882,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
5881 while (cpos < clusters) { 5882 while (cpos < clusters) {
5882 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 5883 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
5883 &num_clusters, el, &ext_flags); 5884 &num_clusters, el, &ext_flags);
5885 if (ret) {
5886 mlog_errno(ret);
5887 break;
5888 }
5884 5889
5885 cpos += num_clusters; 5890 cpos += num_clusters;
5886 if ((ext_flags & OCFS2_EXT_REFCOUNTED)) 5891 if ((ext_flags & OCFS2_EXT_REFCOUNTED))
@@ -6797,7 +6802,7 @@ out:
6797 if (ret) { 6802 if (ret) {
6798 if (*meta_ac) { 6803 if (*meta_ac) {
6799 ocfs2_free_alloc_context(*meta_ac); 6804 ocfs2_free_alloc_context(*meta_ac);
6800 meta_ac = NULL; 6805 *meta_ac = NULL;
6801 } 6806 }
6802 } 6807 }
6803 6808
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 0ff80f9b930f..985ea881b5bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -286,7 +286,7 @@ int proc_fd_permission(struct inode *inode, int mask)
286 int rv = generic_permission(inode, mask); 286 int rv = generic_permission(inode, mask);
287 if (rv == 0) 287 if (rv == 0)
288 return 0; 288 return 0;
289 if (task_pid(current) == proc_pid(inode)) 289 if (task_tgid(current) == proc_pid(inode))
290 rv = 0; 290 rv = 0;
291 return rv; 291 return rv;
292} 292}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 107d026f5d6e..7366e9d63cee 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
740 ptent = pte_file_clear_soft_dirty(ptent); 740 ptent = pte_file_clear_soft_dirty(ptent);
741 } 741 }
742 742
743 if (vma->vm_flags & VM_SOFTDIRTY)
744 vma->vm_flags &= ~VM_SOFTDIRTY;
745
743 set_pte_at(vma->vm_mm, addr, pte, ptent); 746 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif 747#endif
745} 748}
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
949 if (is_migration_entry(entry)) 952 if (is_migration_entry(entry))
950 page = migration_entry_to_page(entry); 953 page = migration_entry_to_page(entry);
951 } else { 954 } else {
952 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 955 if (vma->vm_flags & VM_SOFTDIRTY)
956 flags2 |= __PM_SOFT_DIRTY;
957 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
953 return; 958 return;
954 } 959 }
955 960
956 if (page && !PageAnon(page)) 961 if (page && !PageAnon(page))
957 flags |= PM_FILE; 962 flags |= PM_FILE;
958 if (pte_soft_dirty(pte)) 963 if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
959 flags2 |= __PM_SOFT_DIRTY; 964 flags2 |= __PM_SOFT_DIRTY;
960 965
961 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 966 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 979 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 980 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
976 else 981 else
977 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 982 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
978} 983}
979#else 984#else
980static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 985static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 1002 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
998 int pmd_flags2; 1003 int pmd_flags2;
999 1004
1000 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); 1005 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
1006 pmd_flags2 = __PM_SOFT_DIRTY;
1007 else
1008 pmd_flags2 = 0;
1009
1001 for (; addr != end; addr += PAGE_SIZE) { 1010 for (; addr != end; addr += PAGE_SIZE) {
1002 unsigned long offset; 1011 unsigned long offset;
1003 1012
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1015 if (pmd_trans_unstable(pmd)) 1024 if (pmd_trans_unstable(pmd))
1016 return 0; 1025 return 0;
1017 for (; addr != end; addr += PAGE_SIZE) { 1026 for (; addr != end; addr += PAGE_SIZE) {
1027 int flags2;
1018 1028
1019 /* check to see if we've left 'vma' behind 1029 /* check to see if we've left 'vma' behind
1020 * and need a new, higher one */ 1030 * and need a new, higher one */
1021 if (vma && (addr >= vma->vm_end)) { 1031 if (vma && (addr >= vma->vm_end)) {
1022 vma = find_vma(walk->mm, addr); 1032 vma = find_vma(walk->mm, addr);
1023 pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1033 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1034 flags2 = __PM_SOFT_DIRTY;
1035 else
1036 flags2 = 0;
1037 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1024 } 1038 }
1025 1039
1026 /* check that 'vma' actually covers this address, 1040 /* check that 'vma' actually covers this address,
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1044 1058
1045#ifdef CONFIG_HUGETLB_PAGE 1059#ifdef CONFIG_HUGETLB_PAGE
1046static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1060static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1047 pte_t pte, int offset) 1061 pte_t pte, int offset, int flags2)
1048{ 1062{
1049 if (pte_present(pte)) 1063 if (pte_present(pte))
1050 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1064 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
1051 | PM_STATUS2(pm->v2, 0) | PM_PRESENT); 1065 PM_STATUS2(pm->v2, flags2) |
1066 PM_PRESENT);
1052 else 1067 else
1053 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1068 *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
1069 PM_STATUS2(pm->v2, flags2));
1054} 1070}
1055 1071
1056/* This function walks within one hugetlb entry in the single call */ 1072/* This function walks within one hugetlb entry in the single call */
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1059 struct mm_walk *walk) 1075 struct mm_walk *walk)
1060{ 1076{
1061 struct pagemapread *pm = walk->private; 1077 struct pagemapread *pm = walk->private;
1078 struct vm_area_struct *vma;
1062 int err = 0; 1079 int err = 0;
1080 int flags2;
1063 pagemap_entry_t pme; 1081 pagemap_entry_t pme;
1064 1082
1083 vma = find_vma(walk->mm, addr);
1084 WARN_ON_ONCE(!vma);
1085
1086 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1087 flags2 = __PM_SOFT_DIRTY;
1088 else
1089 flags2 = 0;
1090
1065 for (; addr != end; addr += PAGE_SIZE) { 1091 for (; addr != end; addr += PAGE_SIZE) {
1066 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1092 int offset = (addr & ~hmask) >> PAGE_SHIFT;
1067 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); 1093 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1068 err = add_to_pagemap(addr, &pme, pm); 1094 err = add_to_pagemap(addr, &pme, pm);
1069 if (err) 1095 if (err)
1070 return err; 1096 return err;
@@ -1376,8 +1402,10 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1376 walk.mm = mm; 1402 walk.mm = mm;
1377 1403
1378 pol = get_vma_policy(task, vma, vma->vm_start); 1404 pol = get_vma_policy(task, vma, vma->vm_start);
1379 mpol_to_str(buffer, sizeof(buffer), pol); 1405 n = mpol_to_str(buffer, sizeof(buffer), pol);
1380 mpol_cond_put(pol); 1406 mpol_cond_put(pol);
1407 if (n < 0)
1408 return n;
1381 1409
1382 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1410 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1383 1411
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a1a16eb97c7b..9100d6959886 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -21,6 +21,7 @@
21#include <linux/crash_dump.h> 21#include <linux/crash_dump.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/pagemap.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/io.h> 26#include <asm/io.h>
26#include "internal.h" 27#include "internal.h"
@@ -123,11 +124,65 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
123 return read; 124 return read;
124} 125}
125 126
127/*
128 * Architectures may override this function to allocate ELF header in 2nd kernel
129 */
130int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
131{
132 return 0;
133}
134
135/*
136 * Architectures may override this function to free header
137 */
138void __weak elfcorehdr_free(unsigned long long addr)
139{}
140
141/*
142 * Architectures may override this function to read from ELF header
143 */
144ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
145{
146 return read_from_oldmem(buf, count, ppos, 0);
147}
148
149/*
150 * Architectures may override this function to read from notes sections
151 */
152ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
153{
154 return read_from_oldmem(buf, count, ppos, 0);
155}
156
157/*
158 * Architectures may override this function to map oldmem
159 */
160int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
161 unsigned long from, unsigned long pfn,
162 unsigned long size, pgprot_t prot)
163{
164 return remap_pfn_range(vma, from, pfn, size, prot);
165}
166
167/*
168 * Copy to either kernel or user space
169 */
170static int copy_to(void *target, void *src, size_t size, int userbuf)
171{
172 if (userbuf) {
173 if (copy_to_user((char __user *) target, src, size))
174 return -EFAULT;
175 } else {
176 memcpy(target, src, size);
177 }
178 return 0;
179}
180
126/* Read from the ELF header and then the crash dump. On error, negative value is 181/* Read from the ELF header and then the crash dump. On error, negative value is
127 * returned otherwise number of bytes read are returned. 182 * returned otherwise number of bytes read are returned.
128 */ 183 */
129static ssize_t read_vmcore(struct file *file, char __user *buffer, 184static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
130 size_t buflen, loff_t *fpos) 185 int userbuf)
131{ 186{
132 ssize_t acc = 0, tmp; 187 ssize_t acc = 0, tmp;
133 size_t tsz; 188 size_t tsz;
@@ -144,7 +199,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
144 /* Read ELF core header */ 199 /* Read ELF core header */
145 if (*fpos < elfcorebuf_sz) { 200 if (*fpos < elfcorebuf_sz) {
146 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); 201 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
147 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) 202 if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
148 return -EFAULT; 203 return -EFAULT;
149 buflen -= tsz; 204 buflen -= tsz;
150 *fpos += tsz; 205 *fpos += tsz;
@@ -162,7 +217,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
162 217
163 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); 218 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
164 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; 219 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
165 if (copy_to_user(buffer, kaddr, tsz)) 220 if (copy_to(buffer, kaddr, tsz, userbuf))
166 return -EFAULT; 221 return -EFAULT;
167 buflen -= tsz; 222 buflen -= tsz;
168 *fpos += tsz; 223 *fpos += tsz;
@@ -178,7 +233,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
178 if (*fpos < m->offset + m->size) { 233 if (*fpos < m->offset + m->size) {
179 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); 234 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
180 start = m->paddr + *fpos - m->offset; 235 start = m->paddr + *fpos - m->offset;
181 tmp = read_from_oldmem(buffer, tsz, &start, 1); 236 tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
182 if (tmp < 0) 237 if (tmp < 0)
183 return tmp; 238 return tmp;
184 buflen -= tsz; 239 buflen -= tsz;
@@ -195,6 +250,55 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
195 return acc; 250 return acc;
196} 251}
197 252
253static ssize_t read_vmcore(struct file *file, char __user *buffer,
254 size_t buflen, loff_t *fpos)
255{
256 return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
257}
258
259/*
260 * The vmcore fault handler uses the page cache and fills data using the
261 * standard __vmcore_read() function.
262 *
263 * On s390 the fault handler is used for memory regions that can't be mapped
264 * directly with remap_pfn_range().
265 */
266static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
267{
268#ifdef CONFIG_S390
269 struct address_space *mapping = vma->vm_file->f_mapping;
270 pgoff_t index = vmf->pgoff;
271 struct page *page;
272 loff_t offset;
273 char *buf;
274 int rc;
275
276 page = find_or_create_page(mapping, index, GFP_KERNEL);
277 if (!page)
278 return VM_FAULT_OOM;
279 if (!PageUptodate(page)) {
280 offset = (loff_t) index << PAGE_CACHE_SHIFT;
281 buf = __va((page_to_pfn(page) << PAGE_SHIFT));
282 rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
283 if (rc < 0) {
284 unlock_page(page);
285 page_cache_release(page);
286 return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
287 }
288 SetPageUptodate(page);
289 }
290 unlock_page(page);
291 vmf->page = page;
292 return 0;
293#else
294 return VM_FAULT_SIGBUS;
295#endif
296}
297
298static const struct vm_operations_struct vmcore_mmap_ops = {
299 .fault = mmap_vmcore_fault,
300};
301
198/** 302/**
199 * alloc_elfnotes_buf - allocate buffer for ELF note segment in 303 * alloc_elfnotes_buf - allocate buffer for ELF note segment in
200 * vmalloc memory 304 * vmalloc memory
@@ -223,7 +327,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into 327 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout. 328 * virtually contiguous user-space in ELF layout.
225 */ 329 */
226#if defined(CONFIG_MMU) && !defined(CONFIG_S390) 330#ifdef CONFIG_MMU
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) 331static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{ 332{
229 size_t size = vma->vm_end - vma->vm_start; 333 size_t size = vma->vm_end - vma->vm_start;
@@ -241,6 +345,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
241 345
242 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); 346 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
243 vma->vm_flags |= VM_MIXEDMAP; 347 vma->vm_flags |= VM_MIXEDMAP;
348 vma->vm_ops = &vmcore_mmap_ops;
244 349
245 len = 0; 350 len = 0;
246 351
@@ -282,9 +387,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
282 387
283 tsz = min_t(size_t, m->offset + m->size - start, size); 388 tsz = min_t(size_t, m->offset + m->size - start, size);
284 paddr = m->paddr + start - m->offset; 389 paddr = m->paddr + start - m->offset;
285 if (remap_pfn_range(vma, vma->vm_start + len, 390 if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
286 paddr >> PAGE_SHIFT, tsz, 391 paddr >> PAGE_SHIFT, tsz,
287 vma->vm_page_prot)) 392 vma->vm_page_prot))
288 goto fail; 393 goto fail;
289 size -= tsz; 394 size -= tsz;
290 start += tsz; 395 start += tsz;
@@ -357,7 +462,7 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
357 notes_section = kmalloc(max_sz, GFP_KERNEL); 462 notes_section = kmalloc(max_sz, GFP_KERNEL);
358 if (!notes_section) 463 if (!notes_section)
359 return -ENOMEM; 464 return -ENOMEM;
360 rc = read_from_oldmem(notes_section, max_sz, &offset, 0); 465 rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
361 if (rc < 0) { 466 if (rc < 0) {
362 kfree(notes_section); 467 kfree(notes_section);
363 return rc; 468 return rc;
@@ -444,7 +549,8 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
444 if (phdr_ptr->p_type != PT_NOTE) 549 if (phdr_ptr->p_type != PT_NOTE)
445 continue; 550 continue;
446 offset = phdr_ptr->p_offset; 551 offset = phdr_ptr->p_offset;
447 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); 552 rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
553 &offset);
448 if (rc < 0) 554 if (rc < 0)
449 return rc; 555 return rc;
450 notes_buf += phdr_ptr->p_memsz; 556 notes_buf += phdr_ptr->p_memsz;
@@ -536,7 +642,7 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
536 notes_section = kmalloc(max_sz, GFP_KERNEL); 642 notes_section = kmalloc(max_sz, GFP_KERNEL);
537 if (!notes_section) 643 if (!notes_section)
538 return -ENOMEM; 644 return -ENOMEM;
539 rc = read_from_oldmem(notes_section, max_sz, &offset, 0); 645 rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
540 if (rc < 0) { 646 if (rc < 0) {
541 kfree(notes_section); 647 kfree(notes_section);
542 return rc; 648 return rc;
@@ -623,7 +729,8 @@ static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
623 if (phdr_ptr->p_type != PT_NOTE) 729 if (phdr_ptr->p_type != PT_NOTE)
624 continue; 730 continue;
625 offset = phdr_ptr->p_offset; 731 offset = phdr_ptr->p_offset;
626 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); 732 rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
733 &offset);
627 if (rc < 0) 734 if (rc < 0)
628 return rc; 735 return rc;
629 notes_buf += phdr_ptr->p_memsz; 736 notes_buf += phdr_ptr->p_memsz;
@@ -810,7 +917,7 @@ static int __init parse_crash_elf64_headers(void)
810 addr = elfcorehdr_addr; 917 addr = elfcorehdr_addr;
811 918
812 /* Read Elf header */ 919 /* Read Elf header */
813 rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); 920 rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
814 if (rc < 0) 921 if (rc < 0)
815 return rc; 922 return rc;
816 923
@@ -837,7 +944,7 @@ static int __init parse_crash_elf64_headers(void)
837 if (!elfcorebuf) 944 if (!elfcorebuf)
838 return -ENOMEM; 945 return -ENOMEM;
839 addr = elfcorehdr_addr; 946 addr = elfcorehdr_addr;
840 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); 947 rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
841 if (rc < 0) 948 if (rc < 0)
842 goto fail; 949 goto fail;
843 950
@@ -866,7 +973,7 @@ static int __init parse_crash_elf32_headers(void)
866 addr = elfcorehdr_addr; 973 addr = elfcorehdr_addr;
867 974
868 /* Read Elf header */ 975 /* Read Elf header */
869 rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); 976 rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
870 if (rc < 0) 977 if (rc < 0)
871 return rc; 978 return rc;
872 979
@@ -892,7 +999,7 @@ static int __init parse_crash_elf32_headers(void)
892 if (!elfcorebuf) 999 if (!elfcorebuf)
893 return -ENOMEM; 1000 return -ENOMEM;
894 addr = elfcorehdr_addr; 1001 addr = elfcorehdr_addr;
895 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); 1002 rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
896 if (rc < 0) 1003 if (rc < 0)
897 goto fail; 1004 goto fail;
898 1005
@@ -919,7 +1026,7 @@ static int __init parse_crash_elf_headers(void)
919 int rc=0; 1026 int rc=0;
920 1027
921 addr = elfcorehdr_addr; 1028 addr = elfcorehdr_addr;
922 rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); 1029 rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr);
923 if (rc < 0) 1030 if (rc < 0)
924 return rc; 1031 return rc;
925 if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { 1032 if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
@@ -952,7 +1059,14 @@ static int __init vmcore_init(void)
952{ 1059{
953 int rc = 0; 1060 int rc = 0;
954 1061
955 /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ 1062 /* Allow architectures to allocate ELF header in 2nd kernel */
1063 rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size);
1064 if (rc)
1065 return rc;
1066 /*
1067 * If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
1068 * then capture the dump.
1069 */
956 if (!(is_vmcore_usable())) 1070 if (!(is_vmcore_usable()))
957 return rc; 1071 return rc;
958 rc = parse_crash_elf_headers(); 1072 rc = parse_crash_elf_headers();
@@ -960,6 +1074,8 @@ static int __init vmcore_init(void)
960 pr_warn("Kdump: vmcore not initialized\n"); 1074 pr_warn("Kdump: vmcore not initialized\n");
961 return rc; 1075 return rc;
962 } 1076 }
1077 elfcorehdr_free(elfcorehdr_addr);
1078 elfcorehdr_addr = ELFCORE_ADDR_ERR;
963 1079
964 proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); 1080 proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
965 if (proc_vmcore) 1081 if (proc_vmcore)
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c24f1e10b946..39d14659a8d3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -244,12 +244,6 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type,
244 return mount_nodev(fs_type, flags, data, ramfs_fill_super); 244 return mount_nodev(fs_type, flags, data, ramfs_fill_super);
245} 245}
246 246
247static struct dentry *rootfs_mount(struct file_system_type *fs_type,
248 int flags, const char *dev_name, void *data)
249{
250 return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
251}
252
253static void ramfs_kill_sb(struct super_block *sb) 247static void ramfs_kill_sb(struct super_block *sb)
254{ 248{
255 kfree(sb->s_fs_info); 249 kfree(sb->s_fs_info);
@@ -262,29 +256,23 @@ static struct file_system_type ramfs_fs_type = {
262 .kill_sb = ramfs_kill_sb, 256 .kill_sb = ramfs_kill_sb,
263 .fs_flags = FS_USERNS_MOUNT, 257 .fs_flags = FS_USERNS_MOUNT,
264}; 258};
265static struct file_system_type rootfs_fs_type = {
266 .name = "rootfs",
267 .mount = rootfs_mount,
268 .kill_sb = kill_litter_super,
269};
270 259
271static int __init init_ramfs_fs(void) 260int __init init_ramfs_fs(void)
272{
273 return register_filesystem(&ramfs_fs_type);
274}
275module_init(init_ramfs_fs)
276
277int __init init_rootfs(void)
278{ 261{
262 static unsigned long once;
279 int err; 263 int err;
280 264
265 if (test_and_set_bit(0, &once))
266 return 0;
267
281 err = bdi_init(&ramfs_backing_dev_info); 268 err = bdi_init(&ramfs_backing_dev_info);
282 if (err) 269 if (err)
283 return err; 270 return err;
284 271
285 err = register_filesystem(&rootfs_fs_type); 272 err = register_filesystem(&ramfs_fs_type);
286 if (err) 273 if (err)
287 bdi_destroy(&ramfs_backing_dev_info); 274 bdi_destroy(&ramfs_backing_dev_info);
288 275
289 return err; 276 return err;
290} 277}
278module_init(init_ramfs_fs)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c3881553f7d1..5f66d519a726 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
243 * BDI_CAP_EXEC_MAP: Can be mapped for execution 243 * BDI_CAP_EXEC_MAP: Can be mapped for execution
244 * 244 *
245 * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. 245 * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed.
246 *
247 * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold.
246 */ 248 */
247#define BDI_CAP_NO_ACCT_DIRTY 0x00000001 249#define BDI_CAP_NO_ACCT_DIRTY 0x00000001
248#define BDI_CAP_NO_WRITEBACK 0x00000002 250#define BDI_CAP_NO_WRITEBACK 0x00000002
@@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
254#define BDI_CAP_NO_ACCT_WB 0x00000080 256#define BDI_CAP_NO_ACCT_WB 0x00000080
255#define BDI_CAP_SWAP_BACKED 0x00000100 257#define BDI_CAP_SWAP_BACKED 0x00000100
256#define BDI_CAP_STABLE_WRITES 0x00000200 258#define BDI_CAP_STABLE_WRITES 0x00000200
259#define BDI_CAP_STRICTLIMIT 0x00000400
257 260
258#define BDI_CAP_VMFLAGS \ 261#define BDI_CAP_VMFLAGS \
259 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) 262 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 70cf138690e9..e8112ae50531 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -31,7 +31,7 @@ struct linux_binprm {
31#ifdef __alpha__ 31#ifdef __alpha__
32 unsigned int taso:1; 32 unsigned int taso:1;
33#endif 33#endif
34 unsigned int recursion_depth; 34 unsigned int recursion_depth; /* only for search_binary_handler() */
35 struct file * file; 35 struct file * file;
36 struct cred *cred; /* new credentials */ 36 struct cred *cred; /* new credentials */
37 int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ 37 int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h
new file mode 100644
index 000000000000..98e892ef6d5a
--- /dev/null
+++ b/include/linux/cmdline-parser.h
@@ -0,0 +1,43 @@
1/*
2 * Parsing command line, get the partitions information.
3 *
4 * Written by Cai Zhiyong <caizhiyong@huawei.com>
5 *
6 */
7#ifndef CMDLINEPARSEH
8#define CMDLINEPARSEH
9
10#include <linux/blkdev.h>
11
12/* partition flags */
13#define PF_RDONLY 0x01 /* Device is read only */
14#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */
15
16struct cmdline_subpart {
17 char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */
18 sector_t from;
19 sector_t size;
20 int flags;
21 struct cmdline_subpart *next_subpart;
22};
23
24struct cmdline_parts {
25 char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */
26 unsigned int nr_subparts;
27 struct cmdline_subpart *subpart;
28 struct cmdline_parts *next_parts;
29};
30
31void cmdline_parts_free(struct cmdline_parts **parts);
32
33int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline);
34
35struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
36 const char *bdev);
37
38void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
39 int slot,
40 int (*add_part)(int, struct cmdline_subpart *, void *),
41 void *param);
42
43#endif /* CMDLINEPARSEH */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ec1aee4aec9c..345da00a86e0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -43,6 +43,7 @@
43#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ 43#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
44 asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ 44 asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
45 static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ 45 static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
46 asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));\
46 asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ 47 asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
47 { \ 48 { \
48 return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ 49 return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 37e4f8da7cdf..fe68a5a98583 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -12,6 +12,15 @@
12extern unsigned long long elfcorehdr_addr; 12extern unsigned long long elfcorehdr_addr;
13extern unsigned long long elfcorehdr_size; 13extern unsigned long long elfcorehdr_size;
14 14
15extern int __weak elfcorehdr_alloc(unsigned long long *addr,
16 unsigned long long *size);
17extern void __weak elfcorehdr_free(unsigned long long addr);
18extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos);
19extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos);
20extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
21 unsigned long from, unsigned long pfn,
22 unsigned long size, pgprot_t prot);
23
15extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, 24extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
16 unsigned long, int); 25 unsigned long, int);
17 26
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 661d374aeb2d..f8d41cb1cbe0 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -66,8 +66,8 @@ struct gen_pool_chunk {
66 struct list_head next_chunk; /* next chunk in pool */ 66 struct list_head next_chunk; /* next chunk in pool */
67 atomic_t avail; 67 atomic_t avail;
68 phys_addr_t phys_addr; /* physical starting address of memory chunk */ 68 phys_addr_t phys_addr; /* physical starting address of memory chunk */
69 unsigned long start_addr; /* starting address of memory chunk */ 69 unsigned long start_addr; /* start address of memory chunk */
70 unsigned long end_addr; /* ending address of memory chunk */ 70 unsigned long end_addr; /* end address of memory chunk (inclusive) */
71 unsigned long bits[0]; /* bitmap for allocating memory chunk */ 71 unsigned long bits[0]; /* bitmap for allocating memory chunk */
72}; 72};
73 73
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c2b1801a160b..0393270466c3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -66,6 +66,9 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
66 vm_flags_t vm_flags); 66 vm_flags_t vm_flags);
67void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 67void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
68int dequeue_hwpoisoned_huge_page(struct page *page); 68int dequeue_hwpoisoned_huge_page(struct page *page);
69bool isolate_huge_page(struct page *page, struct list_head *list);
70void putback_active_hugepage(struct page *page);
71bool is_hugepage_active(struct page *page);
69void copy_huge_page(struct page *dst, struct page *src); 72void copy_huge_page(struct page *dst, struct page *src);
70 73
71#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 74#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -134,6 +137,9 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page)
134 return 0; 137 return 0;
135} 138}
136 139
140#define isolate_huge_page(p, l) false
141#define putback_active_hugepage(p) do {} while (0)
142#define is_hugepage_active(x) false
137static inline void copy_huge_page(struct page *dst, struct page *src) 143static inline void copy_huge_page(struct page *dst, struct page *src)
138{ 144{
139} 145}
@@ -261,6 +267,8 @@ struct huge_bootmem_page {
261}; 267};
262 268
263struct page *alloc_huge_page_node(struct hstate *h, int nid); 269struct page *alloc_huge_page_node(struct hstate *h, int nid);
270struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
271 unsigned long addr, int avoid_reserve);
264 272
265/* arch callback */ 273/* arch callback */
266int __init alloc_bootmem_huge_page(struct hstate *h); 274int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -371,9 +379,23 @@ static inline pgoff_t basepage_index(struct page *page)
371 return __basepage_index(page); 379 return __basepage_index(page);
372} 380}
373 381
382extern void dissolve_free_huge_pages(unsigned long start_pfn,
383 unsigned long end_pfn);
384int pmd_huge_support(void);
385/*
386 * Currently hugepage migration is enabled only for pmd-based hugepage.
387 * This function will be updated when hugepage migration is more widely
388 * supported.
389 */
390static inline int hugepage_migration_support(struct hstate *h)
391{
392 return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT);
393}
394
374#else /* CONFIG_HUGETLB_PAGE */ 395#else /* CONFIG_HUGETLB_PAGE */
375struct hstate {}; 396struct hstate {};
376#define alloc_huge_page_node(h, nid) NULL 397#define alloc_huge_page_node(h, nid) NULL
398#define alloc_huge_page_noerr(v, a, r) NULL
377#define alloc_bootmem_huge_page(h) NULL 399#define alloc_bootmem_huge_page(h) NULL
378#define hstate_file(f) NULL 400#define hstate_file(f) NULL
379#define hstate_sizelog(s) NULL 401#define hstate_sizelog(s) NULL
@@ -396,6 +418,9 @@ static inline pgoff_t basepage_index(struct page *page)
396{ 418{
397 return page->index; 419 return page->index;
398} 420}
421#define dissolve_free_huge_pages(s, e) do {} while (0)
422#define pmd_huge_support() 0
423#define hugepage_migration_support(h) 0
399#endif /* CONFIG_HUGETLB_PAGE */ 424#endif /* CONFIG_HUGETLB_PAGE */
400 425
401#endif /* _LINUX_HUGETLB_H */ 426#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/init.h b/include/linux/init.h
index e73f2b708525..f1c27a71d03c 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -153,6 +153,7 @@ extern unsigned int reset_devices;
153void setup_arch(char **); 153void setup_arch(char **);
154void prepare_namespace(void); 154void prepare_namespace(void);
155void __init load_default_modules(void); 155void __init load_default_modules(void);
156int __init init_rootfs(void);
156 157
157extern void (*late_time_init)(void); 158extern void (*late_time_init)(void);
158 159
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c4d870b0d5e6..19c19a5eee29 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -22,7 +22,7 @@ struct ipc_ids {
22 int in_use; 22 int in_use;
23 unsigned short seq; 23 unsigned short seq;
24 unsigned short seq_max; 24 unsigned short seq_max;
25 struct rw_semaphore rw_mutex; 25 struct rw_semaphore rwsem;
26 struct idr ipcs_idr; 26 struct idr ipcs_idr;
27 int next_id; 27 int next_id;
28}; 28};
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ca1d27a0d6a6..925eaf28fca9 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -264,10 +264,36 @@ extern void arch_arm_kprobe(struct kprobe *p);
264extern void arch_disarm_kprobe(struct kprobe *p); 264extern void arch_disarm_kprobe(struct kprobe *p);
265extern int arch_init_kprobes(void); 265extern int arch_init_kprobes(void);
266extern void show_registers(struct pt_regs *regs); 266extern void show_registers(struct pt_regs *regs);
267extern kprobe_opcode_t *get_insn_slot(void);
268extern void free_insn_slot(kprobe_opcode_t *slot, int dirty);
269extern void kprobes_inc_nmissed_count(struct kprobe *p); 267extern void kprobes_inc_nmissed_count(struct kprobe *p);
270 268
269struct kprobe_insn_cache {
270 struct mutex mutex;
271 void *(*alloc)(void); /* allocate insn page */
272 void (*free)(void *); /* free insn page */
273 struct list_head pages; /* list of kprobe_insn_page */
274 size_t insn_size; /* size of instruction slot */
275 int nr_garbage;
276};
277
278extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
279extern void __free_insn_slot(struct kprobe_insn_cache *c,
280 kprobe_opcode_t *slot, int dirty);
281
282#define DEFINE_INSN_CACHE_OPS(__name) \
283extern struct kprobe_insn_cache kprobe_##__name##_slots; \
284 \
285static inline kprobe_opcode_t *get_##__name##_slot(void) \
286{ \
287 return __get_insn_slot(&kprobe_##__name##_slots); \
288} \
289 \
290static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
291{ \
292 __free_insn_slot(&kprobe_##__name##_slots, slot, dirty); \
293} \
294
295DEFINE_INSN_CACHE_OPS(insn);
296
271#ifdef CONFIG_OPTPROBES 297#ifdef CONFIG_OPTPROBES
272/* 298/*
273 * Internal structure for direct jump optimized probe 299 * Internal structure for direct jump optimized probe
@@ -287,13 +313,13 @@ extern void arch_optimize_kprobes(struct list_head *oplist);
287extern void arch_unoptimize_kprobes(struct list_head *oplist, 313extern void arch_unoptimize_kprobes(struct list_head *oplist,
288 struct list_head *done_list); 314 struct list_head *done_list);
289extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); 315extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
290extern kprobe_opcode_t *get_optinsn_slot(void);
291extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
292extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, 316extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
293 unsigned long addr); 317 unsigned long addr);
294 318
295extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); 319extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
296 320
321DEFINE_INSN_CACHE_OPS(optinsn);
322
297#ifdef CONFIG_SYSCTL 323#ifdef CONFIG_SYSCTL
298extern int sysctl_kprobes_optimization; 324extern int sysctl_kprobes_optimization;
299extern int proc_kprobes_optimization_handler(struct ctl_table *table, 325extern int proc_kprobes_optimization_handler(struct ctl_table *table,
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index d21c13f10a64..4356686b0a39 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -67,8 +67,8 @@ int lz4hc_compress(const unsigned char *src, size_t src_len,
67 * note : Destination buffer must be already allocated. 67 * note : Destination buffer must be already allocated.
68 * slightly faster than lz4_decompress_unknownoutputsize() 68 * slightly faster than lz4_decompress_unknownoutputsize()
69 */ 69 */
70int lz4_decompress(const char *src, size_t *src_len, char *dest, 70int lz4_decompress(const unsigned char *src, size_t *src_len,
71 size_t actual_dest_len); 71 unsigned char *dest, size_t actual_dest_len);
72 72
73/* 73/*
74 * lz4_decompress_unknownoutputsize() 74 * lz4_decompress_unknownoutputsize()
@@ -82,6 +82,6 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest,
82 * Error if return (< 0) 82 * Error if return (< 0)
83 * note : Destination buffer must be already allocated. 83 * note : Destination buffer must be already allocated.
84 */ 84 */
85int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, 85int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
86 char *dest, size_t *dest_len); 86 unsigned char *dest, size_t *dest_len);
87#endif 87#endif
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203db7e8..31e95acddb4d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -60,6 +60,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
60void memblock_trim_memory(phys_addr_t align); 60void memblock_trim_memory(phys_addr_t align);
61 61
62#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 62#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
63int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
64 unsigned long *end_pfn);
63void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, 65void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
64 unsigned long *out_end_pfn, int *out_nid); 66 unsigned long *out_end_pfn, int *out_nid);
65 67
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 0d7df39a5885..da6716b9e3fe 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -91,7 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
91} 91}
92 92
93#define vma_policy(vma) ((vma)->vm_policy) 93#define vma_policy(vma) ((vma)->vm_policy)
94#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol))
95 94
96static inline void mpol_get(struct mempolicy *pol) 95static inline void mpol_get(struct mempolicy *pol)
97{ 96{
@@ -126,6 +125,7 @@ struct shared_policy {
126 spinlock_t lock; 125 spinlock_t lock;
127}; 126};
128 127
128int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
129void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); 129void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
130int mpol_set_shared_policy(struct shared_policy *info, 130int mpol_set_shared_policy(struct shared_policy *info,
131 struct vm_area_struct *vma, 131 struct vm_area_struct *vma,
@@ -173,7 +173,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
173/* Check if a vma is migratable */ 173/* Check if a vma is migratable */
174static inline int vma_migratable(struct vm_area_struct *vma) 174static inline int vma_migratable(struct vm_area_struct *vma)
175{ 175{
176 if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) 176 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
177 return 0; 177 return 0;
178 /* 178 /*
179 * Migration allocates pages in the highest zone. If we cannot 179 * Migration allocates pages in the highest zone. If we cannot
@@ -240,7 +240,12 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
240} 240}
241 241
242#define vma_policy(vma) NULL 242#define vma_policy(vma) NULL
243#define vma_set_policy(vma, pol) do {} while(0) 243
244static inline int
245vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
246{
247 return 0;
248}
244 249
245static inline void numa_policy_init(void) 250static inline void numa_policy_init(void)
246{ 251{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc0f61..6fe521420631 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -41,8 +41,6 @@ extern int migrate_page(struct address_space *,
41 struct page *, struct page *, enum migrate_mode); 41 struct page *, struct page *, enum migrate_mode);
42extern int migrate_pages(struct list_head *l, new_page_t x, 42extern int migrate_pages(struct list_head *l, new_page_t x,
43 unsigned long private, enum migrate_mode mode, int reason); 43 unsigned long private, enum migrate_mode mode, int reason);
44extern int migrate_huge_page(struct page *, new_page_t x,
45 unsigned long private, enum migrate_mode mode);
46 44
47extern int fail_migrate_page(struct address_space *, 45extern int fail_migrate_page(struct address_space *,
48 struct page *, struct page *); 46 struct page *, struct page *);
@@ -62,9 +60,6 @@ static inline void putback_movable_pages(struct list_head *l) {}
62static inline int migrate_pages(struct list_head *l, new_page_t x, 60static inline int migrate_pages(struct list_head *l, new_page_t x,
63 unsigned long private, enum migrate_mode mode, int reason) 61 unsigned long private, enum migrate_mode mode, int reason)
64 { return -ENOSYS; } 62 { return -ENOSYS; }
65static inline int migrate_huge_page(struct page *page, new_page_t x,
66 unsigned long private, enum migrate_mode mode)
67 { return -ENOSYS; }
68 63
69static inline int migrate_prep(void) { return -ENOSYS; } 64static inline int migrate_prep(void) { return -ENOSYS; }
70static inline int migrate_prep_local(void) { return -ENOSYS; } 65static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2d59b4149d0..caf543c7eaa7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp);
115#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 115#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
116#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 116#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
117 117
118#ifdef CONFIG_MEM_SOFT_DIRTY
119# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
120#else
121# define VM_SOFTDIRTY 0
122#endif
123
118#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 124#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
119#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ 125#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
120#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ 126#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
@@ -489,20 +495,6 @@ static inline int compound_order(struct page *page)
489 return (unsigned long)page[1].lru.prev; 495 return (unsigned long)page[1].lru.prev;
490} 496}
491 497
492static inline int compound_trans_order(struct page *page)
493{
494 int order;
495 unsigned long flags;
496
497 if (!PageHead(page))
498 return 0;
499
500 flags = compound_lock_irqsave(page);
501 order = compound_order(page);
502 compound_unlock_irqrestore(page, flags);
503 return order;
504}
505
506static inline void set_compound_order(struct page *page, unsigned long order) 498static inline void set_compound_order(struct page *page, unsigned long order)
507{ 499{
508 page[1].lru.prev = (void *)order; 500 page[1].lru.prev = (void *)order;
@@ -637,12 +629,12 @@ static inline enum zone_type page_zonenum(const struct page *page)
637#endif 629#endif
638 630
639/* 631/*
640 * The identification function is only used by the buddy allocator for 632 * The identification function is mainly used by the buddy allocator for
641 * determining if two pages could be buddies. We are not really 633 * determining if two pages could be buddies. We are not really identifying
642 * identifying a zone since we could be using a the section number 634 * the zone since we could be using the section number id if we do not have
643 * id if we have not node id available in page flags. 635 * node id available in page flags.
644 * We guarantee only that it will return the same value for two 636 * We only guarantee that it will return the same value for two combinable
645 * combinable pages in a zone. 637 * pages in a zone.
646 */ 638 */
647static inline int page_zone_id(struct page *page) 639static inline int page_zone_id(struct page *page)
648{ 640{
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 1397ccf81e91..cf55945c83fb 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -2,6 +2,7 @@
2#define LINUX_MM_INLINE_H 2#define LINUX_MM_INLINE_H
3 3
4#include <linux/huge_mm.h> 4#include <linux/huge_mm.h>
5#include <linux/swap.h>
5 6
6/** 7/**
7 * page_is_file_cache - should the page be on a file LRU or anon LRU? 8 * page_is_file_cache - should the page be on a file LRU or anon LRU?
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index af4a3b77a8de..bd791e452ad7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -105,6 +105,7 @@ struct zone_padding {
105enum zone_stat_item { 105enum zone_stat_item {
106 /* First 128 byte cacheline (assuming 64 bit words) */ 106 /* First 128 byte cacheline (assuming 64 bit words) */
107 NR_FREE_PAGES, 107 NR_FREE_PAGES,
108 NR_ALLOC_BATCH,
108 NR_LRU_BASE, 109 NR_LRU_BASE,
109 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ 110 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
110 NR_ACTIVE_ANON, /* " " " " " */ 111 NR_ACTIVE_ANON, /* " " " " " */
@@ -352,7 +353,6 @@ struct zone {
352 * free areas of different sizes 353 * free areas of different sizes
353 */ 354 */
354 spinlock_t lock; 355 spinlock_t lock;
355 int all_unreclaimable; /* All pages pinned */
356#if defined CONFIG_COMPACTION || defined CONFIG_CMA 356#if defined CONFIG_COMPACTION || defined CONFIG_CMA
357 /* Set to true when the PG_migrate_skip bits should be cleared */ 357 /* Set to true when the PG_migrate_skip bits should be cleared */
358 bool compact_blockskip_flush; 358 bool compact_blockskip_flush;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ffc444c38b0a..403940787be1 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
231unsigned long radix_tree_prev_hole(struct radix_tree_root *root, 231unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
232 unsigned long index, unsigned long max_scan); 232 unsigned long index, unsigned long max_scan);
233int radix_tree_preload(gfp_t gfp_mask); 233int radix_tree_preload(gfp_t gfp_mask);
234int radix_tree_maybe_preload(gfp_t gfp_mask);
234void radix_tree_init(void); 235void radix_tree_init(void);
235void *radix_tree_tag_set(struct radix_tree_root *root, 236void *radix_tree_tag_set(struct radix_tree_root *root,
236 unsigned long index, unsigned int tag); 237 unsigned long index, unsigned int tag);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 69e37c2d1ea5..753207c8ce20 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -25,7 +25,7 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
25 25
26extern const struct file_operations ramfs_file_operations; 26extern const struct file_operations ramfs_file_operations;
27extern const struct vm_operations_struct generic_file_vm_ops; 27extern const struct vm_operations_struct generic_file_vm_ops;
28extern int __init init_rootfs(void); 28extern int __init init_ramfs_fs(void);
29 29
30int ramfs_fill_super(struct super_block *sb, void *data, int silent); 30int ramfs_fill_super(struct super_block *sb, void *data, int silent);
31 31
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 0022c1bb1e26..aa870a4ddf54 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -68,6 +68,10 @@ extern struct rb_node *rb_prev(const struct rb_node *);
68extern struct rb_node *rb_first(const struct rb_root *); 68extern struct rb_node *rb_first(const struct rb_root *);
69extern struct rb_node *rb_last(const struct rb_root *); 69extern struct rb_node *rb_last(const struct rb_root *);
70 70
71/* Postorder iteration - always visit the parent after its children */
72extern struct rb_node *rb_first_postorder(const struct rb_root *);
73extern struct rb_node *rb_next_postorder(const struct rb_node *);
74
71/* Fast replacement of a single node without remove/rebalance/add/rebalance */ 75/* Fast replacement of a single node without remove/rebalance/add/rebalance */
72extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 76extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
73 struct rb_root *root); 77 struct rb_root *root);
@@ -81,4 +85,22 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
81 *rb_link = node; 85 *rb_link = node;
82} 86}
83 87
88/**
89 * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
90 * given type safe against removal of rb_node entry
91 *
92 * @pos: the 'type *' to use as a loop cursor.
93 * @n: another 'type *' to use as temporary storage
94 * @root: 'rb_root *' of the rbtree.
95 * @field: the name of the rb_node field within 'type'.
96 */
97#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
98 for (pos = rb_entry(rb_first_postorder(root), typeof(*pos), field),\
99 n = rb_entry(rb_next_postorder(&pos->field), \
100 typeof(*pos), field); \
101 &pos->field; \
102 pos = n, \
103 n = rb_entry(rb_next_postorder(&pos->field), \
104 typeof(*pos), field))
105
84#endif /* _LINUX_RBTREE_H */ 106#endif /* _LINUX_RBTREE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ce1e1c0aaa33..45f254dddafc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2169,15 +2169,15 @@ static inline bool thread_group_leader(struct task_struct *p)
2169 * all we care about is that we have a task with the appropriate 2169 * all we care about is that we have a task with the appropriate
2170 * pid, we don't actually care if we have the right task. 2170 * pid, we don't actually care if we have the right task.
2171 */ 2171 */
2172static inline int has_group_leader_pid(struct task_struct *p) 2172static inline bool has_group_leader_pid(struct task_struct *p)
2173{ 2173{
2174 return p->pid == p->tgid; 2174 return task_pid(p) == p->signal->leader_pid;
2175} 2175}
2176 2176
2177static inline 2177static inline
2178int same_thread_group(struct task_struct *p1, struct task_struct *p2) 2178bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
2179{ 2179{
2180 return p1->tgid == p2->tgid; 2180 return p1->signal == p2->signal;
2181} 2181}
2182 2182
2183static inline struct task_struct *next_thread(const struct task_struct *p) 2183static inline struct task_struct *next_thread(const struct task_struct *p)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index c181399f2c20..cfb7ca094b38 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -28,6 +28,27 @@ extern unsigned int total_cpus;
28int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, 28int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
29 int wait); 29 int wait);
30 30
31/*
32 * Call a function on all processors
33 */
34int on_each_cpu(smp_call_func_t func, void *info, int wait);
35
36/*
37 * Call a function on processors specified by mask, which might include
38 * the local one.
39 */
40void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
41 void *info, bool wait);
42
43/*
44 * Call a function on each processor for which the supplied function
45 * cond_func returns a positive value. This may include the local
46 * processor.
47 */
48void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
49 smp_call_func_t func, void *info, bool wait,
50 gfp_t gfp_flags);
51
31#ifdef CONFIG_SMP 52#ifdef CONFIG_SMP
32 53
33#include <linux/preempt.h> 54#include <linux/preempt.h>
@@ -95,27 +116,6 @@ static inline void call_function_init(void) { }
95#endif 116#endif
96 117
97/* 118/*
98 * Call a function on all processors
99 */
100int on_each_cpu(smp_call_func_t func, void *info, int wait);
101
102/*
103 * Call a function on processors specified by mask, which might include
104 * the local one.
105 */
106void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
107 void *info, bool wait);
108
109/*
110 * Call a function on each processor for which the supplied function
111 * cond_func returns a positive value. This may include the local
112 * processor.
113 */
114void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
115 smp_call_func_t func, void *info, bool wait,
116 gfp_t gfp_flags);
117
118/*
119 * Mark the boot cpu "online" so that it can call console drivers in 119 * Mark the boot cpu "online" so that it can call console drivers in
120 * printk() and can access its per-cpu storage. 120 * printk() and can access its per-cpu storage.
121 */ 121 */
@@ -139,43 +139,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info)
139} 139}
140#define smp_call_function(func, info, wait) \ 140#define smp_call_function(func, info, wait) \
141 (up_smp_call_function(func, info)) 141 (up_smp_call_function(func, info))
142#define on_each_cpu(func, info, wait) \
143 ({ \
144 unsigned long __flags; \
145 local_irq_save(__flags); \
146 func(info); \
147 local_irq_restore(__flags); \
148 0; \
149 })
150/*
151 * Note we still need to test the mask even for UP
152 * because we actually can get an empty mask from
153 * code that on SMP might call us without the local
154 * CPU in the mask.
155 */
156#define on_each_cpu_mask(mask, func, info, wait) \
157 do { \
158 if (cpumask_test_cpu(0, (mask))) { \
159 local_irq_disable(); \
160 (func)(info); \
161 local_irq_enable(); \
162 } \
163 } while (0)
164/*
165 * Preemption is disabled here to make sure the cond_func is called under the
166 * same condtions in UP and SMP.
167 */
168#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\
169 do { \
170 void *__info = (info); \
171 preempt_disable(); \
172 if ((cond_func)(0, __info)) { \
173 local_irq_disable(); \
174 (func)(__info); \
175 local_irq_enable(); \
176 } \
177 preempt_enable(); \
178 } while (0)
179 142
180static inline void smp_send_reschedule(int cpu) { } 143static inline void smp_send_reschedule(int cpu) { }
181#define smp_prepare_boot_cpu() do {} while (0) 144#define smp_prepare_boot_cpu() do {} while (0)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d95cde5e257d..c03c139219c9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -182,6 +182,33 @@ enum {
182#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ 182#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */
183 183
184/* 184/*
185 * We use this to track usage of a cluster. A cluster is a block of swap disk
186 * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
187 * free clusters are organized into a list. We fetch an entry from the list to
188 * get a free cluster.
189 *
190 * The data field stores next cluster if the cluster is free or cluster usage
191 * counter otherwise. The flags field determines if a cluster is free. This is
192 * protected by swap_info_struct.lock.
193 */
194struct swap_cluster_info {
195 unsigned int data:24;
196 unsigned int flags:8;
197};
198#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
199#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
200
201/*
202 * We assign a cluster to each CPU, so each CPU can allocate swap entry from
203 * its own cluster and swapout sequentially. The purpose is to optimize swapout
204 * throughput.
205 */
206struct percpu_cluster {
207 struct swap_cluster_info index; /* Current cluster index */
208 unsigned int next; /* Likely next allocation offset */
209};
210
211/*
185 * The in-memory structure used to track swap areas. 212 * The in-memory structure used to track swap areas.
186 */ 213 */
187struct swap_info_struct { 214struct swap_info_struct {
@@ -191,14 +218,16 @@ struct swap_info_struct {
191 signed char next; /* next type on the swap list */ 218 signed char next; /* next type on the swap list */
192 unsigned int max; /* extent of the swap_map */ 219 unsigned int max; /* extent of the swap_map */
193 unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 220 unsigned char *swap_map; /* vmalloc'ed array of usage counts */
221 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
222 struct swap_cluster_info free_cluster_head; /* free cluster list head */
223 struct swap_cluster_info free_cluster_tail; /* free cluster list tail */
194 unsigned int lowest_bit; /* index of first free in swap_map */ 224 unsigned int lowest_bit; /* index of first free in swap_map */
195 unsigned int highest_bit; /* index of last free in swap_map */ 225 unsigned int highest_bit; /* index of last free in swap_map */
196 unsigned int pages; /* total of usable pages of swap */ 226 unsigned int pages; /* total of usable pages of swap */
197 unsigned int inuse_pages; /* number of those currently in use */ 227 unsigned int inuse_pages; /* number of those currently in use */
198 unsigned int cluster_next; /* likely index for next allocation */ 228 unsigned int cluster_next; /* likely index for next allocation */
199 unsigned int cluster_nr; /* countdown to next cluster search */ 229 unsigned int cluster_nr; /* countdown to next cluster search */
200 unsigned int lowest_alloc; /* while preparing discard cluster */ 230 struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
201 unsigned int highest_alloc; /* while preparing discard cluster */
202 struct swap_extent *curr_swap_extent; 231 struct swap_extent *curr_swap_extent;
203 struct swap_extent first_swap_extent; 232 struct swap_extent first_swap_extent;
204 struct block_device *bdev; /* swap device or bdev of swap file */ 233 struct block_device *bdev; /* swap device or bdev of swap file */
@@ -212,14 +241,18 @@ struct swap_info_struct {
212 * protect map scan related fields like 241 * protect map scan related fields like
213 * swap_map, lowest_bit, highest_bit, 242 * swap_map, lowest_bit, highest_bit,
214 * inuse_pages, cluster_next, 243 * inuse_pages, cluster_next,
215 * cluster_nr, lowest_alloc and 244 * cluster_nr, lowest_alloc,
216 * highest_alloc. other fields are only 245 * highest_alloc, free/discard cluster
217 * changed at swapon/swapoff, so are 246 * list. other fields are only changed
218 * protected by swap_lock. changing 247 * at swapon/swapoff, so are protected
219 * flags need hold this lock and 248 * by swap_lock. changing flags need
220 * swap_lock. If both locks need hold, 249 * hold this lock and swap_lock. If
221 * hold swap_lock first. 250 * both locks need hold, hold swap_lock
251 * first.
222 */ 252 */
253 struct work_struct discard_work; /* discard worker */
254 struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
255 struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
223}; 256};
224 257
225struct swap_list_t { 258struct swap_list_t {
@@ -414,6 +447,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
414 447
415#else /* CONFIG_SWAP */ 448#else /* CONFIG_SWAP */
416 449
450#define swap_address_space(entry) (NULL)
417#define get_nr_swap_pages() 0L 451#define get_nr_swap_pages() 0L
418#define total_swap_pages 0L 452#define total_swap_pages 0L
419#define total_swapcache_pages() 0UL 453#define total_swapcache_pages() 0UL
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 84662ecc7b51..7fac04e7ff6e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -186,6 +186,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
186#define __SYSCALL_DEFINEx(x, name, ...) \ 186#define __SYSCALL_DEFINEx(x, name, ...) \
187 asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ 187 asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
188 static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ 188 static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
189 asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
189 asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ 190 asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
190 { \ 191 { \
191 long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ 192 long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index bd6cf61142be..1855f0a22add 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -70,6 +70,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
70 THP_ZERO_PAGE_ALLOC, 70 THP_ZERO_PAGE_ALLOC,
71 THP_ZERO_PAGE_ALLOC_FAILED, 71 THP_ZERO_PAGE_ALLOC_FAILED,
72#endif 72#endif
73#ifdef CONFIG_SMP
74 NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
75 NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
76#endif
77 NR_TLB_LOCAL_FLUSH_ALL,
78 NR_TLB_LOCAL_FLUSH_ONE,
73 NR_VM_EVENT_ITEMS 79 NR_VM_EVENT_ITEMS
74}; 80};
75 81
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c586679b6fef..e4b948080d20 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
143} 143}
144 144
145extern unsigned long global_reclaimable_pages(void); 145extern unsigned long global_reclaimable_pages(void);
146extern unsigned long zone_reclaimable_pages(struct zone *zone);
147 146
148#ifdef CONFIG_NUMA 147#ifdef CONFIG_NUMA
149/* 148/*
@@ -198,7 +197,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
198extern void dec_zone_state(struct zone *, enum zone_stat_item); 197extern void dec_zone_state(struct zone *, enum zone_stat_item);
199extern void __dec_zone_state(struct zone *, enum zone_stat_item); 198extern void __dec_zone_state(struct zone *, enum zone_stat_item);
200 199
201void refresh_cpu_vm_stats(int); 200void cpu_vm_stats_fold(int cpu);
202void refresh_zone_stat_thresholds(void); 201void refresh_zone_stat_thresholds(void);
203 202
204void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); 203void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
@@ -255,6 +254,7 @@ static inline void __dec_zone_page_state(struct page *page,
255 254
256static inline void refresh_cpu_vm_stats(int cpu) { } 255static inline void refresh_cpu_vm_stats(int cpu) { }
257static inline void refresh_zone_stat_thresholds(void) { } 256static inline void refresh_zone_stat_thresholds(void) { }
257static inline void cpu_vm_stats_fold(int cpu) { }
258 258
259static inline void drain_zonestat(struct zone *zone, 259static inline void drain_zonestat(struct zone *zone,
260 struct per_cpu_pageset *pset) { } 260 struct per_cpu_pageset *pset) { }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 4e198ca1f685..021b8a319b9e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -98,8 +98,6 @@ int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
98int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, 98int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
99 enum wb_reason reason); 99 enum wb_reason reason);
100void sync_inodes_sb(struct super_block *); 100void sync_inodes_sb(struct super_block *);
101long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
102 enum wb_reason reason);
103void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); 101void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
104void inode_wait_for_writeback(struct inode *inode); 102void inode_wait_for_writeback(struct inode *inode);
105 103
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 6bc943ecb841..d0c613476620 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -268,11 +268,13 @@ TRACE_EVENT(mm_page_alloc_extfrag,
268 268
269 TP_PROTO(struct page *page, 269 TP_PROTO(struct page *page,
270 int alloc_order, int fallback_order, 270 int alloc_order, int fallback_order,
271 int alloc_migratetype, int fallback_migratetype), 271 int alloc_migratetype, int fallback_migratetype,
272 int change_ownership),
272 273
273 TP_ARGS(page, 274 TP_ARGS(page,
274 alloc_order, fallback_order, 275 alloc_order, fallback_order,
275 alloc_migratetype, fallback_migratetype), 276 alloc_migratetype, fallback_migratetype,
277 change_ownership),
276 278
277 TP_STRUCT__entry( 279 TP_STRUCT__entry(
278 __field( struct page *, page ) 280 __field( struct page *, page )
@@ -280,6 +282,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
280 __field( int, fallback_order ) 282 __field( int, fallback_order )
281 __field( int, alloc_migratetype ) 283 __field( int, alloc_migratetype )
282 __field( int, fallback_migratetype ) 284 __field( int, fallback_migratetype )
285 __field( int, change_ownership )
283 ), 286 ),
284 287
285 TP_fast_assign( 288 TP_fast_assign(
@@ -288,6 +291,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
288 __entry->fallback_order = fallback_order; 291 __entry->fallback_order = fallback_order;
289 __entry->alloc_migratetype = alloc_migratetype; 292 __entry->alloc_migratetype = alloc_migratetype;
290 __entry->fallback_migratetype = fallback_migratetype; 293 __entry->fallback_migratetype = fallback_migratetype;
294 __entry->change_ownership = change_ownership;
291 ), 295 ),
292 296
293 TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", 297 TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
@@ -299,7 +303,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
299 __entry->alloc_migratetype, 303 __entry->alloc_migratetype,
300 __entry->fallback_migratetype, 304 __entry->fallback_migratetype,
301 __entry->fallback_order < pageblock_order, 305 __entry->fallback_order < pageblock_order,
302 __entry->alloc_migratetype == __entry->fallback_migratetype) 306 __entry->change_ownership)
303); 307);
304 308
305#endif /* _TRACE_KMEM_H */ 309#endif /* _TRACE_KMEM_H */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 816014c4627e..a51cddc2ff8c 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -26,6 +26,8 @@
26#include <linux/async.h> 26#include <linux/async.h>
27#include <linux/fs_struct.h> 27#include <linux/fs_struct.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/ramfs.h>
30#include <linux/shmem_fs.h>
29 31
30#include <linux/nfs_fs.h> 32#include <linux/nfs_fs.h>
31#include <linux/nfs_fs_sb.h> 33#include <linux/nfs_fs_sb.h>
@@ -588,3 +590,46 @@ out:
588 sys_mount(".", "/", NULL, MS_MOVE, NULL); 590 sys_mount(".", "/", NULL, MS_MOVE, NULL);
589 sys_chroot("."); 591 sys_chroot(".");
590} 592}
593
594static bool is_tmpfs;
595static struct dentry *rootfs_mount(struct file_system_type *fs_type,
596 int flags, const char *dev_name, void *data)
597{
598 static unsigned long once;
599 void *fill = ramfs_fill_super;
600
601 if (test_and_set_bit(0, &once))
602 return ERR_PTR(-ENODEV);
603
604 if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
605 fill = shmem_fill_super;
606
607 return mount_nodev(fs_type, flags, data, fill);
608}
609
610static struct file_system_type rootfs_fs_type = {
611 .name = "rootfs",
612 .mount = rootfs_mount,
613 .kill_sb = kill_litter_super,
614};
615
616int __init init_rootfs(void)
617{
618 int err = register_filesystem(&rootfs_fs_type);
619
620 if (err)
621 return err;
622
623 if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
624 (!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
625 err = shmem_init();
626 is_tmpfs = true;
627 } else {
628 err = init_ramfs_fs();
629 }
630
631 if (err)
632 unregister_filesystem(&rootfs_fs_type);
633
634 return err;
635}
diff --git a/ipc/msg.c b/ipc/msg.c
index b65fdf1a09dd..b0d541d42677 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -70,8 +70,6 @@ struct msg_sender {
70 70
71#define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) 71#define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS])
72 72
73#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm)
74
75static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); 73static void freeque(struct ipc_namespace *, struct kern_ipc_perm *);
76static int newque(struct ipc_namespace *, struct ipc_params *); 74static int newque(struct ipc_namespace *, struct ipc_params *);
77#ifdef CONFIG_PROC_FS 75#ifdef CONFIG_PROC_FS
@@ -172,7 +170,7 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
172 * @ns: namespace 170 * @ns: namespace
173 * @params: ptr to the structure that contains the key and msgflg 171 * @params: ptr to the structure that contains the key and msgflg
174 * 172 *
175 * Called with msg_ids.rw_mutex held (writer) 173 * Called with msg_ids.rwsem held (writer)
176 */ 174 */
177static int newque(struct ipc_namespace *ns, struct ipc_params *params) 175static int newque(struct ipc_namespace *ns, struct ipc_params *params)
178{ 176{
@@ -259,8 +257,8 @@ static void expunge_all(struct msg_queue *msq, int res)
259 * removes the message queue from message queue ID IDR, and cleans up all the 257 * removes the message queue from message queue ID IDR, and cleans up all the
260 * messages associated with this queue. 258 * messages associated with this queue.
261 * 259 *
262 * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held 260 * msg_ids.rwsem (writer) and the spinlock for this message queue are held
263 * before freeque() is called. msg_ids.rw_mutex remains locked on exit. 261 * before freeque() is called. msg_ids.rwsem remains locked on exit.
264 */ 262 */
265static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) 263static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
266{ 264{
@@ -270,7 +268,8 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
270 expunge_all(msq, -EIDRM); 268 expunge_all(msq, -EIDRM);
271 ss_wakeup(&msq->q_senders, 1); 269 ss_wakeup(&msq->q_senders, 1);
272 msg_rmid(ns, msq); 270 msg_rmid(ns, msq);
273 msg_unlock(msq); 271 ipc_unlock_object(&msq->q_perm);
272 rcu_read_unlock();
274 273
275 list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { 274 list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
276 atomic_dec(&ns->msg_hdrs); 275 atomic_dec(&ns->msg_hdrs);
@@ -282,7 +281,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
282} 281}
283 282
284/* 283/*
285 * Called with msg_ids.rw_mutex and ipcp locked. 284 * Called with msg_ids.rwsem and ipcp locked.
286 */ 285 */
287static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) 286static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)
288{ 287{
@@ -386,9 +385,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
386} 385}
387 386
388/* 387/*
389 * This function handles some msgctl commands which require the rw_mutex 388 * This function handles some msgctl commands which require the rwsem
390 * to be held in write mode. 389 * to be held in write mode.
391 * NOTE: no locks must be held, the rw_mutex is taken inside this function. 390 * NOTE: no locks must be held, the rwsem is taken inside this function.
392 */ 391 */
393static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, 392static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
394 struct msqid_ds __user *buf, int version) 393 struct msqid_ds __user *buf, int version)
@@ -403,7 +402,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
403 return -EFAULT; 402 return -EFAULT;
404 } 403 }
405 404
406 down_write(&msg_ids(ns).rw_mutex); 405 down_write(&msg_ids(ns).rwsem);
407 rcu_read_lock(); 406 rcu_read_lock();
408 407
409 ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, 408 ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
@@ -459,7 +458,7 @@ out_unlock0:
459out_unlock1: 458out_unlock1:
460 rcu_read_unlock(); 459 rcu_read_unlock();
461out_up: 460out_up:
462 up_write(&msg_ids(ns).rw_mutex); 461 up_write(&msg_ids(ns).rwsem);
463 return err; 462 return err;
464} 463}
465 464
@@ -494,7 +493,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
494 msginfo.msgmnb = ns->msg_ctlmnb; 493 msginfo.msgmnb = ns->msg_ctlmnb;
495 msginfo.msgssz = MSGSSZ; 494 msginfo.msgssz = MSGSSZ;
496 msginfo.msgseg = MSGSEG; 495 msginfo.msgseg = MSGSEG;
497 down_read(&msg_ids(ns).rw_mutex); 496 down_read(&msg_ids(ns).rwsem);
498 if (cmd == MSG_INFO) { 497 if (cmd == MSG_INFO) {
499 msginfo.msgpool = msg_ids(ns).in_use; 498 msginfo.msgpool = msg_ids(ns).in_use;
500 msginfo.msgmap = atomic_read(&ns->msg_hdrs); 499 msginfo.msgmap = atomic_read(&ns->msg_hdrs);
@@ -505,7 +504,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
505 msginfo.msgtql = MSGTQL; 504 msginfo.msgtql = MSGTQL;
506 } 505 }
507 max_id = ipc_get_maxid(&msg_ids(ns)); 506 max_id = ipc_get_maxid(&msg_ids(ns));
508 up_read(&msg_ids(ns).rw_mutex); 507 up_read(&msg_ids(ns).rwsem);
509 if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) 508 if (copy_to_user(buf, &msginfo, sizeof(struct msginfo)))
510 return -EFAULT; 509 return -EFAULT;
511 return (max_id < 0) ? 0 : max_id; 510 return (max_id < 0) ? 0 : max_id;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4be6581d3b7f..59451c1e214d 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -81,7 +81,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
81 int next_id; 81 int next_id;
82 int total, in_use; 82 int total, in_use;
83 83
84 down_write(&ids->rw_mutex); 84 down_write(&ids->rwsem);
85 85
86 in_use = ids->in_use; 86 in_use = ids->in_use;
87 87
@@ -89,11 +89,12 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
89 perm = idr_find(&ids->ipcs_idr, next_id); 89 perm = idr_find(&ids->ipcs_idr, next_id);
90 if (perm == NULL) 90 if (perm == NULL)
91 continue; 91 continue;
92 ipc_lock_by_ptr(perm); 92 rcu_read_lock();
93 ipc_lock_object(perm);
93 free(ns, perm); 94 free(ns, perm);
94 total++; 95 total++;
95 } 96 }
96 up_write(&ids->rw_mutex); 97 up_write(&ids->rwsem);
97} 98}
98 99
99static void free_ipc_ns(struct ipc_namespace *ns) 100static void free_ipc_ns(struct ipc_namespace *ns)
diff --git a/ipc/sem.c b/ipc/sem.c
index 41088899783d..69b6a21f3844 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -322,7 +322,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
322} 322}
323 323
324/* 324/*
325 * sem_lock_(check_) routines are called in the paths where the rw_mutex 325 * sem_lock_(check_) routines are called in the paths where the rwsem
326 * is not held. 326 * is not held.
327 * 327 *
328 * The caller holds the RCU read lock. 328 * The caller holds the RCU read lock.
@@ -426,7 +426,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
426 * @ns: namespace 426 * @ns: namespace
427 * @params: ptr to the structure that contains key, semflg and nsems 427 * @params: ptr to the structure that contains key, semflg and nsems
428 * 428 *
429 * Called with sem_ids.rw_mutex held (as a writer) 429 * Called with sem_ids.rwsem held (as a writer)
430 */ 430 */
431 431
432static int newary(struct ipc_namespace *ns, struct ipc_params *params) 432static int newary(struct ipc_namespace *ns, struct ipc_params *params)
@@ -492,7 +492,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
492 492
493 493
494/* 494/*
495 * Called with sem_ids.rw_mutex and ipcp locked. 495 * Called with sem_ids.rwsem and ipcp locked.
496 */ 496 */
497static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) 497static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
498{ 498{
@@ -503,7 +503,7 @@ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
503} 503}
504 504
505/* 505/*
506 * Called with sem_ids.rw_mutex and ipcp locked. 506 * Called with sem_ids.rwsem and ipcp locked.
507 */ 507 */
508static inline int sem_more_checks(struct kern_ipc_perm *ipcp, 508static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
509 struct ipc_params *params) 509 struct ipc_params *params)
@@ -994,8 +994,8 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
994 return semzcnt; 994 return semzcnt;
995} 995}
996 996
997/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked 997/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
998 * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex 998 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
999 * remains locked on exit. 999 * remains locked on exit.
1000 */ 1000 */
1001static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) 1001static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
@@ -1116,7 +1116,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
1116 seminfo.semmnu = SEMMNU; 1116 seminfo.semmnu = SEMMNU;
1117 seminfo.semmap = SEMMAP; 1117 seminfo.semmap = SEMMAP;
1118 seminfo.semume = SEMUME; 1118 seminfo.semume = SEMUME;
1119 down_read(&sem_ids(ns).rw_mutex); 1119 down_read(&sem_ids(ns).rwsem);
1120 if (cmd == SEM_INFO) { 1120 if (cmd == SEM_INFO) {
1121 seminfo.semusz = sem_ids(ns).in_use; 1121 seminfo.semusz = sem_ids(ns).in_use;
1122 seminfo.semaem = ns->used_sems; 1122 seminfo.semaem = ns->used_sems;
@@ -1125,7 +1125,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
1125 seminfo.semaem = SEMAEM; 1125 seminfo.semaem = SEMAEM;
1126 } 1126 }
1127 max_id = ipc_get_maxid(&sem_ids(ns)); 1127 max_id = ipc_get_maxid(&sem_ids(ns));
1128 up_read(&sem_ids(ns).rw_mutex); 1128 up_read(&sem_ids(ns).rwsem);
1129 if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) 1129 if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
1130 return -EFAULT; 1130 return -EFAULT;
1131 return (max_id < 0) ? 0: max_id; 1131 return (max_id < 0) ? 0: max_id;
@@ -1431,9 +1431,9 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
1431} 1431}
1432 1432
1433/* 1433/*
1434 * This function handles some semctl commands which require the rw_mutex 1434 * This function handles some semctl commands which require the rwsem
1435 * to be held in write mode. 1435 * to be held in write mode.
1436 * NOTE: no locks must be held, the rw_mutex is taken inside this function. 1436 * NOTE: no locks must be held, the rwsem is taken inside this function.
1437 */ 1437 */
1438static int semctl_down(struct ipc_namespace *ns, int semid, 1438static int semctl_down(struct ipc_namespace *ns, int semid,
1439 int cmd, int version, void __user *p) 1439 int cmd, int version, void __user *p)
@@ -1448,7 +1448,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
1448 return -EFAULT; 1448 return -EFAULT;
1449 } 1449 }
1450 1450
1451 down_write(&sem_ids(ns).rw_mutex); 1451 down_write(&sem_ids(ns).rwsem);
1452 rcu_read_lock(); 1452 rcu_read_lock();
1453 1453
1454 ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, 1454 ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
@@ -1487,7 +1487,7 @@ out_unlock0:
1487out_unlock1: 1487out_unlock1:
1488 rcu_read_unlock(); 1488 rcu_read_unlock();
1489out_up: 1489out_up:
1490 up_write(&sem_ids(ns).rw_mutex); 1490 up_write(&sem_ids(ns).rwsem);
1491 return err; 1491 return err;
1492} 1492}
1493 1493
diff --git a/ipc/shm.c b/ipc/shm.c
index c6b4ad5ce3b7..2821cdf93adb 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -19,6 +19,9 @@
19 * namespaces support 19 * namespaces support
20 * OpenVZ, SWsoft Inc. 20 * OpenVZ, SWsoft Inc.
21 * Pavel Emelianov <xemul@openvz.org> 21 * Pavel Emelianov <xemul@openvz.org>
22 *
23 * Better ipc lock (kern_ipc_perm.lock) handling
24 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
22 */ 25 */
23 26
24#include <linux/slab.h> 27#include <linux/slab.h>
@@ -80,8 +83,8 @@ void shm_init_ns(struct ipc_namespace *ns)
80} 83}
81 84
82/* 85/*
83 * Called with shm_ids.rw_mutex (writer) and the shp structure locked. 86 * Called with shm_ids.rwsem (writer) and the shp structure locked.
84 * Only shm_ids.rw_mutex remains locked on exit. 87 * Only shm_ids.rwsem remains locked on exit.
85 */ 88 */
86static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) 89static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
87{ 90{
@@ -124,8 +127,28 @@ void __init shm_init (void)
124 IPC_SHM_IDS, sysvipc_shm_proc_show); 127 IPC_SHM_IDS, sysvipc_shm_proc_show);
125} 128}
126 129
130static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
131{
132 struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id);
133
134 if (IS_ERR(ipcp))
135 return ERR_CAST(ipcp);
136
137 return container_of(ipcp, struct shmid_kernel, shm_perm);
138}
139
140static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
141{
142 struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
143
144 if (IS_ERR(ipcp))
145 return ERR_CAST(ipcp);
146
147 return container_of(ipcp, struct shmid_kernel, shm_perm);
148}
149
127/* 150/*
128 * shm_lock_(check_) routines are called in the paths where the rw_mutex 151 * shm_lock_(check_) routines are called in the paths where the rwsem
129 * is not necessarily held. 152 * is not necessarily held.
130 */ 153 */
131static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) 154static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
@@ -144,17 +167,6 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
144 ipc_lock_object(&ipcp->shm_perm); 167 ipc_lock_object(&ipcp->shm_perm);
145} 168}
146 169
147static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
148 int id)
149{
150 struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id);
151
152 if (IS_ERR(ipcp))
153 return (struct shmid_kernel *)ipcp;
154
155 return container_of(ipcp, struct shmid_kernel, shm_perm);
156}
157
158static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) 170static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
159{ 171{
160 ipc_rmid(&shm_ids(ns), &s->shm_perm); 172 ipc_rmid(&shm_ids(ns), &s->shm_perm);
@@ -182,7 +194,7 @@ static void shm_open(struct vm_area_struct *vma)
182 * @ns: namespace 194 * @ns: namespace
183 * @shp: struct to free 195 * @shp: struct to free
184 * 196 *
185 * It has to be called with shp and shm_ids.rw_mutex (writer) locked, 197 * It has to be called with shp and shm_ids.rwsem (writer) locked,
186 * but returns with shp unlocked and freed. 198 * but returns with shp unlocked and freed.
187 */ 199 */
188static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) 200static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
@@ -230,7 +242,7 @@ static void shm_close(struct vm_area_struct *vma)
230 struct shmid_kernel *shp; 242 struct shmid_kernel *shp;
231 struct ipc_namespace *ns = sfd->ns; 243 struct ipc_namespace *ns = sfd->ns;
232 244
233 down_write(&shm_ids(ns).rw_mutex); 245 down_write(&shm_ids(ns).rwsem);
234 /* remove from the list of attaches of the shm segment */ 246 /* remove from the list of attaches of the shm segment */
235 shp = shm_lock(ns, sfd->id); 247 shp = shm_lock(ns, sfd->id);
236 BUG_ON(IS_ERR(shp)); 248 BUG_ON(IS_ERR(shp));
@@ -241,10 +253,10 @@ static void shm_close(struct vm_area_struct *vma)
241 shm_destroy(ns, shp); 253 shm_destroy(ns, shp);
242 else 254 else
243 shm_unlock(shp); 255 shm_unlock(shp);
244 up_write(&shm_ids(ns).rw_mutex); 256 up_write(&shm_ids(ns).rwsem);
245} 257}
246 258
247/* Called with ns->shm_ids(ns).rw_mutex locked */ 259/* Called with ns->shm_ids(ns).rwsem locked */
248static int shm_try_destroy_current(int id, void *p, void *data) 260static int shm_try_destroy_current(int id, void *p, void *data)
249{ 261{
250 struct ipc_namespace *ns = data; 262 struct ipc_namespace *ns = data;
@@ -275,7 +287,7 @@ static int shm_try_destroy_current(int id, void *p, void *data)
275 return 0; 287 return 0;
276} 288}
277 289
278/* Called with ns->shm_ids(ns).rw_mutex locked */ 290/* Called with ns->shm_ids(ns).rwsem locked */
279static int shm_try_destroy_orphaned(int id, void *p, void *data) 291static int shm_try_destroy_orphaned(int id, void *p, void *data)
280{ 292{
281 struct ipc_namespace *ns = data; 293 struct ipc_namespace *ns = data;
@@ -286,7 +298,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
286 * We want to destroy segments without users and with already 298 * We want to destroy segments without users and with already
287 * exit'ed originating process. 299 * exit'ed originating process.
288 * 300 *
289 * As shp->* are changed under rw_mutex, it's safe to skip shp locking. 301 * As shp->* are changed under rwsem, it's safe to skip shp locking.
290 */ 302 */
291 if (shp->shm_creator != NULL) 303 if (shp->shm_creator != NULL)
292 return 0; 304 return 0;
@@ -300,10 +312,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
300 312
301void shm_destroy_orphaned(struct ipc_namespace *ns) 313void shm_destroy_orphaned(struct ipc_namespace *ns)
302{ 314{
303 down_write(&shm_ids(ns).rw_mutex); 315 down_write(&shm_ids(ns).rwsem);
304 if (shm_ids(ns).in_use) 316 if (shm_ids(ns).in_use)
305 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); 317 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
306 up_write(&shm_ids(ns).rw_mutex); 318 up_write(&shm_ids(ns).rwsem);
307} 319}
308 320
309 321
@@ -315,10 +327,10 @@ void exit_shm(struct task_struct *task)
315 return; 327 return;
316 328
317 /* Destroy all already created segments, but not mapped yet */ 329 /* Destroy all already created segments, but not mapped yet */
318 down_write(&shm_ids(ns).rw_mutex); 330 down_write(&shm_ids(ns).rwsem);
319 if (shm_ids(ns).in_use) 331 if (shm_ids(ns).in_use)
320 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); 332 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
321 up_write(&shm_ids(ns).rw_mutex); 333 up_write(&shm_ids(ns).rwsem);
322} 334}
323 335
324static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 336static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -452,7 +464,7 @@ static const struct vm_operations_struct shm_vm_ops = {
452 * @ns: namespace 464 * @ns: namespace
453 * @params: ptr to the structure that contains key, size and shmflg 465 * @params: ptr to the structure that contains key, size and shmflg
454 * 466 *
455 * Called with shm_ids.rw_mutex held as a writer. 467 * Called with shm_ids.rwsem held as a writer.
456 */ 468 */
457 469
458static int newseg(struct ipc_namespace *ns, struct ipc_params *params) 470static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
@@ -560,7 +572,7 @@ no_file:
560} 572}
561 573
562/* 574/*
563 * Called with shm_ids.rw_mutex and ipcp locked. 575 * Called with shm_ids.rwsem and ipcp locked.
564 */ 576 */
565static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) 577static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
566{ 578{
@@ -571,7 +583,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
571} 583}
572 584
573/* 585/*
574 * Called with shm_ids.rw_mutex and ipcp locked. 586 * Called with shm_ids.rwsem and ipcp locked.
575 */ 587 */
576static inline int shm_more_checks(struct kern_ipc_perm *ipcp, 588static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
577 struct ipc_params *params) 589 struct ipc_params *params)
@@ -684,7 +696,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
684 696
685/* 697/*
686 * Calculate and add used RSS and swap pages of a shm. 698 * Calculate and add used RSS and swap pages of a shm.
687 * Called with shm_ids.rw_mutex held as a reader 699 * Called with shm_ids.rwsem held as a reader
688 */ 700 */
689static void shm_add_rss_swap(struct shmid_kernel *shp, 701static void shm_add_rss_swap(struct shmid_kernel *shp,
690 unsigned long *rss_add, unsigned long *swp_add) 702 unsigned long *rss_add, unsigned long *swp_add)
@@ -711,7 +723,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
711} 723}
712 724
713/* 725/*
714 * Called with shm_ids.rw_mutex held as a reader 726 * Called with shm_ids.rwsem held as a reader
715 */ 727 */
716static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, 728static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
717 unsigned long *swp) 729 unsigned long *swp)
@@ -740,9 +752,9 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
740} 752}
741 753
742/* 754/*
743 * This function handles some shmctl commands which require the rw_mutex 755 * This function handles some shmctl commands which require the rwsem
744 * to be held in write mode. 756 * to be held in write mode.
745 * NOTE: no locks must be held, the rw_mutex is taken inside this function. 757 * NOTE: no locks must be held, the rwsem is taken inside this function.
746 */ 758 */
747static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, 759static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
748 struct shmid_ds __user *buf, int version) 760 struct shmid_ds __user *buf, int version)
@@ -757,14 +769,13 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
757 return -EFAULT; 769 return -EFAULT;
758 } 770 }
759 771
760 down_write(&shm_ids(ns).rw_mutex); 772 down_write(&shm_ids(ns).rwsem);
761 rcu_read_lock(); 773 rcu_read_lock();
762 774
763 ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, 775 ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
764 &shmid64.shm_perm, 0); 776 &shmid64.shm_perm, 0);
765 if (IS_ERR(ipcp)) { 777 if (IS_ERR(ipcp)) {
766 err = PTR_ERR(ipcp); 778 err = PTR_ERR(ipcp);
767 /* the ipc lock is not held upon failure */
768 goto out_unlock1; 779 goto out_unlock1;
769 } 780 }
770 781
@@ -772,14 +783,16 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
772 783
773 err = security_shm_shmctl(shp, cmd); 784 err = security_shm_shmctl(shp, cmd);
774 if (err) 785 if (err)
775 goto out_unlock0; 786 goto out_unlock1;
776 787
777 switch (cmd) { 788 switch (cmd) {
778 case IPC_RMID: 789 case IPC_RMID:
790 ipc_lock_object(&shp->shm_perm);
779 /* do_shm_rmid unlocks the ipc object and rcu */ 791 /* do_shm_rmid unlocks the ipc object and rcu */
780 do_shm_rmid(ns, ipcp); 792 do_shm_rmid(ns, ipcp);
781 goto out_up; 793 goto out_up;
782 case IPC_SET: 794 case IPC_SET:
795 ipc_lock_object(&shp->shm_perm);
783 err = ipc_update_perm(&shmid64.shm_perm, ipcp); 796 err = ipc_update_perm(&shmid64.shm_perm, ipcp);
784 if (err) 797 if (err)
785 goto out_unlock0; 798 goto out_unlock0;
@@ -787,6 +800,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
787 break; 800 break;
788 default: 801 default:
789 err = -EINVAL; 802 err = -EINVAL;
803 goto out_unlock1;
790 } 804 }
791 805
792out_unlock0: 806out_unlock0:
@@ -794,33 +808,28 @@ out_unlock0:
794out_unlock1: 808out_unlock1:
795 rcu_read_unlock(); 809 rcu_read_unlock();
796out_up: 810out_up:
797 up_write(&shm_ids(ns).rw_mutex); 811 up_write(&shm_ids(ns).rwsem);
798 return err; 812 return err;
799} 813}
800 814
801SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) 815static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
816 int cmd, int version, void __user *buf)
802{ 817{
818 int err;
803 struct shmid_kernel *shp; 819 struct shmid_kernel *shp;
804 int err, version;
805 struct ipc_namespace *ns;
806 820
807 if (cmd < 0 || shmid < 0) { 821 /* preliminary security checks for *_INFO */
808 err = -EINVAL; 822 if (cmd == IPC_INFO || cmd == SHM_INFO) {
809 goto out; 823 err = security_shm_shmctl(NULL, cmd);
824 if (err)
825 return err;
810 } 826 }
811 827
812 version = ipc_parse_version(&cmd); 828 switch (cmd) {
813 ns = current->nsproxy->ipc_ns;
814
815 switch (cmd) { /* replace with proc interface ? */
816 case IPC_INFO: 829 case IPC_INFO:
817 { 830 {
818 struct shminfo64 shminfo; 831 struct shminfo64 shminfo;
819 832
820 err = security_shm_shmctl(NULL, cmd);
821 if (err)
822 return err;
823
824 memset(&shminfo, 0, sizeof(shminfo)); 833 memset(&shminfo, 0, sizeof(shminfo));
825 shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; 834 shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
826 shminfo.shmmax = ns->shm_ctlmax; 835 shminfo.shmmax = ns->shm_ctlmax;
@@ -830,9 +839,9 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
830 if(copy_shminfo_to_user (buf, &shminfo, version)) 839 if(copy_shminfo_to_user (buf, &shminfo, version))
831 return -EFAULT; 840 return -EFAULT;
832 841
833 down_read(&shm_ids(ns).rw_mutex); 842 down_read(&shm_ids(ns).rwsem);
834 err = ipc_get_maxid(&shm_ids(ns)); 843 err = ipc_get_maxid(&shm_ids(ns));
835 up_read(&shm_ids(ns).rw_mutex); 844 up_read(&shm_ids(ns).rwsem);
836 845
837 if(err<0) 846 if(err<0)
838 err = 0; 847 err = 0;
@@ -842,19 +851,15 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
842 { 851 {
843 struct shm_info shm_info; 852 struct shm_info shm_info;
844 853
845 err = security_shm_shmctl(NULL, cmd);
846 if (err)
847 return err;
848
849 memset(&shm_info, 0, sizeof(shm_info)); 854 memset(&shm_info, 0, sizeof(shm_info));
850 down_read(&shm_ids(ns).rw_mutex); 855 down_read(&shm_ids(ns).rwsem);
851 shm_info.used_ids = shm_ids(ns).in_use; 856 shm_info.used_ids = shm_ids(ns).in_use;
852 shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); 857 shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
853 shm_info.shm_tot = ns->shm_tot; 858 shm_info.shm_tot = ns->shm_tot;
854 shm_info.swap_attempts = 0; 859 shm_info.swap_attempts = 0;
855 shm_info.swap_successes = 0; 860 shm_info.swap_successes = 0;
856 err = ipc_get_maxid(&shm_ids(ns)); 861 err = ipc_get_maxid(&shm_ids(ns));
857 up_read(&shm_ids(ns).rw_mutex); 862 up_read(&shm_ids(ns).rwsem);
858 if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { 863 if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
859 err = -EFAULT; 864 err = -EFAULT;
860 goto out; 865 goto out;
@@ -869,27 +874,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
869 struct shmid64_ds tbuf; 874 struct shmid64_ds tbuf;
870 int result; 875 int result;
871 876
877 rcu_read_lock();
872 if (cmd == SHM_STAT) { 878 if (cmd == SHM_STAT) {
873 shp = shm_lock(ns, shmid); 879 shp = shm_obtain_object(ns, shmid);
874 if (IS_ERR(shp)) { 880 if (IS_ERR(shp)) {
875 err = PTR_ERR(shp); 881 err = PTR_ERR(shp);
876 goto out; 882 goto out_unlock;
877 } 883 }
878 result = shp->shm_perm.id; 884 result = shp->shm_perm.id;
879 } else { 885 } else {
880 shp = shm_lock_check(ns, shmid); 886 shp = shm_obtain_object_check(ns, shmid);
881 if (IS_ERR(shp)) { 887 if (IS_ERR(shp)) {
882 err = PTR_ERR(shp); 888 err = PTR_ERR(shp);
883 goto out; 889 goto out_unlock;
884 } 890 }
885 result = 0; 891 result = 0;
886 } 892 }
893
887 err = -EACCES; 894 err = -EACCES;
888 if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) 895 if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
889 goto out_unlock; 896 goto out_unlock;
897
890 err = security_shm_shmctl(shp, cmd); 898 err = security_shm_shmctl(shp, cmd);
891 if (err) 899 if (err)
892 goto out_unlock; 900 goto out_unlock;
901
893 memset(&tbuf, 0, sizeof(tbuf)); 902 memset(&tbuf, 0, sizeof(tbuf));
894 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); 903 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
895 tbuf.shm_segsz = shp->shm_segsz; 904 tbuf.shm_segsz = shp->shm_segsz;
@@ -899,43 +908,76 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
899 tbuf.shm_cpid = shp->shm_cprid; 908 tbuf.shm_cpid = shp->shm_cprid;
900 tbuf.shm_lpid = shp->shm_lprid; 909 tbuf.shm_lpid = shp->shm_lprid;
901 tbuf.shm_nattch = shp->shm_nattch; 910 tbuf.shm_nattch = shp->shm_nattch;
902 shm_unlock(shp); 911 rcu_read_unlock();
903 if(copy_shmid_to_user (buf, &tbuf, version)) 912
913 if (copy_shmid_to_user(buf, &tbuf, version))
904 err = -EFAULT; 914 err = -EFAULT;
905 else 915 else
906 err = result; 916 err = result;
907 goto out; 917 goto out;
908 } 918 }
919 default:
920 return -EINVAL;
921 }
922
923out_unlock:
924 rcu_read_unlock();
925out:
926 return err;
927}
928
929SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
930{
931 struct shmid_kernel *shp;
932 int err, version;
933 struct ipc_namespace *ns;
934
935 if (cmd < 0 || shmid < 0)
936 return -EINVAL;
937
938 version = ipc_parse_version(&cmd);
939 ns = current->nsproxy->ipc_ns;
940
941 switch (cmd) {
942 case IPC_INFO:
943 case SHM_INFO:
944 case SHM_STAT:
945 case IPC_STAT:
946 return shmctl_nolock(ns, shmid, cmd, version, buf);
947 case IPC_RMID:
948 case IPC_SET:
949 return shmctl_down(ns, shmid, cmd, buf, version);
909 case SHM_LOCK: 950 case SHM_LOCK:
910 case SHM_UNLOCK: 951 case SHM_UNLOCK:
911 { 952 {
912 struct file *shm_file; 953 struct file *shm_file;
913 954
914 shp = shm_lock_check(ns, shmid); 955 rcu_read_lock();
956 shp = shm_obtain_object_check(ns, shmid);
915 if (IS_ERR(shp)) { 957 if (IS_ERR(shp)) {
916 err = PTR_ERR(shp); 958 err = PTR_ERR(shp);
917 goto out; 959 goto out_unlock1;
918 } 960 }
919 961
920 audit_ipc_obj(&(shp->shm_perm)); 962 audit_ipc_obj(&(shp->shm_perm));
963 err = security_shm_shmctl(shp, cmd);
964 if (err)
965 goto out_unlock1;
921 966
967 ipc_lock_object(&shp->shm_perm);
922 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { 968 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
923 kuid_t euid = current_euid(); 969 kuid_t euid = current_euid();
924 err = -EPERM; 970 err = -EPERM;
925 if (!uid_eq(euid, shp->shm_perm.uid) && 971 if (!uid_eq(euid, shp->shm_perm.uid) &&
926 !uid_eq(euid, shp->shm_perm.cuid)) 972 !uid_eq(euid, shp->shm_perm.cuid))
927 goto out_unlock; 973 goto out_unlock0;
928 if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) 974 if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
929 goto out_unlock; 975 goto out_unlock0;
930 } 976 }
931 977
932 err = security_shm_shmctl(shp, cmd);
933 if (err)
934 goto out_unlock;
935
936 shm_file = shp->shm_file; 978 shm_file = shp->shm_file;
937 if (is_file_hugepages(shm_file)) 979 if (is_file_hugepages(shm_file))
938 goto out_unlock; 980 goto out_unlock0;
939 981
940 if (cmd == SHM_LOCK) { 982 if (cmd == SHM_LOCK) {
941 struct user_struct *user = current_user(); 983 struct user_struct *user = current_user();
@@ -944,32 +986,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
944 shp->shm_perm.mode |= SHM_LOCKED; 986 shp->shm_perm.mode |= SHM_LOCKED;
945 shp->mlock_user = user; 987 shp->mlock_user = user;
946 } 988 }
947 goto out_unlock; 989 goto out_unlock0;
948 } 990 }
949 991
950 /* SHM_UNLOCK */ 992 /* SHM_UNLOCK */
951 if (!(shp->shm_perm.mode & SHM_LOCKED)) 993 if (!(shp->shm_perm.mode & SHM_LOCKED))
952 goto out_unlock; 994 goto out_unlock0;
953 shmem_lock(shm_file, 0, shp->mlock_user); 995 shmem_lock(shm_file, 0, shp->mlock_user);
954 shp->shm_perm.mode &= ~SHM_LOCKED; 996 shp->shm_perm.mode &= ~SHM_LOCKED;
955 shp->mlock_user = NULL; 997 shp->mlock_user = NULL;
956 get_file(shm_file); 998 get_file(shm_file);
957 shm_unlock(shp); 999 ipc_unlock_object(&shp->shm_perm);
1000 rcu_read_unlock();
958 shmem_unlock_mapping(shm_file->f_mapping); 1001 shmem_unlock_mapping(shm_file->f_mapping);
1002
959 fput(shm_file); 1003 fput(shm_file);
960 goto out;
961 }
962 case IPC_RMID:
963 case IPC_SET:
964 err = shmctl_down(ns, shmid, cmd, buf, version);
965 return err; 1004 return err;
1005 }
966 default: 1006 default:
967 return -EINVAL; 1007 return -EINVAL;
968 } 1008 }
969 1009
970out_unlock: 1010out_unlock0:
971 shm_unlock(shp); 1011 ipc_unlock_object(&shp->shm_perm);
972out: 1012out_unlock1:
1013 rcu_read_unlock();
973 return err; 1014 return err;
974} 1015}
975 1016
@@ -1037,10 +1078,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1037 * additional creator id... 1078 * additional creator id...
1038 */ 1079 */
1039 ns = current->nsproxy->ipc_ns; 1080 ns = current->nsproxy->ipc_ns;
1040 shp = shm_lock_check(ns, shmid); 1081 rcu_read_lock();
1082 shp = shm_obtain_object_check(ns, shmid);
1041 if (IS_ERR(shp)) { 1083 if (IS_ERR(shp)) {
1042 err = PTR_ERR(shp); 1084 err = PTR_ERR(shp);
1043 goto out; 1085 goto out_unlock;
1044 } 1086 }
1045 1087
1046 err = -EACCES; 1088 err = -EACCES;
@@ -1051,24 +1093,31 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1051 if (err) 1093 if (err)
1052 goto out_unlock; 1094 goto out_unlock;
1053 1095
1096 ipc_lock_object(&shp->shm_perm);
1054 path = shp->shm_file->f_path; 1097 path = shp->shm_file->f_path;
1055 path_get(&path); 1098 path_get(&path);
1056 shp->shm_nattch++; 1099 shp->shm_nattch++;
1057 size = i_size_read(path.dentry->d_inode); 1100 size = i_size_read(path.dentry->d_inode);
1058 shm_unlock(shp); 1101 ipc_unlock_object(&shp->shm_perm);
1102 rcu_read_unlock();
1059 1103
1060 err = -ENOMEM; 1104 err = -ENOMEM;
1061 sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); 1105 sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1062 if (!sfd) 1106 if (!sfd) {
1063 goto out_put_dentry; 1107 path_put(&path);
1108 goto out_nattch;
1109 }
1064 1110
1065 file = alloc_file(&path, f_mode, 1111 file = alloc_file(&path, f_mode,
1066 is_file_hugepages(shp->shm_file) ? 1112 is_file_hugepages(shp->shm_file) ?
1067 &shm_file_operations_huge : 1113 &shm_file_operations_huge :
1068 &shm_file_operations); 1114 &shm_file_operations);
1069 err = PTR_ERR(file); 1115 err = PTR_ERR(file);
1070 if (IS_ERR(file)) 1116 if (IS_ERR(file)) {
1071 goto out_free; 1117 kfree(sfd);
1118 path_put(&path);
1119 goto out_nattch;
1120 }
1072 1121
1073 file->private_data = sfd; 1122 file->private_data = sfd;
1074 file->f_mapping = shp->shm_file->f_mapping; 1123 file->f_mapping = shp->shm_file->f_mapping;
@@ -1094,7 +1143,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1094 addr > current->mm->start_stack - size - PAGE_SIZE * 5) 1143 addr > current->mm->start_stack - size - PAGE_SIZE * 5)
1095 goto invalid; 1144 goto invalid;
1096 } 1145 }
1097 1146
1098 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); 1147 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
1099 *raddr = addr; 1148 *raddr = addr;
1100 err = 0; 1149 err = 0;
@@ -1109,7 +1158,7 @@ out_fput:
1109 fput(file); 1158 fput(file);
1110 1159
1111out_nattch: 1160out_nattch:
1112 down_write(&shm_ids(ns).rw_mutex); 1161 down_write(&shm_ids(ns).rwsem);
1113 shp = shm_lock(ns, shmid); 1162 shp = shm_lock(ns, shmid);
1114 BUG_ON(IS_ERR(shp)); 1163 BUG_ON(IS_ERR(shp));
1115 shp->shm_nattch--; 1164 shp->shm_nattch--;
@@ -1117,20 +1166,13 @@ out_nattch:
1117 shm_destroy(ns, shp); 1166 shm_destroy(ns, shp);
1118 else 1167 else
1119 shm_unlock(shp); 1168 shm_unlock(shp);
1120 up_write(&shm_ids(ns).rw_mutex); 1169 up_write(&shm_ids(ns).rwsem);
1121
1122out:
1123 return err; 1170 return err;
1124 1171
1125out_unlock: 1172out_unlock:
1126 shm_unlock(shp); 1173 rcu_read_unlock();
1127 goto out; 1174out:
1128 1175 return err;
1129out_free:
1130 kfree(sfd);
1131out_put_dentry:
1132 path_put(&path);
1133 goto out_nattch;
1134} 1176}
1135 1177
1136SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) 1178SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
@@ -1235,8 +1277,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1235#else /* CONFIG_MMU */ 1277#else /* CONFIG_MMU */
1236 /* under NOMMU conditions, the exact address to be destroyed must be 1278 /* under NOMMU conditions, the exact address to be destroyed must be
1237 * given */ 1279 * given */
1238 retval = -EINVAL; 1280 if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1239 if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1240 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1281 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1241 retval = 0; 1282 retval = 0;
1242 } 1283 }
diff --git a/ipc/util.c b/ipc/util.c
index 4704223bfad4..e829da9ed01f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -15,6 +15,14 @@
15 * Jun 2006 - namespaces ssupport 15 * Jun 2006 - namespaces ssupport
16 * OpenVZ, SWsoft Inc. 16 * OpenVZ, SWsoft Inc.
17 * Pavel Emelianov <xemul@openvz.org> 17 * Pavel Emelianov <xemul@openvz.org>
18 *
19 * General sysv ipc locking scheme:
20 * when doing ipc id lookups, take the ids->rwsem
21 * rcu_read_lock()
22 * obtain the ipc object (kern_ipc_perm)
23 * perform security, capabilities, auditing and permission checks, etc.
24 * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object()
25 * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands)
18 */ 26 */
19 27
20#include <linux/mm.h> 28#include <linux/mm.h>
@@ -119,7 +127,7 @@ __initcall(ipc_init);
119 127
120void ipc_init_ids(struct ipc_ids *ids) 128void ipc_init_ids(struct ipc_ids *ids)
121{ 129{
122 init_rwsem(&ids->rw_mutex); 130 init_rwsem(&ids->rwsem);
123 131
124 ids->in_use = 0; 132 ids->in_use = 0;
125 ids->seq = 0; 133 ids->seq = 0;
@@ -174,7 +182,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
174 * @ids: Identifier set 182 * @ids: Identifier set
175 * @key: The key to find 183 * @key: The key to find
176 * 184 *
177 * Requires ipc_ids.rw_mutex locked. 185 * Requires ipc_ids.rwsem locked.
178 * Returns the LOCKED pointer to the ipc structure if found or NULL 186 * Returns the LOCKED pointer to the ipc structure if found or NULL
179 * if not. 187 * if not.
180 * If key is found ipc points to the owning ipc structure 188 * If key is found ipc points to the owning ipc structure
@@ -197,7 +205,8 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
197 continue; 205 continue;
198 } 206 }
199 207
200 ipc_lock_by_ptr(ipc); 208 rcu_read_lock();
209 ipc_lock_object(ipc);
201 return ipc; 210 return ipc;
202 } 211 }
203 212
@@ -208,7 +217,7 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
208 * ipc_get_maxid - get the last assigned id 217 * ipc_get_maxid - get the last assigned id
209 * @ids: IPC identifier set 218 * @ids: IPC identifier set
210 * 219 *
211 * Called with ipc_ids.rw_mutex held. 220 * Called with ipc_ids.rwsem held.
212 */ 221 */
213 222
214int ipc_get_maxid(struct ipc_ids *ids) 223int ipc_get_maxid(struct ipc_ids *ids)
@@ -246,7 +255,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
246 * is returned. The 'new' entry is returned in a locked state on success. 255 * is returned. The 'new' entry is returned in a locked state on success.
247 * On failure the entry is not locked and a negative err-code is returned. 256 * On failure the entry is not locked and a negative err-code is returned.
248 * 257 *
249 * Called with writer ipc_ids.rw_mutex held. 258 * Called with writer ipc_ids.rwsem held.
250 */ 259 */
251int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) 260int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
252{ 261{
@@ -312,9 +321,9 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
312{ 321{
313 int err; 322 int err;
314 323
315 down_write(&ids->rw_mutex); 324 down_write(&ids->rwsem);
316 err = ops->getnew(ns, params); 325 err = ops->getnew(ns, params);
317 up_write(&ids->rw_mutex); 326 up_write(&ids->rwsem);
318 return err; 327 return err;
319} 328}
320 329
@@ -331,7 +340,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
331 * 340 *
332 * On success, the IPC id is returned. 341 * On success, the IPC id is returned.
333 * 342 *
334 * It is called with ipc_ids.rw_mutex and ipcp->lock held. 343 * It is called with ipc_ids.rwsem and ipcp->lock held.
335 */ 344 */
336static int ipc_check_perms(struct ipc_namespace *ns, 345static int ipc_check_perms(struct ipc_namespace *ns,
337 struct kern_ipc_perm *ipcp, 346 struct kern_ipc_perm *ipcp,
@@ -376,7 +385,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
376 * Take the lock as a writer since we are potentially going to add 385 * Take the lock as a writer since we are potentially going to add
377 * a new entry + read locks are not "upgradable" 386 * a new entry + read locks are not "upgradable"
378 */ 387 */
379 down_write(&ids->rw_mutex); 388 down_write(&ids->rwsem);
380 ipcp = ipc_findkey(ids, params->key); 389 ipcp = ipc_findkey(ids, params->key);
381 if (ipcp == NULL) { 390 if (ipcp == NULL) {
382 /* key not used */ 391 /* key not used */
@@ -402,7 +411,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
402 } 411 }
403 ipc_unlock(ipcp); 412 ipc_unlock(ipcp);
404 } 413 }
405 up_write(&ids->rw_mutex); 414 up_write(&ids->rwsem);
406 415
407 return err; 416 return err;
408} 417}
@@ -413,7 +422,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
413 * @ids: IPC identifier set 422 * @ids: IPC identifier set
414 * @ipcp: ipc perm structure containing the identifier to remove 423 * @ipcp: ipc perm structure containing the identifier to remove
415 * 424 *
416 * ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held 425 * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
417 * before this function is called, and remain locked on the exit. 426 * before this function is called, and remain locked on the exit.
418 */ 427 */
419 428
@@ -621,7 +630,7 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id)
621} 630}
622 631
623/** 632/**
624 * ipc_lock - Lock an ipc structure without rw_mutex held 633 * ipc_lock - Lock an ipc structure without rwsem held
625 * @ids: IPC identifier set 634 * @ids: IPC identifier set
626 * @id: ipc id to look for 635 * @id: ipc id to look for
627 * 636 *
@@ -677,22 +686,6 @@ out:
677 return out; 686 return out;
678} 687}
679 688
680struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id)
681{
682 struct kern_ipc_perm *out;
683
684 out = ipc_lock(ids, id);
685 if (IS_ERR(out))
686 return out;
687
688 if (ipc_checkid(out, id)) {
689 ipc_unlock(out);
690 return ERR_PTR(-EIDRM);
691 }
692
693 return out;
694}
695
696/** 689/**
697 * ipcget - Common sys_*get() code 690 * ipcget - Common sys_*get() code
698 * @ns : namsepace 691 * @ns : namsepace
@@ -733,7 +726,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
733} 726}
734 727
735/** 728/**
736 * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd 729 * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
737 * @ns: the ipc namespace 730 * @ns: the ipc namespace
738 * @ids: the table of ids where to look for the ipc 731 * @ids: the table of ids where to look for the ipc
739 * @id: the id of the ipc to retrieve 732 * @id: the id of the ipc to retrieve
@@ -746,29 +739,13 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
746 * It must be called without any lock held and 739 * It must be called without any lock held and
747 * - retrieves the ipc with the given id in the given table. 740 * - retrieves the ipc with the given id in the given table.
748 * - performs some audit and permission check, depending on the given cmd 741 * - performs some audit and permission check, depending on the given cmd
749 * - returns the ipc with the ipc lock held in case of success 742 * - returns a pointer to the ipc object or otherwise, the corresponding error.
750 * or an err-code without any lock held otherwise.
751 * 743 *
752 * Call holding the both the rw_mutex and the rcu read lock. 744 * Call holding the both the rwsem and the rcu read lock.
753 */ 745 */
754struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns,
755 struct ipc_ids *ids, int id, int cmd,
756 struct ipc64_perm *perm, int extra_perm)
757{
758 struct kern_ipc_perm *ipcp;
759
760 ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm);
761 if (IS_ERR(ipcp))
762 goto out;
763
764 spin_lock(&ipcp->lock);
765out:
766 return ipcp;
767}
768
769struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, 746struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
770 struct ipc_ids *ids, int id, int cmd, 747 struct ipc_ids *ids, int id, int cmd,
771 struct ipc64_perm *perm, int extra_perm) 748 struct ipc64_perm *perm, int extra_perm)
772{ 749{
773 kuid_t euid; 750 kuid_t euid;
774 int err = -EPERM; 751 int err = -EPERM;
@@ -846,7 +823,8 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
846 ipc = idr_find(&ids->ipcs_idr, pos); 823 ipc = idr_find(&ids->ipcs_idr, pos);
847 if (ipc != NULL) { 824 if (ipc != NULL) {
848 *new_pos = pos + 1; 825 *new_pos = pos + 1;
849 ipc_lock_by_ptr(ipc); 826 rcu_read_lock();
827 ipc_lock_object(ipc);
850 return ipc; 828 return ipc;
851 } 829 }
852 } 830 }
@@ -884,7 +862,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos)
884 * Take the lock - this will be released by the corresponding 862 * Take the lock - this will be released by the corresponding
885 * call to stop(). 863 * call to stop().
886 */ 864 */
887 down_read(&ids->rw_mutex); 865 down_read(&ids->rwsem);
888 866
889 /* pos < 0 is invalid */ 867 /* pos < 0 is invalid */
890 if (*pos < 0) 868 if (*pos < 0)
@@ -911,7 +889,7 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it)
911 889
912 ids = &iter->ns->ids[iface->ids]; 890 ids = &iter->ns->ids[iface->ids];
913 /* Release the lock we took in start() */ 891 /* Release the lock we took in start() */
914 up_read(&ids->rw_mutex); 892 up_read(&ids->rwsem);
915} 893}
916 894
917static int sysvipc_proc_show(struct seq_file *s, void *it) 895static int sysvipc_proc_show(struct seq_file *s, void *it)
diff --git a/ipc/util.h b/ipc/util.h
index b6a6a88f3002..c5f3338ba1fa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -94,10 +94,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
94#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) 94#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
95#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) 95#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
96 96
97/* must be called with ids->rw_mutex acquired for writing */ 97/* must be called with ids->rwsem acquired for writing */
98int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); 98int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
99 99
100/* must be called with ids->rw_mutex acquired for reading */ 100/* must be called with ids->rwsem acquired for reading */
101int ipc_get_maxid(struct ipc_ids *); 101int ipc_get_maxid(struct ipc_ids *);
102 102
103/* must be called with both locks acquired. */ 103/* must be called with both locks acquired. */
@@ -131,9 +131,6 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
131struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, 131struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
132 struct ipc_ids *ids, int id, int cmd, 132 struct ipc_ids *ids, int id, int cmd,
133 struct ipc64_perm *perm, int extra_perm); 133 struct ipc64_perm *perm, int extra_perm);
134struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns,
135 struct ipc_ids *ids, int id, int cmd,
136 struct ipc64_perm *perm, int extra_perm);
137 134
138#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION 135#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
139 /* On IA-64, we always use the "64-bit version" of the IPC structures. */ 136 /* On IA-64, we always use the "64-bit version" of the IPC structures. */
@@ -174,19 +171,12 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
174 assert_spin_locked(&perm->lock); 171 assert_spin_locked(&perm->lock);
175} 172}
176 173
177static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
178{
179 rcu_read_lock();
180 ipc_lock_object(perm);
181}
182
183static inline void ipc_unlock(struct kern_ipc_perm *perm) 174static inline void ipc_unlock(struct kern_ipc_perm *perm)
184{ 175{
185 ipc_unlock_object(perm); 176 ipc_unlock_object(perm);
186 rcu_read_unlock(); 177 rcu_read_unlock();
187} 178}
188 179
189struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
190struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); 180struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
191int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, 181int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
192 struct ipc_ops *ops, struct ipc_params *params); 182 struct ipc_ops *ops, struct ipc_params *params);
diff --git a/kernel/extable.c b/kernel/extable.c
index 67460b93b1a1..832cb28105bb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) { 44 if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
45 pr_notice("Sorting __ex_table...\n"); 45 pr_notice("Sorting __ex_table...\n");
46 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
47 } 47 }
diff --git a/kernel/fork.c b/kernel/fork.c
index c9eaf2013002..81ccb4f010c2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
351 struct rb_node **rb_link, *rb_parent; 351 struct rb_node **rb_link, *rb_parent;
352 int retval; 352 int retval;
353 unsigned long charge; 353 unsigned long charge;
354 struct mempolicy *pol;
355 354
356 uprobe_start_dup_mmap(); 355 uprobe_start_dup_mmap();
357 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
400 goto fail_nomem; 399 goto fail_nomem;
401 *tmp = *mpnt; 400 *tmp = *mpnt;
402 INIT_LIST_HEAD(&tmp->anon_vma_chain); 401 INIT_LIST_HEAD(&tmp->anon_vma_chain);
403 pol = mpol_dup(vma_policy(mpnt)); 402 retval = vma_dup_policy(mpnt, tmp);
404 retval = PTR_ERR(pol); 403 if (retval)
405 if (IS_ERR(pol))
406 goto fail_nomem_policy; 404 goto fail_nomem_policy;
407 vma_set_policy(tmp, pol);
408 tmp->vm_mm = mm; 405 tmp->vm_mm = mm;
409 if (anon_vma_fork(tmp, mpnt)) 406 if (anon_vma_fork(tmp, mpnt))
410 goto fail_nomem_anon_vma_fork; 407 goto fail_nomem_anon_vma_fork;
@@ -472,7 +469,7 @@ out:
472 uprobe_end_dup_mmap(); 469 uprobe_end_dup_mmap();
473 return retval; 470 return retval;
474fail_nomem_anon_vma_fork: 471fail_nomem_anon_vma_fork:
475 mpol_put(pol); 472 mpol_put(vma_policy(tmp));
476fail_nomem_policy: 473fail_nomem_policy:
477 kmem_cache_free(vm_area_cachep, tmp); 474 kmem_cache_free(vm_area_cachep, tmp);
478fail_nomem: 475fail_nomem:
@@ -1173,13 +1170,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1173 return ERR_PTR(-EINVAL); 1170 return ERR_PTR(-EINVAL);
1174 1171
1175 /* 1172 /*
1176 * If the new process will be in a different pid namespace 1173 * If the new process will be in a different pid or user namespace
1177 * don't allow the creation of threads. 1174 * do not allow it to share a thread group or signal handlers or
1175 * parent with the forking task.
1178 */ 1176 */
1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && 1177 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
1180 (task_active_pid_ns(current) != 1178 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1181 current->nsproxy->pid_ns_for_children)) 1179 (task_active_pid_ns(current) !=
1182 return ERR_PTR(-EINVAL); 1180 current->nsproxy->pid_ns_for_children))
1181 return ERR_PTR(-EINVAL);
1182 }
1183 1183
1184 retval = security_task_create(clone_flags); 1184 retval = security_task_create(clone_flags);
1185 if (retval) 1185 if (retval)
@@ -1576,15 +1576,6 @@ long do_fork(unsigned long clone_flags,
1576 long nr; 1576 long nr;
1577 1577
1578 /* 1578 /*
1579 * Do some preliminary argument and permissions checking before we
1580 * actually start allocating stuff
1581 */
1582 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1583 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1584 return -EINVAL;
1585 }
1586
1587 /*
1588 * Determine whether and which event to report to ptracer. When 1579 * Determine whether and which event to report to ptracer. When
1589 * called from kernel_thread or CLONE_UNTRACED is explicitly 1580 * called from kernel_thread or CLONE_UNTRACED is explicitly
1590 * requested, no event is reported; otherwise, report if the event 1581 * requested, no event is reported; otherwise, report if the event
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba745..2a74f307c5ec 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline,
1474 if (first_colon && (!first_space || first_colon < first_space)) 1474 if (first_colon && (!first_space || first_colon < first_space))
1475 return parse_crashkernel_mem(ck_cmdline, system_ram, 1475 return parse_crashkernel_mem(ck_cmdline, system_ram,
1476 crash_size, crash_base); 1476 crash_size, crash_base);
1477 else
1478 return parse_crashkernel_simple(ck_cmdline, crash_size,
1479 crash_base);
1480 1477
1481 return 0; 1478 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1482} 1479}
1483 1480
1484/* 1481/*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6e33498d665c..a0d367a49122 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
112struct kprobe_insn_page { 112struct kprobe_insn_page {
113 struct list_head list; 113 struct list_head list;
114 kprobe_opcode_t *insns; /* Page of instruction slots */ 114 kprobe_opcode_t *insns; /* Page of instruction slots */
115 struct kprobe_insn_cache *cache;
115 int nused; 116 int nused;
116 int ngarbage; 117 int ngarbage;
117 char slot_used[]; 118 char slot_used[];
@@ -121,12 +122,6 @@ struct kprobe_insn_page {
121 (offsetof(struct kprobe_insn_page, slot_used) + \ 122 (offsetof(struct kprobe_insn_page, slot_used) + \
122 (sizeof(char) * (slots))) 123 (sizeof(char) * (slots)))
123 124
124struct kprobe_insn_cache {
125 struct list_head pages; /* list of kprobe_insn_page */
126 size_t insn_size; /* size of instruction slot */
127 int nr_garbage;
128};
129
130static int slots_per_page(struct kprobe_insn_cache *c) 125static int slots_per_page(struct kprobe_insn_cache *c)
131{ 126{
132 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); 127 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,8 +133,20 @@ enum kprobe_slot_state {
138 SLOT_USED = 2, 133 SLOT_USED = 2,
139}; 134};
140 135
141static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ 136static void *alloc_insn_page(void)
142static struct kprobe_insn_cache kprobe_insn_slots = { 137{
138 return module_alloc(PAGE_SIZE);
139}
140
141static void free_insn_page(void *page)
142{
143 module_free(NULL, page);
144}
145
146struct kprobe_insn_cache kprobe_insn_slots = {
147 .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
148 .alloc = alloc_insn_page,
149 .free = free_insn_page,
143 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), 150 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
144 .insn_size = MAX_INSN_SIZE, 151 .insn_size = MAX_INSN_SIZE,
145 .nr_garbage = 0, 152 .nr_garbage = 0,
@@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
150 * __get_insn_slot() - Find a slot on an executable page for an instruction. 157 * __get_insn_slot() - Find a slot on an executable page for an instruction.
151 * We allocate an executable page if there's no room on existing ones. 158 * We allocate an executable page if there's no room on existing ones.
152 */ 159 */
153static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) 160kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
154{ 161{
155 struct kprobe_insn_page *kip; 162 struct kprobe_insn_page *kip;
163 kprobe_opcode_t *slot = NULL;
156 164
165 mutex_lock(&c->mutex);
157 retry: 166 retry:
158 list_for_each_entry(kip, &c->pages, list) { 167 list_for_each_entry(kip, &c->pages, list) {
159 if (kip->nused < slots_per_page(c)) { 168 if (kip->nused < slots_per_page(c)) {
@@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
162 if (kip->slot_used[i] == SLOT_CLEAN) { 171 if (kip->slot_used[i] == SLOT_CLEAN) {
163 kip->slot_used[i] = SLOT_USED; 172 kip->slot_used[i] = SLOT_USED;
164 kip->nused++; 173 kip->nused++;
165 return kip->insns + (i * c->insn_size); 174 slot = kip->insns + (i * c->insn_size);
175 goto out;
166 } 176 }
167 } 177 }
168 /* kip->nused is broken. Fix it. */ 178 /* kip->nused is broken. Fix it. */
@@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
178 /* All out of space. Need to allocate a new page. */ 188 /* All out of space. Need to allocate a new page. */
179 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); 189 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
180 if (!kip) 190 if (!kip)
181 return NULL; 191 goto out;
182 192
183 /* 193 /*
184 * Use module_alloc so this page is within +/- 2GB of where the 194 * Use module_alloc so this page is within +/- 2GB of where the
185 * kernel image and loaded module images reside. This is required 195 * kernel image and loaded module images reside. This is required
186 * so x86_64 can correctly handle the %rip-relative fixups. 196 * so x86_64 can correctly handle the %rip-relative fixups.
187 */ 197 */
188 kip->insns = module_alloc(PAGE_SIZE); 198 kip->insns = c->alloc();
189 if (!kip->insns) { 199 if (!kip->insns) {
190 kfree(kip); 200 kfree(kip);
191 return NULL; 201 goto out;
192 } 202 }
193 INIT_LIST_HEAD(&kip->list); 203 INIT_LIST_HEAD(&kip->list);
194 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); 204 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
195 kip->slot_used[0] = SLOT_USED; 205 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 206 kip->nused = 1;
197 kip->ngarbage = 0; 207 kip->ngarbage = 0;
208 kip->cache = c;
198 list_add(&kip->list, &c->pages); 209 list_add(&kip->list, &c->pages);
199 return kip->insns; 210 slot = kip->insns;
200} 211out:
201 212 mutex_unlock(&c->mutex);
202 213 return slot;
203kprobe_opcode_t __kprobes *get_insn_slot(void)
204{
205 kprobe_opcode_t *ret = NULL;
206
207 mutex_lock(&kprobe_insn_mutex);
208 ret = __get_insn_slot(&kprobe_insn_slots);
209 mutex_unlock(&kprobe_insn_mutex);
210
211 return ret;
212} 214}
213 215
214/* Return 1 if all garbages are collected, otherwise 0. */ 216/* Return 1 if all garbages are collected, otherwise 0. */
@@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
225 */ 227 */
226 if (!list_is_singular(&kip->list)) { 228 if (!list_is_singular(&kip->list)) {
227 list_del(&kip->list); 229 list_del(&kip->list);
228 module_free(NULL, kip->insns); 230 kip->cache->free(kip->insns);
229 kfree(kip); 231 kfree(kip);
230 } 232 }
231 return 1; 233 return 1;
@@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
255 return 0; 257 return 0;
256} 258}
257 259
258static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, 260void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
259 kprobe_opcode_t *slot, int dirty) 261 kprobe_opcode_t *slot, int dirty)
260{ 262{
261 struct kprobe_insn_page *kip; 263 struct kprobe_insn_page *kip;
262 264
265 mutex_lock(&c->mutex);
263 list_for_each_entry(kip, &c->pages, list) { 266 list_for_each_entry(kip, &c->pages, list) {
264 long idx = ((long)slot - (long)kip->insns) / 267 long idx = ((long)slot - (long)kip->insns) /
265 (c->insn_size * sizeof(kprobe_opcode_t)); 268 (c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
272 collect_garbage_slots(c); 275 collect_garbage_slots(c);
273 } else 276 } else
274 collect_one_slot(kip, idx); 277 collect_one_slot(kip, idx);
275 return; 278 goto out;
276 } 279 }
277 } 280 }
278 /* Could not free this slot. */ 281 /* Could not free this slot. */
279 WARN_ON(1); 282 WARN_ON(1);
283out:
284 mutex_unlock(&c->mutex);
280} 285}
281 286
282void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
283{
284 mutex_lock(&kprobe_insn_mutex);
285 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
286 mutex_unlock(&kprobe_insn_mutex);
287}
288#ifdef CONFIG_OPTPROBES 287#ifdef CONFIG_OPTPROBES
289/* For optimized_kprobe buffer */ 288/* For optimized_kprobe buffer */
290static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ 289struct kprobe_insn_cache kprobe_optinsn_slots = {
291static struct kprobe_insn_cache kprobe_optinsn_slots = { 290 .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
291 .alloc = alloc_insn_page,
292 .free = free_insn_page,
292 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), 293 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
293 /* .insn_size is initialized later */ 294 /* .insn_size is initialized later */
294 .nr_garbage = 0, 295 .nr_garbage = 0,
295}; 296};
296/* Get a slot for optimized_kprobe buffer */
297kprobe_opcode_t __kprobes *get_optinsn_slot(void)
298{
299 kprobe_opcode_t *ret = NULL;
300
301 mutex_lock(&kprobe_optinsn_mutex);
302 ret = __get_insn_slot(&kprobe_optinsn_slots);
303 mutex_unlock(&kprobe_optinsn_mutex);
304
305 return ret;
306}
307
308void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
309{
310 mutex_lock(&kprobe_optinsn_mutex);
311 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
312 mutex_unlock(&kprobe_optinsn_mutex);
313}
314#endif 297#endif
315#endif 298#endif
316 299
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 2b6e69909c39..7cbd4507a7e6 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -18,14 +18,14 @@
18 18
19struct key *modsign_keyring; 19struct key *modsign_keyring;
20 20
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initconst const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initconst const u8 modsign_certificate_list_end[];
23 23
24/* 24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice 25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes. 26 * if modsign.pub changes.
27 */ 27 */
28static __initdata const char annoy_ccache[] = __TIME__ "foo"; 28static __initconst const char annoy_ccache[] = __TIME__ "foo";
29 29
30/* 30/*
31 * Load the compiled-in keys 31 * Load the compiled-in keys
diff --git a/kernel/panic.c b/kernel/panic.c
index 801864600514..b6c482ccc5db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...)
123 */ 123 */
124 smp_send_stop(); 124 smp_send_stop();
125 125
126 kmsg_dump(KMSG_DUMP_PANIC); 126 /*
127 127 * Run any panic handlers, including those that might need to
128 * add information to the kmsg dump output.
129 */
128 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 130 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
129 131
132 kmsg_dump(KMSG_DUMP_PANIC);
133
130 bust_spinlocks(0); 134 bust_spinlocks(0);
131 135
132 if (!panic_blink) 136 if (!panic_blink)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 349587bb03e1..358a146fd4da 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
352 struct mem_extent *ext, *cur, *aux; 352 struct mem_extent *ext, *cur, *aux;
353 353
354 zone_start = zone->zone_start_pfn; 354 zone_start = zone->zone_start_pfn;
355 zone_end = zone->zone_start_pfn + zone->spanned_pages; 355 zone_end = zone_end_pfn(zone);
356 356
357 list_for_each_entry(ext, list, hook) 357 list_for_each_entry(ext, list, hook)
358 if (zone_start <= ext->end) 358 if (zone_start <= ext->end)
@@ -884,7 +884,7 @@ static unsigned int count_highmem_pages(void)
884 continue; 884 continue;
885 885
886 mark_free_pages(zone); 886 mark_free_pages(zone);
887 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 887 max_zone_pfn = zone_end_pfn(zone);
888 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 888 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
889 if (saveable_highmem_page(zone, pfn)) 889 if (saveable_highmem_page(zone, pfn))
890 n++; 890 n++;
@@ -948,7 +948,7 @@ static unsigned int count_data_pages(void)
948 continue; 948 continue;
949 949
950 mark_free_pages(zone); 950 mark_free_pages(zone);
951 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 951 max_zone_pfn = zone_end_pfn(zone);
952 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 952 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
953 if (saveable_page(zone, pfn)) 953 if (saveable_page(zone, pfn))
954 n++; 954 n++;
@@ -1041,7 +1041,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1041 unsigned long max_zone_pfn; 1041 unsigned long max_zone_pfn;
1042 1042
1043 mark_free_pages(zone); 1043 mark_free_pages(zone);
1044 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1044 max_zone_pfn = zone_end_pfn(zone);
1045 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1045 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1046 if (page_is_saveable(zone, pfn)) 1046 if (page_is_saveable(zone, pfn))
1047 memory_bm_set_bit(orig_bm, pfn); 1047 memory_bm_set_bit(orig_bm, pfn);
@@ -1093,7 +1093,7 @@ void swsusp_free(void)
1093 unsigned long pfn, max_zone_pfn; 1093 unsigned long pfn, max_zone_pfn;
1094 1094
1095 for_each_populated_zone(zone) { 1095 for_each_populated_zone(zone) {
1096 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1096 max_zone_pfn = zone_end_pfn(zone);
1097 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1097 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1098 if (pfn_valid(pfn)) { 1098 if (pfn_valid(pfn)) {
1099 struct page *page = pfn_to_page(pfn); 1099 struct page *page = pfn_to_page(pfn);
@@ -1755,7 +1755,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1755 1755
1756 /* Clear page flags */ 1756 /* Clear page flags */
1757 for_each_populated_zone(zone) { 1757 for_each_populated_zone(zone) {
1758 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1758 max_zone_pfn = zone_end_pfn(zone);
1759 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1759 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1760 if (pfn_valid(pfn)) 1760 if (pfn_valid(pfn))
1761 swsusp_unset_page_free(pfn_to_page(pfn)); 1761 swsusp_unset_page_free(pfn_to_page(pfn));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee327f6a..dd562e9aa2c8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
236 */ 236 */
237 int dumpable = 0; 237 int dumpable = 0;
238 /* Don't let security modules deny introspection */ 238 /* Don't let security modules deny introspection */
239 if (task == current) 239 if (same_thread_group(task, current))
240 return 0; 240 return 0;
241 rcu_read_lock(); 241 rcu_read_lock();
242 tcred = __task_cred(task); 242 tcred = __task_cred(task);
diff --git a/kernel/signal.c b/kernel/signal.c
index 50e41075ac77..ded28b91fa53 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3394 new_ka.sa.sa_restorer = compat_ptr(restorer); 3394 new_ka.sa.sa_restorer = compat_ptr(restorer);
3395#endif 3395#endif
3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); 3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3397 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); 3397 ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
3398 if (ret) 3398 if (ret)
3399 return -EFAULT; 3399 return -EFAULT;
3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask); 3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3407 &oact->sa_handler); 3407 &oact->sa_handler);
3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); 3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3409 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 3409 ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3410#ifdef __ARCH_HAS_SA_RESTORER 3410#ifdef __ARCH_HAS_SA_RESTORER
3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), 3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3412 &oact->sa_restorer); 3412 &oact->sa_restorer);
diff --git a/kernel/smp.c b/kernel/smp.c
index 449b707fc20d..0564571dcdf7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48 cpu_to_node(cpu))) 48 cpu_to_node(cpu)))
49 return notifier_from_errno(-ENOMEM); 49 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu))) 51 cpu_to_node(cpu))) {
52 free_cpumask_var(cfd->cpumask);
52 return notifier_from_errno(-ENOMEM); 53 return notifier_from_errno(-ENOMEM);
54 }
53 cfd->csd = alloc_percpu(struct call_single_data); 55 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) { 56 if (!cfd->csd) {
57 free_cpumask_var(cfd->cpumask_ipi);
55 free_cpumask_var(cfd->cpumask); 58 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM); 59 return notifier_from_errno(-ENOMEM);
57 } 60 }
@@ -572,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu);
572 * 575 *
573 * If @wait is true, then returns once @func has returned. 576 * If @wait is true, then returns once @func has returned.
574 * 577 *
575 * You must not call this function with disabled interrupts or 578 * You must not call this function with disabled interrupts or from a
576 * from a hardware interrupt handler or from a bottom half handler. 579 * hardware interrupt handler or from a bottom half handler. The
580 * exception is that it may be used during early boot while
581 * early_boot_irqs_disabled is set.
577 */ 582 */
578void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, 583void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
579 void *info, bool wait) 584 void *info, bool wait)
@@ -582,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
582 587
583 smp_call_function_many(mask, func, info, wait); 588 smp_call_function_many(mask, func, info, wait);
584 if (cpumask_test_cpu(cpu, mask)) { 589 if (cpumask_test_cpu(cpu, mask)) {
585 local_irq_disable(); 590 unsigned long flags;
591 local_irq_save(flags);
586 func(info); 592 func(info);
587 local_irq_enable(); 593 local_irq_restore(flags);
588 } 594 }
589 put_cpu(); 595 put_cpu();
590} 596}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3ce..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -34,6 +34,20 @@
34#else 34#else
35#define raw_read_can_lock(l) read_can_lock(l) 35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l) 36#define raw_write_can_lock(l) write_can_lock(l)
37
38/*
39 * Some architectures can relax in favour of the CPU owning the lock.
40 */
41#ifndef arch_read_relax
42# define arch_read_relax(l) cpu_relax()
43#endif
44#ifndef arch_write_relax
45# define arch_write_relax(l) cpu_relax()
46#endif
47#ifndef arch_spin_relax
48# define arch_spin_relax(l) cpu_relax()
49#endif
50
37/* 51/*
38 * We build the __lock_function inlines here. They are too large for 52 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function 53 * inlining all over the place, but here is only one user per function
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc468e17..dc69093a8ec4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = {
1225 .data = &hugepages_treat_as_movable, 1225 .data = &hugepages_treat_as_movable,
1226 .maxlen = sizeof(int), 1226 .maxlen = sizeof(int),
1227 .mode = 0644, 1227 .mode = 0644,
1228 .proc_handler = hugetlb_treat_movable_handler, 1228 .proc_handler = proc_dointvec,
1229 }, 1229 },
1230 { 1230 {
1231 .procname = "nr_overcommit_hugepages", 1231 .procname = "nr_overcommit_hugepages",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c92d6f3..8727032e3a6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -4,6 +4,23 @@
4 4
5static struct callback_head work_exited; /* all we need is ->next == NULL */ 5static struct callback_head work_exited; /* all we need is ->next == NULL */
6 6
7/**
8 * task_work_add - ask the @task to execute @work->func()
9 * @task: the task which should run the callback
10 * @work: the callback to run
11 * @notify: send the notification if true
12 *
13 * Queue @work for task_work_run() below and notify the @task if @notify.
14 * Fails if the @task is exiting/exited and thus it can't process this @work.
15 * Otherwise @work->func() will be called when the @task returns from kernel
16 * mode or exits.
17 *
18 * This is like the signal handler which runs in kernel mode, but it doesn't
19 * try to wake up the @task.
20 *
21 * RETURNS:
22 * 0 if succeeds or -ESRCH.
23 */
7int 24int
8task_work_add(struct task_struct *task, struct callback_head *work, bool notify) 25task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
9{ 26{
@@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
21 return 0; 38 return 0;
22} 39}
23 40
41/**
42 * task_work_cancel - cancel a pending work added by task_work_add()
43 * @task: the task which should execute the work
44 * @func: identifies the work to remove
45 *
46 * Find the last queued pending work with ->func == @func and remove
47 * it from queue.
48 *
49 * RETURNS:
50 * The found work or NULL if not found.
51 */
24struct callback_head * 52struct callback_head *
25task_work_cancel(struct task_struct *task, task_work_func_t func) 53task_work_cancel(struct task_struct *task, task_work_func_t func)
26{ 54{
27 struct callback_head **pprev = &task->task_works; 55 struct callback_head **pprev = &task->task_works;
28 struct callback_head *work = NULL; 56 struct callback_head *work;
29 unsigned long flags; 57 unsigned long flags;
30 /* 58 /*
31 * If cmpxchg() fails we continue without updating pprev. 59 * If cmpxchg() fails we continue without updating pprev.
@@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
35 */ 63 */
36 raw_spin_lock_irqsave(&task->pi_lock, flags); 64 raw_spin_lock_irqsave(&task->pi_lock, flags);
37 while ((work = ACCESS_ONCE(*pprev))) { 65 while ((work = ACCESS_ONCE(*pprev))) {
38 read_barrier_depends(); 66 smp_read_barrier_depends();
39 if (work->func != func) 67 if (work->func != func)
40 pprev = &work->next; 68 pprev = &work->next;
41 else if (cmpxchg(pprev, work, work->next) == work) 69 else if (cmpxchg(pprev, work, work->next) == work)
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
46 return work; 74 return work;
47} 75}
48 76
77/**
78 * task_work_run - execute the works added by task_work_add()
79 *
80 * Flush the pending works. Should be used by the core kernel code.
81 * Called before the task returns to the user-mode or stops, or when
82 * it exits. In the latter case task_work_add() can no longer add the
83 * new work after task_work_run() returns.
84 */
49void task_work_run(void) 85void task_work_run(void)
50{ 86{
51 struct task_struct *task = current; 87 struct task_struct *task = current;
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf7..630d72bf7e41 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,12 +10,64 @@
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
11 int wait) 11 int wait)
12{ 12{
13 unsigned long flags;
14
13 WARN_ON(cpu != 0); 15 WARN_ON(cpu != 0);
14 16
15 local_irq_disable(); 17 local_irq_save(flags);
16 (func)(info); 18 func(info);
17 local_irq_enable(); 19 local_irq_restore(flags);
18 20
19 return 0; 21 return 0;
20} 22}
21EXPORT_SYMBOL(smp_call_function_single); 23EXPORT_SYMBOL(smp_call_function_single);
24
25int on_each_cpu(smp_call_func_t func, void *info, int wait)
26{
27 unsigned long flags;
28
29 local_irq_save(flags);
30 func(info);
31 local_irq_restore(flags);
32 return 0;
33}
34EXPORT_SYMBOL(on_each_cpu);
35
36/*
37 * Note we still need to test the mask even for UP
38 * because we actually can get an empty mask from
39 * code that on SMP might call us without the local
40 * CPU in the mask.
41 */
42void on_each_cpu_mask(const struct cpumask *mask,
43 smp_call_func_t func, void *info, bool wait)
44{
45 unsigned long flags;
46
47 if (cpumask_test_cpu(0, mask)) {
48 local_irq_save(flags);
49 func(info);
50 local_irq_restore(flags);
51 }
52}
53EXPORT_SYMBOL(on_each_cpu_mask);
54
55/*
56 * Preemption is disabled here to make sure the cond_func is called under the
57 * same condtions in UP and SMP.
58 */
59void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
60 smp_call_func_t func, void *info, bool wait,
61 gfp_t gfp_flags)
62{
63 unsigned long flags;
64
65 preempt_disable();
66 if (cond_func(0, info)) {
67 local_irq_save(flags);
68 func(info);
69 local_irq_restore(flags);
70 }
71 preempt_enable();
72}
73EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 652bea9054f0..c9eef36739a9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1461,7 +1461,7 @@ config BACKTRACE_SELF_TEST
1461 1461
1462config RBTREE_TEST 1462config RBTREE_TEST
1463 tristate "Red-Black tree test" 1463 tristate "Red-Black tree test"
1464 depends on m && DEBUG_KERNEL 1464 depends on DEBUG_KERNEL
1465 help 1465 help
1466 A benchmark measuring the performance of the rbtree library. 1466 A benchmark measuring the performance of the rbtree library.
1467 Also includes rbtree invariant checks. 1467 Also includes rbtree invariant checks.
diff --git a/lib/crc32.c b/lib/crc32.c
index 072fbd8234d5..410093dbe51c 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -131,11 +131,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
131#endif 131#endif
132 132
133/** 133/**
134 * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 134 * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
135 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for 135 * CRC32/CRC32C
136 * other uses, or the previous crc32 value if computing incrementally. 136 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other
137 * @p: pointer to buffer over which CRC is run 137 * uses, or the previous crc32/crc32c value if computing incrementally.
138 * @p: pointer to buffer over which CRC32/CRC32C is run
138 * @len: length of buffer @p 139 * @len: length of buffer @p
140 * @tab: little-endian Ethernet table
141 * @polynomial: CRC32/CRC32c LE polynomial
139 */ 142 */
140static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, 143static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
141 size_t len, const u32 (*tab)[256], 144 size_t len, const u32 (*tab)[256],
@@ -201,11 +204,13 @@ EXPORT_SYMBOL(crc32_le);
201EXPORT_SYMBOL(__crc32c_le); 204EXPORT_SYMBOL(__crc32c_le);
202 205
203/** 206/**
204 * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 207 * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
205 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for 208 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for
206 * other uses, or the previous crc32 value if computing incrementally. 209 * other uses, or the previous crc32 value if computing incrementally.
207 * @p: pointer to buffer over which CRC is run 210 * @p: pointer to buffer over which CRC32 is run
208 * @len: length of buffer @p 211 * @len: length of buffer @p
212 * @tab: big-endian Ethernet table
213 * @polynomial: CRC32 BE polynomial
209 */ 214 */
210static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, 215static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
211 size_t len, const u32 (*tab)[256], 216 size_t len, const u32 (*tab)[256],
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 19ff89e34eec..d619b28c456f 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -48,7 +48,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
48 out_len = 0x8000; /* 32 K */ 48 out_len = 0x8000; /* 32 K */
49 out_buf = malloc(out_len); 49 out_buf = malloc(out_len);
50 } else { 50 } else {
51 out_len = 0x7fffffff; /* no limit */ 51 out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
52 } 52 }
53 if (!out_buf) { 53 if (!out_buf) {
54 error("Out of memory while allocating output buffer"); 54 error("Out of memory while allocating output buffer");
diff --git a/lib/genalloc.c b/lib/genalloc.c
index b35cfa9bc3d4..26cf20be72b7 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -37,6 +37,11 @@
37#include <linux/of_address.h> 37#include <linux/of_address.h>
38#include <linux/of_device.h> 38#include <linux/of_device.h>
39 39
40static inline size_t chunk_size(const struct gen_pool_chunk *chunk)
41{
42 return chunk->end_addr - chunk->start_addr + 1;
43}
44
40static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) 45static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
41{ 46{
42 unsigned long val, nval; 47 unsigned long val, nval;
@@ -182,13 +187,13 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
182 int nbytes = sizeof(struct gen_pool_chunk) + 187 int nbytes = sizeof(struct gen_pool_chunk) +
183 BITS_TO_LONGS(nbits) * sizeof(long); 188 BITS_TO_LONGS(nbits) * sizeof(long);
184 189
185 chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid); 190 chunk = kzalloc_node(nbytes, GFP_KERNEL, nid);
186 if (unlikely(chunk == NULL)) 191 if (unlikely(chunk == NULL))
187 return -ENOMEM; 192 return -ENOMEM;
188 193
189 chunk->phys_addr = phys; 194 chunk->phys_addr = phys;
190 chunk->start_addr = virt; 195 chunk->start_addr = virt;
191 chunk->end_addr = virt + size; 196 chunk->end_addr = virt + size - 1;
192 atomic_set(&chunk->avail, size); 197 atomic_set(&chunk->avail, size);
193 198
194 spin_lock(&pool->lock); 199 spin_lock(&pool->lock);
@@ -213,7 +218,7 @@ phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
213 218
214 rcu_read_lock(); 219 rcu_read_lock();
215 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 220 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
216 if (addr >= chunk->start_addr && addr < chunk->end_addr) { 221 if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
217 paddr = chunk->phys_addr + (addr - chunk->start_addr); 222 paddr = chunk->phys_addr + (addr - chunk->start_addr);
218 break; 223 break;
219 } 224 }
@@ -242,7 +247,7 @@ void gen_pool_destroy(struct gen_pool *pool)
242 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 247 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
243 list_del(&chunk->next_chunk); 248 list_del(&chunk->next_chunk);
244 249
245 end_bit = (chunk->end_addr - chunk->start_addr) >> order; 250 end_bit = chunk_size(chunk) >> order;
246 bit = find_next_bit(chunk->bits, end_bit, 0); 251 bit = find_next_bit(chunk->bits, end_bit, 0);
247 BUG_ON(bit < end_bit); 252 BUG_ON(bit < end_bit);
248 253
@@ -283,7 +288,7 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
283 if (size > atomic_read(&chunk->avail)) 288 if (size > atomic_read(&chunk->avail))
284 continue; 289 continue;
285 290
286 end_bit = (chunk->end_addr - chunk->start_addr) >> order; 291 end_bit = chunk_size(chunk) >> order;
287retry: 292retry:
288 start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, 293 start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits,
289 pool->data); 294 pool->data);
@@ -330,8 +335,8 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
330 nbits = (size + (1UL << order) - 1) >> order; 335 nbits = (size + (1UL << order) - 1) >> order;
331 rcu_read_lock(); 336 rcu_read_lock();
332 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 337 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
333 if (addr >= chunk->start_addr && addr < chunk->end_addr) { 338 if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
334 BUG_ON(addr + size > chunk->end_addr); 339 BUG_ON(addr + size - 1 > chunk->end_addr);
335 start_bit = (addr - chunk->start_addr) >> order; 340 start_bit = (addr - chunk->start_addr) >> order;
336 remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); 341 remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
337 BUG_ON(remain); 342 BUG_ON(remain);
@@ -400,7 +405,7 @@ size_t gen_pool_size(struct gen_pool *pool)
400 405
401 rcu_read_lock(); 406 rcu_read_lock();
402 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) 407 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
403 size += chunk->end_addr - chunk->start_addr; 408 size += chunk_size(chunk);
404 rcu_read_unlock(); 409 rcu_read_unlock();
405 return size; 410 return size;
406} 411}
@@ -519,7 +524,6 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
519/** 524/**
520 * dev_get_gen_pool - Obtain the gen_pool (if any) for a device 525 * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
521 * @dev: device to retrieve the gen_pool from 526 * @dev: device to retrieve the gen_pool from
522 * @name: Optional name for the gen_pool, usually NULL
523 * 527 *
524 * Returns the gen_pool for the device if one is present, or NULL. 528 * Returns the gen_pool for the device if one is present, or NULL.
525 */ 529 */
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 411be80ddb46..df6839e3ce08 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -283,8 +283,8 @@ _output_error:
283 return (int) (-(((char *) ip) - source)); 283 return (int) (-(((char *) ip) - source));
284} 284}
285 285
286int lz4_decompress(const char *src, size_t *src_len, char *dest, 286int lz4_decompress(const unsigned char *src, size_t *src_len,
287 size_t actual_dest_len) 287 unsigned char *dest, size_t actual_dest_len)
288{ 288{
289 int ret = -1; 289 int ret = -1;
290 int input_len = 0; 290 int input_len = 0;
@@ -302,8 +302,8 @@ exit_0:
302EXPORT_SYMBOL(lz4_decompress); 302EXPORT_SYMBOL(lz4_decompress);
303#endif 303#endif
304 304
305int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, 305int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
306 char *dest, size_t *dest_len) 306 unsigned char *dest, size_t *dest_len)
307{ 307{
308 int ret = -1; 308 int ret = -1;
309 int out_len = 0; 309 int out_len = 0;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e7964296fd50..7811ed3b4e70 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -32,6 +32,7 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/bitops.h> 33#include <linux/bitops.h>
34#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
35#include <linux/hardirq.h> /* in_interrupt() */
35 36
36 37
37#ifdef __KERNEL__ 38#ifdef __KERNEL__
@@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root)
207 struct radix_tree_node *ret = NULL; 208 struct radix_tree_node *ret = NULL;
208 gfp_t gfp_mask = root_gfp_mask(root); 209 gfp_t gfp_mask = root_gfp_mask(root);
209 210
210 if (!(gfp_mask & __GFP_WAIT)) { 211 /*
212 * Preload code isn't irq safe and it doesn't make sence to use
213 * preloading in the interrupt anyway as all the allocations have to
214 * be atomic. So just do normal allocation when in interrupt.
215 */
216 if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
211 struct radix_tree_preload *rtp; 217 struct radix_tree_preload *rtp;
212 218
213 /* 219 /*
@@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node)
264 * To make use of this facility, the radix tree must be initialised without 270 * To make use of this facility, the radix tree must be initialised without
265 * __GFP_WAIT being passed to INIT_RADIX_TREE(). 271 * __GFP_WAIT being passed to INIT_RADIX_TREE().
266 */ 272 */
267int radix_tree_preload(gfp_t gfp_mask) 273static int __radix_tree_preload(gfp_t gfp_mask)
268{ 274{
269 struct radix_tree_preload *rtp; 275 struct radix_tree_preload *rtp;
270 struct radix_tree_node *node; 276 struct radix_tree_node *node;
@@ -288,9 +294,40 @@ int radix_tree_preload(gfp_t gfp_mask)
288out: 294out:
289 return ret; 295 return ret;
290} 296}
297
298/*
299 * Load up this CPU's radix_tree_node buffer with sufficient objects to
300 * ensure that the addition of a single element in the tree cannot fail. On
301 * success, return zero, with preemption disabled. On error, return -ENOMEM
302 * with preemption not disabled.
303 *
304 * To make use of this facility, the radix tree must be initialised without
305 * __GFP_WAIT being passed to INIT_RADIX_TREE().
306 */
307int radix_tree_preload(gfp_t gfp_mask)
308{
309 /* Warn on non-sensical use... */
310 WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
311 return __radix_tree_preload(gfp_mask);
312}
291EXPORT_SYMBOL(radix_tree_preload); 313EXPORT_SYMBOL(radix_tree_preload);
292 314
293/* 315/*
316 * The same as above function, except we don't guarantee preloading happens.
317 * We do it, if we decide it helps. On success, return zero with preemption
318 * disabled. On error, return -ENOMEM with preemption not disabled.
319 */
320int radix_tree_maybe_preload(gfp_t gfp_mask)
321{
322 if (gfp_mask & __GFP_WAIT)
323 return __radix_tree_preload(gfp_mask);
324 /* Preloading doesn't help anything with this gfp mask, skip it */
325 preempt_disable();
326 return 0;
327}
328EXPORT_SYMBOL(radix_tree_maybe_preload);
329
330/*
294 * Return the maximum key which can be store into a 331 * Return the maximum key which can be store into a
295 * radix tree with height HEIGHT. 332 * radix tree with height HEIGHT.
296 */ 333 */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c0e31fe2fabf..65f4effd117f 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -518,3 +518,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
518 *new = *victim; 518 *new = *victim;
519} 519}
520EXPORT_SYMBOL(rb_replace_node); 520EXPORT_SYMBOL(rb_replace_node);
521
522static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
523{
524 for (;;) {
525 if (node->rb_left)
526 node = node->rb_left;
527 else if (node->rb_right)
528 node = node->rb_right;
529 else
530 return (struct rb_node *)node;
531 }
532}
533
534struct rb_node *rb_next_postorder(const struct rb_node *node)
535{
536 const struct rb_node *parent;
537 if (!node)
538 return NULL;
539 parent = rb_parent(node);
540
541 /* If we're sitting on node, we've already seen our children */
542 if (parent && node == parent->rb_left && parent->rb_right) {
543 /* If we are the parent's left node, go to the parent's right
544 * node then all the way down to the left */
545 return rb_left_deepest_node(parent->rb_right);
546 } else
547 /* Otherwise we are the parent's right node, and the parent
548 * should be next */
549 return (struct rb_node *)parent;
550}
551EXPORT_SYMBOL(rb_next_postorder);
552
553struct rb_node *rb_first_postorder(const struct rb_root *root)
554{
555 if (!root->rb_node)
556 return NULL;
557
558 return rb_left_deepest_node(root->rb_node);
559}
560EXPORT_SYMBOL(rb_first_postorder);
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 122f02f9941b..31dd4ccd3baa 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -114,6 +114,16 @@ static int black_path_count(struct rb_node *rb)
114 return count; 114 return count;
115} 115}
116 116
117static void check_postorder(int nr_nodes)
118{
119 struct rb_node *rb;
120 int count = 0;
121 for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb))
122 count++;
123
124 WARN_ON_ONCE(count != nr_nodes);
125}
126
117static void check(int nr_nodes) 127static void check(int nr_nodes)
118{ 128{
119 struct rb_node *rb; 129 struct rb_node *rb;
@@ -136,6 +146,8 @@ static void check(int nr_nodes)
136 146
137 WARN_ON_ONCE(count != nr_nodes); 147 WARN_ON_ONCE(count != nr_nodes);
138 WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1); 148 WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
149
150 check_postorder(nr_nodes);
139} 151}
140 152
141static void check_augmented(int nr_nodes) 153static void check_augmented(int nr_nodes)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 37d9edcd14cf..ce682f7a4f29 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
652{ 652{
653 char kbuf[] = "0\n"; 653 char kbuf[] = "0\n";
654 654
655 if (*ppos) { 655 if (*ppos || *lenp < sizeof(kbuf)) {
656 *lenp = 0; 656 *lenp = 0;
657 return 0; 657 return 0;
658 } 658 }
diff --git a/mm/compaction.c b/mm/compaction.c
index 05ccb4cc0bdb..c43789388cd8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order)
1131 .sync = false, 1131 .sync = false,
1132 }; 1132 };
1133 1133
1134 if (!order)
1135 return;
1136
1134 __compact_pgdat(pgdat, &cc); 1137 __compact_pgdat(pgdat, &cc);
1135} 1138}
1136 1139
diff --git a/mm/filemap.c b/mm/filemap.c
index 731a2c24532d..e607728db4a8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
469 if (error) 469 if (error)
470 goto out; 470 goto out;
471 471
472 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 472 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
473 if (error == 0) { 473 if (error == 0) {
474 page_cache_get(page); 474 page_cache_get(page);
475 page->mapping = mapping; 475 page->mapping = mapping;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a92012a71702..963e14c0486f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -417,7 +417,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
417 unsigned long msecs; 417 unsigned long msecs;
418 int err; 418 int err;
419 419
420 err = strict_strtoul(buf, 10, &msecs); 420 err = kstrtoul(buf, 10, &msecs);
421 if (err || msecs > UINT_MAX) 421 if (err || msecs > UINT_MAX)
422 return -EINVAL; 422 return -EINVAL;
423 423
@@ -444,7 +444,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
444 unsigned long msecs; 444 unsigned long msecs;
445 int err; 445 int err;
446 446
447 err = strict_strtoul(buf, 10, &msecs); 447 err = kstrtoul(buf, 10, &msecs);
448 if (err || msecs > UINT_MAX) 448 if (err || msecs > UINT_MAX)
449 return -EINVAL; 449 return -EINVAL;
450 450
@@ -470,7 +470,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
470 int err; 470 int err;
471 unsigned long pages; 471 unsigned long pages;
472 472
473 err = strict_strtoul(buf, 10, &pages); 473 err = kstrtoul(buf, 10, &pages);
474 if (err || !pages || pages > UINT_MAX) 474 if (err || !pages || pages > UINT_MAX)
475 return -EINVAL; 475 return -EINVAL;
476 476
@@ -538,7 +538,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
538 int err; 538 int err;
539 unsigned long max_ptes_none; 539 unsigned long max_ptes_none;
540 540
541 err = strict_strtoul(buf, 10, &max_ptes_none); 541 err = kstrtoul(buf, 10, &max_ptes_none);
542 if (err || max_ptes_none > HPAGE_PMD_NR-1) 542 if (err || max_ptes_none > HPAGE_PMD_NR-1)
543 return -EINVAL; 543 return -EINVAL;
544 544
@@ -2296,6 +2296,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2296 goto out; 2296 goto out;
2297 2297
2298 vma = find_vma(mm, address); 2298 vma = find_vma(mm, address);
2299 if (!vma)
2300 goto out;
2299 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2301 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2300 hend = vma->vm_end & HPAGE_PMD_MASK; 2302 hend = vma->vm_end & HPAGE_PMD_MASK;
2301 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2303 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b60f33080a28..b49579c7f2a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
21#include <linux/rmap.h> 21#include <linux/rmap.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/page-isolation.h>
24 25
25#include <asm/page.h> 26#include <asm/page.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -33,7 +34,6 @@
33#include "internal.h" 34#include "internal.h"
34 35
35const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
36static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
37unsigned long hugepages_treat_as_movable; 37unsigned long hugepages_treat_as_movable;
38 38
39int hugetlb_max_hstate __read_mostly; 39int hugetlb_max_hstate __read_mostly;
@@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages;
48static unsigned long __initdata default_hstate_size; 48static unsigned long __initdata default_hstate_size;
49 49
50/* 50/*
51 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 51 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
52 * free_huge_pages, and surplus_huge_pages.
52 */ 53 */
53DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
54 55
@@ -135,9 +136,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
135 * across the pages in a mapping. 136 * across the pages in a mapping.
136 * 137 *
137 * The region data structures are protected by a combination of the mmap_sem 138 * The region data structures are protected by a combination of the mmap_sem
138 * and the hugetlb_instantion_mutex. To access or modify a region the caller 139 * and the hugetlb_instantiation_mutex. To access or modify a region the caller
139 * must either hold the mmap_sem for write, or the mmap_sem for read and 140 * must either hold the mmap_sem for write, or the mmap_sem for read and
140 * the hugetlb_instantiation mutex: 141 * the hugetlb_instantiation_mutex:
141 * 142 *
142 * down_write(&mm->mmap_sem); 143 * down_write(&mm->mmap_sem);
143 * or 144 * or
@@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
434 return (get_vma_private_data(vma) & flag) != 0; 435 return (get_vma_private_data(vma) & flag) != 0;
435} 436}
436 437
437/* Decrement the reserved pages in the hugepage pool by one */
438static void decrement_hugepage_resv_vma(struct hstate *h,
439 struct vm_area_struct *vma)
440{
441 if (vma->vm_flags & VM_NORESERVE)
442 return;
443
444 if (vma->vm_flags & VM_MAYSHARE) {
445 /* Shared mappings always use reserves */
446 h->resv_huge_pages--;
447 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
448 /*
449 * Only the process that called mmap() has reserves for
450 * private mappings.
451 */
452 h->resv_huge_pages--;
453 }
454}
455
456/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 438/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
457void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 439void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
458{ 440{
@@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
462} 444}
463 445
464/* Returns true if the VMA has associated reserve pages */ 446/* Returns true if the VMA has associated reserve pages */
465static int vma_has_reserves(struct vm_area_struct *vma) 447static int vma_has_reserves(struct vm_area_struct *vma, long chg)
466{ 448{
449 if (vma->vm_flags & VM_NORESERVE) {
450 /*
451 * This address is already reserved by other process(chg == 0),
452 * so, we should decrement reserved count. Without decrementing,
453 * reserve count remains after releasing inode, because this
454 * allocated page will go into page cache and is regarded as
455 * coming from reserved pool in releasing step. Currently, we
456 * don't have any other solution to deal with this situation
457 * properly, so add work-around here.
458 */
459 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
460 return 1;
461 else
462 return 0;
463 }
464
465 /* Shared mappings always use reserves */
467 if (vma->vm_flags & VM_MAYSHARE) 466 if (vma->vm_flags & VM_MAYSHARE)
468 return 1; 467 return 1;
468
469 /*
470 * Only the process that called mmap() has reserves for
471 * private mappings.
472 */
469 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 473 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
470 return 1; 474 return 1;
475
471 return 0; 476 return 0;
472} 477}
473 478
@@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
517{ 522{
518 struct page *page; 523 struct page *page;
519 524
520 if (list_empty(&h->hugepage_freelists[nid])) 525 list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
526 if (!is_migrate_isolate_page(page))
527 break;
528 /*
529 * if 'non-isolated free hugepage' not found on the list,
530 * the allocation fails.
531 */
532 if (&h->hugepage_freelists[nid] == &page->lru)
521 return NULL; 533 return NULL;
522 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
523 list_move(&page->lru, &h->hugepage_activelist); 534 list_move(&page->lru, &h->hugepage_activelist);
524 set_page_refcounted(page); 535 set_page_refcounted(page);
525 h->free_huge_pages--; 536 h->free_huge_pages--;
@@ -527,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
527 return page; 538 return page;
528} 539}
529 540
541/* Movability of hugepages depends on migration support. */
542static inline gfp_t htlb_alloc_mask(struct hstate *h)
543{
544 if (hugepages_treat_as_movable || hugepage_migration_support(h))
545 return GFP_HIGHUSER_MOVABLE;
546 else
547 return GFP_HIGHUSER;
548}
549
530static struct page *dequeue_huge_page_vma(struct hstate *h, 550static struct page *dequeue_huge_page_vma(struct hstate *h,
531 struct vm_area_struct *vma, 551 struct vm_area_struct *vma,
532 unsigned long address, int avoid_reserve) 552 unsigned long address, int avoid_reserve,
553 long chg)
533{ 554{
534 struct page *page = NULL; 555 struct page *page = NULL;
535 struct mempolicy *mpol; 556 struct mempolicy *mpol;
@@ -539,16 +560,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
539 struct zoneref *z; 560 struct zoneref *z;
540 unsigned int cpuset_mems_cookie; 561 unsigned int cpuset_mems_cookie;
541 562
542retry_cpuset:
543 cpuset_mems_cookie = get_mems_allowed();
544 zonelist = huge_zonelist(vma, address,
545 htlb_alloc_mask, &mpol, &nodemask);
546 /* 563 /*
547 * A child process with MAP_PRIVATE mappings created by their parent 564 * A child process with MAP_PRIVATE mappings created by their parent
548 * have no page reserves. This check ensures that reservations are 565 * have no page reserves. This check ensures that reservations are
549 * not "stolen". The child may still get SIGKILLed 566 * not "stolen". The child may still get SIGKILLed
550 */ 567 */
551 if (!vma_has_reserves(vma) && 568 if (!vma_has_reserves(vma, chg) &&
552 h->free_huge_pages - h->resv_huge_pages == 0) 569 h->free_huge_pages - h->resv_huge_pages == 0)
553 goto err; 570 goto err;
554 571
@@ -556,13 +573,23 @@ retry_cpuset:
556 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 573 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
557 goto err; 574 goto err;
558 575
576retry_cpuset:
577 cpuset_mems_cookie = get_mems_allowed();
578 zonelist = huge_zonelist(vma, address,
579 htlb_alloc_mask(h), &mpol, &nodemask);
580
559 for_each_zone_zonelist_nodemask(zone, z, zonelist, 581 for_each_zone_zonelist_nodemask(zone, z, zonelist,
560 MAX_NR_ZONES - 1, nodemask) { 582 MAX_NR_ZONES - 1, nodemask) {
561 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { 583 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
562 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 584 page = dequeue_huge_page_node(h, zone_to_nid(zone));
563 if (page) { 585 if (page) {
564 if (!avoid_reserve) 586 if (avoid_reserve)
565 decrement_hugepage_resv_vma(h, vma); 587 break;
588 if (!vma_has_reserves(vma, chg))
589 break;
590
591 SetPagePrivate(page);
592 h->resv_huge_pages--;
566 break; 593 break;
567 } 594 }
568 } 595 }
@@ -574,7 +601,6 @@ retry_cpuset:
574 return page; 601 return page;
575 602
576err: 603err:
577 mpol_cond_put(mpol);
578 return NULL; 604 return NULL;
579} 605}
580 606
@@ -620,15 +646,20 @@ static void free_huge_page(struct page *page)
620 int nid = page_to_nid(page); 646 int nid = page_to_nid(page);
621 struct hugepage_subpool *spool = 647 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page); 648 (struct hugepage_subpool *)page_private(page);
649 bool restore_reserve;
623 650
624 set_page_private(page, 0); 651 set_page_private(page, 0);
625 page->mapping = NULL; 652 page->mapping = NULL;
626 BUG_ON(page_count(page)); 653 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 654 BUG_ON(page_mapcount(page));
655 restore_reserve = PagePrivate(page);
628 656
629 spin_lock(&hugetlb_lock); 657 spin_lock(&hugetlb_lock);
630 hugetlb_cgroup_uncharge_page(hstate_index(h), 658 hugetlb_cgroup_uncharge_page(hstate_index(h),
631 pages_per_huge_page(h), page); 659 pages_per_huge_page(h), page);
660 if (restore_reserve)
661 h->resv_huge_pages++;
662
632 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 663 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
633 /* remove the page from active list */ 664 /* remove the page from active list */
634 list_del(&page->lru); 665 list_del(&page->lru);
@@ -715,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
715 return NULL; 746 return NULL;
716 747
717 page = alloc_pages_exact_node(nid, 748 page = alloc_pages_exact_node(nid,
718 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 749 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
719 __GFP_REPEAT|__GFP_NOWARN, 750 __GFP_REPEAT|__GFP_NOWARN,
720 huge_page_order(h)); 751 huge_page_order(h));
721 if (page) { 752 if (page) {
@@ -772,33 +803,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
772 return nid; 803 return nid;
773} 804}
774 805
775static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
776{
777 struct page *page;
778 int start_nid;
779 int next_nid;
780 int ret = 0;
781
782 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
783 next_nid = start_nid;
784
785 do {
786 page = alloc_fresh_huge_page_node(h, next_nid);
787 if (page) {
788 ret = 1;
789 break;
790 }
791 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
792 } while (next_nid != start_nid);
793
794 if (ret)
795 count_vm_event(HTLB_BUDDY_PGALLOC);
796 else
797 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
798
799 return ret;
800}
801
802/* 806/*
803 * helper for free_pool_huge_page() - return the previously saved 807 * helper for free_pool_huge_page() - return the previously saved
804 * node ["this node"] from which to free a huge page. Advance the 808 * node ["this node"] from which to free a huge page. Advance the
@@ -817,6 +821,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
817 return nid; 821 return nid;
818} 822}
819 823
824#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
825 for (nr_nodes = nodes_weight(*mask); \
826 nr_nodes > 0 && \
827 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
828 nr_nodes--)
829
830#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
831 for (nr_nodes = nodes_weight(*mask); \
832 nr_nodes > 0 && \
833 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
834 nr_nodes--)
835
836static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
837{
838 struct page *page;
839 int nr_nodes, node;
840 int ret = 0;
841
842 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
843 page = alloc_fresh_huge_page_node(h, node);
844 if (page) {
845 ret = 1;
846 break;
847 }
848 }
849
850 if (ret)
851 count_vm_event(HTLB_BUDDY_PGALLOC);
852 else
853 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
854
855 return ret;
856}
857
820/* 858/*
821 * Free huge page from pool from next node to free. 859 * Free huge page from pool from next node to free.
822 * Attempt to keep persistent huge pages more or less 860 * Attempt to keep persistent huge pages more or less
@@ -826,40 +864,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
826static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 864static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
827 bool acct_surplus) 865 bool acct_surplus)
828{ 866{
829 int start_nid; 867 int nr_nodes, node;
830 int next_nid;
831 int ret = 0; 868 int ret = 0;
832 869
833 start_nid = hstate_next_node_to_free(h, nodes_allowed); 870 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
834 next_nid = start_nid;
835
836 do {
837 /* 871 /*
838 * If we're returning unused surplus pages, only examine 872 * If we're returning unused surplus pages, only examine
839 * nodes with surplus pages. 873 * nodes with surplus pages.
840 */ 874 */
841 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && 875 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
842 !list_empty(&h->hugepage_freelists[next_nid])) { 876 !list_empty(&h->hugepage_freelists[node])) {
843 struct page *page = 877 struct page *page =
844 list_entry(h->hugepage_freelists[next_nid].next, 878 list_entry(h->hugepage_freelists[node].next,
845 struct page, lru); 879 struct page, lru);
846 list_del(&page->lru); 880 list_del(&page->lru);
847 h->free_huge_pages--; 881 h->free_huge_pages--;
848 h->free_huge_pages_node[next_nid]--; 882 h->free_huge_pages_node[node]--;
849 if (acct_surplus) { 883 if (acct_surplus) {
850 h->surplus_huge_pages--; 884 h->surplus_huge_pages--;
851 h->surplus_huge_pages_node[next_nid]--; 885 h->surplus_huge_pages_node[node]--;
852 } 886 }
853 update_and_free_page(h, page); 887 update_and_free_page(h, page);
854 ret = 1; 888 ret = 1;
855 break; 889 break;
856 } 890 }
857 next_nid = hstate_next_node_to_free(h, nodes_allowed); 891 }
858 } while (next_nid != start_nid);
859 892
860 return ret; 893 return ret;
861} 894}
862 895
896/*
897 * Dissolve a given free hugepage into free buddy pages. This function does
898 * nothing for in-use (including surplus) hugepages.
899 */
900static void dissolve_free_huge_page(struct page *page)
901{
902 spin_lock(&hugetlb_lock);
903 if (PageHuge(page) && !page_count(page)) {
904 struct hstate *h = page_hstate(page);
905 int nid = page_to_nid(page);
906 list_del(&page->lru);
907 h->free_huge_pages--;
908 h->free_huge_pages_node[nid]--;
909 update_and_free_page(h, page);
910 }
911 spin_unlock(&hugetlb_lock);
912}
913
914/*
915 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
916 * make specified memory blocks removable from the system.
917 * Note that start_pfn should aligned with (minimum) hugepage size.
918 */
919void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
920{
921 unsigned int order = 8 * sizeof(void *);
922 unsigned long pfn;
923 struct hstate *h;
924
925 /* Set scan step to minimum hugepage size */
926 for_each_hstate(h)
927 if (order > huge_page_order(h))
928 order = huge_page_order(h);
929 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
930 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
931 dissolve_free_huge_page(pfn_to_page(pfn));
932}
933
863static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 934static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
864{ 935{
865 struct page *page; 936 struct page *page;
@@ -902,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
902 spin_unlock(&hugetlb_lock); 973 spin_unlock(&hugetlb_lock);
903 974
904 if (nid == NUMA_NO_NODE) 975 if (nid == NUMA_NO_NODE)
905 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 976 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
906 __GFP_REPEAT|__GFP_NOWARN, 977 __GFP_REPEAT|__GFP_NOWARN,
907 huge_page_order(h)); 978 huge_page_order(h));
908 else 979 else
909 page = alloc_pages_exact_node(nid, 980 page = alloc_pages_exact_node(nid,
910 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 981 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
911 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 982 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
912 983
913 if (page && arch_prepare_hugepage(page)) { 984 if (page && arch_prepare_hugepage(page)) {
@@ -944,10 +1015,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
944 */ 1015 */
945struct page *alloc_huge_page_node(struct hstate *h, int nid) 1016struct page *alloc_huge_page_node(struct hstate *h, int nid)
946{ 1017{
947 struct page *page; 1018 struct page *page = NULL;
948 1019
949 spin_lock(&hugetlb_lock); 1020 spin_lock(&hugetlb_lock);
950 page = dequeue_huge_page_node(h, nid); 1021 if (h->free_huge_pages - h->resv_huge_pages > 0)
1022 page = dequeue_huge_page_node(h, nid);
951 spin_unlock(&hugetlb_lock); 1023 spin_unlock(&hugetlb_lock);
952 1024
953 if (!page) 1025 if (!page)
@@ -1035,11 +1107,8 @@ free:
1035 spin_unlock(&hugetlb_lock); 1107 spin_unlock(&hugetlb_lock);
1036 1108
1037 /* Free unnecessary surplus pages to the buddy allocator */ 1109 /* Free unnecessary surplus pages to the buddy allocator */
1038 if (!list_empty(&surplus_list)) { 1110 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1039 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1111 put_page(page);
1040 put_page(page);
1041 }
1042 }
1043 spin_lock(&hugetlb_lock); 1112 spin_lock(&hugetlb_lock);
1044 1113
1045 return ret; 1114 return ret;
@@ -1106,9 +1175,9 @@ static long vma_needs_reservation(struct hstate *h,
1106 } else { 1175 } else {
1107 long err; 1176 long err;
1108 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1177 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1109 struct resv_map *reservations = vma_resv_map(vma); 1178 struct resv_map *resv = vma_resv_map(vma);
1110 1179
1111 err = region_chg(&reservations->regions, idx, idx + 1); 1180 err = region_chg(&resv->regions, idx, idx + 1);
1112 if (err < 0) 1181 if (err < 0)
1113 return err; 1182 return err;
1114 return 0; 1183 return 0;
@@ -1126,10 +1195,10 @@ static void vma_commit_reservation(struct hstate *h,
1126 1195
1127 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1196 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1128 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1197 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1129 struct resv_map *reservations = vma_resv_map(vma); 1198 struct resv_map *resv = vma_resv_map(vma);
1130 1199
1131 /* Mark this page used in the map. */ 1200 /* Mark this page used in the map. */
1132 region_add(&reservations->regions, idx, idx + 1); 1201 region_add(&resv->regions, idx, idx + 1);
1133 } 1202 }
1134} 1203}
1135 1204
@@ -1155,38 +1224,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1155 chg = vma_needs_reservation(h, vma, addr); 1224 chg = vma_needs_reservation(h, vma, addr);
1156 if (chg < 0) 1225 if (chg < 0)
1157 return ERR_PTR(-ENOMEM); 1226 return ERR_PTR(-ENOMEM);
1158 if (chg) 1227 if (chg || avoid_reserve)
1159 if (hugepage_subpool_get_pages(spool, chg)) 1228 if (hugepage_subpool_get_pages(spool, 1))
1160 return ERR_PTR(-ENOSPC); 1229 return ERR_PTR(-ENOSPC);
1161 1230
1162 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1231 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1163 if (ret) { 1232 if (ret) {
1164 hugepage_subpool_put_pages(spool, chg); 1233 if (chg || avoid_reserve)
1234 hugepage_subpool_put_pages(spool, 1);
1165 return ERR_PTR(-ENOSPC); 1235 return ERR_PTR(-ENOSPC);
1166 } 1236 }
1167 spin_lock(&hugetlb_lock); 1237 spin_lock(&hugetlb_lock);
1168 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1238 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1169 if (page) { 1239 if (!page) {
1170 /* update page cgroup details */
1171 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1172 h_cg, page);
1173 spin_unlock(&hugetlb_lock);
1174 } else {
1175 spin_unlock(&hugetlb_lock); 1240 spin_unlock(&hugetlb_lock);
1176 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1241 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1177 if (!page) { 1242 if (!page) {
1178 hugetlb_cgroup_uncharge_cgroup(idx, 1243 hugetlb_cgroup_uncharge_cgroup(idx,
1179 pages_per_huge_page(h), 1244 pages_per_huge_page(h),
1180 h_cg); 1245 h_cg);
1181 hugepage_subpool_put_pages(spool, chg); 1246 if (chg || avoid_reserve)
1247 hugepage_subpool_put_pages(spool, 1);
1182 return ERR_PTR(-ENOSPC); 1248 return ERR_PTR(-ENOSPC);
1183 } 1249 }
1184 spin_lock(&hugetlb_lock); 1250 spin_lock(&hugetlb_lock);
1185 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1186 h_cg, page);
1187 list_move(&page->lru, &h->hugepage_activelist); 1251 list_move(&page->lru, &h->hugepage_activelist);
1188 spin_unlock(&hugetlb_lock); 1252 /* Fall through */
1189 } 1253 }
1254 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
1255 spin_unlock(&hugetlb_lock);
1190 1256
1191 set_page_private(page, (unsigned long)spool); 1257 set_page_private(page, (unsigned long)spool);
1192 1258
@@ -1194,17 +1260,29 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1194 return page; 1260 return page;
1195} 1261}
1196 1262
1263/*
1264 * alloc_huge_page()'s wrapper which simply returns the page if allocation
1265 * succeeds, otherwise NULL. This function is called from new_vma_page(),
1266 * where no ERR_VALUE is expected to be returned.
1267 */
1268struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
1269 unsigned long addr, int avoid_reserve)
1270{
1271 struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
1272 if (IS_ERR(page))
1273 page = NULL;
1274 return page;
1275}
1276
1197int __weak alloc_bootmem_huge_page(struct hstate *h) 1277int __weak alloc_bootmem_huge_page(struct hstate *h)
1198{ 1278{
1199 struct huge_bootmem_page *m; 1279 struct huge_bootmem_page *m;
1200 int nr_nodes = nodes_weight(node_states[N_MEMORY]); 1280 int nr_nodes, node;
1201 1281
1202 while (nr_nodes) { 1282 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1203 void *addr; 1283 void *addr;
1204 1284
1205 addr = __alloc_bootmem_node_nopanic( 1285 addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
1206 NODE_DATA(hstate_next_node_to_alloc(h,
1207 &node_states[N_MEMORY])),
1208 huge_page_size(h), huge_page_size(h), 0); 1286 huge_page_size(h), huge_page_size(h), 0);
1209 1287
1210 if (addr) { 1288 if (addr) {
@@ -1216,7 +1294,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1216 m = addr; 1294 m = addr;
1217 goto found; 1295 goto found;
1218 } 1296 }
1219 nr_nodes--;
1220 } 1297 }
1221 return 0; 1298 return 0;
1222 1299
@@ -1355,48 +1432,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
1355static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 1432static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1356 int delta) 1433 int delta)
1357{ 1434{
1358 int start_nid, next_nid; 1435 int nr_nodes, node;
1359 int ret = 0;
1360 1436
1361 VM_BUG_ON(delta != -1 && delta != 1); 1437 VM_BUG_ON(delta != -1 && delta != 1);
1362 1438
1363 if (delta < 0) 1439 if (delta < 0) {
1364 start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 1440 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1365 else 1441 if (h->surplus_huge_pages_node[node])
1366 start_nid = hstate_next_node_to_free(h, nodes_allowed); 1442 goto found;
1367 next_nid = start_nid;
1368
1369 do {
1370 int nid = next_nid;
1371 if (delta < 0) {
1372 /*
1373 * To shrink on this node, there must be a surplus page
1374 */
1375 if (!h->surplus_huge_pages_node[nid]) {
1376 next_nid = hstate_next_node_to_alloc(h,
1377 nodes_allowed);
1378 continue;
1379 }
1380 } 1443 }
1381 if (delta > 0) { 1444 } else {
1382 /* 1445 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1383 * Surplus cannot exceed the total number of pages 1446 if (h->surplus_huge_pages_node[node] <
1384 */ 1447 h->nr_huge_pages_node[node])
1385 if (h->surplus_huge_pages_node[nid] >= 1448 goto found;
1386 h->nr_huge_pages_node[nid]) {
1387 next_nid = hstate_next_node_to_free(h,
1388 nodes_allowed);
1389 continue;
1390 }
1391 } 1449 }
1450 }
1451 return 0;
1392 1452
1393 h->surplus_huge_pages += delta; 1453found:
1394 h->surplus_huge_pages_node[nid] += delta; 1454 h->surplus_huge_pages += delta;
1395 ret = 1; 1455 h->surplus_huge_pages_node[node] += delta;
1396 break; 1456 return 1;
1397 } while (next_nid != start_nid);
1398
1399 return ret;
1400} 1457}
1401 1458
1402#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1459#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
@@ -1526,7 +1583,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1526 struct hstate *h; 1583 struct hstate *h;
1527 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1584 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1528 1585
1529 err = strict_strtoul(buf, 10, &count); 1586 err = kstrtoul(buf, 10, &count);
1530 if (err) 1587 if (err)
1531 goto out; 1588 goto out;
1532 1589
@@ -1617,7 +1674,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1617 if (h->order >= MAX_ORDER) 1674 if (h->order >= MAX_ORDER)
1618 return -EINVAL; 1675 return -EINVAL;
1619 1676
1620 err = strict_strtoul(buf, 10, &input); 1677 err = kstrtoul(buf, 10, &input);
1621 if (err) 1678 if (err)
1622 return err; 1679 return err;
1623 1680
@@ -2068,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
2068} 2125}
2069#endif /* CONFIG_NUMA */ 2126#endif /* CONFIG_NUMA */
2070 2127
2071int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
2072 void __user *buffer,
2073 size_t *length, loff_t *ppos)
2074{
2075 proc_dointvec(table, write, buffer, length, ppos);
2076 if (hugepages_treat_as_movable)
2077 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
2078 else
2079 htlb_alloc_mask = GFP_HIGHUSER;
2080 return 0;
2081}
2082
2083int hugetlb_overcommit_handler(struct ctl_table *table, int write, 2128int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2084 void __user *buffer, 2129 void __user *buffer,
2085 size_t *length, loff_t *ppos) 2130 size_t *length, loff_t *ppos)
@@ -2207,7 +2252,7 @@ out:
2207 2252
2208static void hugetlb_vm_op_open(struct vm_area_struct *vma) 2253static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2209{ 2254{
2210 struct resv_map *reservations = vma_resv_map(vma); 2255 struct resv_map *resv = vma_resv_map(vma);
2211 2256
2212 /* 2257 /*
2213 * This new VMA should share its siblings reservation map if present. 2258 * This new VMA should share its siblings reservation map if present.
@@ -2217,34 +2262,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2217 * after this open call completes. It is therefore safe to take a 2262 * after this open call completes. It is therefore safe to take a
2218 * new reference here without additional locking. 2263 * new reference here without additional locking.
2219 */ 2264 */
2220 if (reservations) 2265 if (resv)
2221 kref_get(&reservations->refs); 2266 kref_get(&resv->refs);
2222} 2267}
2223 2268
2224static void resv_map_put(struct vm_area_struct *vma) 2269static void resv_map_put(struct vm_area_struct *vma)
2225{ 2270{
2226 struct resv_map *reservations = vma_resv_map(vma); 2271 struct resv_map *resv = vma_resv_map(vma);
2227 2272
2228 if (!reservations) 2273 if (!resv)
2229 return; 2274 return;
2230 kref_put(&reservations->refs, resv_map_release); 2275 kref_put(&resv->refs, resv_map_release);
2231} 2276}
2232 2277
2233static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2278static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2234{ 2279{
2235 struct hstate *h = hstate_vma(vma); 2280 struct hstate *h = hstate_vma(vma);
2236 struct resv_map *reservations = vma_resv_map(vma); 2281 struct resv_map *resv = vma_resv_map(vma);
2237 struct hugepage_subpool *spool = subpool_vma(vma); 2282 struct hugepage_subpool *spool = subpool_vma(vma);
2238 unsigned long reserve; 2283 unsigned long reserve;
2239 unsigned long start; 2284 unsigned long start;
2240 unsigned long end; 2285 unsigned long end;
2241 2286
2242 if (reservations) { 2287 if (resv) {
2243 start = vma_hugecache_offset(h, vma, vma->vm_start); 2288 start = vma_hugecache_offset(h, vma, vma->vm_start);
2244 end = vma_hugecache_offset(h, vma, vma->vm_end); 2289 end = vma_hugecache_offset(h, vma, vma->vm_end);
2245 2290
2246 reserve = (end - start) - 2291 reserve = (end - start) -
2247 region_count(&reservations->regions, start, end); 2292 region_count(&resv->regions, start, end);
2248 2293
2249 resv_map_put(vma); 2294 resv_map_put(vma);
2250 2295
@@ -2557,7 +2602,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2557{ 2602{
2558 struct hstate *h = hstate_vma(vma); 2603 struct hstate *h = hstate_vma(vma);
2559 struct page *old_page, *new_page; 2604 struct page *old_page, *new_page;
2560 int avoidcopy;
2561 int outside_reserve = 0; 2605 int outside_reserve = 0;
2562 unsigned long mmun_start; /* For mmu_notifiers */ 2606 unsigned long mmun_start; /* For mmu_notifiers */
2563 unsigned long mmun_end; /* For mmu_notifiers */ 2607 unsigned long mmun_end; /* For mmu_notifiers */
@@ -2567,10 +2611,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2567retry_avoidcopy: 2611retry_avoidcopy:
2568 /* If no-one else is actually using this page, avoid the copy 2612 /* If no-one else is actually using this page, avoid the copy
2569 * and just make the page writable */ 2613 * and just make the page writable */
2570 avoidcopy = (page_mapcount(old_page) == 1); 2614 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
2571 if (avoidcopy) { 2615 page_move_anon_rmap(old_page, vma, address);
2572 if (PageAnon(old_page))
2573 page_move_anon_rmap(old_page, vma, address);
2574 set_huge_ptep_writable(vma, address, ptep); 2616 set_huge_ptep_writable(vma, address, ptep);
2575 return 0; 2617 return 0;
2576 } 2618 }
@@ -2584,8 +2626,7 @@ retry_avoidcopy:
2584 * at the time of fork() could consume its reserves on COW instead 2626 * at the time of fork() could consume its reserves on COW instead
2585 * of the full address range. 2627 * of the full address range.
2586 */ 2628 */
2587 if (!(vma->vm_flags & VM_MAYSHARE) && 2629 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2588 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2589 old_page != pagecache_page) 2630 old_page != pagecache_page)
2590 outside_reserve = 1; 2631 outside_reserve = 1;
2591 2632
@@ -2657,6 +2698,8 @@ retry_avoidcopy:
2657 spin_lock(&mm->page_table_lock); 2698 spin_lock(&mm->page_table_lock);
2658 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2699 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2659 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2700 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2701 ClearPagePrivate(new_page);
2702
2660 /* Break COW */ 2703 /* Break COW */
2661 huge_ptep_clear_flush(vma, address, ptep); 2704 huge_ptep_clear_flush(vma, address, ptep);
2662 set_huge_pte_at(mm, address, ptep, 2705 set_huge_pte_at(mm, address, ptep,
@@ -2668,10 +2711,11 @@ retry_avoidcopy:
2668 } 2711 }
2669 spin_unlock(&mm->page_table_lock); 2712 spin_unlock(&mm->page_table_lock);
2670 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2713 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2671 /* Caller expects lock to be held */
2672 spin_lock(&mm->page_table_lock);
2673 page_cache_release(new_page); 2714 page_cache_release(new_page);
2674 page_cache_release(old_page); 2715 page_cache_release(old_page);
2716
2717 /* Caller expects lock to be held */
2718 spin_lock(&mm->page_table_lock);
2675 return 0; 2719 return 0;
2676} 2720}
2677 2721
@@ -2767,6 +2811,7 @@ retry:
2767 goto retry; 2811 goto retry;
2768 goto out; 2812 goto out;
2769 } 2813 }
2814 ClearPagePrivate(page);
2770 2815
2771 spin_lock(&inode->i_lock); 2816 spin_lock(&inode->i_lock);
2772 inode->i_blocks += blocks_per_huge_page(h); 2817 inode->i_blocks += blocks_per_huge_page(h);
@@ -2813,8 +2858,10 @@ retry:
2813 if (!huge_pte_none(huge_ptep_get(ptep))) 2858 if (!huge_pte_none(huge_ptep_get(ptep)))
2814 goto backout; 2859 goto backout;
2815 2860
2816 if (anon_rmap) 2861 if (anon_rmap) {
2862 ClearPagePrivate(page);
2817 hugepage_add_new_anon_rmap(page, vma, address); 2863 hugepage_add_new_anon_rmap(page, vma, address);
2864 }
2818 else 2865 else
2819 page_dup_rmap(page); 2866 page_dup_rmap(page);
2820 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 2867 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -3431,3 +3478,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3431 return ret; 3478 return ret;
3432} 3479}
3433#endif 3480#endif
3481
3482bool isolate_huge_page(struct page *page, struct list_head *list)
3483{
3484 VM_BUG_ON(!PageHead(page));
3485 if (!get_page_unless_zero(page))
3486 return false;
3487 spin_lock(&hugetlb_lock);
3488 list_move_tail(&page->lru, list);
3489 spin_unlock(&hugetlb_lock);
3490 return true;
3491}
3492
3493void putback_active_hugepage(struct page *page)
3494{
3495 VM_BUG_ON(!PageHead(page));
3496 spin_lock(&hugetlb_lock);
3497 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3498 spin_unlock(&hugetlb_lock);
3499 put_page(page);
3500}
3501
3502bool is_hugepage_active(struct page *page)
3503{
3504 VM_BUG_ON(!PageHuge(page));
3505 /*
3506 * This function can be called for a tail page because the caller,
3507 * scan_movable_pages, scans through a given pfn-range which typically
3508 * covers one memory block. In systems using gigantic hugepage (1GB
3509 * for x86_64,) a hugepage is larger than a memory block, and we don't
3510 * support migrating such large hugepages for now, so return false
3511 * when called for tail pages.
3512 */
3513 if (PageTail(page))
3514 return false;
3515 /*
3516 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3517 * so we should return false for them.
3518 */
3519 if (unlikely(PageHWPoison(page)))
3520 return false;
3521 return page_count(page) > 0;
3522}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 3a61efc518d5..afc2daa91c60 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -88,12 +88,12 @@ static int pfn_inject_init(void)
88 * hardware status change, hence do not require hardware support. 88 * hardware status change, hence do not require hardware support.
89 * They are mainly for testing hwpoison in software level. 89 * They are mainly for testing hwpoison in software level.
90 */ 90 */
91 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 91 dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
92 NULL, &hwpoison_fops); 92 NULL, &hwpoison_fops);
93 if (!dentry) 93 if (!dentry)
94 goto fail; 94 goto fail;
95 95
96 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, 96 dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
97 NULL, &unpoison_fops); 97 NULL, &unpoison_fops);
98 if (!dentry) 98 if (!dentry)
99 goto fail; 99 goto fail;
diff --git a/mm/internal.h b/mm/internal.h
index 4390ac6c106e..684f7aa9692a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn;
85 */ 85 */
86extern int isolate_lru_page(struct page *page); 86extern int isolate_lru_page(struct page *page);
87extern void putback_lru_page(struct page *page); 87extern void putback_lru_page(struct page *page);
88extern unsigned long zone_reclaimable_pages(struct zone *zone);
89extern bool zone_reclaimable(struct zone *zone);
88 90
89/* 91/*
90 * in mm/rmap.c: 92 * in mm/rmap.c:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c8d7f3110fd0..e126b0ef9ad2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1639 else if (strncmp(buf, "scan=", 5) == 0) { 1639 else if (strncmp(buf, "scan=", 5) == 0) {
1640 unsigned long secs; 1640 unsigned long secs;
1641 1641
1642 ret = strict_strtoul(buf + 5, 0, &secs); 1642 ret = kstrtoul(buf + 5, 0, &secs);
1643 if (ret < 0) 1643 if (ret < 0)
1644 goto out; 1644 goto out;
1645 stop_scan_thread(); 1645 stop_scan_thread();
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c440d8..0bea2b262a47 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
2194 unsigned long msecs; 2194 unsigned long msecs;
2195 int err; 2195 int err;
2196 2196
2197 err = strict_strtoul(buf, 10, &msecs); 2197 err = kstrtoul(buf, 10, &msecs);
2198 if (err || msecs > UINT_MAX) 2198 if (err || msecs > UINT_MAX)
2199 return -EINVAL; 2199 return -EINVAL;
2200 2200
@@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
2217 int err; 2217 int err;
2218 unsigned long nr_pages; 2218 unsigned long nr_pages;
2219 2219
2220 err = strict_strtoul(buf, 10, &nr_pages); 2220 err = kstrtoul(buf, 10, &nr_pages);
2221 if (err || nr_pages > UINT_MAX) 2221 if (err || nr_pages > UINT_MAX)
2222 return -EINVAL; 2222 return -EINVAL;
2223 2223
@@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2239 int err; 2239 int err;
2240 unsigned long flags; 2240 unsigned long flags;
2241 2241
2242 err = strict_strtoul(buf, 10, &flags); 2242 err = kstrtoul(buf, 10, &flags);
2243 if (err || flags > UINT_MAX) 2243 if (err || flags > UINT_MAX)
2244 return -EINVAL; 2244 return -EINVAL;
2245 if (flags > KSM_RUN_UNMERGE) 2245 if (flags > KSM_RUN_UNMERGE)
diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883e6e25..6975bc812542 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior)
42 * We can potentially split a vm area into separate 42 * We can potentially split a vm area into separate
43 * areas, each area with its own behavior. 43 * areas, each area with its own behavior.
44 */ 44 */
45static long madvise_behavior(struct vm_area_struct * vma, 45static long madvise_behavior(struct vm_area_struct *vma,
46 struct vm_area_struct **prev, 46 struct vm_area_struct **prev,
47 unsigned long start, unsigned long end, int behavior) 47 unsigned long start, unsigned long end, int behavior)
48{ 48{
49 struct mm_struct * mm = vma->vm_mm; 49 struct mm_struct *mm = vma->vm_mm;
50 int error = 0; 50 int error = 0;
51 pgoff_t pgoff; 51 pgoff_t pgoff;
52 unsigned long new_flags = vma->vm_flags; 52 unsigned long new_flags = vma->vm_flags;
@@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
215/* 215/*
216 * Schedule all required I/O operations. Do not wait for completion. 216 * Schedule all required I/O operations. Do not wait for completion.
217 */ 217 */
218static long madvise_willneed(struct vm_area_struct * vma, 218static long madvise_willneed(struct vm_area_struct *vma,
219 struct vm_area_struct ** prev, 219 struct vm_area_struct **prev,
220 unsigned long start, unsigned long end) 220 unsigned long start, unsigned long end)
221{ 221{
222 struct file *file = vma->vm_file; 222 struct file *file = vma->vm_file;
@@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma,
270 * An interface that causes the system to free clean pages and flush 270 * An interface that causes the system to free clean pages and flush
271 * dirty pages is already available as msync(MS_INVALIDATE). 271 * dirty pages is already available as msync(MS_INVALIDATE).
272 */ 272 */
273static long madvise_dontneed(struct vm_area_struct * vma, 273static long madvise_dontneed(struct vm_area_struct *vma,
274 struct vm_area_struct ** prev, 274 struct vm_area_struct **prev,
275 unsigned long start, unsigned long end) 275 unsigned long start, unsigned long end)
276{ 276{
277 *prev = vma; 277 *prev = vma;
@@ -343,29 +343,34 @@ static long madvise_remove(struct vm_area_struct *vma,
343 */ 343 */
344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
345{ 345{
346 int ret = 0;
347
348 if (!capable(CAP_SYS_ADMIN)) 346 if (!capable(CAP_SYS_ADMIN))
349 return -EPERM; 347 return -EPERM;
350 for (; start < end; start += PAGE_SIZE) { 348 for (; start < end; start += PAGE_SIZE) {
351 struct page *p; 349 struct page *p;
352 int ret = get_user_pages_fast(start, 1, 0, &p); 350 int ret;
351
352 ret = get_user_pages_fast(start, 1, 0, &p);
353 if (ret != 1) 353 if (ret != 1)
354 return ret; 354 return ret;
355
356 if (PageHWPoison(p)) {
357 put_page(p);
358 continue;
359 }
355 if (bhv == MADV_SOFT_OFFLINE) { 360 if (bhv == MADV_SOFT_OFFLINE) {
356 printk(KERN_INFO "Soft offlining page %lx at %lx\n", 361 pr_info("Soft offlining page %#lx at %#lx\n",
357 page_to_pfn(p), start); 362 page_to_pfn(p), start);
358 ret = soft_offline_page(p, MF_COUNT_INCREASED); 363 ret = soft_offline_page(p, MF_COUNT_INCREASED);
359 if (ret) 364 if (ret)
360 break; 365 return ret;
361 continue; 366 continue;
362 } 367 }
363 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 368 pr_info("Injecting memory failure for page %#lx at %#lx\n",
364 page_to_pfn(p), start); 369 page_to_pfn(p), start);
365 /* Ignore return value for now */ 370 /* Ignore return value for now */
366 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 371 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
367 } 372 }
368 return ret; 373 return 0;
369} 374}
370#endif 375#endif
371 376
@@ -459,7 +464,7 @@ madvise_behavior_valid(int behavior)
459SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 464SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
460{ 465{
461 unsigned long end, tmp; 466 unsigned long end, tmp;
462 struct vm_area_struct * vma, *prev; 467 struct vm_area_struct *vma, *prev;
463 int unmapped_error = 0; 468 int unmapped_error = 0;
464 int error = -EINVAL; 469 int error = -EINVAL;
465 int write; 470 int write;
diff --git a/mm/memblock.c b/mm/memblock.c
index a847bfe6f3ba..0ac412a0a7ee 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
914 return memblock_search(&memblock.memory, addr) != -1; 914 return memblock_search(&memblock.memory, addr) != -1;
915} 915}
916 916
917#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
918int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
919 unsigned long *start_pfn, unsigned long *end_pfn)
920{
921 struct memblock_type *type = &memblock.memory;
922 int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
923
924 if (mid == -1)
925 return -1;
926
927 *start_pfn = type->regions[mid].base >> PAGE_SHIFT;
928 *end_pfn = (type->regions[mid].base + type->regions[mid].size)
929 >> PAGE_SHIFT;
930
931 return type->regions[mid].nid;
932}
933#endif
934
917/** 935/**
918 * memblock_is_region_memory - check if a region is a subset of memory 936 * memblock_is_region_memory - check if a region is a subset of memory
919 * @base: base of region to check 937 * @base: base of region to check
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3b83957b6439..c6bd28edd533 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3121 ssize_t size = memcg_caches_array_size(num_groups); 3121 ssize_t size = memcg_caches_array_size(num_groups);
3122 3122
3123 size *= sizeof(void *); 3123 size *= sizeof(void *);
3124 size += sizeof(struct memcg_cache_params); 3124 size += offsetof(struct memcg_cache_params, memcg_caches);
3125 3125
3126 s->memcg_params = kzalloc(size, GFP_KERNEL); 3126 s->memcg_params = kzalloc(size, GFP_KERNEL);
3127 if (!s->memcg_params) { 3127 if (!s->memcg_params) {
@@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3164int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3164int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3165 struct kmem_cache *root_cache) 3165 struct kmem_cache *root_cache)
3166{ 3166{
3167 size_t size = sizeof(struct memcg_cache_params); 3167 size_t size;
3168 3168
3169 if (!memcg_kmem_enabled()) 3169 if (!memcg_kmem_enabled())
3170 return 0; 3170 return 0;
3171 3171
3172 if (!memcg) 3172 if (!memcg) {
3173 size = offsetof(struct memcg_cache_params, memcg_caches);
3173 size += memcg_limited_groups_array_size * sizeof(void *); 3174 size += memcg_limited_groups_array_size * sizeof(void *);
3175 } else
3176 size = sizeof(struct memcg_cache_params);
3174 3177
3175 s->memcg_params = kzalloc(size, GFP_KERNEL); 3178 s->memcg_params = kzalloc(size, GFP_KERNEL);
3176 if (!s->memcg_params) 3179 if (!s->memcg_params)
@@ -5588,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b)
5588 const struct mem_cgroup_threshold *_a = a; 5591 const struct mem_cgroup_threshold *_a = a;
5589 const struct mem_cgroup_threshold *_b = b; 5592 const struct mem_cgroup_threshold *_b = b;
5590 5593
5591 return _a->threshold - _b->threshold; 5594 if (_a->threshold > _b->threshold)
5595 return 1;
5596
5597 if (_a->threshold < _b->threshold)
5598 return -1;
5599
5600 return 0;
5592} 5601}
5593 5602
5594static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5603static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d84c5e5331bb..d472e14c6808 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
206#ifdef __ARCH_SI_TRAPNO 206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno; 207 si.si_trapno = trapno;
208#endif 208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; 209 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
210 210
211 if ((flags & MF_ACTION_REQUIRED) && t == current) { 211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 si.si_code = BUS_MCEERR_AR; 212 si.si_code = BUS_MCEERR_AR;
@@ -983,7 +983,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
983static void set_page_hwpoison_huge_page(struct page *hpage) 983static void set_page_hwpoison_huge_page(struct page *hpage)
984{ 984{
985 int i; 985 int i;
986 int nr_pages = 1 << compound_trans_order(hpage); 986 int nr_pages = 1 << compound_order(hpage);
987 for (i = 0; i < nr_pages; i++) 987 for (i = 0; i < nr_pages; i++)
988 SetPageHWPoison(hpage + i); 988 SetPageHWPoison(hpage + i);
989} 989}
@@ -991,7 +991,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
991static void clear_page_hwpoison_huge_page(struct page *hpage) 991static void clear_page_hwpoison_huge_page(struct page *hpage)
992{ 992{
993 int i; 993 int i;
994 int nr_pages = 1 << compound_trans_order(hpage); 994 int nr_pages = 1 << compound_order(hpage);
995 for (i = 0; i < nr_pages; i++) 995 for (i = 0; i < nr_pages; i++)
996 ClearPageHWPoison(hpage + i); 996 ClearPageHWPoison(hpage + i);
997} 997}
@@ -1204,6 +1204,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1204 for (ps = error_states;; ps++) 1204 for (ps = error_states;; ps++)
1205 if ((p->flags & ps->mask) == ps->res) 1205 if ((p->flags & ps->mask) == ps->res)
1206 break; 1206 break;
1207
1208 page_flags |= (p->flags & (1UL << PG_dirty));
1209
1207 if (!ps->mask) 1210 if (!ps->mask)
1208 for (ps = error_states;; ps++) 1211 for (ps = error_states;; ps++)
1209 if ((page_flags & ps->mask) == ps->res) 1212 if ((page_flags & ps->mask) == ps->res)
@@ -1339,7 +1342,17 @@ int unpoison_memory(unsigned long pfn)
1339 return 0; 1342 return 0;
1340 } 1343 }
1341 1344
1342 nr_pages = 1 << compound_trans_order(page); 1345 /*
1346 * unpoison_memory() can encounter thp only when the thp is being
1347 * worked by memory_failure() and the page lock is not held yet.
1348 * In such case, we yield to memory_failure() and make unpoison fail.
1349 */
1350 if (PageTransHuge(page)) {
1351 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1352 return 0;
1353 }
1354
1355 nr_pages = 1 << compound_order(page);
1343 1356
1344 if (!get_page_unless_zero(page)) { 1357 if (!get_page_unless_zero(page)) {
1345 /* 1358 /*
@@ -1353,7 +1366,7 @@ int unpoison_memory(unsigned long pfn)
1353 return 0; 1366 return 0;
1354 } 1367 }
1355 if (TestClearPageHWPoison(p)) 1368 if (TestClearPageHWPoison(p))
1356 atomic_long_sub(nr_pages, &num_poisoned_pages); 1369 atomic_long_dec(&num_poisoned_pages);
1357 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1370 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1358 return 0; 1371 return 0;
1359 } 1372 }
@@ -1375,7 +1388,7 @@ int unpoison_memory(unsigned long pfn)
1375 unlock_page(page); 1388 unlock_page(page);
1376 1389
1377 put_page(page); 1390 put_page(page);
1378 if (freeit) 1391 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1379 put_page(page); 1392 put_page(page);
1380 1393
1381 return 0; 1394 return 0;
@@ -1416,7 +1429,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1416 * was free. This flag should be kept set until the source page 1429 * was free. This flag should be kept set until the source page
1417 * is freed and PG_hwpoison on it is set. 1430 * is freed and PG_hwpoison on it is set.
1418 */ 1431 */
1419 set_migratetype_isolate(p, true); 1432 if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
1433 set_migratetype_isolate(p, true);
1420 /* 1434 /*
1421 * When the target page is a free hugepage, just remove it 1435 * When the target page is a free hugepage, just remove it
1422 * from free hugepage list. 1436 * from free hugepage list.
@@ -1470,6 +1484,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1470 int ret; 1484 int ret;
1471 unsigned long pfn = page_to_pfn(page); 1485 unsigned long pfn = page_to_pfn(page);
1472 struct page *hpage = compound_head(page); 1486 struct page *hpage = compound_head(page);
1487 LIST_HEAD(pagelist);
1473 1488
1474 /* 1489 /*
1475 * This double-check of PageHWPoison is to avoid the race with 1490 * This double-check of PageHWPoison is to avoid the race with
@@ -1485,86 +1500,29 @@ static int soft_offline_huge_page(struct page *page, int flags)
1485 unlock_page(hpage); 1500 unlock_page(hpage);
1486 1501
1487 /* Keep page count to indicate a given hugepage is isolated. */ 1502 /* Keep page count to indicate a given hugepage is isolated. */
1488 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, 1503 list_move(&hpage->lru, &pagelist);
1489 MIGRATE_SYNC); 1504 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1490 put_page(hpage); 1505 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1491 if (ret) { 1506 if (ret) {
1492 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1507 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1493 pfn, ret, page->flags); 1508 pfn, ret, page->flags);
1509 /*
1510 * We know that soft_offline_huge_page() tries to migrate
1511 * only one hugepage pointed to by hpage, so we need not
1512 * run through the pagelist here.
1513 */
1514 putback_active_hugepage(hpage);
1515 if (ret > 0)
1516 ret = -EIO;
1494 } else { 1517 } else {
1495 set_page_hwpoison_huge_page(hpage); 1518 set_page_hwpoison_huge_page(hpage);
1496 dequeue_hwpoisoned_huge_page(hpage); 1519 dequeue_hwpoisoned_huge_page(hpage);
1497 atomic_long_add(1 << compound_trans_order(hpage), 1520 atomic_long_add(1 << compound_order(hpage),
1498 &num_poisoned_pages); 1521 &num_poisoned_pages);
1499 } 1522 }
1500 return ret; 1523 return ret;
1501} 1524}
1502 1525
1503static int __soft_offline_page(struct page *page, int flags);
1504
1505/**
1506 * soft_offline_page - Soft offline a page.
1507 * @page: page to offline
1508 * @flags: flags. Same as memory_failure().
1509 *
1510 * Returns 0 on success, otherwise negated errno.
1511 *
1512 * Soft offline a page, by migration or invalidation,
1513 * without killing anything. This is for the case when
1514 * a page is not corrupted yet (so it's still valid to access),
1515 * but has had a number of corrected errors and is better taken
1516 * out.
1517 *
1518 * The actual policy on when to do that is maintained by
1519 * user space.
1520 *
1521 * This should never impact any application or cause data loss,
1522 * however it might take some time.
1523 *
1524 * This is not a 100% solution for all memory, but tries to be
1525 * ``good enough'' for the majority of memory.
1526 */
1527int soft_offline_page(struct page *page, int flags)
1528{
1529 int ret;
1530 unsigned long pfn = page_to_pfn(page);
1531 struct page *hpage = compound_trans_head(page);
1532
1533 if (PageHWPoison(page)) {
1534 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1535 return -EBUSY;
1536 }
1537 if (!PageHuge(page) && PageTransHuge(hpage)) {
1538 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1539 pr_info("soft offline: %#lx: failed to split THP\n",
1540 pfn);
1541 return -EBUSY;
1542 }
1543 }
1544
1545 ret = get_any_page(page, pfn, flags);
1546 if (ret < 0)
1547 return ret;
1548 if (ret) { /* for in-use pages */
1549 if (PageHuge(page))
1550 ret = soft_offline_huge_page(page, flags);
1551 else
1552 ret = __soft_offline_page(page, flags);
1553 } else { /* for free pages */
1554 if (PageHuge(page)) {
1555 set_page_hwpoison_huge_page(hpage);
1556 dequeue_hwpoisoned_huge_page(hpage);
1557 atomic_long_add(1 << compound_trans_order(hpage),
1558 &num_poisoned_pages);
1559 } else {
1560 SetPageHWPoison(page);
1561 atomic_long_inc(&num_poisoned_pages);
1562 }
1563 }
1564 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1565 return ret;
1566}
1567
1568static int __soft_offline_page(struct page *page, int flags) 1526static int __soft_offline_page(struct page *page, int flags)
1569{ 1527{
1570 int ret; 1528 int ret;
@@ -1651,3 +1609,67 @@ static int __soft_offline_page(struct page *page, int flags)
1651 } 1609 }
1652 return ret; 1610 return ret;
1653} 1611}
1612
1613/**
1614 * soft_offline_page - Soft offline a page.
1615 * @page: page to offline
1616 * @flags: flags. Same as memory_failure().
1617 *
1618 * Returns 0 on success, otherwise negated errno.
1619 *
1620 * Soft offline a page, by migration or invalidation,
1621 * without killing anything. This is for the case when
1622 * a page is not corrupted yet (so it's still valid to access),
1623 * but has had a number of corrected errors and is better taken
1624 * out.
1625 *
1626 * The actual policy on when to do that is maintained by
1627 * user space.
1628 *
1629 * This should never impact any application or cause data loss,
1630 * however it might take some time.
1631 *
1632 * This is not a 100% solution for all memory, but tries to be
1633 * ``good enough'' for the majority of memory.
1634 */
1635int soft_offline_page(struct page *page, int flags)
1636{
1637 int ret;
1638 unsigned long pfn = page_to_pfn(page);
1639 struct page *hpage = compound_trans_head(page);
1640
1641 if (PageHWPoison(page)) {
1642 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1643 return -EBUSY;
1644 }
1645 if (!PageHuge(page) && PageTransHuge(hpage)) {
1646 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1647 pr_info("soft offline: %#lx: failed to split THP\n",
1648 pfn);
1649 return -EBUSY;
1650 }
1651 }
1652
1653 ret = get_any_page(page, pfn, flags);
1654 if (ret < 0)
1655 goto unset;
1656 if (ret) { /* for in-use pages */
1657 if (PageHuge(page))
1658 ret = soft_offline_huge_page(page, flags);
1659 else
1660 ret = __soft_offline_page(page, flags);
1661 } else { /* for free pages */
1662 if (PageHuge(page)) {
1663 set_page_hwpoison_huge_page(hpage);
1664 dequeue_hwpoisoned_huge_page(hpage);
1665 atomic_long_add(1 << compound_order(hpage),
1666 &num_poisoned_pages);
1667 } else {
1668 SetPageHWPoison(page);
1669 atomic_long_inc(&num_poisoned_pages);
1670 }
1671 }
1672unset:
1673 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1674 return ret;
1675}
diff --git a/mm/memory.c b/mm/memory.c
index b3c6bf9a398e..2b73dbde2274 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -373,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
373#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ 373#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
374 374
375/* 375/*
376 * If a p?d_bad entry is found while walking page tables, report
377 * the error, before resetting entry to p?d_none. Usually (but
378 * very seldom) called out from the p?d_none_or_clear_bad macros.
379 */
380
381void pgd_clear_bad(pgd_t *pgd)
382{
383 pgd_ERROR(*pgd);
384 pgd_clear(pgd);
385}
386
387void pud_clear_bad(pud_t *pud)
388{
389 pud_ERROR(*pud);
390 pud_clear(pud);
391}
392
393void pmd_clear_bad(pmd_t *pmd)
394{
395 pmd_ERROR(*pmd);
396 pmd_clear(pmd);
397}
398
399/*
400 * Note: this doesn't free the actual pages themselves. That 376 * Note: this doesn't free the actual pages themselves. That
401 * has been handled earlier when unmapping all the memory regions. 377 * has been handled earlier when unmapping all the memory regions.
402 */ 378 */
@@ -1505,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
1505 if (pud_none(*pud)) 1481 if (pud_none(*pud))
1506 goto no_page_table; 1482 goto no_page_table;
1507 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 1483 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1508 BUG_ON(flags & FOLL_GET); 1484 if (flags & FOLL_GET)
1485 goto out;
1509 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1486 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1510 goto out; 1487 goto out;
1511 } 1488 }
@@ -1516,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
1516 if (pmd_none(*pmd)) 1493 if (pmd_none(*pmd))
1517 goto no_page_table; 1494 goto no_page_table;
1518 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 1495 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1519 BUG_ON(flags & FOLL_GET);
1520 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1496 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1497 if (flags & FOLL_GET) {
1498 /*
1499 * Refcount on tail pages are not well-defined and
1500 * shouldn't be taken. The caller should handle a NULL
1501 * return when trying to follow tail pages.
1502 */
1503 if (PageHead(page))
1504 get_page(page);
1505 else {
1506 page = NULL;
1507 goto out;
1508 }
1509 }
1521 goto out; 1510 goto out;
1522 } 1511 }
1523 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1512 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ca1dd3aa5eee..0eb1a1df649d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -30,6 +30,7 @@
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/hugetlb.h>
33 34
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
35 36
@@ -194,7 +195,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
194 195
195 zone = &pgdat->node_zones[0]; 196 zone = &pgdat->node_zones[0];
196 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 197 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
197 if (zone->wait_table) { 198 if (zone_is_initialized(zone)) {
198 nr_pages = zone->wait_table_hash_nr_entries 199 nr_pages = zone->wait_table_hash_nr_entries
199 * sizeof(wait_queue_head_t); 200 * sizeof(wait_queue_head_t);
200 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 201 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
@@ -229,8 +230,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
229 230
230 zone_span_writelock(zone); 231 zone_span_writelock(zone);
231 232
232 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 233 old_zone_end_pfn = zone_end_pfn(zone);
233 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 234 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
234 zone->zone_start_pfn = start_pfn; 235 zone->zone_start_pfn = start_pfn;
235 236
236 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 237 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -305,7 +306,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
305 goto out_fail; 306 goto out_fail;
306 307
307 /* use start_pfn for z1's start_pfn if z1 is empty */ 308 /* use start_pfn for z1's start_pfn if z1 is empty */
308 if (z1->spanned_pages) 309 if (!zone_is_empty(z1))
309 z1_start_pfn = z1->zone_start_pfn; 310 z1_start_pfn = z1->zone_start_pfn;
310 else 311 else
311 z1_start_pfn = start_pfn; 312 z1_start_pfn = start_pfn;
@@ -347,7 +348,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
347 goto out_fail; 348 goto out_fail;
348 349
349 /* use end_pfn for z2's end_pfn if z2 is empty */ 350 /* use end_pfn for z2's end_pfn if z2 is empty */
350 if (z2->spanned_pages) 351 if (!zone_is_empty(z2))
351 z2_end_pfn = zone_end_pfn(z2); 352 z2_end_pfn = zone_end_pfn(z2);
352 else 353 else
353 z2_end_pfn = end_pfn; 354 z2_end_pfn = end_pfn;
@@ -514,8 +515,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
514static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 515static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
515 unsigned long end_pfn) 516 unsigned long end_pfn)
516{ 517{
517 unsigned long zone_start_pfn = zone->zone_start_pfn; 518 unsigned long zone_start_pfn = zone->zone_start_pfn;
518 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 519 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
520 unsigned long zone_end_pfn = z;
519 unsigned long pfn; 521 unsigned long pfn;
520 struct mem_section *ms; 522 struct mem_section *ms;
521 int nid = zone_to_nid(zone); 523 int nid = zone_to_nid(zone);
@@ -1069,6 +1071,23 @@ out:
1069 return ret; 1071 return ret;
1070} 1072}
1071 1073
1074static int check_hotplug_memory_range(u64 start, u64 size)
1075{
1076 u64 start_pfn = start >> PAGE_SHIFT;
1077 u64 nr_pages = size >> PAGE_SHIFT;
1078
1079 /* Memory range must be aligned with section */
1080 if ((start_pfn & ~PAGE_SECTION_MASK) ||
1081 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
1082 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
1083 (unsigned long long)start,
1084 (unsigned long long)size);
1085 return -EINVAL;
1086 }
1087
1088 return 0;
1089}
1090
1072/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1091/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1073int __ref add_memory(int nid, u64 start, u64 size) 1092int __ref add_memory(int nid, u64 start, u64 size)
1074{ 1093{
@@ -1078,6 +1097,10 @@ int __ref add_memory(int nid, u64 start, u64 size)
1078 struct resource *res; 1097 struct resource *res;
1079 int ret; 1098 int ret;
1080 1099
1100 ret = check_hotplug_memory_range(start, size);
1101 if (ret)
1102 return ret;
1103
1081 lock_memory_hotplug(); 1104 lock_memory_hotplug();
1082 1105
1083 res = register_memory_resource(start, size); 1106 res = register_memory_resource(start, size);
@@ -1208,10 +1231,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
1208} 1231}
1209 1232
1210/* 1233/*
1211 * Scanning pfn is much easier than scanning lru list. 1234 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
1212 * Scan pfn from start to end and Find LRU page. 1235 * and hugepages). We scan pfn because it's much easier than scanning over
1236 * linked list. This function returns the pfn of the first found movable
1237 * page if it's found, otherwise 0.
1213 */ 1238 */
1214static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1239static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1215{ 1240{
1216 unsigned long pfn; 1241 unsigned long pfn;
1217 struct page *page; 1242 struct page *page;
@@ -1220,6 +1245,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
1220 page = pfn_to_page(pfn); 1245 page = pfn_to_page(pfn);
1221 if (PageLRU(page)) 1246 if (PageLRU(page))
1222 return pfn; 1247 return pfn;
1248 if (PageHuge(page)) {
1249 if (is_hugepage_active(page))
1250 return pfn;
1251 else
1252 pfn = round_up(pfn + 1,
1253 1 << compound_order(page)) - 1;
1254 }
1223 } 1255 }
1224 } 1256 }
1225 return 0; 1257 return 0;
@@ -1240,6 +1272,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1240 if (!pfn_valid(pfn)) 1272 if (!pfn_valid(pfn))
1241 continue; 1273 continue;
1242 page = pfn_to_page(pfn); 1274 page = pfn_to_page(pfn);
1275
1276 if (PageHuge(page)) {
1277 struct page *head = compound_head(page);
1278 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1279 if (compound_order(head) > PFN_SECTION_SHIFT) {
1280 ret = -EBUSY;
1281 break;
1282 }
1283 if (isolate_huge_page(page, &source))
1284 move_pages -= 1 << compound_order(head);
1285 continue;
1286 }
1287
1243 if (!get_page_unless_zero(page)) 1288 if (!get_page_unless_zero(page))
1244 continue; 1289 continue;
1245 /* 1290 /*
@@ -1272,7 +1317,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1272 } 1317 }
1273 if (!list_empty(&source)) { 1318 if (!list_empty(&source)) {
1274 if (not_managed) { 1319 if (not_managed) {
1275 putback_lru_pages(&source); 1320 putback_movable_pages(&source);
1276 goto out; 1321 goto out;
1277 } 1322 }
1278 1323
@@ -1283,7 +1328,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1283 ret = migrate_pages(&source, alloc_migrate_target, 0, 1328 ret = migrate_pages(&source, alloc_migrate_target, 0,
1284 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1329 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1285 if (ret) 1330 if (ret)
1286 putback_lru_pages(&source); 1331 putback_movable_pages(&source);
1287 } 1332 }
1288out: 1333out:
1289 return ret; 1334 return ret;
@@ -1472,7 +1517,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
1472 struct zone *zone; 1517 struct zone *zone;
1473 struct memory_notify arg; 1518 struct memory_notify arg;
1474 1519
1475 BUG_ON(start_pfn >= end_pfn);
1476 /* at least, alignment against pageblock is necessary */ 1520 /* at least, alignment against pageblock is necessary */
1477 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1521 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1478 return -EINVAL; 1522 return -EINVAL;
@@ -1527,8 +1571,8 @@ repeat:
1527 drain_all_pages(); 1571 drain_all_pages();
1528 } 1572 }
1529 1573
1530 pfn = scan_lru_pages(start_pfn, end_pfn); 1574 pfn = scan_movable_pages(start_pfn, end_pfn);
1531 if (pfn) { /* We have page on LRU */ 1575 if (pfn) { /* We have movable pages */
1532 ret = do_migrate_range(pfn, end_pfn); 1576 ret = do_migrate_range(pfn, end_pfn);
1533 if (!ret) { 1577 if (!ret) {
1534 drain = 1; 1578 drain = 1;
@@ -1547,6 +1591,11 @@ repeat:
1547 yield(); 1591 yield();
1548 /* drain pcp pages, this is synchronous. */ 1592 /* drain pcp pages, this is synchronous. */
1549 drain_all_pages(); 1593 drain_all_pages();
1594 /*
1595 * dissolve free hugepages in the memory block before doing offlining
1596 * actually in order to make hugetlbfs's object counting consistent.
1597 */
1598 dissolve_free_huge_pages(start_pfn, end_pfn);
1550 /* check again */ 1599 /* check again */
1551 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1600 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1552 if (offlined_pages < 0) { 1601 if (offlined_pages < 0) {
@@ -1674,9 +1723,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1674 return ret; 1723 return ret;
1675} 1724}
1676 1725
1677static int check_cpu_on_node(void *data) 1726static int check_cpu_on_node(pg_data_t *pgdat)
1678{ 1727{
1679 struct pglist_data *pgdat = data;
1680 int cpu; 1728 int cpu;
1681 1729
1682 for_each_present_cpu(cpu) { 1730 for_each_present_cpu(cpu) {
@@ -1691,10 +1739,9 @@ static int check_cpu_on_node(void *data)
1691 return 0; 1739 return 0;
1692} 1740}
1693 1741
1694static void unmap_cpu_on_node(void *data) 1742static void unmap_cpu_on_node(pg_data_t *pgdat)
1695{ 1743{
1696#ifdef CONFIG_ACPI_NUMA 1744#ifdef CONFIG_ACPI_NUMA
1697 struct pglist_data *pgdat = data;
1698 int cpu; 1745 int cpu;
1699 1746
1700 for_each_possible_cpu(cpu) 1747 for_each_possible_cpu(cpu)
@@ -1703,10 +1750,11 @@ static void unmap_cpu_on_node(void *data)
1703#endif 1750#endif
1704} 1751}
1705 1752
1706static int check_and_unmap_cpu_on_node(void *data) 1753static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1707{ 1754{
1708 int ret = check_cpu_on_node(data); 1755 int ret;
1709 1756
1757 ret = check_cpu_on_node(pgdat);
1710 if (ret) 1758 if (ret)
1711 return ret; 1759 return ret;
1712 1760
@@ -1715,11 +1763,18 @@ static int check_and_unmap_cpu_on_node(void *data)
1715 * the cpu_to_node() now. 1763 * the cpu_to_node() now.
1716 */ 1764 */
1717 1765
1718 unmap_cpu_on_node(data); 1766 unmap_cpu_on_node(pgdat);
1719 return 0; 1767 return 0;
1720} 1768}
1721 1769
1722/* offline the node if all memory sections of this node are removed */ 1770/**
1771 * try_offline_node
1772 *
1773 * Offline a node if all memory sections and cpus of the node are removed.
1774 *
1775 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1776 * and online/offline operations before this call.
1777 */
1723void try_offline_node(int nid) 1778void try_offline_node(int nid)
1724{ 1779{
1725 pg_data_t *pgdat = NODE_DATA(nid); 1780 pg_data_t *pgdat = NODE_DATA(nid);
@@ -1745,7 +1800,7 @@ void try_offline_node(int nid)
1745 return; 1800 return;
1746 } 1801 }
1747 1802
1748 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1803 if (check_and_unmap_cpu_on_node(pgdat))
1749 return; 1804 return;
1750 1805
1751 /* 1806 /*
@@ -1782,10 +1837,19 @@ void try_offline_node(int nid)
1782} 1837}
1783EXPORT_SYMBOL(try_offline_node); 1838EXPORT_SYMBOL(try_offline_node);
1784 1839
1840/**
1841 * remove_memory
1842 *
1843 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1844 * and online/offline operations before this call, as required by
1845 * try_offline_node().
1846 */
1785void __ref remove_memory(int nid, u64 start, u64 size) 1847void __ref remove_memory(int nid, u64 start, u64 size)
1786{ 1848{
1787 int ret; 1849 int ret;
1788 1850
1851 BUG_ON(check_hotplug_memory_range(start, size));
1852
1789 lock_memory_hotplug(); 1853 lock_memory_hotplug();
1790 1854
1791 /* 1855 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4baf12e534d1..04729647f359 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
123static struct mempolicy *get_task_policy(struct task_struct *p) 123static struct mempolicy *get_task_policy(struct task_struct *p)
124{ 124{
125 struct mempolicy *pol = p->mempolicy; 125 struct mempolicy *pol = p->mempolicy;
126 int node;
127 126
128 if (!pol) { 127 if (!pol) {
129 node = numa_node_id(); 128 int node = numa_node_id();
130 if (node != NUMA_NO_NODE)
131 pol = &preferred_node_policy[node];
132 129
133 /* preferred_node_policy is not initialised early in boot */ 130 if (node != NUMA_NO_NODE) {
134 if (!pol->mode) 131 pol = &preferred_node_policy[node];
135 pol = NULL; 132 /*
133 * preferred_node_policy is not initialised early in
134 * boot
135 */
136 if (!pol->mode)
137 pol = NULL;
138 }
136 } 139 }
137 140
138 return pol; 141 return pol;
@@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
473static void migrate_page_add(struct page *page, struct list_head *pagelist, 476static void migrate_page_add(struct page *page, struct list_head *pagelist,
474 unsigned long flags); 477 unsigned long flags);
475 478
476/* Scan through pages checking if pages follow certain conditions. */ 479/*
477static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 480 * Scan through pages checking if pages follow certain conditions,
481 * and move them to the pagelist if they do.
482 */
483static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
478 unsigned long addr, unsigned long end, 484 unsigned long addr, unsigned long end,
479 const nodemask_t *nodes, unsigned long flags, 485 const nodemask_t *nodes, unsigned long flags,
480 void *private) 486 void *private)
@@ -512,7 +518,31 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
512 return addr != end; 518 return addr != end;
513} 519}
514 520
515static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 521static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523 void *private)
524{
525#ifdef CONFIG_HUGETLB_PAGE
526 int nid;
527 struct page *page;
528
529 spin_lock(&vma->vm_mm->page_table_lock);
530 page = pte_page(huge_ptep_get((pte_t *)pmd));
531 nid = page_to_nid(page);
532 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
533 goto unlock;
534 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535 if (flags & (MPOL_MF_MOVE_ALL) ||
536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 isolate_huge_page(page, private);
538unlock:
539 spin_unlock(&vma->vm_mm->page_table_lock);
540#else
541 BUG();
542#endif
543}
544
545static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
516 unsigned long addr, unsigned long end, 546 unsigned long addr, unsigned long end,
517 const nodemask_t *nodes, unsigned long flags, 547 const nodemask_t *nodes, unsigned long flags,
518 void *private) 548 void *private)
@@ -523,17 +553,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
523 pmd = pmd_offset(pud, addr); 553 pmd = pmd_offset(pud, addr);
524 do { 554 do {
525 next = pmd_addr_end(addr, end); 555 next = pmd_addr_end(addr, end);
556 if (!pmd_present(*pmd))
557 continue;
558 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560 flags, private);
561 continue;
562 }
526 split_huge_page_pmd(vma, addr, pmd); 563 split_huge_page_pmd(vma, addr, pmd);
527 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 564 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
528 continue; 565 continue;
529 if (check_pte_range(vma, pmd, addr, next, nodes, 566 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
530 flags, private)) 567 flags, private))
531 return -EIO; 568 return -EIO;
532 } while (pmd++, addr = next, addr != end); 569 } while (pmd++, addr = next, addr != end);
533 return 0; 570 return 0;
534} 571}
535 572
536static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 573static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
537 unsigned long addr, unsigned long end, 574 unsigned long addr, unsigned long end,
538 const nodemask_t *nodes, unsigned long flags, 575 const nodemask_t *nodes, unsigned long flags,
539 void *private) 576 void *private)
@@ -544,16 +581,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
544 pud = pud_offset(pgd, addr); 581 pud = pud_offset(pgd, addr);
545 do { 582 do {
546 next = pud_addr_end(addr, end); 583 next = pud_addr_end(addr, end);
584 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585 continue;
547 if (pud_none_or_clear_bad(pud)) 586 if (pud_none_or_clear_bad(pud))
548 continue; 587 continue;
549 if (check_pmd_range(vma, pud, addr, next, nodes, 588 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
550 flags, private)) 589 flags, private))
551 return -EIO; 590 return -EIO;
552 } while (pud++, addr = next, addr != end); 591 } while (pud++, addr = next, addr != end);
553 return 0; 592 return 0;
554} 593}
555 594
556static inline int check_pgd_range(struct vm_area_struct *vma, 595static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
557 unsigned long addr, unsigned long end, 596 unsigned long addr, unsigned long end,
558 const nodemask_t *nodes, unsigned long flags, 597 const nodemask_t *nodes, unsigned long flags,
559 void *private) 598 void *private)
@@ -566,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
566 next = pgd_addr_end(addr, end); 605 next = pgd_addr_end(addr, end);
567 if (pgd_none_or_clear_bad(pgd)) 606 if (pgd_none_or_clear_bad(pgd))
568 continue; 607 continue;
569 if (check_pud_range(vma, pgd, addr, next, nodes, 608 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
570 flags, private)) 609 flags, private))
571 return -EIO; 610 return -EIO;
572 } while (pgd++, addr = next, addr != end); 611 } while (pgd++, addr = next, addr != end);
@@ -604,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
604#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ 643#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
605 644
606/* 645/*
607 * Check if all pages in a range are on a set of nodes. 646 * Walk through page tables and collect pages to be migrated.
608 * If pagelist != NULL then isolate pages from the LRU and 647 *
609 * put them on the pagelist. 648 * If pages found in a given range are on a set of nodes (determined by
649 * @nodes and @flags,) it's isolated and queued to the pagelist which is
650 * passed via @private.)
610 */ 651 */
611static struct vm_area_struct * 652static struct vm_area_struct *
612check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 653queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
613 const nodemask_t *nodes, unsigned long flags, void *private) 654 const nodemask_t *nodes, unsigned long flags, void *private)
614{ 655{
615 int err; 656 int err;
@@ -635,9 +676,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
635 return ERR_PTR(-EFAULT); 676 return ERR_PTR(-EFAULT);
636 } 677 }
637 678
638 if (is_vm_hugetlb_page(vma))
639 goto next;
640
641 if (flags & MPOL_MF_LAZY) { 679 if (flags & MPOL_MF_LAZY) {
642 change_prot_numa(vma, start, endvma); 680 change_prot_numa(vma, start, endvma);
643 goto next; 681 goto next;
@@ -647,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
647 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 685 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
648 vma_migratable(vma))) { 686 vma_migratable(vma))) {
649 687
650 err = check_pgd_range(vma, start, endvma, nodes, 688 err = queue_pages_pgd_range(vma, start, endvma, nodes,
651 flags, private); 689 flags, private);
652 if (err) { 690 if (err) {
653 first = ERR_PTR(err); 691 first = ERR_PTR(err);
@@ -990,7 +1028,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
990 1028
991static struct page *new_node_page(struct page *page, unsigned long node, int **x) 1029static struct page *new_node_page(struct page *page, unsigned long node, int **x)
992{ 1030{
993 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 1031 if (PageHuge(page))
1032 return alloc_huge_page_node(page_hstate(compound_head(page)),
1033 node);
1034 else
1035 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
994} 1036}
995 1037
996/* 1038/*
@@ -1013,14 +1055,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1013 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. 1055 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1014 */ 1056 */
1015 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1057 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1016 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 1058 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1017 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1059 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1018 1060
1019 if (!list_empty(&pagelist)) { 1061 if (!list_empty(&pagelist)) {
1020 err = migrate_pages(&pagelist, new_node_page, dest, 1062 err = migrate_pages(&pagelist, new_node_page, dest,
1021 MIGRATE_SYNC, MR_SYSCALL); 1063 MIGRATE_SYNC, MR_SYSCALL);
1022 if (err) 1064 if (err)
1023 putback_lru_pages(&pagelist); 1065 putback_movable_pages(&pagelist);
1024 } 1066 }
1025 1067
1026 return err; 1068 return err;
@@ -1154,10 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
1154 break; 1196 break;
1155 vma = vma->vm_next; 1197 vma = vma->vm_next;
1156 } 1198 }
1157
1158 /* 1199 /*
1159 * if !vma, alloc_page_vma() will use task or system default policy 1200 * queue_pages_range() confirms that @page belongs to some vma,
1201 * so vma shouldn't be NULL.
1160 */ 1202 */
1203 BUG_ON(!vma);
1204
1205 if (PageHuge(page))
1206 return alloc_huge_page_noerr(vma, address, 1);
1161 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1207 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1162} 1208}
1163#else 1209#else
@@ -1249,7 +1295,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1249 if (err) 1295 if (err)
1250 goto mpol_out; 1296 goto mpol_out;
1251 1297
1252 vma = check_range(mm, start, end, nmask, 1298 vma = queue_pages_range(mm, start, end, nmask,
1253 flags | MPOL_MF_INVERT, &pagelist); 1299 flags | MPOL_MF_INVERT, &pagelist);
1254 1300
1255 err = PTR_ERR(vma); /* maybe ... */ 1301 err = PTR_ERR(vma); /* maybe ... */
@@ -1265,7 +1311,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1265 (unsigned long)vma, 1311 (unsigned long)vma,
1266 MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1312 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1267 if (nr_failed) 1313 if (nr_failed)
1268 putback_lru_pages(&pagelist); 1314 putback_movable_pages(&pagelist);
1269 } 1315 }
1270 1316
1271 if (nr_failed && (flags & MPOL_MF_STRICT)) 1317 if (nr_failed && (flags & MPOL_MF_STRICT))
@@ -2065,6 +2111,16 @@ retry_cpuset:
2065} 2111}
2066EXPORT_SYMBOL(alloc_pages_current); 2112EXPORT_SYMBOL(alloc_pages_current);
2067 2113
2114int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2115{
2116 struct mempolicy *pol = mpol_dup(vma_policy(src));
2117
2118 if (IS_ERR(pol))
2119 return PTR_ERR(pol);
2120 dst->vm_policy = pol;
2121 return 0;
2122}
2123
2068/* 2124/*
2069 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2125 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2070 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2126 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
diff --git a/mm/mempool.c b/mm/mempool.c
index 54990476c049..659aa42bad16 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
73 gfp_t gfp_mask, int node_id) 73 gfp_t gfp_mask, int node_id)
74{ 74{
75 mempool_t *pool; 75 mempool_t *pool;
76 pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); 76 pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
77 if (!pool) 77 if (!pool)
78 return NULL; 78 return NULL;
79 pool->elements = kmalloc_node(min_nr * sizeof(void *), 79 pool->elements = kmalloc_node(min_nr * sizeof(void *),
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c24438bba..b7ded7eafe3a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l)
100 struct page *page2; 100 struct page *page2;
101 101
102 list_for_each_entry_safe(page, page2, l, lru) { 102 list_for_each_entry_safe(page, page2, l, lru) {
103 if (unlikely(PageHuge(page))) {
104 putback_active_hugepage(page);
105 continue;
106 }
103 list_del(&page->lru); 107 list_del(&page->lru);
104 dec_zone_page_state(page, NR_ISOLATED_ANON + 108 dec_zone_page_state(page, NR_ISOLATED_ANON +
105 page_is_file_cache(page)); 109 page_is_file_cache(page));
@@ -945,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
945 struct page *new_hpage = get_new_page(hpage, private, &result); 949 struct page *new_hpage = get_new_page(hpage, private, &result);
946 struct anon_vma *anon_vma = NULL; 950 struct anon_vma *anon_vma = NULL;
947 951
952 /*
953 * Movability of hugepages depends on architectures and hugepage size.
954 * This check is necessary because some callers of hugepage migration
955 * like soft offline and memory hotremove don't walk through page
956 * tables or check whether the hugepage is pmd-based or not before
957 * kicking migration.
958 */
959 if (!hugepage_migration_support(page_hstate(hpage)))
960 return -ENOSYS;
961
948 if (!new_hpage) 962 if (!new_hpage)
949 return -ENOMEM; 963 return -ENOMEM;
950 964
@@ -975,6 +989,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
975 989
976 unlock_page(hpage); 990 unlock_page(hpage);
977out: 991out:
992 if (rc != -EAGAIN)
993 putback_active_hugepage(hpage);
978 put_page(new_hpage); 994 put_page(new_hpage);
979 if (result) { 995 if (result) {
980 if (rc) 996 if (rc)
@@ -1025,7 +1041,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1025 list_for_each_entry_safe(page, page2, from, lru) { 1041 list_for_each_entry_safe(page, page2, from, lru) {
1026 cond_resched(); 1042 cond_resched();
1027 1043
1028 rc = unmap_and_move(get_new_page, private, 1044 if (PageHuge(page))
1045 rc = unmap_and_move_huge_page(get_new_page,
1046 private, page, pass > 2, mode);
1047 else
1048 rc = unmap_and_move(get_new_page, private,
1029 page, pass > 2, mode); 1049 page, pass > 2, mode);
1030 1050
1031 switch(rc) { 1051 switch(rc) {
@@ -1058,32 +1078,6 @@ out:
1058 return rc; 1078 return rc;
1059} 1079}
1060 1080
1061int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1062 unsigned long private, enum migrate_mode mode)
1063{
1064 int pass, rc;
1065
1066 for (pass = 0; pass < 10; pass++) {
1067 rc = unmap_and_move_huge_page(get_new_page, private,
1068 hpage, pass > 2, mode);
1069 switch (rc) {
1070 case -ENOMEM:
1071 goto out;
1072 case -EAGAIN:
1073 /* try again */
1074 cond_resched();
1075 break;
1076 case MIGRATEPAGE_SUCCESS:
1077 goto out;
1078 default:
1079 rc = -EIO;
1080 goto out;
1081 }
1082 }
1083out:
1084 return rc;
1085}
1086
1087#ifdef CONFIG_NUMA 1081#ifdef CONFIG_NUMA
1088/* 1082/*
1089 * Move a list of individual pages 1083 * Move a list of individual pages
@@ -1108,7 +1102,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
1108 1102
1109 *result = &pm->status; 1103 *result = &pm->status;
1110 1104
1111 return alloc_pages_exact_node(pm->node, 1105 if (PageHuge(p))
1106 return alloc_huge_page_node(page_hstate(compound_head(p)),
1107 pm->node);
1108 else
1109 return alloc_pages_exact_node(pm->node,
1112 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 1110 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
1113} 1111}
1114 1112
@@ -1168,6 +1166,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1168 !migrate_all) 1166 !migrate_all)
1169 goto put_and_set; 1167 goto put_and_set;
1170 1168
1169 if (PageHuge(page)) {
1170 isolate_huge_page(page, &pagelist);
1171 goto put_and_set;
1172 }
1173
1171 err = isolate_lru_page(page); 1174 err = isolate_lru_page(page);
1172 if (!err) { 1175 if (!err) {
1173 list_add_tail(&page->lru, &pagelist); 1176 list_add_tail(&page->lru, &pagelist);
@@ -1190,7 +1193,7 @@ set_status:
1190 err = migrate_pages(&pagelist, new_page_node, 1193 err = migrate_pages(&pagelist, new_page_node,
1191 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); 1194 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1192 if (err) 1195 if (err)
1193 putback_lru_pages(&pagelist); 1196 putback_movable_pages(&pagelist);
1194 } 1197 }
1195 1198
1196 up_read(&mm->mmap_sem); 1199 up_read(&mm->mmap_sem);
@@ -1468,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1468 if (!populated_zone(zone)) 1471 if (!populated_zone(zone))
1469 continue; 1472 continue;
1470 1473
1471 if (zone->all_unreclaimable) 1474 if (!zone_reclaimable(zone))
1472 continue; 1475 continue;
1473 1476
1474 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1477 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7d1bca..d63802663242 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -11,6 +11,7 @@
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/swapops.h> 12#include <linux/swapops.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/pagevec.h>
14#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
15#include <linux/syscalls.h> 16#include <linux/syscalls.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
@@ -18,6 +19,8 @@
18#include <linux/rmap.h> 19#include <linux/rmap.h>
19#include <linux/mmzone.h> 20#include <linux/mmzone.h>
20#include <linux/hugetlb.h> 21#include <linux/hugetlb.h>
22#include <linux/memcontrol.h>
23#include <linux/mm_inline.h>
21 24
22#include "internal.h" 25#include "internal.h"
23 26
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
87 } 90 }
88} 91}
89 92
93/*
94 * Finish munlock after successful page isolation
95 *
96 * Page must be locked. This is a wrapper for try_to_munlock()
97 * and putback_lru_page() with munlock accounting.
98 */
99static void __munlock_isolated_page(struct page *page)
100{
101 int ret = SWAP_AGAIN;
102
103 /*
104 * Optimization: if the page was mapped just once, that's our mapping
105 * and we don't need to check all the other vmas.
106 */
107 if (page_mapcount(page) > 1)
108 ret = try_to_munlock(page);
109
110 /* Did try_to_unlock() succeed or punt? */
111 if (ret != SWAP_MLOCK)
112 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
113
114 putback_lru_page(page);
115}
116
117/*
118 * Accounting for page isolation fail during munlock
119 *
120 * Performs accounting when page isolation fails in munlock. There is nothing
121 * else to do because it means some other task has already removed the page
122 * from the LRU. putback_lru_page() will take care of removing the page from
123 * the unevictable list, if necessary. vmscan [page_referenced()] will move
124 * the page back to the unevictable list if some other vma has it mlocked.
125 */
126static void __munlock_isolation_failed(struct page *page)
127{
128 if (PageUnevictable(page))
129 count_vm_event(UNEVICTABLE_PGSTRANDED);
130 else
131 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
132}
133
90/** 134/**
91 * munlock_vma_page - munlock a vma page 135 * munlock_vma_page - munlock a vma page
92 * @page - page to be unlocked 136 * @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
112 unsigned int nr_pages = hpage_nr_pages(page); 156 unsigned int nr_pages = hpage_nr_pages(page);
113 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 157 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
114 page_mask = nr_pages - 1; 158 page_mask = nr_pages - 1;
115 if (!isolate_lru_page(page)) { 159 if (!isolate_lru_page(page))
116 int ret = SWAP_AGAIN; 160 __munlock_isolated_page(page);
117 161 else
118 /* 162 __munlock_isolation_failed(page);
119 * Optimization: if the page was mapped just once,
120 * that's our mapping and we don't need to check all the
121 * other vmas.
122 */
123 if (page_mapcount(page) > 1)
124 ret = try_to_munlock(page);
125 /*
126 * did try_to_unlock() succeed or punt?
127 */
128 if (ret != SWAP_MLOCK)
129 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
130
131 putback_lru_page(page);
132 } else {
133 /*
134 * Some other task has removed the page from the LRU.
135 * putback_lru_page() will take care of removing the
136 * page from the unevictable list, if necessary.
137 * vmscan [page_referenced()] will move the page back
138 * to the unevictable list if some other vma has it
139 * mlocked.
140 */
141 if (PageUnevictable(page))
142 count_vm_event(UNEVICTABLE_PGSTRANDED);
143 else
144 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
145 }
146 } 163 }
147 164
148 return page_mask; 165 return page_mask;
@@ -210,6 +227,191 @@ static int __mlock_posix_error_return(long retval)
210} 227}
211 228
212/* 229/*
230 * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
231 *
232 * The fast path is available only for evictable pages with single mapping.
233 * Then we can bypass the per-cpu pvec and get better performance.
234 * when mapcount > 1 we need try_to_munlock() which can fail.
235 * when !page_evictable(), we need the full redo logic of putback_lru_page to
236 * avoid leaving evictable page in unevictable list.
237 *
238 * In case of success, @page is added to @pvec and @pgrescued is incremented
239 * in case that the page was previously unevictable. @page is also unlocked.
240 */
241static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
242 int *pgrescued)
243{
244 VM_BUG_ON(PageLRU(page));
245 VM_BUG_ON(!PageLocked(page));
246
247 if (page_mapcount(page) <= 1 && page_evictable(page)) {
248 pagevec_add(pvec, page);
249 if (TestClearPageUnevictable(page))
250 (*pgrescued)++;
251 unlock_page(page);
252 return true;
253 }
254
255 return false;
256}
257
258/*
259 * Putback multiple evictable pages to the LRU
260 *
261 * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
262 * the pages might have meanwhile become unevictable but that is OK.
263 */
264static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
265{
266 count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
267 /*
268 *__pagevec_lru_add() calls release_pages() so we don't call
269 * put_page() explicitly
270 */
271 __pagevec_lru_add(pvec);
272 count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
273}
274
275/*
276 * Munlock a batch of pages from the same zone
277 *
278 * The work is split to two main phases. First phase clears the Mlocked flag
279 * and attempts to isolate the pages, all under a single zone lru lock.
280 * The second phase finishes the munlock only for pages where isolation
281 * succeeded.
282 *
283 * Note that the pagevec may be modified during the process.
284 */
285static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
286{
287 int i;
288 int nr = pagevec_count(pvec);
289 int delta_munlocked = -nr;
290 struct pagevec pvec_putback;
291 int pgrescued = 0;
292
293 /* Phase 1: page isolation */
294 spin_lock_irq(&zone->lru_lock);
295 for (i = 0; i < nr; i++) {
296 struct page *page = pvec->pages[i];
297
298 if (TestClearPageMlocked(page)) {
299 struct lruvec *lruvec;
300 int lru;
301
302 if (PageLRU(page)) {
303 lruvec = mem_cgroup_page_lruvec(page, zone);
304 lru = page_lru(page);
305 /*
306 * We already have pin from follow_page_mask()
307 * so we can spare the get_page() here.
308 */
309 ClearPageLRU(page);
310 del_page_from_lru_list(page, lruvec, lru);
311 } else {
312 __munlock_isolation_failed(page);
313 goto skip_munlock;
314 }
315
316 } else {
317skip_munlock:
318 /*
319 * We won't be munlocking this page in the next phase
320 * but we still need to release the follow_page_mask()
321 * pin.
322 */
323 pvec->pages[i] = NULL;
324 put_page(page);
325 delta_munlocked++;
326 }
327 }
328 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
329 spin_unlock_irq(&zone->lru_lock);
330
331 /* Phase 2: page munlock */
332 pagevec_init(&pvec_putback, 0);
333 for (i = 0; i < nr; i++) {
334 struct page *page = pvec->pages[i];
335
336 if (page) {
337 lock_page(page);
338 if (!__putback_lru_fast_prepare(page, &pvec_putback,
339 &pgrescued)) {
340 /*
341 * Slow path. We don't want to lose the last
342 * pin before unlock_page()
343 */
344 get_page(page); /* for putback_lru_page() */
345 __munlock_isolated_page(page);
346 unlock_page(page);
347 put_page(page); /* from follow_page_mask() */
348 }
349 }
350 }
351
352 /*
353 * Phase 3: page putback for pages that qualified for the fast path
354 * This will also call put_page() to return pin from follow_page_mask()
355 */
356 if (pagevec_count(&pvec_putback))
357 __putback_lru_fast(&pvec_putback, pgrescued);
358}
359
360/*
361 * Fill up pagevec for __munlock_pagevec using pte walk
362 *
363 * The function expects that the struct page corresponding to @start address is
364 * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
365 *
366 * The rest of @pvec is filled by subsequent pages within the same pmd and same
367 * zone, as long as the pte's are present and vm_normal_page() succeeds. These
368 * pages also get pinned.
369 *
370 * Returns the address of the next page that should be scanned. This equals
371 * @start + PAGE_SIZE when no page could be added by the pte walk.
372 */
373static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
374 struct vm_area_struct *vma, int zoneid, unsigned long start,
375 unsigned long end)
376{
377 pte_t *pte;
378 spinlock_t *ptl;
379
380 /*
381 * Initialize pte walk starting at the already pinned page where we
382 * are sure that there is a pte.
383 */
384 pte = get_locked_pte(vma->vm_mm, start, &ptl);
385 end = min(end, pmd_addr_end(start, end));
386
387 /* The page next to the pinned page is the first we will try to get */
388 start += PAGE_SIZE;
389 while (start < end) {
390 struct page *page = NULL;
391 pte++;
392 if (pte_present(*pte))
393 page = vm_normal_page(vma, start, *pte);
394 /*
395 * Break if page could not be obtained or the page's node+zone does not
396 * match
397 */
398 if (!page || page_zone_id(page) != zoneid)
399 break;
400
401 get_page(page);
402 /*
403 * Increase the address that will be returned *before* the
404 * eventual break due to pvec becoming full by adding the page
405 */
406 start += PAGE_SIZE;
407 if (pagevec_add(pvec, page) == 0)
408 break;
409 }
410 pte_unmap_unlock(pte, ptl);
411 return start;
412}
413
414/*
213 * munlock_vma_pages_range() - munlock all pages in the vma range.' 415 * munlock_vma_pages_range() - munlock all pages in the vma range.'
214 * @vma - vma containing range to be munlock()ed. 416 * @vma - vma containing range to be munlock()ed.
215 * @start - start address in @vma of the range 417 * @start - start address in @vma of the range
@@ -233,9 +435,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
233 vma->vm_flags &= ~VM_LOCKED; 435 vma->vm_flags &= ~VM_LOCKED;
234 436
235 while (start < end) { 437 while (start < end) {
236 struct page *page; 438 struct page *page = NULL;
237 unsigned int page_mask, page_increm; 439 unsigned int page_mask, page_increm;
440 struct pagevec pvec;
441 struct zone *zone;
442 int zoneid;
238 443
444 pagevec_init(&pvec, 0);
239 /* 445 /*
240 * Although FOLL_DUMP is intended for get_dump_page(), 446 * Although FOLL_DUMP is intended for get_dump_page(),
241 * it just so happens that its special treatment of the 447 * it just so happens that its special treatment of the
@@ -244,21 +450,45 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
244 * has sneaked into the range, we won't oops here: great). 450 * has sneaked into the range, we won't oops here: great).
245 */ 451 */
246 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, 452 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
247 &page_mask); 453 &page_mask);
454
248 if (page && !IS_ERR(page)) { 455 if (page && !IS_ERR(page)) {
249 lock_page(page); 456 if (PageTransHuge(page)) {
250 lru_add_drain(); 457 lock_page(page);
251 /* 458 /*
252 * Any THP page found by follow_page_mask() may have 459 * Any THP page found by follow_page_mask() may
253 * gotten split before reaching munlock_vma_page(), 460 * have gotten split before reaching
254 * so we need to recompute the page_mask here. 461 * munlock_vma_page(), so we need to recompute
255 */ 462 * the page_mask here.
256 page_mask = munlock_vma_page(page); 463 */
257 unlock_page(page); 464 page_mask = munlock_vma_page(page);
258 put_page(page); 465 unlock_page(page);
466 put_page(page); /* follow_page_mask() */
467 } else {
468 /*
469 * Non-huge pages are handled in batches via
470 * pagevec. The pin from follow_page_mask()
471 * prevents them from collapsing by THP.
472 */
473 pagevec_add(&pvec, page);
474 zone = page_zone(page);
475 zoneid = page_zone_id(page);
476
477 /*
478 * Try to fill the rest of pagevec using fast
479 * pte walk. This will also update start to
480 * the next page to process. Then munlock the
481 * pagevec.
482 */
483 start = __munlock_pagevec_fill(&pvec, vma,
484 zoneid, start, end);
485 __munlock_pagevec(&pvec, zone);
486 goto next;
487 }
259 } 488 }
260 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 489 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
261 start += page_increm * PAGE_SIZE; 490 start += page_increm * PAGE_SIZE;
491next:
262 cond_resched(); 492 cond_resched();
263 } 493 }
264} 494}
diff --git a/mm/mmap.c b/mm/mmap.c
index f9c97d10b873..9d548512ff8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1202 unsigned long *populate) 1202 unsigned long *populate)
1203{ 1203{
1204 struct mm_struct * mm = current->mm; 1204 struct mm_struct * mm = current->mm;
1205 struct inode *inode;
1206 vm_flags_t vm_flags; 1205 vm_flags_t vm_flags;
1207 1206
1208 *populate = 0; 1207 *populate = 0;
@@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1265 return -EAGAIN; 1264 return -EAGAIN;
1266 } 1265 }
1267 1266
1268 inode = file ? file_inode(file) : NULL;
1269
1270 if (file) { 1267 if (file) {
1268 struct inode *inode = file_inode(file);
1269
1271 switch (flags & MAP_TYPE) { 1270 switch (flags & MAP_TYPE) {
1272 case MAP_SHARED: 1271 case MAP_SHARED:
1273 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1272 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -1302,6 +1301,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1302 1301
1303 if (!file->f_op || !file->f_op->mmap) 1302 if (!file->f_op || !file->f_op->mmap)
1304 return -ENODEV; 1303 return -ENODEV;
1304 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1305 return -EINVAL;
1305 break; 1306 break;
1306 1307
1307 default: 1308 default:
@@ -1310,6 +1311,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1310 } else { 1311 } else {
1311 switch (flags & MAP_TYPE) { 1312 switch (flags & MAP_TYPE) {
1312 case MAP_SHARED: 1313 case MAP_SHARED:
1314 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1315 return -EINVAL;
1313 /* 1316 /*
1314 * Ignore pgoff. 1317 * Ignore pgoff.
1315 */ 1318 */
@@ -1476,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1476{ 1479{
1477 struct mm_struct *mm = current->mm; 1480 struct mm_struct *mm = current->mm;
1478 struct vm_area_struct *vma, *prev; 1481 struct vm_area_struct *vma, *prev;
1479 int correct_wcount = 0;
1480 int error; 1482 int error;
1481 struct rb_node **rb_link, *rb_parent; 1483 struct rb_node **rb_link, *rb_parent;
1482 unsigned long charged = 0; 1484 unsigned long charged = 0;
1483 struct inode *inode = file ? file_inode(file) : NULL;
1484 1485
1485 /* Check against address space limit. */ 1486 /* Check against address space limit. */
1486 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1487 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1544,16 +1545,11 @@ munmap_back:
1544 vma->vm_pgoff = pgoff; 1545 vma->vm_pgoff = pgoff;
1545 INIT_LIST_HEAD(&vma->anon_vma_chain); 1546 INIT_LIST_HEAD(&vma->anon_vma_chain);
1546 1547
1547 error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
1548
1549 if (file) { 1548 if (file) {
1550 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1551 goto free_vma;
1552 if (vm_flags & VM_DENYWRITE) { 1549 if (vm_flags & VM_DENYWRITE) {
1553 error = deny_write_access(file); 1550 error = deny_write_access(file);
1554 if (error) 1551 if (error)
1555 goto free_vma; 1552 goto free_vma;
1556 correct_wcount = 1;
1557 } 1553 }
1558 vma->vm_file = get_file(file); 1554 vma->vm_file = get_file(file);
1559 error = file->f_op->mmap(file, vma); 1555 error = file->f_op->mmap(file, vma);
@@ -1570,11 +1566,8 @@ munmap_back:
1570 WARN_ON_ONCE(addr != vma->vm_start); 1566 WARN_ON_ONCE(addr != vma->vm_start);
1571 1567
1572 addr = vma->vm_start; 1568 addr = vma->vm_start;
1573 pgoff = vma->vm_pgoff;
1574 vm_flags = vma->vm_flags; 1569 vm_flags = vma->vm_flags;
1575 } else if (vm_flags & VM_SHARED) { 1570 } else if (vm_flags & VM_SHARED) {
1576 if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
1577 goto free_vma;
1578 error = shmem_zero_setup(vma); 1571 error = shmem_zero_setup(vma);
1579 if (error) 1572 if (error)
1580 goto free_vma; 1573 goto free_vma;
@@ -1596,11 +1589,10 @@ munmap_back:
1596 } 1589 }
1597 1590
1598 vma_link(mm, vma, prev, rb_link, rb_parent); 1591 vma_link(mm, vma, prev, rb_link, rb_parent);
1599 file = vma->vm_file;
1600
1601 /* Once vma denies write, undo our temporary denial count */ 1592 /* Once vma denies write, undo our temporary denial count */
1602 if (correct_wcount) 1593 if (vm_flags & VM_DENYWRITE)
1603 atomic_inc(&inode->i_writecount); 1594 allow_write_access(file);
1595 file = vma->vm_file;
1604out: 1596out:
1605 perf_event_mmap(vma); 1597 perf_event_mmap(vma);
1606 1598
@@ -1616,11 +1608,20 @@ out:
1616 if (file) 1608 if (file)
1617 uprobe_mmap(vma); 1609 uprobe_mmap(vma);
1618 1610
1611 /*
1612 * New (or expanded) vma always get soft dirty status.
1613 * Otherwise user-space soft-dirty page tracker won't
1614 * be able to distinguish situation when vma area unmapped,
1615 * then new mapped in-place (which must be aimed as
1616 * a completely new data area).
1617 */
1618 vma->vm_flags |= VM_SOFTDIRTY;
1619
1619 return addr; 1620 return addr;
1620 1621
1621unmap_and_free_vma: 1622unmap_and_free_vma:
1622 if (correct_wcount) 1623 if (vm_flags & VM_DENYWRITE)
1623 atomic_inc(&inode->i_writecount); 1624 allow_write_access(file);
1624 vma->vm_file = NULL; 1625 vma->vm_file = NULL;
1625 fput(file); 1626 fput(file);
1626 1627
@@ -2380,7 +2381,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2380static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 2381static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2381 unsigned long addr, int new_below) 2382 unsigned long addr, int new_below)
2382{ 2383{
2383 struct mempolicy *pol;
2384 struct vm_area_struct *new; 2384 struct vm_area_struct *new;
2385 int err = -ENOMEM; 2385 int err = -ENOMEM;
2386 2386
@@ -2404,12 +2404,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2404 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 2404 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2405 } 2405 }
2406 2406
2407 pol = mpol_dup(vma_policy(vma)); 2407 err = vma_dup_policy(vma, new);
2408 if (IS_ERR(pol)) { 2408 if (err)
2409 err = PTR_ERR(pol);
2410 goto out_free_vma; 2409 goto out_free_vma;
2411 }
2412 vma_set_policy(new, pol);
2413 2410
2414 if (anon_vma_clone(new, vma)) 2411 if (anon_vma_clone(new, vma))
2415 goto out_free_mpol; 2412 goto out_free_mpol;
@@ -2437,7 +2434,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2437 fput(new->vm_file); 2434 fput(new->vm_file);
2438 unlink_anon_vmas(new); 2435 unlink_anon_vmas(new);
2439 out_free_mpol: 2436 out_free_mpol:
2440 mpol_put(pol); 2437 mpol_put(vma_policy(new));
2441 out_free_vma: 2438 out_free_vma:
2442 kmem_cache_free(vm_area_cachep, new); 2439 kmem_cache_free(vm_area_cachep, new);
2443 out_err: 2440 out_err:
@@ -2663,6 +2660,7 @@ out:
2663 mm->total_vm += len >> PAGE_SHIFT; 2660 mm->total_vm += len >> PAGE_SHIFT;
2664 if (flags & VM_LOCKED) 2661 if (flags & VM_LOCKED)
2665 mm->locked_vm += (len >> PAGE_SHIFT); 2662 mm->locked_vm += (len >> PAGE_SHIFT);
2663 vma->vm_flags |= VM_SOFTDIRTY;
2666 return addr; 2664 return addr;
2667} 2665}
2668 2666
@@ -2780,7 +2778,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2780 struct mm_struct *mm = vma->vm_mm; 2778 struct mm_struct *mm = vma->vm_mm;
2781 struct vm_area_struct *new_vma, *prev; 2779 struct vm_area_struct *new_vma, *prev;
2782 struct rb_node **rb_link, *rb_parent; 2780 struct rb_node **rb_link, *rb_parent;
2783 struct mempolicy *pol;
2784 bool faulted_in_anon_vma = true; 2781 bool faulted_in_anon_vma = true;
2785 2782
2786 /* 2783 /*
@@ -2825,10 +2822,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2825 new_vma->vm_start = addr; 2822 new_vma->vm_start = addr;
2826 new_vma->vm_end = addr + len; 2823 new_vma->vm_end = addr + len;
2827 new_vma->vm_pgoff = pgoff; 2824 new_vma->vm_pgoff = pgoff;
2828 pol = mpol_dup(vma_policy(vma)); 2825 if (vma_dup_policy(vma, new_vma))
2829 if (IS_ERR(pol))
2830 goto out_free_vma; 2826 goto out_free_vma;
2831 vma_set_policy(new_vma, pol);
2832 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2827 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2833 if (anon_vma_clone(new_vma, vma)) 2828 if (anon_vma_clone(new_vma, vma))
2834 goto out_free_mempol; 2829 goto out_free_mempol;
@@ -2843,7 +2838,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2843 return new_vma; 2838 return new_vma;
2844 2839
2845 out_free_mempol: 2840 out_free_mempol:
2846 mpol_put(pol); 2841 mpol_put(vma_policy(new_vma));
2847 out_free_vma: 2842 out_free_vma:
2848 kmem_cache_free(vm_area_cachep, new_vma); 2843 kmem_cache_free(vm_area_cachep, new_vma);
2849 return NULL; 2844 return NULL;
@@ -2930,7 +2925,7 @@ int install_special_mapping(struct mm_struct *mm,
2930 vma->vm_start = addr; 2925 vma->vm_start = addr;
2931 vma->vm_end = addr + len; 2926 vma->vm_end = addr + len;
2932 2927
2933 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; 2928 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
2934 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2929 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2935 2930
2936 vma->vm_ops = &special_mapping_vmops; 2931 vma->vm_ops = &special_mapping_vmops;
diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb66f3d..91b13d6a16d4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,6 +25,7 @@
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28#include <asm/pgalloc.h>
28 29
29#include "internal.h" 30#include "internal.h"
30 31
@@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
62 return NULL; 63 return NULL;
63 64
64 pmd = pmd_alloc(mm, pud, addr); 65 pmd = pmd_alloc(mm, pud, addr);
65 if (!pmd) 66 if (!pmd) {
67 pud_free(mm, pud);
66 return NULL; 68 return NULL;
69 }
67 70
68 VM_BUG_ON(pmd_trans_huge(*pmd)); 71 VM_BUG_ON(pmd_trans_huge(*pmd));
69 72
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895c71fe..6c7b0187be8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,8 +36,11 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h> 37#include <linux/timer.h>
38#include <linux/sched/rt.h> 38#include <linux/sched/rt.h>
39#include <linux/mm_inline.h>
39#include <trace/events/writeback.h> 40#include <trace/events/writeback.h>
40 41
42#include "internal.h"
43
41/* 44/*
42 * Sleep at most 200ms at a time in balance_dirty_pages(). 45 * Sleep at most 200ms at a time in balance_dirty_pages().
43 */ 46 */
@@ -241,9 +244,6 @@ static unsigned long global_dirtyable_memory(void)
241 if (!vm_highmem_is_dirtyable) 244 if (!vm_highmem_is_dirtyable)
242 x -= highmem_dirtyable_memory(x); 245 x -= highmem_dirtyable_memory(x);
243 246
244 /* Subtract min_free_kbytes */
245 x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
246
247 return x + 1; /* Ensure that we never return 0 */ 247 return x + 1; /* Ensure that we never return 0 */
248} 248}
249 249
@@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
585} 585}
586 586
587/* 587/*
588 * setpoint - dirty 3
589 * f(dirty) := 1.0 + (----------------)
590 * limit - setpoint
591 *
592 * it's a 3rd order polynomial that subjects to
593 *
594 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
595 * (2) f(setpoint) = 1.0 => the balance point
596 * (3) f(limit) = 0 => the hard limit
597 * (4) df/dx <= 0 => negative feedback control
598 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
599 * => fast response on large errors; small oscillation near setpoint
600 */
601static inline long long pos_ratio_polynom(unsigned long setpoint,
602 unsigned long dirty,
603 unsigned long limit)
604{
605 long long pos_ratio;
606 long x;
607
608 x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
609 limit - setpoint + 1);
610 pos_ratio = x;
611 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
612 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
613 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
614
615 return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
616}
617
618/*
588 * Dirty position control. 619 * Dirty position control.
589 * 620 *
590 * (o) global/bdi setpoints 621 * (o) global/bdi setpoints
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
682 /* 713 /*
683 * global setpoint 714 * global setpoint
684 * 715 *
685 * setpoint - dirty 3 716 * See comment for pos_ratio_polynom().
686 * f(dirty) := 1.0 + (----------------) 717 */
687 * limit - setpoint 718 setpoint = (freerun + limit) / 2;
719 pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
720
721 /*
722 * The strictlimit feature is a tool preventing mistrusted filesystems
723 * from growing a large number of dirty pages before throttling. For
724 * such filesystems balance_dirty_pages always checks bdi counters
725 * against bdi limits. Even if global "nr_dirty" is under "freerun".
726 * This is especially important for fuse which sets bdi->max_ratio to
727 * 1% by default. Without strictlimit feature, fuse writeback may
728 * consume arbitrary amount of RAM because it is accounted in
729 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
688 * 730 *
689 * it's a 3rd order polynomial that subjects to 731 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
732 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
733 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
734 * limits are set by default to 10% and 20% (background and throttle).
735 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
736 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
737 * about ~6K pages (as the average of background and throttle bdi
738 * limits). The 3rd order polynomial will provide positive feedback if
739 * bdi_dirty is under bdi_setpoint and vice versa.
690 * 740 *
691 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast 741 * Note, that we cannot use global counters in these calculations
692 * (2) f(setpoint) = 1.0 => the balance point 742 * because we want to throttle process writing to a strictlimit BDI
693 * (3) f(limit) = 0 => the hard limit 743 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
694 * (4) df/dx <= 0 => negative feedback control 744 * in the example above).
695 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
696 * => fast response on large errors; small oscillation near setpoint
697 */ 745 */
698 setpoint = (freerun + limit) / 2; 746 if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
699 x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, 747 long long bdi_pos_ratio;
700 limit - setpoint + 1); 748 unsigned long bdi_bg_thresh;
701 pos_ratio = x; 749
702 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 750 if (bdi_dirty < 8)
703 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 751 return min_t(long long, pos_ratio * 2,
704 pos_ratio += 1 << RATELIMIT_CALC_SHIFT; 752 2 << RATELIMIT_CALC_SHIFT);
753
754 if (bdi_dirty >= bdi_thresh)
755 return 0;
756
757 bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
758 bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
759 bdi_bg_thresh);
760
761 if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
762 return 0;
763
764 bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
765 bdi_thresh);
766
767 /*
768 * Typically, for strictlimit case, bdi_setpoint << setpoint
769 * and pos_ratio >> bdi_pos_ratio. In the other words global
770 * state ("dirty") is not limiting factor and we have to
771 * make decision based on bdi counters. But there is an
772 * important case when global pos_ratio should get precedence:
773 * global limits are exceeded (e.g. due to activities on other
774 * BDIs) while given strictlimit BDI is below limit.
775 *
776 * "pos_ratio * bdi_pos_ratio" would work for the case above,
777 * but it would look too non-natural for the case of all
778 * activity in the system coming from a single strictlimit BDI
779 * with bdi->max_ratio == 100%.
780 *
781 * Note that min() below somewhat changes the dynamics of the
782 * control system. Normally, pos_ratio value can be well over 3
783 * (when globally we are at freerun and bdi is well below bdi
784 * setpoint). Now the maximum pos_ratio in the same situation
785 * is 2. We might want to tweak this if we observe the control
786 * system is too slow to adapt.
787 */
788 return min(pos_ratio, bdi_pos_ratio);
789 }
705 790
706 /* 791 /*
707 * We have computed basic pos_ratio above based on global situation. If 792 * We have computed basic pos_ratio above based on global situation. If
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
994 * keep that period small to reduce time lags). 1079 * keep that period small to reduce time lags).
995 */ 1080 */
996 step = 0; 1081 step = 0;
1082
1083 /*
1084 * For strictlimit case, calculations above were based on bdi counters
1085 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
1086 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
1087 * Hence, to calculate "step" properly, we have to use bdi_dirty as
1088 * "dirty" and bdi_setpoint as "setpoint".
1089 *
1090 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
1091 * it's possible that bdi_thresh is close to zero due to inactivity
1092 * of backing device (see the implementation of bdi_dirty_limit()).
1093 */
1094 if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1095 dirty = bdi_dirty;
1096 if (bdi_dirty < 8)
1097 setpoint = bdi_dirty + 1;
1098 else
1099 setpoint = (bdi_thresh +
1100 bdi_dirty_limit(bdi, bg_thresh)) / 2;
1101 }
1102
997 if (dirty < setpoint) { 1103 if (dirty < setpoint) {
998 x = min(bdi->balanced_dirty_ratelimit, 1104 x = min(bdi->balanced_dirty_ratelimit,
999 min(balanced_dirty_ratelimit, task_ratelimit)); 1105 min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
1198 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; 1304 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1199} 1305}
1200 1306
1307static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
1308 unsigned long dirty_thresh,
1309 unsigned long background_thresh,
1310 unsigned long *bdi_dirty,
1311 unsigned long *bdi_thresh,
1312 unsigned long *bdi_bg_thresh)
1313{
1314 unsigned long bdi_reclaimable;
1315
1316 /*
1317 * bdi_thresh is not treated as some limiting factor as
1318 * dirty_thresh, due to reasons
1319 * - in JBOD setup, bdi_thresh can fluctuate a lot
1320 * - in a system with HDD and USB key, the USB key may somehow
1321 * go into state (bdi_dirty >> bdi_thresh) either because
1322 * bdi_dirty starts high, or because bdi_thresh drops low.
1323 * In this case we don't want to hard throttle the USB key
1324 * dirtiers for 100 seconds until bdi_dirty drops under
1325 * bdi_thresh. Instead the auxiliary bdi control line in
1326 * bdi_position_ratio() will let the dirtier task progress
1327 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1328 */
1329 *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1330
1331 if (bdi_bg_thresh)
1332 *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
1333 background_thresh,
1334 dirty_thresh);
1335
1336 /*
1337 * In order to avoid the stacked BDI deadlock we need
1338 * to ensure we accurately count the 'dirty' pages when
1339 * the threshold is low.
1340 *
1341 * Otherwise it would be possible to get thresh+n pages
1342 * reported dirty, even though there are thresh-m pages
1343 * actually dirty; with m+n sitting in the percpu
1344 * deltas.
1345 */
1346 if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
1347 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1348 *bdi_dirty = bdi_reclaimable +
1349 bdi_stat_sum(bdi, BDI_WRITEBACK);
1350 } else {
1351 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1352 *bdi_dirty = bdi_reclaimable +
1353 bdi_stat(bdi, BDI_WRITEBACK);
1354 }
1355}
1356
1201/* 1357/*
1202 * balance_dirty_pages() must be called by processes which are generating dirty 1358 * balance_dirty_pages() must be called by processes which are generating dirty
1203 * data. It looks at the number of dirty pages in the machine and will force 1359 * data. It looks at the number of dirty pages in the machine and will force
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
1209 unsigned long pages_dirtied) 1365 unsigned long pages_dirtied)
1210{ 1366{
1211 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ 1367 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
1212 unsigned long bdi_reclaimable;
1213 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ 1368 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
1214 unsigned long bdi_dirty;
1215 unsigned long freerun;
1216 unsigned long background_thresh; 1369 unsigned long background_thresh;
1217 unsigned long dirty_thresh; 1370 unsigned long dirty_thresh;
1218 unsigned long bdi_thresh;
1219 long period; 1371 long period;
1220 long pause; 1372 long pause;
1221 long max_pause; 1373 long max_pause;
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
1226 unsigned long dirty_ratelimit; 1378 unsigned long dirty_ratelimit;
1227 unsigned long pos_ratio; 1379 unsigned long pos_ratio;
1228 struct backing_dev_info *bdi = mapping->backing_dev_info; 1380 struct backing_dev_info *bdi = mapping->backing_dev_info;
1381 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1229 unsigned long start_time = jiffies; 1382 unsigned long start_time = jiffies;
1230 1383
1231 for (;;) { 1384 for (;;) {
1232 unsigned long now = jiffies; 1385 unsigned long now = jiffies;
1386 unsigned long uninitialized_var(bdi_thresh);
1387 unsigned long thresh;
1388 unsigned long uninitialized_var(bdi_dirty);
1389 unsigned long dirty;
1390 unsigned long bg_thresh;
1233 1391
1234 /* 1392 /*
1235 * Unstable writes are a feature of certain networked 1393 * Unstable writes are a feature of certain networked
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
1243 1401
1244 global_dirty_limits(&background_thresh, &dirty_thresh); 1402 global_dirty_limits(&background_thresh, &dirty_thresh);
1245 1403
1404 if (unlikely(strictlimit)) {
1405 bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
1406 &bdi_dirty, &bdi_thresh, &bg_thresh);
1407
1408 dirty = bdi_dirty;
1409 thresh = bdi_thresh;
1410 } else {
1411 dirty = nr_dirty;
1412 thresh = dirty_thresh;
1413 bg_thresh = background_thresh;
1414 }
1415
1246 /* 1416 /*
1247 * Throttle it only when the background writeback cannot 1417 * Throttle it only when the background writeback cannot
1248 * catch-up. This avoids (excessively) small writeouts 1418 * catch-up. This avoids (excessively) small writeouts
1249 * when the bdi limits are ramping up. 1419 * when the bdi limits are ramping up in case of !strictlimit.
1420 *
1421 * In strictlimit case make decision based on the bdi counters
1422 * and limits. Small writeouts when the bdi limits are ramping
1423 * up are the price we consciously pay for strictlimit-ing.
1250 */ 1424 */
1251 freerun = dirty_freerun_ceiling(dirty_thresh, 1425 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
1252 background_thresh);
1253 if (nr_dirty <= freerun) {
1254 current->dirty_paused_when = now; 1426 current->dirty_paused_when = now;
1255 current->nr_dirtied = 0; 1427 current->nr_dirtied = 0;
1256 current->nr_dirtied_pause = 1428 current->nr_dirtied_pause =
1257 dirty_poll_interval(nr_dirty, dirty_thresh); 1429 dirty_poll_interval(dirty, thresh);
1258 break; 1430 break;
1259 } 1431 }
1260 1432
1261 if (unlikely(!writeback_in_progress(bdi))) 1433 if (unlikely(!writeback_in_progress(bdi)))
1262 bdi_start_background_writeback(bdi); 1434 bdi_start_background_writeback(bdi);
1263 1435
1264 /* 1436 if (!strictlimit)
1265 * bdi_thresh is not treated as some limiting factor as 1437 bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
1266 * dirty_thresh, due to reasons 1438 &bdi_dirty, &bdi_thresh, NULL);
1267 * - in JBOD setup, bdi_thresh can fluctuate a lot
1268 * - in a system with HDD and USB key, the USB key may somehow
1269 * go into state (bdi_dirty >> bdi_thresh) either because
1270 * bdi_dirty starts high, or because bdi_thresh drops low.
1271 * In this case we don't want to hard throttle the USB key
1272 * dirtiers for 100 seconds until bdi_dirty drops under
1273 * bdi_thresh. Instead the auxiliary bdi control line in
1274 * bdi_position_ratio() will let the dirtier task progress
1275 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1276 */
1277 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1278
1279 /*
1280 * In order to avoid the stacked BDI deadlock we need
1281 * to ensure we accurately count the 'dirty' pages when
1282 * the threshold is low.
1283 *
1284 * Otherwise it would be possible to get thresh+n pages
1285 * reported dirty, even though there are thresh-m pages
1286 * actually dirty; with m+n sitting in the percpu
1287 * deltas.
1288 */
1289 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
1290 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1291 bdi_dirty = bdi_reclaimable +
1292 bdi_stat_sum(bdi, BDI_WRITEBACK);
1293 } else {
1294 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1295 bdi_dirty = bdi_reclaimable +
1296 bdi_stat(bdi, BDI_WRITEBACK);
1297 }
1298 1439
1299 dirty_exceeded = (bdi_dirty > bdi_thresh) && 1440 dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1300 (nr_dirty > dirty_thresh); 1441 ((nr_dirty > dirty_thresh) || strictlimit);
1301 if (dirty_exceeded && !bdi->dirty_exceeded) 1442 if (dirty_exceeded && !bdi->dirty_exceeded)
1302 bdi->dirty_exceeded = 1; 1443 bdi->dirty_exceeded = 1;
1303 1444
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2b59dbda196..0ee638f76ebe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@
56#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h> 57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/mm_inline.h>
59#include <linux/migrate.h> 60#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 61#include <linux/page-debug-flags.h>
61#include <linux/hugetlb.h> 62#include <linux/hugetlb.h>
@@ -488,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
488 * (c) a page and its buddy have the same order && 489 * (c) a page and its buddy have the same order &&
489 * (d) a page and its buddy are in the same zone. 490 * (d) a page and its buddy are in the same zone.
490 * 491 *
491 * For recording whether a page is in the buddy system, we set ->_mapcount -2. 492 * For recording whether a page is in the buddy system, we set ->_mapcount
492 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. 493 * PAGE_BUDDY_MAPCOUNT_VALUE.
494 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
495 * serialized by zone->lock.
493 * 496 *
494 * For recording page's order, we use page_private(page). 497 * For recording page's order, we use page_private(page).
495 */ 498 */
@@ -527,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
527 * as necessary, plus some accounting needed to play nicely with other 530 * as necessary, plus some accounting needed to play nicely with other
528 * parts of the VM system. 531 * parts of the VM system.
529 * At each level, we keep a list of pages, which are heads of continuous 532 * At each level, we keep a list of pages, which are heads of continuous
530 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 533 * free pages of length of (1 << order) and marked with _mapcount
531 * order is recorded in page_private(page) field. 534 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
535 * field.
532 * So when we are allocating or freeing one, we can derive the state of the 536 * So when we are allocating or freeing one, we can derive the state of the
533 * other. That is, if we allocate a small block, and both were 537 * other. That is, if we allocate a small block, and both were
534 * free, the remainder of the region must be split into blocks. 538 * free, the remainder of the region must be split into blocks.
@@ -647,7 +651,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
647 int to_free = count; 651 int to_free = count;
648 652
649 spin_lock(&zone->lock); 653 spin_lock(&zone->lock);
650 zone->all_unreclaimable = 0;
651 zone->pages_scanned = 0; 654 zone->pages_scanned = 0;
652 655
653 while (to_free) { 656 while (to_free) {
@@ -696,7 +699,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
696 int migratetype) 699 int migratetype)
697{ 700{
698 spin_lock(&zone->lock); 701 spin_lock(&zone->lock);
699 zone->all_unreclaimable = 0;
700 zone->pages_scanned = 0; 702 zone->pages_scanned = 0;
701 703
702 __free_one_page(page, zone, order, migratetype); 704 __free_one_page(page, zone, order, migratetype);
@@ -721,7 +723,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
721 return false; 723 return false;
722 724
723 if (!PageHighMem(page)) { 725 if (!PageHighMem(page)) {
724 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 726 debug_check_no_locks_freed(page_address(page),
727 PAGE_SIZE << order);
725 debug_check_no_obj_freed(page_address(page), 728 debug_check_no_obj_freed(page_address(page),
726 PAGE_SIZE << order); 729 PAGE_SIZE << order);
727 } 730 }
@@ -750,19 +753,19 @@ static void __free_pages_ok(struct page *page, unsigned int order)
750void __init __free_pages_bootmem(struct page *page, unsigned int order) 753void __init __free_pages_bootmem(struct page *page, unsigned int order)
751{ 754{
752 unsigned int nr_pages = 1 << order; 755 unsigned int nr_pages = 1 << order;
756 struct page *p = page;
753 unsigned int loop; 757 unsigned int loop;
754 758
755 prefetchw(page); 759 prefetchw(p);
756 for (loop = 0; loop < nr_pages; loop++) { 760 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
757 struct page *p = &page[loop]; 761 prefetchw(p + 1);
758
759 if (loop + 1 < nr_pages)
760 prefetchw(p + 1);
761 __ClearPageReserved(p); 762 __ClearPageReserved(p);
762 set_page_count(p, 0); 763 set_page_count(p, 0);
763 } 764 }
765 __ClearPageReserved(p);
766 set_page_count(p, 0);
764 767
765 page_zone(page)->managed_pages += 1 << order; 768 page_zone(page)->managed_pages += nr_pages;
766 set_page_refcounted(page); 769 set_page_refcounted(page);
767 __free_pages(page, order); 770 __free_pages(page, order);
768} 771}
@@ -885,7 +888,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
885 int migratetype) 888 int migratetype)
886{ 889{
887 unsigned int current_order; 890 unsigned int current_order;
888 struct free_area * area; 891 struct free_area *area;
889 struct page *page; 892 struct page *page;
890 893
891 /* Find a page of the appropriate size in the preferred list */ 894 /* Find a page of the appropriate size in the preferred list */
@@ -1007,14 +1010,60 @@ static void change_pageblock_range(struct page *pageblock_page,
1007 } 1010 }
1008} 1011}
1009 1012
1013/*
1014 * If breaking a large block of pages, move all free pages to the preferred
1015 * allocation list. If falling back for a reclaimable kernel allocation, be
1016 * more aggressive about taking ownership of free pages.
1017 *
1018 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1019 * nor move CMA pages to different free lists. We don't want unmovable pages
1020 * to be allocated from MIGRATE_CMA areas.
1021 *
1022 * Returns the new migratetype of the pageblock (or the same old migratetype
1023 * if it was unchanged).
1024 */
1025static int try_to_steal_freepages(struct zone *zone, struct page *page,
1026 int start_type, int fallback_type)
1027{
1028 int current_order = page_order(page);
1029
1030 if (is_migrate_cma(fallback_type))
1031 return fallback_type;
1032
1033 /* Take ownership for orders >= pageblock_order */
1034 if (current_order >= pageblock_order) {
1035 change_pageblock_range(page, current_order, start_type);
1036 return start_type;
1037 }
1038
1039 if (current_order >= pageblock_order / 2 ||
1040 start_type == MIGRATE_RECLAIMABLE ||
1041 page_group_by_mobility_disabled) {
1042 int pages;
1043
1044 pages = move_freepages_block(zone, page, start_type);
1045
1046 /* Claim the whole block if over half of it is free */
1047 if (pages >= (1 << (pageblock_order-1)) ||
1048 page_group_by_mobility_disabled) {
1049
1050 set_pageblock_migratetype(page, start_type);
1051 return start_type;
1052 }
1053
1054 }
1055
1056 return fallback_type;
1057}
1058
1010/* Remove an element from the buddy allocator from the fallback list */ 1059/* Remove an element from the buddy allocator from the fallback list */
1011static inline struct page * 1060static inline struct page *
1012__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1061__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1013{ 1062{
1014 struct free_area * area; 1063 struct free_area *area;
1015 int current_order; 1064 int current_order;
1016 struct page *page; 1065 struct page *page;
1017 int migratetype, i; 1066 int migratetype, new_type, i;
1018 1067
1019 /* Find the largest possible block of pages in the other list */ 1068 /* Find the largest possible block of pages in the other list */
1020 for (current_order = MAX_ORDER-1; current_order >= order; 1069 for (current_order = MAX_ORDER-1; current_order >= order;
@@ -1034,51 +1083,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1034 struct page, lru); 1083 struct page, lru);
1035 area->nr_free--; 1084 area->nr_free--;
1036 1085
1037 /* 1086 new_type = try_to_steal_freepages(zone, page,
1038 * If breaking a large block of pages, move all free 1087 start_migratetype,
1039 * pages to the preferred allocation list. If falling 1088 migratetype);
1040 * back for a reclaimable kernel allocation, be more
1041 * aggressive about taking ownership of free pages
1042 *
1043 * On the other hand, never change migration
1044 * type of MIGRATE_CMA pageblocks nor move CMA
1045 * pages on different free lists. We don't
1046 * want unmovable pages to be allocated from
1047 * MIGRATE_CMA areas.
1048 */
1049 if (!is_migrate_cma(migratetype) &&
1050 (current_order >= pageblock_order / 2 ||
1051 start_migratetype == MIGRATE_RECLAIMABLE ||
1052 page_group_by_mobility_disabled)) {
1053 int pages;
1054 pages = move_freepages_block(zone, page,
1055 start_migratetype);
1056
1057 /* Claim the whole block if over half of it is free */
1058 if (pages >= (1 << (pageblock_order-1)) ||
1059 page_group_by_mobility_disabled)
1060 set_pageblock_migratetype(page,
1061 start_migratetype);
1062
1063 migratetype = start_migratetype;
1064 }
1065 1089
1066 /* Remove the page from the freelists */ 1090 /* Remove the page from the freelists */
1067 list_del(&page->lru); 1091 list_del(&page->lru);
1068 rmv_page_order(page); 1092 rmv_page_order(page);
1069 1093
1070 /* Take ownership for orders >= pageblock_order */ 1094 /*
1071 if (current_order >= pageblock_order && 1095 * Borrow the excess buddy pages as well, irrespective
1072 !is_migrate_cma(migratetype)) 1096 * of whether we stole freepages, or took ownership of
1073 change_pageblock_range(page, current_order, 1097 * the pageblock or not.
1074 start_migratetype); 1098 *
1075 1099 * Exception: When borrowing from MIGRATE_CMA, release
1100 * the excess buddy pages to CMA itself.
1101 */
1076 expand(zone, page, order, current_order, area, 1102 expand(zone, page, order, current_order, area,
1077 is_migrate_cma(migratetype) 1103 is_migrate_cma(migratetype)
1078 ? migratetype : start_migratetype); 1104 ? migratetype : start_migratetype);
1079 1105
1080 trace_mm_page_alloc_extfrag(page, order, current_order, 1106 trace_mm_page_alloc_extfrag(page, order,
1081 start_migratetype, migratetype); 1107 current_order, start_migratetype, migratetype,
1108 new_type == start_migratetype);
1082 1109
1083 return page; 1110 return page;
1084 } 1111 }
@@ -1281,7 +1308,7 @@ void mark_free_pages(struct zone *zone)
1281 int order, t; 1308 int order, t;
1282 struct list_head *curr; 1309 struct list_head *curr;
1283 1310
1284 if (!zone->spanned_pages) 1311 if (zone_is_empty(zone))
1285 return; 1312 return;
1286 1313
1287 spin_lock_irqsave(&zone->lock, flags); 1314 spin_lock_irqsave(&zone->lock, flags);
@@ -1526,6 +1553,7 @@ again:
1526 get_pageblock_migratetype(page)); 1553 get_pageblock_migratetype(page));
1527 } 1554 }
1528 1555
1556 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1529 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1557 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1530 zone_statistics(preferred_zone, zone, gfp_flags); 1558 zone_statistics(preferred_zone, zone, gfp_flags);
1531 local_irq_restore(flags); 1559 local_irq_restore(flags);
@@ -1792,6 +1820,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1792 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1820 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1793} 1821}
1794 1822
1823static bool zone_local(struct zone *local_zone, struct zone *zone)
1824{
1825 return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
1826}
1827
1795static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1828static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1796{ 1829{
1797 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1830 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
@@ -1829,6 +1862,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1829{ 1862{
1830} 1863}
1831 1864
1865static bool zone_local(struct zone *local_zone, struct zone *zone)
1866{
1867 return true;
1868}
1869
1832static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1870static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1833{ 1871{
1834 return true; 1872 return true;
@@ -1860,16 +1898,41 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1860zonelist_scan: 1898zonelist_scan:
1861 /* 1899 /*
1862 * Scan zonelist, looking for a zone with enough free. 1900 * Scan zonelist, looking for a zone with enough free.
1863 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1901 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1864 */ 1902 */
1865 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1903 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1866 high_zoneidx, nodemask) { 1904 high_zoneidx, nodemask) {
1905 unsigned long mark;
1906
1867 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1907 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1868 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1908 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1869 continue; 1909 continue;
1870 if ((alloc_flags & ALLOC_CPUSET) && 1910 if ((alloc_flags & ALLOC_CPUSET) &&
1871 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1911 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1872 continue; 1912 continue;
1913 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1914 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1915 goto try_this_zone;
1916 /*
1917 * Distribute pages in proportion to the individual
1918 * zone size to ensure fair page aging. The zone a
1919 * page was allocated in should have no effect on the
1920 * time the page has in memory before being reclaimed.
1921 *
1922 * When zone_reclaim_mode is enabled, try to stay in
1923 * local zones in the fastpath. If that fails, the
1924 * slowpath is entered, which will do another pass
1925 * starting with the local zones, but ultimately fall
1926 * back to remote zones that do not partake in the
1927 * fairness round-robin cycle of this zonelist.
1928 */
1929 if (alloc_flags & ALLOC_WMARK_LOW) {
1930 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1931 continue;
1932 if (zone_reclaim_mode &&
1933 !zone_local(preferred_zone, zone))
1934 continue;
1935 }
1873 /* 1936 /*
1874 * When allocating a page cache page for writing, we 1937 * When allocating a page cache page for writing, we
1875 * want to get it from a zone that is within its dirty 1938 * want to get it from a zone that is within its dirty
@@ -1900,16 +1963,11 @@ zonelist_scan:
1900 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1963 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1901 goto this_zone_full; 1964 goto this_zone_full;
1902 1965
1903 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1966 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1904 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1967 if (!zone_watermark_ok(zone, order, mark,
1905 unsigned long mark; 1968 classzone_idx, alloc_flags)) {
1906 int ret; 1969 int ret;
1907 1970
1908 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1909 if (zone_watermark_ok(zone, order, mark,
1910 classzone_idx, alloc_flags))
1911 goto try_this_zone;
1912
1913 if (IS_ENABLED(CONFIG_NUMA) && 1971 if (IS_ENABLED(CONFIG_NUMA) &&
1914 !did_zlc_setup && nr_online_nodes > 1) { 1972 !did_zlc_setup && nr_online_nodes > 1) {
1915 /* 1973 /*
@@ -2321,16 +2379,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2321 return page; 2379 return page;
2322} 2380}
2323 2381
2324static inline 2382static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
2325void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2383 struct zonelist *zonelist,
2326 enum zone_type high_zoneidx, 2384 enum zone_type high_zoneidx,
2327 enum zone_type classzone_idx) 2385 struct zone *preferred_zone)
2328{ 2386{
2329 struct zoneref *z; 2387 struct zoneref *z;
2330 struct zone *zone; 2388 struct zone *zone;
2331 2389
2332 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2390 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2333 wakeup_kswapd(zone, order, classzone_idx); 2391 if (!(gfp_mask & __GFP_NO_KSWAPD))
2392 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2393 /*
2394 * Only reset the batches of zones that were actually
2395 * considered in the fast path, we don't want to
2396 * thrash fairness information for zones that are not
2397 * actually part of this zonelist's round-robin cycle.
2398 */
2399 if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
2400 continue;
2401 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2402 high_wmark_pages(zone) -
2403 low_wmark_pages(zone) -
2404 zone_page_state(zone, NR_ALLOC_BATCH));
2405 }
2334} 2406}
2335 2407
2336static inline int 2408static inline int
@@ -2426,9 +2498,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2426 goto nopage; 2498 goto nopage;
2427 2499
2428restart: 2500restart:
2429 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2501 prepare_slowpath(gfp_mask, order, zonelist,
2430 wake_all_kswapd(order, zonelist, high_zoneidx, 2502 high_zoneidx, preferred_zone);
2431 zone_idx(preferred_zone));
2432 2503
2433 /* 2504 /*
2434 * OK, we're below the kswapd watermark and have kicked background 2505 * OK, we're below the kswapd watermark and have kicked background
@@ -3095,7 +3166,7 @@ void show_free_areas(unsigned int filter)
3095 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3166 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3096 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3167 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3097 zone->pages_scanned, 3168 zone->pages_scanned,
3098 (zone->all_unreclaimable ? "yes" : "no") 3169 (!zone_reclaimable(zone) ? "yes" : "no")
3099 ); 3170 );
3100 printk("lowmem_reserve[]:"); 3171 printk("lowmem_reserve[]:");
3101 for (i = 0; i < MAX_NR_ZONES; i++) 3172 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3104,7 +3175,7 @@ void show_free_areas(unsigned int filter)
3104 } 3175 }
3105 3176
3106 for_each_populated_zone(zone) { 3177 for_each_populated_zone(zone) {
3107 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3178 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3108 unsigned char types[MAX_ORDER]; 3179 unsigned char types[MAX_ORDER];
3109 3180
3110 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3181 if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -3416,11 +3487,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3416static int default_zonelist_order(void) 3487static int default_zonelist_order(void)
3417{ 3488{
3418 int nid, zone_type; 3489 int nid, zone_type;
3419 unsigned long low_kmem_size,total_size; 3490 unsigned long low_kmem_size, total_size;
3420 struct zone *z; 3491 struct zone *z;
3421 int average_size; 3492 int average_size;
3422 /* 3493 /*
3423 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3494 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3424 * If they are really small and used heavily, the system can fall 3495 * If they are really small and used heavily, the system can fall
3425 * into OOM very easily. 3496 * into OOM very easily.
3426 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3497 * This function detect ZONE_DMA/DMA32 size and configures zone order.
@@ -3452,9 +3523,9 @@ static int default_zonelist_order(void)
3452 return ZONELIST_ORDER_NODE; 3523 return ZONELIST_ORDER_NODE;
3453 /* 3524 /*
3454 * look into each node's config. 3525 * look into each node's config.
3455 * If there is a node whose DMA/DMA32 memory is very big area on 3526 * If there is a node whose DMA/DMA32 memory is very big area on
3456 * local memory, NODE_ORDER may be suitable. 3527 * local memory, NODE_ORDER may be suitable.
3457 */ 3528 */
3458 average_size = total_size / 3529 average_size = total_size /
3459 (nodes_weight(node_states[N_MEMORY]) + 1); 3530 (nodes_weight(node_states[N_MEMORY]) + 1);
3460 for_each_online_node(nid) { 3531 for_each_online_node(nid) {
@@ -4180,7 +4251,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4180 if (!zone->wait_table) 4251 if (!zone->wait_table)
4181 return -ENOMEM; 4252 return -ENOMEM;
4182 4253
4183 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4254 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4184 init_waitqueue_head(zone->wait_table + i); 4255 init_waitqueue_head(zone->wait_table + i);
4185 4256
4186 return 0; 4257 return 0;
@@ -4237,7 +4308,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
4237int __meminit __early_pfn_to_nid(unsigned long pfn) 4308int __meminit __early_pfn_to_nid(unsigned long pfn)
4238{ 4309{
4239 unsigned long start_pfn, end_pfn; 4310 unsigned long start_pfn, end_pfn;
4240 int i, nid; 4311 int nid;
4241 /* 4312 /*
4242 * NOTE: The following SMP-unsafe globals are only used early in boot 4313 * NOTE: The following SMP-unsafe globals are only used early in boot
4243 * when the kernel is running single-threaded. 4314 * when the kernel is running single-threaded.
@@ -4248,15 +4319,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
4248 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4319 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4249 return last_nid; 4320 return last_nid;
4250 4321
4251 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4322 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4252 if (start_pfn <= pfn && pfn < end_pfn) { 4323 if (nid != -1) {
4253 last_start_pfn = start_pfn; 4324 last_start_pfn = start_pfn;
4254 last_end_pfn = end_pfn; 4325 last_end_pfn = end_pfn;
4255 last_nid = nid; 4326 last_nid = nid;
4256 return nid; 4327 }
4257 } 4328
4258 /* This is a memory hole */ 4329 return nid;
4259 return -1;
4260} 4330}
4261#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4331#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4262 4332
@@ -4586,7 +4656,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4586#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4656#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4587 4657
4588/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4658/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4589void __init set_pageblock_order(void) 4659void __paginginit set_pageblock_order(void)
4590{ 4660{
4591 unsigned int order; 4661 unsigned int order;
4592 4662
@@ -4614,7 +4684,7 @@ void __init set_pageblock_order(void)
4614 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4684 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4615 * the kernel config 4685 * the kernel config
4616 */ 4686 */
4617void __init set_pageblock_order(void) 4687void __paginginit set_pageblock_order(void)
4618{ 4688{
4619} 4689}
4620 4690
@@ -4728,8 +4798,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4728 spin_lock_init(&zone->lru_lock); 4798 spin_lock_init(&zone->lru_lock);
4729 zone_seqlock_init(zone); 4799 zone_seqlock_init(zone);
4730 zone->zone_pgdat = pgdat; 4800 zone->zone_pgdat = pgdat;
4731
4732 zone_pcp_init(zone); 4801 zone_pcp_init(zone);
4802
4803 /* For bootup, initialized properly in watermark setup */
4804 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4805
4733 lruvec_init(&zone->lruvec); 4806 lruvec_init(&zone->lruvec);
4734 if (!size) 4807 if (!size)
4735 continue; 4808 continue;
@@ -4930,7 +5003,7 @@ static unsigned long __init early_calculate_totalpages(void)
4930 if (pages) 5003 if (pages)
4931 node_set_state(nid, N_MEMORY); 5004 node_set_state(nid, N_MEMORY);
4932 } 5005 }
4933 return totalpages; 5006 return totalpages;
4934} 5007}
4935 5008
4936/* 5009/*
@@ -5047,7 +5120,7 @@ restart:
5047 /* 5120 /*
5048 * Some kernelcore has been met, update counts and 5121 * Some kernelcore has been met, update counts and
5049 * break if the kernelcore for this node has been 5122 * break if the kernelcore for this node has been
5050 * satisified 5123 * satisfied
5051 */ 5124 */
5052 required_kernelcore -= min(required_kernelcore, 5125 required_kernelcore -= min(required_kernelcore,
5053 size_pages); 5126 size_pages);
@@ -5061,7 +5134,7 @@ restart:
5061 * If there is still required_kernelcore, we do another pass with one 5134 * If there is still required_kernelcore, we do another pass with one
5062 * less node in the count. This will push zone_movable_pfn[nid] further 5135 * less node in the count. This will push zone_movable_pfn[nid] further
5063 * along on the nodes that still have memory until kernelcore is 5136 * along on the nodes that still have memory until kernelcore is
5064 * satisified 5137 * satisfied
5065 */ 5138 */
5066 usable_nodes--; 5139 usable_nodes--;
5067 if (usable_nodes && required_kernelcore > usable_nodes) 5140 if (usable_nodes && required_kernelcore > usable_nodes)
@@ -5286,8 +5359,10 @@ void __init mem_init_print_info(const char *str)
5286 * 3) .rodata.* may be embedded into .text or .data sections. 5359 * 3) .rodata.* may be embedded into .text or .data sections.
5287 */ 5360 */
5288#define adj_init_size(start, end, size, pos, adj) \ 5361#define adj_init_size(start, end, size, pos, adj) \
5289 if (start <= pos && pos < end && size > adj) \ 5362 do { \
5290 size -= adj; 5363 if (start <= pos && pos < end && size > adj) \
5364 size -= adj; \
5365 } while (0)
5291 5366
5292 adj_init_size(__init_begin, __init_end, init_data_size, 5367 adj_init_size(__init_begin, __init_end, init_data_size,
5293 _sinittext, init_code_size); 5368 _sinittext, init_code_size);
@@ -5361,7 +5436,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
5361 * This is only okay since the processor is dead and cannot 5436 * This is only okay since the processor is dead and cannot
5362 * race with what we are doing. 5437 * race with what we are doing.
5363 */ 5438 */
5364 refresh_cpu_vm_stats(cpu); 5439 cpu_vm_stats_fold(cpu);
5365 } 5440 }
5366 return NOTIFY_OK; 5441 return NOTIFY_OK;
5367} 5442}
@@ -5498,6 +5573,11 @@ static void __setup_per_zone_wmarks(void)
5498 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5573 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5499 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5574 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5500 5575
5576 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5577 high_wmark_pages(zone) -
5578 low_wmark_pages(zone) -
5579 zone_page_state(zone, NR_ALLOC_BATCH));
5580
5501 setup_zone_migrate_reserve(zone); 5581 setup_zone_migrate_reserve(zone);
5502 spin_unlock_irqrestore(&zone->lock, flags); 5582 spin_unlock_irqrestore(&zone->lock, flags);
5503 } 5583 }
@@ -5570,7 +5650,7 @@ static void __meminit setup_per_zone_inactive_ratio(void)
5570 * we want it large (64MB max). But it is not linear, because network 5650 * we want it large (64MB max). But it is not linear, because network
5571 * bandwidth does not increase linearly with machine size. We use 5651 * bandwidth does not increase linearly with machine size. We use
5572 * 5652 *
5573 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5653 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5574 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5654 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5575 * 5655 *
5576 * which yields 5656 * which yields
@@ -5614,11 +5694,11 @@ int __meminit init_per_zone_wmark_min(void)
5614module_init(init_per_zone_wmark_min) 5694module_init(init_per_zone_wmark_min)
5615 5695
5616/* 5696/*
5617 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5697 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5618 * that we can call two helper functions whenever min_free_kbytes 5698 * that we can call two helper functions whenever min_free_kbytes
5619 * changes. 5699 * changes.
5620 */ 5700 */
5621int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5701int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5622 void __user *buffer, size_t *length, loff_t *ppos) 5702 void __user *buffer, size_t *length, loff_t *ppos)
5623{ 5703{
5624 proc_dointvec(table, write, buffer, length, ppos); 5704 proc_dointvec(table, write, buffer, length, ppos);
@@ -5682,8 +5762,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5682 5762
5683/* 5763/*
5684 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5764 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5685 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5765 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5686 * can have before it gets flushed back to buddy allocator. 5766 * pagelist can have before it gets flushed back to buddy allocator.
5687 */ 5767 */
5688int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5768int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5689 void __user *buffer, size_t *length, loff_t *ppos) 5769 void __user *buffer, size_t *length, loff_t *ppos)
@@ -5745,9 +5825,10 @@ void *__init alloc_large_system_hash(const char *tablename,
5745 if (!numentries) { 5825 if (!numentries) {
5746 /* round applicable memory size up to nearest megabyte */ 5826 /* round applicable memory size up to nearest megabyte */
5747 numentries = nr_kernel_pages; 5827 numentries = nr_kernel_pages;
5748 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 5828
5749 numentries >>= 20 - PAGE_SHIFT; 5829 /* It isn't necessary when PAGE_SIZE >= 1MB */
5750 numentries <<= 20 - PAGE_SHIFT; 5830 if (PAGE_SHIFT < 20)
5831 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5751 5832
5752 /* limit to 1 bucket per 2^scale bytes of low memory */ 5833 /* limit to 1 bucket per 2^scale bytes of low memory */
5753 if (scale > PAGE_SHIFT) 5834 if (scale > PAGE_SHIFT)
@@ -5900,7 +5981,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5900 * This function checks whether pageblock includes unmovable pages or not. 5981 * This function checks whether pageblock includes unmovable pages or not.
5901 * If @count is not zero, it is okay to include less @count unmovable pages 5982 * If @count is not zero, it is okay to include less @count unmovable pages
5902 * 5983 *
5903 * PageLRU check wihtout isolation or lru_lock could race so that 5984 * PageLRU check without isolation or lru_lock could race so that
5904 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5985 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5905 * expect this function should be exact. 5986 * expect this function should be exact.
5906 */ 5987 */
@@ -5928,6 +6009,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5928 continue; 6009 continue;
5929 6010
5930 page = pfn_to_page(check); 6011 page = pfn_to_page(check);
6012
6013 /*
6014 * Hugepages are not in LRU lists, but they're movable.
6015 * We need not scan over tail pages bacause we don't
6016 * handle each tail page individually in migration.
6017 */
6018 if (PageHuge(page)) {
6019 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6020 continue;
6021 }
6022
5931 /* 6023 /*
5932 * We can't use page_count without pin a page 6024 * We can't use page_count without pin a page
5933 * because another CPU can free compound page. 6025 * because another CPU can free compound page.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0cee10ffb98d..d1473b2e9481 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -6,6 +6,7 @@
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/hugetlb.h>
9#include "internal.h" 10#include "internal.h"
10 11
11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) 12int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
@@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
252{ 253{
253 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 254 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
254 255
256 /*
257 * TODO: allocate a destination hugepage from a nearest neighbor node,
258 * accordance with memory policy of the user process if possible. For
259 * now as a simple work-around, we use the next node for destination.
260 */
261 if (PageHuge(page)) {
262 nodemask_t src = nodemask_of_node(page_to_nid(page));
263 nodemask_t dst;
264 nodes_complement(dst, src);
265 return alloc_huge_page_node(page_hstate(compound_head(page)),
266 next_node(page_to_nid(page), dst));
267 }
268
255 if (PageHighMem(page)) 269 if (PageHighMem(page))
256 gfp_mask |= __GFP_HIGHMEM; 270 gfp_mask |= __GFP_HIGHMEM;
257 271
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e1a6e4fab016..3929a40bd6c0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,30 @@
10#include <asm/tlb.h> 10#include <asm/tlb.h>
11#include <asm-generic/pgtable.h> 11#include <asm-generic/pgtable.h>
12 12
13/*
14 * If a p?d_bad entry is found while walking page tables, report
15 * the error, before resetting entry to p?d_none. Usually (but
16 * very seldom) called out from the p?d_none_or_clear_bad macros.
17 */
18
19void pgd_clear_bad(pgd_t *pgd)
20{
21 pgd_ERROR(*pgd);
22 pgd_clear(pgd);
23}
24
25void pud_clear_bad(pud_t *pud)
26{
27 pud_ERROR(*pud);
28 pud_clear(pud);
29}
30
31void pmd_clear_bad(pmd_t *pmd)
32{
33 pmd_ERROR(*pmd);
34 pmd_clear(pmd);
35}
36
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 37#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/* 38/*
15 * Only sets the access flags (dirty, accessed), as well as write 39 * Only sets the access flags (dirty, accessed), as well as write
diff --git a/mm/readahead.c b/mm/readahead.c
index 829a77c62834..e4ed04149785 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping,
371 size = count_history_pages(mapping, ra, offset, max); 371 size = count_history_pages(mapping, ra, offset, max);
372 372
373 /* 373 /*
374 * no history pages: 374 * not enough history pages:
375 * it could be a random read 375 * it could be a random read
376 */ 376 */
377 if (!size) 377 if (size <= req_size)
378 return 0; 378 return 0;
379 379
380 /* 380 /*
@@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping,
385 size *= 2; 385 size *= 2;
386 386
387 ra->start = offset; 387 ra->start = offset;
388 ra->size = get_init_ra_size(size + req_size, max); 388 ra->size = min(size + req_size, max);
389 ra->async_size = ra->size; 389 ra->async_size = 1;
390 390
391 return 1; 391 return 1;
392} 392}
diff --git a/mm/shmem.c b/mm/shmem.c
index 526149846d0a..8297623fcaed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1205,7 +1205,7 @@ repeat:
1205 gfp & GFP_RECLAIM_MASK); 1205 gfp & GFP_RECLAIM_MASK);
1206 if (error) 1206 if (error)
1207 goto decused; 1207 goto decused;
1208 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 1208 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
1209 if (!error) { 1209 if (!error) {
1210 error = shmem_add_to_page_cache(page, mapping, index, 1210 error = shmem_add_to_page_cache(page, mapping, index,
1211 gfp, NULL); 1211 gfp, NULL);
@@ -2819,6 +2819,10 @@ int __init shmem_init(void)
2819{ 2819{
2820 int error; 2820 int error;
2821 2821
2822 /* If rootfs called this, don't re-init */
2823 if (shmem_inode_cachep)
2824 return 0;
2825
2822 error = bdi_init(&shmem_backing_dev_info); 2826 error = bdi_init(&shmem_backing_dev_info);
2823 if (error) 2827 if (error)
2824 goto out4; 2828 goto out4;
diff --git a/mm/slub.c b/mm/slub.c
index e3ba1f2cf60c..51df8272cfaf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s,
4420 unsigned long order; 4420 unsigned long order;
4421 int err; 4421 int err;
4422 4422
4423 err = strict_strtoul(buf, 10, &order); 4423 err = kstrtoul(buf, 10, &order);
4424 if (err) 4424 if (err)
4425 return err; 4425 return err;
4426 4426
@@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4448 unsigned long min; 4448 unsigned long min;
4449 int err; 4449 int err;
4450 4450
4451 err = strict_strtoul(buf, 10, &min); 4451 err = kstrtoul(buf, 10, &min);
4452 if (err) 4452 if (err)
4453 return err; 4453 return err;
4454 4454
@@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4468 unsigned long objects; 4468 unsigned long objects;
4469 int err; 4469 int err;
4470 4470
4471 err = strict_strtoul(buf, 10, &objects); 4471 err = kstrtoul(buf, 10, &objects);
4472 if (err) 4472 if (err)
4473 return err; 4473 return err;
4474 if (objects && !kmem_cache_has_cpu_partial(s)) 4474 if (objects && !kmem_cache_has_cpu_partial(s))
@@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4784 unsigned long ratio; 4784 unsigned long ratio;
4785 int err; 4785 int err;
4786 4786
4787 err = strict_strtoul(buf, 10, &ratio); 4787 err = kstrtoul(buf, 10, &ratio);
4788 if (err) 4788 if (err)
4789 return err; 4789 return err;
4790 4790
diff --git a/mm/sparse.c b/mm/sparse.c
index 308d50331bc3..4ac1d7ef548f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 339}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 340#endif /* CONFIG_MEMORY_HOTREMOVE */
341 341
342static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, 342static void __init sparse_early_usemaps_alloc_node(void *data,
343 unsigned long pnum_begin, 343 unsigned long pnum_begin,
344 unsigned long pnum_end, 344 unsigned long pnum_end,
345 unsigned long usemap_count, int nodeid) 345 unsigned long usemap_count, int nodeid)
346{ 346{
347 void *usemap; 347 void *usemap;
348 unsigned long pnum; 348 unsigned long pnum;
349 unsigned long **usemap_map = (unsigned long **)data;
349 int size = usemap_size(); 350 int size = usemap_size();
350 351
351 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 352 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
@@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
430#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 431#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
431 432
432#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 433#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
433static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, 434static void __init sparse_early_mem_maps_alloc_node(void *data,
434 unsigned long pnum_begin, 435 unsigned long pnum_begin,
435 unsigned long pnum_end, 436 unsigned long pnum_end,
436 unsigned long map_count, int nodeid) 437 unsigned long map_count, int nodeid)
437{ 438{
439 struct page **map_map = (struct page **)data;
438 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, 440 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
439 map_count, nodeid); 441 map_count, nodeid);
440} 442}
@@ -460,6 +462,55 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
460{ 462{
461} 463}
462 464
465/**
466 * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
467 * @map: usemap_map for pageblock flags or mmap_map for vmemmap
468 */
469static void __init alloc_usemap_and_memmap(void (*alloc_func)
470 (void *, unsigned long, unsigned long,
471 unsigned long, int), void *data)
472{
473 unsigned long pnum;
474 unsigned long map_count;
475 int nodeid_begin = 0;
476 unsigned long pnum_begin = 0;
477
478 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
479 struct mem_section *ms;
480
481 if (!present_section_nr(pnum))
482 continue;
483 ms = __nr_to_section(pnum);
484 nodeid_begin = sparse_early_nid(ms);
485 pnum_begin = pnum;
486 break;
487 }
488 map_count = 1;
489 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
490 struct mem_section *ms;
491 int nodeid;
492
493 if (!present_section_nr(pnum))
494 continue;
495 ms = __nr_to_section(pnum);
496 nodeid = sparse_early_nid(ms);
497 if (nodeid == nodeid_begin) {
498 map_count++;
499 continue;
500 }
501 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
502 alloc_func(data, pnum_begin, pnum,
503 map_count, nodeid_begin);
504 /* new start, update count etc*/
505 nodeid_begin = nodeid;
506 pnum_begin = pnum;
507 map_count = 1;
508 }
509 /* ok, last chunk */
510 alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
511 map_count, nodeid_begin);
512}
513
463/* 514/*
464 * Allocate the accumulated non-linear sections, allocate a mem_map 515 * Allocate the accumulated non-linear sections, allocate a mem_map
465 * for each and record the physical to section mapping. 516 * for each and record the physical to section mapping.
@@ -471,11 +522,7 @@ void __init sparse_init(void)
471 unsigned long *usemap; 522 unsigned long *usemap;
472 unsigned long **usemap_map; 523 unsigned long **usemap_map;
473 int size; 524 int size;
474 int nodeid_begin = 0;
475 unsigned long pnum_begin = 0;
476 unsigned long usemap_count;
477#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 525#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
478 unsigned long map_count;
479 int size2; 526 int size2;
480 struct page **map_map; 527 struct page **map_map;
481#endif 528#endif
@@ -501,82 +548,16 @@ void __init sparse_init(void)
501 usemap_map = alloc_bootmem(size); 548 usemap_map = alloc_bootmem(size);
502 if (!usemap_map) 549 if (!usemap_map)
503 panic("can not allocate usemap_map\n"); 550 panic("can not allocate usemap_map\n");
504 551 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
505 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 552 (void *)usemap_map);
506 struct mem_section *ms;
507
508 if (!present_section_nr(pnum))
509 continue;
510 ms = __nr_to_section(pnum);
511 nodeid_begin = sparse_early_nid(ms);
512 pnum_begin = pnum;
513 break;
514 }
515 usemap_count = 1;
516 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
517 struct mem_section *ms;
518 int nodeid;
519
520 if (!present_section_nr(pnum))
521 continue;
522 ms = __nr_to_section(pnum);
523 nodeid = sparse_early_nid(ms);
524 if (nodeid == nodeid_begin) {
525 usemap_count++;
526 continue;
527 }
528 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
529 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
530 usemap_count, nodeid_begin);
531 /* new start, update count etc*/
532 nodeid_begin = nodeid;
533 pnum_begin = pnum;
534 usemap_count = 1;
535 }
536 /* ok, last chunk */
537 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
538 usemap_count, nodeid_begin);
539 553
540#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 554#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
541 size2 = sizeof(struct page *) * NR_MEM_SECTIONS; 555 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
542 map_map = alloc_bootmem(size2); 556 map_map = alloc_bootmem(size2);
543 if (!map_map) 557 if (!map_map)
544 panic("can not allocate map_map\n"); 558 panic("can not allocate map_map\n");
545 559 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
546 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 560 (void *)map_map);
547 struct mem_section *ms;
548
549 if (!present_section_nr(pnum))
550 continue;
551 ms = __nr_to_section(pnum);
552 nodeid_begin = sparse_early_nid(ms);
553 pnum_begin = pnum;
554 break;
555 }
556 map_count = 1;
557 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
558 struct mem_section *ms;
559 int nodeid;
560
561 if (!present_section_nr(pnum))
562 continue;
563 ms = __nr_to_section(pnum);
564 nodeid = sparse_early_nid(ms);
565 if (nodeid == nodeid_begin) {
566 map_count++;
567 continue;
568 }
569 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
570 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
571 map_count, nodeid_begin);
572 /* new start, update count etc*/
573 nodeid_begin = nodeid;
574 pnum_begin = pnum;
575 map_count = 1;
576 }
577 /* ok, last chunk */
578 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
579 map_count, nodeid_begin);
580#endif 561#endif
581 562
582 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 563 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 62b78a6e224f..c899502d3e36 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
34 35
35#include "internal.h" 36#include "internal.h"
36 37
@@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page)
81 82
82static void put_compound_page(struct page *page) 83static void put_compound_page(struct page *page)
83{ 84{
85 /*
86 * hugetlbfs pages cannot be split from under us. If this is a
87 * hugetlbfs page, check refcount on head page and release the page if
88 * the refcount becomes zero.
89 */
90 if (PageHuge(page)) {
91 page = compound_head(page);
92 if (put_page_testzero(page))
93 __put_compound_page(page);
94
95 return;
96 }
97
84 if (unlikely(PageTail(page))) { 98 if (unlikely(PageTail(page))) {
85 /* __split_huge_page_refcount can run under us */ 99 /* __split_huge_page_refcount can run under us */
86 struct page *page_head = compound_trans_head(page); 100 struct page *page_head = compound_trans_head(page);
@@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page)
184 * proper PT lock that already serializes against 198 * proper PT lock that already serializes against
185 * split_huge_page(). 199 * split_huge_page().
186 */ 200 */
187 unsigned long flags;
188 bool got = false; 201 bool got = false;
189 struct page *page_head = compound_trans_head(page); 202 struct page *page_head;
190 203
191 if (likely(page != page_head && get_page_unless_zero(page_head))) { 204 /*
205 * If this is a hugetlbfs page it cannot be split under us. Simply
206 * increment refcount for the head page.
207 */
208 if (PageHuge(page)) {
209 page_head = compound_head(page);
210 atomic_inc(&page_head->_count);
211 got = true;
212 } else {
213 unsigned long flags;
214
215 page_head = compound_trans_head(page);
216 if (likely(page != page_head &&
217 get_page_unless_zero(page_head))) {
218
219 /* Ref to put_compound_page() comment. */
220 if (PageSlab(page_head)) {
221 if (likely(PageTail(page))) {
222 __get_page_tail_foll(page, false);
223 return true;
224 } else {
225 put_page(page_head);
226 return false;
227 }
228 }
192 229
193 /* Ref to put_compound_page() comment. */ 230 /*
194 if (PageSlab(page_head)) { 231 * page_head wasn't a dangling pointer but it
232 * may not be a head page anymore by the time
233 * we obtain the lock. That is ok as long as it
234 * can't be freed from under us.
235 */
236 flags = compound_lock_irqsave(page_head);
237 /* here __split_huge_page_refcount won't run anymore */
195 if (likely(PageTail(page))) { 238 if (likely(PageTail(page))) {
196 __get_page_tail_foll(page, false); 239 __get_page_tail_foll(page, false);
197 return true; 240 got = true;
198 } else {
199 put_page(page_head);
200 return false;
201 } 241 }
242 compound_unlock_irqrestore(page_head, flags);
243 if (unlikely(!got))
244 put_page(page_head);
202 } 245 }
203
204 /*
205 * page_head wasn't a dangling pointer but it
206 * may not be a head page anymore by the time
207 * we obtain the lock. That is ok as long as it
208 * can't be freed from under us.
209 */
210 flags = compound_lock_irqsave(page_head);
211 /* here __split_huge_page_refcount won't run anymore */
212 if (likely(PageTail(page))) {
213 __get_page_tail_foll(page, false);
214 got = true;
215 }
216 compound_unlock_irqrestore(page_head, flags);
217 if (unlikely(!got))
218 put_page(page_head);
219 } 246 }
220 return got; 247 return got;
221} 248}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f24ab0dff554..e6f15f8ca2af 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
122{ 122{
123 int error; 123 int error;
124 124
125 error = radix_tree_preload(gfp_mask); 125 error = radix_tree_maybe_preload(gfp_mask);
126 if (!error) { 126 if (!error) {
127 error = __add_to_swap_cache(page, entry); 127 error = __add_to_swap_cache(page, entry);
128 radix_tree_preload_end(); 128 radix_tree_preload_end();
@@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
328 /* 328 /*
329 * call radix_tree_preload() while we can wait. 329 * call radix_tree_preload() while we can wait.
330 */ 330 */
331 err = radix_tree_preload(gfp_mask & GFP_KERNEL); 331 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
332 if (err) 332 if (err)
333 break; 333 break;
334 334
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cf2e60983b7..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si,
175 } 175 }
176} 176}
177 177
178static int wait_for_discard(void *word) 178#define SWAPFILE_CLUSTER 256
179#define LATENCY_LIMIT 256
180
181static inline void cluster_set_flag(struct swap_cluster_info *info,
182 unsigned int flag)
179{ 183{
180 schedule(); 184 info->flags = flag;
181 return 0;
182} 185}
183 186
184#define SWAPFILE_CLUSTER 256 187static inline unsigned int cluster_count(struct swap_cluster_info *info)
185#define LATENCY_LIMIT 256 188{
189 return info->data;
190}
191
192static inline void cluster_set_count(struct swap_cluster_info *info,
193 unsigned int c)
194{
195 info->data = c;
196}
197
198static inline void cluster_set_count_flag(struct swap_cluster_info *info,
199 unsigned int c, unsigned int f)
200{
201 info->flags = f;
202 info->data = c;
203}
204
205static inline unsigned int cluster_next(struct swap_cluster_info *info)
206{
207 return info->data;
208}
209
210static inline void cluster_set_next(struct swap_cluster_info *info,
211 unsigned int n)
212{
213 info->data = n;
214}
215
216static inline void cluster_set_next_flag(struct swap_cluster_info *info,
217 unsigned int n, unsigned int f)
218{
219 info->flags = f;
220 info->data = n;
221}
222
223static inline bool cluster_is_free(struct swap_cluster_info *info)
224{
225 return info->flags & CLUSTER_FLAG_FREE;
226}
227
228static inline bool cluster_is_null(struct swap_cluster_info *info)
229{
230 return info->flags & CLUSTER_FLAG_NEXT_NULL;
231}
232
233static inline void cluster_set_null(struct swap_cluster_info *info)
234{
235 info->flags = CLUSTER_FLAG_NEXT_NULL;
236 info->data = 0;
237}
238
239/* Add a cluster to discard list and schedule it to do discard */
240static void swap_cluster_schedule_discard(struct swap_info_struct *si,
241 unsigned int idx)
242{
243 /*
244 * If scan_swap_map() can't find a free cluster, it will check
245 * si->swap_map directly. To make sure the discarding cluster isn't
246 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
247 * will be cleared after discard
248 */
249 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
250 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
251
252 if (cluster_is_null(&si->discard_cluster_head)) {
253 cluster_set_next_flag(&si->discard_cluster_head,
254 idx, 0);
255 cluster_set_next_flag(&si->discard_cluster_tail,
256 idx, 0);
257 } else {
258 unsigned int tail = cluster_next(&si->discard_cluster_tail);
259 cluster_set_next(&si->cluster_info[tail], idx);
260 cluster_set_next_flag(&si->discard_cluster_tail,
261 idx, 0);
262 }
263
264 schedule_work(&si->discard_work);
265}
266
267/*
268 * Doing discard actually. After a cluster discard is finished, the cluster
269 * will be added to free cluster list. caller should hold si->lock.
270*/
271static void swap_do_scheduled_discard(struct swap_info_struct *si)
272{
273 struct swap_cluster_info *info;
274 unsigned int idx;
275
276 info = si->cluster_info;
277
278 while (!cluster_is_null(&si->discard_cluster_head)) {
279 idx = cluster_next(&si->discard_cluster_head);
280
281 cluster_set_next_flag(&si->discard_cluster_head,
282 cluster_next(&info[idx]), 0);
283 if (cluster_next(&si->discard_cluster_tail) == idx) {
284 cluster_set_null(&si->discard_cluster_head);
285 cluster_set_null(&si->discard_cluster_tail);
286 }
287 spin_unlock(&si->lock);
288
289 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
290 SWAPFILE_CLUSTER);
291
292 spin_lock(&si->lock);
293 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
294 if (cluster_is_null(&si->free_cluster_head)) {
295 cluster_set_next_flag(&si->free_cluster_head,
296 idx, 0);
297 cluster_set_next_flag(&si->free_cluster_tail,
298 idx, 0);
299 } else {
300 unsigned int tail;
301
302 tail = cluster_next(&si->free_cluster_tail);
303 cluster_set_next(&info[tail], idx);
304 cluster_set_next_flag(&si->free_cluster_tail,
305 idx, 0);
306 }
307 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
308 0, SWAPFILE_CLUSTER);
309 }
310}
311
312static void swap_discard_work(struct work_struct *work)
313{
314 struct swap_info_struct *si;
315
316 si = container_of(work, struct swap_info_struct, discard_work);
317
318 spin_lock(&si->lock);
319 swap_do_scheduled_discard(si);
320 spin_unlock(&si->lock);
321}
322
323/*
324 * The cluster corresponding to page_nr will be used. The cluster will be
325 * removed from free cluster list and its usage counter will be increased.
326 */
327static void inc_cluster_info_page(struct swap_info_struct *p,
328 struct swap_cluster_info *cluster_info, unsigned long page_nr)
329{
330 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
331
332 if (!cluster_info)
333 return;
334 if (cluster_is_free(&cluster_info[idx])) {
335 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
336 cluster_set_next_flag(&p->free_cluster_head,
337 cluster_next(&cluster_info[idx]), 0);
338 if (cluster_next(&p->free_cluster_tail) == idx) {
339 cluster_set_null(&p->free_cluster_tail);
340 cluster_set_null(&p->free_cluster_head);
341 }
342 cluster_set_count_flag(&cluster_info[idx], 0, 0);
343 }
344
345 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
346 cluster_set_count(&cluster_info[idx],
347 cluster_count(&cluster_info[idx]) + 1);
348}
349
350/*
351 * The cluster corresponding to page_nr decreases one usage. If the usage
352 * counter becomes 0, which means no page in the cluster is in using, we can
353 * optionally discard the cluster and add it to free cluster list.
354 */
355static void dec_cluster_info_page(struct swap_info_struct *p,
356 struct swap_cluster_info *cluster_info, unsigned long page_nr)
357{
358 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
359
360 if (!cluster_info)
361 return;
362
363 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
364 cluster_set_count(&cluster_info[idx],
365 cluster_count(&cluster_info[idx]) - 1);
366
367 if (cluster_count(&cluster_info[idx]) == 0) {
368 /*
369 * If the swap is discardable, prepare discard the cluster
370 * instead of free it immediately. The cluster will be freed
371 * after discard.
372 */
373 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
374 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
375 swap_cluster_schedule_discard(p, idx);
376 return;
377 }
378
379 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
380 if (cluster_is_null(&p->free_cluster_head)) {
381 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
382 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
383 } else {
384 unsigned int tail = cluster_next(&p->free_cluster_tail);
385 cluster_set_next(&cluster_info[tail], idx);
386 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
387 }
388 }
389}
390
391/*
392 * It's possible scan_swap_map() uses a free cluster in the middle of free
393 * cluster list. Avoiding such abuse to avoid list corruption.
394 */
395static bool
396scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
397 unsigned long offset)
398{
399 struct percpu_cluster *percpu_cluster;
400 bool conflict;
401
402 offset /= SWAPFILE_CLUSTER;
403 conflict = !cluster_is_null(&si->free_cluster_head) &&
404 offset != cluster_next(&si->free_cluster_head) &&
405 cluster_is_free(&si->cluster_info[offset]);
406
407 if (!conflict)
408 return false;
409
410 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
411 cluster_set_null(&percpu_cluster->index);
412 return true;
413}
414
415/*
416 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
417 * might involve allocating a new cluster for current CPU too.
418 */
419static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
420 unsigned long *offset, unsigned long *scan_base)
421{
422 struct percpu_cluster *cluster;
423 bool found_free;
424 unsigned long tmp;
425
426new_cluster:
427 cluster = this_cpu_ptr(si->percpu_cluster);
428 if (cluster_is_null(&cluster->index)) {
429 if (!cluster_is_null(&si->free_cluster_head)) {
430 cluster->index = si->free_cluster_head;
431 cluster->next = cluster_next(&cluster->index) *
432 SWAPFILE_CLUSTER;
433 } else if (!cluster_is_null(&si->discard_cluster_head)) {
434 /*
435 * we don't have free cluster but have some clusters in
436 * discarding, do discard now and reclaim them
437 */
438 swap_do_scheduled_discard(si);
439 *scan_base = *offset = si->cluster_next;
440 goto new_cluster;
441 } else
442 return;
443 }
444
445 found_free = false;
446
447 /*
448 * Other CPUs can use our cluster if they can't find a free cluster,
449 * check if there is still free entry in the cluster
450 */
451 tmp = cluster->next;
452 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
453 SWAPFILE_CLUSTER) {
454 if (!si->swap_map[tmp]) {
455 found_free = true;
456 break;
457 }
458 tmp++;
459 }
460 if (!found_free) {
461 cluster_set_null(&cluster->index);
462 goto new_cluster;
463 }
464 cluster->next = tmp + 1;
465 *offset = tmp;
466 *scan_base = tmp;
467}
186 468
187static unsigned long scan_swap_map(struct swap_info_struct *si, 469static unsigned long scan_swap_map(struct swap_info_struct *si,
188 unsigned char usage) 470 unsigned char usage)
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
191 unsigned long scan_base; 473 unsigned long scan_base;
192 unsigned long last_in_cluster = 0; 474 unsigned long last_in_cluster = 0;
193 int latency_ration = LATENCY_LIMIT; 475 int latency_ration = LATENCY_LIMIT;
194 int found_free_cluster = 0;
195 476
196 /* 477 /*
197 * We try to cluster swap pages by allocating them sequentially 478 * We try to cluster swap pages by allocating them sequentially
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
207 si->flags += SWP_SCANNING; 488 si->flags += SWP_SCANNING;
208 scan_base = offset = si->cluster_next; 489 scan_base = offset = si->cluster_next;
209 490
491 /* SSD algorithm */
492 if (si->cluster_info) {
493 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
494 goto checks;
495 }
496
210 if (unlikely(!si->cluster_nr--)) { 497 if (unlikely(!si->cluster_nr--)) {
211 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 498 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
212 si->cluster_nr = SWAPFILE_CLUSTER - 1; 499 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks; 500 goto checks;
214 } 501 }
215 if (si->flags & SWP_PAGE_DISCARD) { 502
216 /*
217 * Start range check on racing allocations, in case
218 * they overlap the cluster we eventually decide on
219 * (we scan without swap_lock to allow preemption).
220 * It's hardly conceivable that cluster_nr could be
221 * wrapped during our scan, but don't depend on it.
222 */
223 if (si->lowest_alloc)
224 goto checks;
225 si->lowest_alloc = si->max;
226 si->highest_alloc = 0;
227 }
228 spin_unlock(&si->lock); 503 spin_unlock(&si->lock);
229 504
230 /* 505 /*
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
248 offset -= SWAPFILE_CLUSTER - 1; 523 offset -= SWAPFILE_CLUSTER - 1;
249 si->cluster_next = offset; 524 si->cluster_next = offset;
250 si->cluster_nr = SWAPFILE_CLUSTER - 1; 525 si->cluster_nr = SWAPFILE_CLUSTER - 1;
251 found_free_cluster = 1;
252 goto checks; 526 goto checks;
253 } 527 }
254 if (unlikely(--latency_ration < 0)) { 528 if (unlikely(--latency_ration < 0)) {
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
269 offset -= SWAPFILE_CLUSTER - 1; 543 offset -= SWAPFILE_CLUSTER - 1;
270 si->cluster_next = offset; 544 si->cluster_next = offset;
271 si->cluster_nr = SWAPFILE_CLUSTER - 1; 545 si->cluster_nr = SWAPFILE_CLUSTER - 1;
272 found_free_cluster = 1;
273 goto checks; 546 goto checks;
274 } 547 }
275 if (unlikely(--latency_ration < 0)) { 548 if (unlikely(--latency_ration < 0)) {
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
281 offset = scan_base; 554 offset = scan_base;
282 spin_lock(&si->lock); 555 spin_lock(&si->lock);
283 si->cluster_nr = SWAPFILE_CLUSTER - 1; 556 si->cluster_nr = SWAPFILE_CLUSTER - 1;
284 si->lowest_alloc = 0;
285 } 557 }
286 558
287checks: 559checks:
560 if (si->cluster_info) {
561 while (scan_swap_map_ssd_cluster_conflict(si, offset))
562 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
563 }
288 if (!(si->flags & SWP_WRITEOK)) 564 if (!(si->flags & SWP_WRITEOK))
289 goto no_page; 565 goto no_page;
290 if (!si->highest_bit) 566 if (!si->highest_bit)
@@ -317,62 +593,10 @@ checks:
317 si->highest_bit = 0; 593 si->highest_bit = 0;
318 } 594 }
319 si->swap_map[offset] = usage; 595 si->swap_map[offset] = usage;
596 inc_cluster_info_page(si, si->cluster_info, offset);
320 si->cluster_next = offset + 1; 597 si->cluster_next = offset + 1;
321 si->flags -= SWP_SCANNING; 598 si->flags -= SWP_SCANNING;
322 599
323 if (si->lowest_alloc) {
324 /*
325 * Only set when SWP_PAGE_DISCARD, and there's a scan
326 * for a free cluster in progress or just completed.
327 */
328 if (found_free_cluster) {
329 /*
330 * To optimize wear-levelling, discard the
331 * old data of the cluster, taking care not to
332 * discard any of its pages that have already
333 * been allocated by racing tasks (offset has
334 * already stepped over any at the beginning).
335 */
336 if (offset < si->highest_alloc &&
337 si->lowest_alloc <= last_in_cluster)
338 last_in_cluster = si->lowest_alloc - 1;
339 si->flags |= SWP_DISCARDING;
340 spin_unlock(&si->lock);
341
342 if (offset < last_in_cluster)
343 discard_swap_cluster(si, offset,
344 last_in_cluster - offset + 1);
345
346 spin_lock(&si->lock);
347 si->lowest_alloc = 0;
348 si->flags &= ~SWP_DISCARDING;
349
350 smp_mb(); /* wake_up_bit advises this */
351 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
352
353 } else if (si->flags & SWP_DISCARDING) {
354 /*
355 * Delay using pages allocated by racing tasks
356 * until the whole discard has been issued. We
357 * could defer that delay until swap_writepage,
358 * but it's easier to keep this self-contained.
359 */
360 spin_unlock(&si->lock);
361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
362 wait_for_discard, TASK_UNINTERRUPTIBLE);
363 spin_lock(&si->lock);
364 } else {
365 /*
366 * Note pages allocated by racing tasks while
367 * scan for a free cluster is in progress, so
368 * that its final discard can exclude them.
369 */
370 if (offset < si->lowest_alloc)
371 si->lowest_alloc = offset;
372 if (offset > si->highest_alloc)
373 si->highest_alloc = offset;
374 }
375 }
376 return offset; 600 return offset;
377 601
378scan: 602scan:
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
527 return p; 751 return p;
528 752
529bad_free: 753bad_free:
530 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 754 pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
531 goto out; 755 goto out;
532bad_offset: 756bad_offset:
533 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 757 pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
534 goto out; 758 goto out;
535bad_device: 759bad_device:
536 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 760 pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
537 goto out; 761 goto out;
538bad_nofile: 762bad_nofile:
539 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 763 pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
540out: 764out:
541 return NULL; 765 return NULL;
542} 766}
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
600 824
601 /* free if no reference */ 825 /* free if no reference */
602 if (!usage) { 826 if (!usage) {
827 dec_cluster_info_page(p, p->cluster_info, offset);
603 if (offset < p->lowest_bit) 828 if (offset < p->lowest_bit)
604 p->lowest_bit = offset; 829 p->lowest_bit = offset;
605 if (offset > p->highest_bit) 830 if (offset > p->highest_bit)
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1107 else 1332 else
1108 continue; 1333 continue;
1109 } 1334 }
1110 count = si->swap_map[i]; 1335 count = ACCESS_ONCE(si->swap_map[i]);
1111 if (count && swap_count(count) != SWAP_MAP_BAD) 1336 if (count && swap_count(count) != SWAP_MAP_BAD)
1112 break; 1337 break;
1113 } 1338 }
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
1127{ 1352{
1128 struct swap_info_struct *si = swap_info[type]; 1353 struct swap_info_struct *si = swap_info[type];
1129 struct mm_struct *start_mm; 1354 struct mm_struct *start_mm;
1130 unsigned char *swap_map; 1355 volatile unsigned char *swap_map; /* swap_map is accessed without
1356 * locking. Mark it as volatile
1357 * to prevent compiler doing
1358 * something odd.
1359 */
1131 unsigned char swcount; 1360 unsigned char swcount;
1132 struct page *page; 1361 struct page *page;
1133 swp_entry_t entry; 1362 swp_entry_t entry;
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
1178 * reused since sys_swapoff() already disabled 1407 * reused since sys_swapoff() already disabled
1179 * allocation from here, or alloc_page() failed. 1408 * allocation from here, or alloc_page() failed.
1180 */ 1409 */
1181 if (!*swap_map) 1410 swcount = *swap_map;
1411 /*
1412 * We don't hold lock here, so the swap entry could be
1413 * SWAP_MAP_BAD (when the cluster is discarding).
1414 * Instead of fail out, We can just skip the swap
1415 * entry because swapoff will wait for discarding
1416 * finish anyway.
1417 */
1418 if (!swcount || swcount == SWAP_MAP_BAD)
1182 continue; 1419 continue;
1183 retval = -ENOMEM; 1420 retval = -ENOMEM;
1184 break; 1421 break;
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1524} 1761}
1525 1762
1526static void _enable_swap_info(struct swap_info_struct *p, int prio, 1763static void _enable_swap_info(struct swap_info_struct *p, int prio,
1527 unsigned char *swap_map) 1764 unsigned char *swap_map,
1765 struct swap_cluster_info *cluster_info)
1528{ 1766{
1529 int i, prev; 1767 int i, prev;
1530 1768
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1533 else 1771 else
1534 p->prio = --least_priority; 1772 p->prio = --least_priority;
1535 p->swap_map = swap_map; 1773 p->swap_map = swap_map;
1774 p->cluster_info = cluster_info;
1536 p->flags |= SWP_WRITEOK; 1775 p->flags |= SWP_WRITEOK;
1537 atomic_long_add(p->pages, &nr_swap_pages); 1776 atomic_long_add(p->pages, &nr_swap_pages);
1538 total_swap_pages += p->pages; 1777 total_swap_pages += p->pages;
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1553 1792
1554static void enable_swap_info(struct swap_info_struct *p, int prio, 1793static void enable_swap_info(struct swap_info_struct *p, int prio,
1555 unsigned char *swap_map, 1794 unsigned char *swap_map,
1795 struct swap_cluster_info *cluster_info,
1556 unsigned long *frontswap_map) 1796 unsigned long *frontswap_map)
1557{ 1797{
1558 frontswap_init(p->type, frontswap_map); 1798 frontswap_init(p->type, frontswap_map);
1559 spin_lock(&swap_lock); 1799 spin_lock(&swap_lock);
1560 spin_lock(&p->lock); 1800 spin_lock(&p->lock);
1561 _enable_swap_info(p, prio, swap_map); 1801 _enable_swap_info(p, prio, swap_map, cluster_info);
1562 spin_unlock(&p->lock); 1802 spin_unlock(&p->lock);
1563 spin_unlock(&swap_lock); 1803 spin_unlock(&swap_lock);
1564} 1804}
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
1567{ 1807{
1568 spin_lock(&swap_lock); 1808 spin_lock(&swap_lock);
1569 spin_lock(&p->lock); 1809 spin_lock(&p->lock);
1570 _enable_swap_info(p, p->prio, p->swap_map); 1810 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
1571 spin_unlock(&p->lock); 1811 spin_unlock(&p->lock);
1572 spin_unlock(&swap_lock); 1812 spin_unlock(&swap_lock);
1573} 1813}
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1576{ 1816{
1577 struct swap_info_struct *p = NULL; 1817 struct swap_info_struct *p = NULL;
1578 unsigned char *swap_map; 1818 unsigned char *swap_map;
1819 struct swap_cluster_info *cluster_info;
1579 unsigned long *frontswap_map; 1820 unsigned long *frontswap_map;
1580 struct file *swap_file, *victim; 1821 struct file *swap_file, *victim;
1581 struct address_space *mapping; 1822 struct address_space *mapping;
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1651 goto out_dput; 1892 goto out_dput;
1652 } 1893 }
1653 1894
1895 flush_work(&p->discard_work);
1896
1654 destroy_swap_extents(p); 1897 destroy_swap_extents(p);
1655 if (p->flags & SWP_CONTINUED) 1898 if (p->flags & SWP_CONTINUED)
1656 free_swap_count_continuations(p); 1899 free_swap_count_continuations(p);
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1675 p->max = 0; 1918 p->max = 0;
1676 swap_map = p->swap_map; 1919 swap_map = p->swap_map;
1677 p->swap_map = NULL; 1920 p->swap_map = NULL;
1921 cluster_info = p->cluster_info;
1922 p->cluster_info = NULL;
1678 p->flags = 0; 1923 p->flags = 0;
1679 frontswap_map = frontswap_map_get(p); 1924 frontswap_map = frontswap_map_get(p);
1680 frontswap_map_set(p, NULL); 1925 frontswap_map_set(p, NULL);
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1682 spin_unlock(&swap_lock); 1927 spin_unlock(&swap_lock);
1683 frontswap_invalidate_area(type); 1928 frontswap_invalidate_area(type);
1684 mutex_unlock(&swapon_mutex); 1929 mutex_unlock(&swapon_mutex);
1930 free_percpu(p->percpu_cluster);
1931 p->percpu_cluster = NULL;
1685 vfree(swap_map); 1932 vfree(swap_map);
1933 vfree(cluster_info);
1686 vfree(frontswap_map); 1934 vfree(frontswap_map);
1687 /* Destroy swap account informatin */ 1935 /* Destroy swap account informatin */
1688 swap_cgroup_swapoff(type); 1936 swap_cgroup_swapoff(type);
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1926 int i; 2174 int i;
1927 unsigned long maxpages; 2175 unsigned long maxpages;
1928 unsigned long swapfilepages; 2176 unsigned long swapfilepages;
2177 unsigned long last_page;
1929 2178
1930 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 2179 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1931 printk(KERN_ERR "Unable to find swap-space signature\n"); 2180 pr_err("Unable to find swap-space signature\n");
1932 return 0; 2181 return 0;
1933 } 2182 }
1934 2183
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1942 } 2191 }
1943 /* Check the swap header's sub-version */ 2192 /* Check the swap header's sub-version */
1944 if (swap_header->info.version != 1) { 2193 if (swap_header->info.version != 1) {
1945 printk(KERN_WARNING 2194 pr_warn("Unable to handle swap header version %d\n",
1946 "Unable to handle swap header version %d\n", 2195 swap_header->info.version);
1947 swap_header->info.version);
1948 return 0; 2196 return 0;
1949 } 2197 }
1950 2198
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1968 */ 2216 */
1969 maxpages = swp_offset(pte_to_swp_entry( 2217 maxpages = swp_offset(pte_to_swp_entry(
1970 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 2218 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1971 if (maxpages > swap_header->info.last_page) { 2219 last_page = swap_header->info.last_page;
1972 maxpages = swap_header->info.last_page + 1; 2220 if (last_page > maxpages) {
2221 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2222 maxpages << (PAGE_SHIFT - 10),
2223 last_page << (PAGE_SHIFT - 10));
2224 }
2225 if (maxpages > last_page) {
2226 maxpages = last_page + 1;
1973 /* p->max is an unsigned int: don't overflow it */ 2227 /* p->max is an unsigned int: don't overflow it */
1974 if ((unsigned int)maxpages == 0) 2228 if ((unsigned int)maxpages == 0)
1975 maxpages = UINT_MAX; 2229 maxpages = UINT_MAX;
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1980 return 0; 2234 return 0;
1981 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 2235 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1982 if (swapfilepages && maxpages > swapfilepages) { 2236 if (swapfilepages && maxpages > swapfilepages) {
1983 printk(KERN_WARNING 2237 pr_warn("Swap area shorter than signature indicates\n");
1984 "Swap area shorter than signature indicates\n");
1985 return 0; 2238 return 0;
1986 } 2239 }
1987 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 2240 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1995static int setup_swap_map_and_extents(struct swap_info_struct *p, 2248static int setup_swap_map_and_extents(struct swap_info_struct *p,
1996 union swap_header *swap_header, 2249 union swap_header *swap_header,
1997 unsigned char *swap_map, 2250 unsigned char *swap_map,
2251 struct swap_cluster_info *cluster_info,
1998 unsigned long maxpages, 2252 unsigned long maxpages,
1999 sector_t *span) 2253 sector_t *span)
2000{ 2254{
2001 int i; 2255 int i;
2002 unsigned int nr_good_pages; 2256 unsigned int nr_good_pages;
2003 int nr_extents; 2257 int nr_extents;
2258 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2259 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
2004 2260
2005 nr_good_pages = maxpages - 1; /* omit header page */ 2261 nr_good_pages = maxpages - 1; /* omit header page */
2006 2262
2263 cluster_set_null(&p->free_cluster_head);
2264 cluster_set_null(&p->free_cluster_tail);
2265 cluster_set_null(&p->discard_cluster_head);
2266 cluster_set_null(&p->discard_cluster_tail);
2267
2007 for (i = 0; i < swap_header->info.nr_badpages; i++) { 2268 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2008 unsigned int page_nr = swap_header->info.badpages[i]; 2269 unsigned int page_nr = swap_header->info.badpages[i];
2009 if (page_nr == 0 || page_nr > swap_header->info.last_page) 2270 if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2011 if (page_nr < maxpages) { 2272 if (page_nr < maxpages) {
2012 swap_map[page_nr] = SWAP_MAP_BAD; 2273 swap_map[page_nr] = SWAP_MAP_BAD;
2013 nr_good_pages--; 2274 nr_good_pages--;
2275 /*
2276 * Haven't marked the cluster free yet, no list
2277 * operation involved
2278 */
2279 inc_cluster_info_page(p, cluster_info, page_nr);
2014 } 2280 }
2015 } 2281 }
2016 2282
2283 /* Haven't marked the cluster free yet, no list operation involved */
2284 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2285 inc_cluster_info_page(p, cluster_info, i);
2286
2017 if (nr_good_pages) { 2287 if (nr_good_pages) {
2018 swap_map[0] = SWAP_MAP_BAD; 2288 swap_map[0] = SWAP_MAP_BAD;
2289 /*
2290 * Not mark the cluster free yet, no list
2291 * operation involved
2292 */
2293 inc_cluster_info_page(p, cluster_info, 0);
2019 p->max = maxpages; 2294 p->max = maxpages;
2020 p->pages = nr_good_pages; 2295 p->pages = nr_good_pages;
2021 nr_extents = setup_swap_extents(p, span); 2296 nr_extents = setup_swap_extents(p, span);
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2024 nr_good_pages = p->pages; 2299 nr_good_pages = p->pages;
2025 } 2300 }
2026 if (!nr_good_pages) { 2301 if (!nr_good_pages) {
2027 printk(KERN_WARNING "Empty swap-file\n"); 2302 pr_warn("Empty swap-file\n");
2028 return -EINVAL; 2303 return -EINVAL;
2029 } 2304 }
2030 2305
2306 if (!cluster_info)
2307 return nr_extents;
2308
2309 for (i = 0; i < nr_clusters; i++) {
2310 if (!cluster_count(&cluster_info[idx])) {
2311 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2312 if (cluster_is_null(&p->free_cluster_head)) {
2313 cluster_set_next_flag(&p->free_cluster_head,
2314 idx, 0);
2315 cluster_set_next_flag(&p->free_cluster_tail,
2316 idx, 0);
2317 } else {
2318 unsigned int tail;
2319
2320 tail = cluster_next(&p->free_cluster_tail);
2321 cluster_set_next(&cluster_info[tail], idx);
2322 cluster_set_next_flag(&p->free_cluster_tail,
2323 idx, 0);
2324 }
2325 }
2326 idx++;
2327 if (idx == nr_clusters)
2328 idx = 0;
2329 }
2031 return nr_extents; 2330 return nr_extents;
2032} 2331}
2033 2332
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2059 sector_t span; 2358 sector_t span;
2060 unsigned long maxpages; 2359 unsigned long maxpages;
2061 unsigned char *swap_map = NULL; 2360 unsigned char *swap_map = NULL;
2361 struct swap_cluster_info *cluster_info = NULL;
2062 unsigned long *frontswap_map = NULL; 2362 unsigned long *frontswap_map = NULL;
2063 struct page *page = NULL; 2363 struct page *page = NULL;
2064 struct inode *inode = NULL; 2364 struct inode *inode = NULL;
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2073 if (IS_ERR(p)) 2373 if (IS_ERR(p))
2074 return PTR_ERR(p); 2374 return PTR_ERR(p);
2075 2375
2376 INIT_WORK(&p->discard_work, swap_discard_work);
2377
2076 name = getname(specialfile); 2378 name = getname(specialfile);
2077 if (IS_ERR(name)) { 2379 if (IS_ERR(name)) {
2078 error = PTR_ERR(name); 2380 error = PTR_ERR(name);
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2132 error = -ENOMEM; 2434 error = -ENOMEM;
2133 goto bad_swap; 2435 goto bad_swap;
2134 } 2436 }
2437 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2438 p->flags |= SWP_SOLIDSTATE;
2439 /*
2440 * select a random position to start with to help wear leveling
2441 * SSD
2442 */
2443 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2444
2445 cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
2446 SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2447 if (!cluster_info) {
2448 error = -ENOMEM;
2449 goto bad_swap;
2450 }
2451 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2452 if (!p->percpu_cluster) {
2453 error = -ENOMEM;
2454 goto bad_swap;
2455 }
2456 for_each_possible_cpu(i) {
2457 struct percpu_cluster *cluster;
2458 cluster = per_cpu_ptr(p->percpu_cluster, i);
2459 cluster_set_null(&cluster->index);
2460 }
2461 }
2135 2462
2136 error = swap_cgroup_swapon(p->type, maxpages); 2463 error = swap_cgroup_swapon(p->type, maxpages);
2137 if (error) 2464 if (error)
2138 goto bad_swap; 2465 goto bad_swap;
2139 2466
2140 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 2467 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2141 maxpages, &span); 2468 cluster_info, maxpages, &span);
2142 if (unlikely(nr_extents < 0)) { 2469 if (unlikely(nr_extents < 0)) {
2143 error = nr_extents; 2470 error = nr_extents;
2144 goto bad_swap; 2471 goto bad_swap;
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2147 if (frontswap_enabled) 2474 if (frontswap_enabled)
2148 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); 2475 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2149 2476
2150 if (p->bdev) { 2477 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2151 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2478 /*
2152 p->flags |= SWP_SOLIDSTATE; 2479 * When discard is enabled for swap with no particular
2153 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2480 * policy flagged, we set all swap discard flags here in
2154 } 2481 * order to sustain backward compatibility with older
2155 2482 * swapon(8) releases.
2156 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 2483 */
2157 /* 2484 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2158 * When discard is enabled for swap with no particular 2485 SWP_PAGE_DISCARD);
2159 * policy flagged, we set all swap discard flags here in
2160 * order to sustain backward compatibility with older
2161 * swapon(8) releases.
2162 */
2163 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2164 SWP_PAGE_DISCARD);
2165 2486
2166 /* 2487 /*
2167 * By flagging sys_swapon, a sysadmin can tell us to 2488 * By flagging sys_swapon, a sysadmin can tell us to
2168 * either do single-time area discards only, or to just 2489 * either do single-time area discards only, or to just
2169 * perform discards for released swap page-clusters. 2490 * perform discards for released swap page-clusters.
2170 * Now it's time to adjust the p->flags accordingly. 2491 * Now it's time to adjust the p->flags accordingly.
2171 */ 2492 */
2172 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 2493 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2173 p->flags &= ~SWP_PAGE_DISCARD; 2494 p->flags &= ~SWP_PAGE_DISCARD;
2174 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 2495 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2175 p->flags &= ~SWP_AREA_DISCARD; 2496 p->flags &= ~SWP_AREA_DISCARD;
2176 2497
2177 /* issue a swapon-time discard if it's still required */ 2498 /* issue a swapon-time discard if it's still required */
2178 if (p->flags & SWP_AREA_DISCARD) { 2499 if (p->flags & SWP_AREA_DISCARD) {
2179 int err = discard_swap(p); 2500 int err = discard_swap(p);
2180 if (unlikely(err)) 2501 if (unlikely(err))
2181 printk(KERN_ERR 2502 pr_err("swapon: discard_swap(%p): %d\n",
2182 "swapon: discard_swap(%p): %d\n", 2503 p, err);
2183 p, err);
2184 }
2185 } 2504 }
2186 } 2505 }
2187 2506
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2190 if (swap_flags & SWAP_FLAG_PREFER) 2509 if (swap_flags & SWAP_FLAG_PREFER)
2191 prio = 2510 prio =
2192 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2511 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2193 enable_swap_info(p, prio, swap_map, frontswap_map); 2512 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
2194 2513
2195 printk(KERN_INFO "Adding %uk swap on %s. " 2514 pr_info("Adding %uk swap on %s. "
2196 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 2515 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2197 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2516 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2198 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2517 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2211 error = 0; 2530 error = 0;
2212 goto out; 2531 goto out;
2213bad_swap: 2532bad_swap:
2533 free_percpu(p->percpu_cluster);
2534 p->percpu_cluster = NULL;
2214 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2535 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2215 set_blocksize(p->bdev, p->old_block_size); 2536 set_blocksize(p->bdev, p->old_block_size);
2216 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2537 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2222,6 +2543,7 @@ bad_swap:
2222 p->flags = 0; 2543 p->flags = 0;
2223 spin_unlock(&swap_lock); 2544 spin_unlock(&swap_lock);
2224 vfree(swap_map); 2545 vfree(swap_map);
2546 vfree(cluster_info);
2225 if (swap_file) { 2547 if (swap_file) {
2226 if (inode && S_ISREG(inode->i_mode)) { 2548 if (inode && S_ISREG(inode->i_mode)) {
2227 mutex_unlock(&inode->i_mutex); 2549 mutex_unlock(&inode->i_mutex);
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2291 goto unlock_out; 2613 goto unlock_out;
2292 2614
2293 count = p->swap_map[offset]; 2615 count = p->swap_map[offset];
2616
2617 /*
2618 * swapin_readahead() doesn't check if a swap entry is valid, so the
2619 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
2620 */
2621 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2622 err = -ENOENT;
2623 goto unlock_out;
2624 }
2625
2294 has_cache = count & SWAP_HAS_CACHE; 2626 has_cache = count & SWAP_HAS_CACHE;
2295 count &= ~SWAP_HAS_CACHE; 2627 count &= ~SWAP_HAS_CACHE;
2296 err = 0; 2628 err = 0;
@@ -2326,7 +2658,7 @@ out:
2326 return err; 2658 return err;
2327 2659
2328bad_file: 2660bad_file:
2329 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2661 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
2330 goto out; 2662 goto out;
2331} 2663}
2332 2664
diff --git a/mm/util.c b/mm/util.c
index 7441c41d00f6..eaf63fc2c92f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page)
388 struct address_space *mapping = page->mapping; 388 struct address_space *mapping = page->mapping;
389 389
390 VM_BUG_ON(PageSlab(page)); 390 VM_BUG_ON(PageSlab(page));
391#ifdef CONFIG_SWAP
392 if (unlikely(PageSwapCache(page))) { 391 if (unlikely(PageSwapCache(page))) {
393 swp_entry_t entry; 392 swp_entry_t entry;
394 393
395 entry.val = page_private(page); 394 entry.val = page_private(page);
396 mapping = swap_address_space(entry); 395 mapping = swap_address_space(entry);
397 } else 396 } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
398#endif
399 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
400 mapping = NULL; 397 mapping = NULL;
401 return mapping; 398 return mapping;
402} 399}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13a54953a273..107454312d5e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -752,7 +752,6 @@ struct vmap_block_queue {
752struct vmap_block { 752struct vmap_block {
753 spinlock_t lock; 753 spinlock_t lock;
754 struct vmap_area *va; 754 struct vmap_area *va;
755 struct vmap_block_queue *vbq;
756 unsigned long free, dirty; 755 unsigned long free, dirty;
757 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 756 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
758 struct list_head free_list; 757 struct list_head free_list;
@@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
830 radix_tree_preload_end(); 829 radix_tree_preload_end();
831 830
832 vbq = &get_cpu_var(vmap_block_queue); 831 vbq = &get_cpu_var(vmap_block_queue);
833 vb->vbq = vbq;
834 spin_lock(&vbq->lock); 832 spin_lock(&vbq->lock);
835 list_add_rcu(&vb->free_list, &vbq->free); 833 list_add_rcu(&vb->free_list, &vbq->free);
836 spin_unlock(&vbq->lock); 834 spin_unlock(&vbq->lock);
@@ -1018,15 +1016,16 @@ void vm_unmap_aliases(void)
1018 1016
1019 rcu_read_lock(); 1017 rcu_read_lock();
1020 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1018 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1021 int i; 1019 int i, j;
1022 1020
1023 spin_lock(&vb->lock); 1021 spin_lock(&vb->lock);
1024 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 1022 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
1025 while (i < VMAP_BBMAP_BITS) { 1023 if (i < VMAP_BBMAP_BITS) {
1026 unsigned long s, e; 1024 unsigned long s, e;
1027 int j; 1025
1028 j = find_next_zero_bit(vb->dirty_map, 1026 j = find_last_bit(vb->dirty_map,
1029 VMAP_BBMAP_BITS, i); 1027 VMAP_BBMAP_BITS);
1028 j = j + 1; /* need exclusive index */
1030 1029
1031 s = vb->va->va_start + (i << PAGE_SHIFT); 1030 s = vb->va->va_start + (i << PAGE_SHIFT);
1032 e = vb->va->va_start + (j << PAGE_SHIFT); 1031 e = vb->va->va_start + (j << PAGE_SHIFT);
@@ -1036,10 +1035,6 @@ void vm_unmap_aliases(void)
1036 start = s; 1035 start = s;
1037 if (e > end) 1036 if (e > end)
1038 end = e; 1037 end = e;
1039
1040 i = j;
1041 i = find_next_bit(vb->dirty_map,
1042 VMAP_BBMAP_BITS, i);
1043 } 1038 }
1044 spin_unlock(&vb->lock); 1039 spin_unlock(&vb->lock);
1045 } 1040 }
@@ -1263,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
1263int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1258int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
1264{ 1259{
1265 unsigned long addr = (unsigned long)area->addr; 1260 unsigned long addr = (unsigned long)area->addr;
1266 unsigned long end = addr + area->size - PAGE_SIZE; 1261 unsigned long end = addr + get_vm_area_size(area);
1267 int err; 1262 int err;
1268 1263
1269 err = vmap_page_range(addr, end, prot, *pages); 1264 err = vmap_page_range(addr, end, prot, *pages);
@@ -1558,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1558 unsigned int nr_pages, array_size, i; 1553 unsigned int nr_pages, array_size, i;
1559 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1554 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1560 1555
1561 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1556 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1562 array_size = (nr_pages * sizeof(struct page *)); 1557 array_size = (nr_pages * sizeof(struct page *));
1563 1558
1564 area->nr_pages = nr_pages; 1559 area->nr_pages = nr_pages;
@@ -1990,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count)
1990 1985
1991 vm = va->vm; 1986 vm = va->vm;
1992 vaddr = (char *) vm->addr; 1987 vaddr = (char *) vm->addr;
1993 if (addr >= vaddr + vm->size - PAGE_SIZE) 1988 if (addr >= vaddr + get_vm_area_size(vm))
1994 continue; 1989 continue;
1995 while (addr < vaddr) { 1990 while (addr < vaddr) {
1996 if (count == 0) 1991 if (count == 0)
@@ -2000,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count)
2000 addr++; 1995 addr++;
2001 count--; 1996 count--;
2002 } 1997 }
2003 n = vaddr + vm->size - PAGE_SIZE - addr; 1998 n = vaddr + get_vm_area_size(vm) - addr;
2004 if (n > count) 1999 if (n > count)
2005 n = count; 2000 n = count;
2006 if (!(vm->flags & VM_IOREMAP)) 2001 if (!(vm->flags & VM_IOREMAP))
@@ -2072,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
2072 2067
2073 vm = va->vm; 2068 vm = va->vm;
2074 vaddr = (char *) vm->addr; 2069 vaddr = (char *) vm->addr;
2075 if (addr >= vaddr + vm->size - PAGE_SIZE) 2070 if (addr >= vaddr + get_vm_area_size(vm))
2076 continue; 2071 continue;
2077 while (addr < vaddr) { 2072 while (addr < vaddr) {
2078 if (count == 0) 2073 if (count == 0)
@@ -2081,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
2081 addr++; 2076 addr++;
2082 count--; 2077 count--;
2083 } 2078 }
2084 n = vaddr + vm->size - PAGE_SIZE - addr; 2079 n = vaddr + get_vm_area_size(vm) - addr;
2085 if (n > count) 2080 if (n > count)
2086 n = count; 2081 n = count;
2087 if (!(vm->flags & VM_IOREMAP)) { 2082 if (!(vm->flags & VM_IOREMAP)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2cff0d491c6d..fe715daeb8bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
146} 146}
147#endif 147#endif
148 148
149unsigned long zone_reclaimable_pages(struct zone *zone)
150{
151 int nr;
152
153 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
154 zone_page_state(zone, NR_INACTIVE_FILE);
155
156 if (get_nr_swap_pages() > 0)
157 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
158 zone_page_state(zone, NR_INACTIVE_ANON);
159
160 return nr;
161}
162
163bool zone_reclaimable(struct zone *zone)
164{
165 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
166}
167
149static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 168static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
150{ 169{
151 if (!mem_cgroup_disabled()) 170 if (!mem_cgroup_disabled())
@@ -545,7 +564,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
545 */ 564 */
546void putback_lru_page(struct page *page) 565void putback_lru_page(struct page *page)
547{ 566{
548 int lru; 567 bool is_unevictable;
549 int was_unevictable = PageUnevictable(page); 568 int was_unevictable = PageUnevictable(page);
550 569
551 VM_BUG_ON(PageLRU(page)); 570 VM_BUG_ON(PageLRU(page));
@@ -560,14 +579,14 @@ redo:
560 * unevictable page on [in]active list. 579 * unevictable page on [in]active list.
561 * We know how to handle that. 580 * We know how to handle that.
562 */ 581 */
563 lru = page_lru_base_type(page); 582 is_unevictable = false;
564 lru_cache_add(page); 583 lru_cache_add(page);
565 } else { 584 } else {
566 /* 585 /*
567 * Put unevictable pages directly on zone's unevictable 586 * Put unevictable pages directly on zone's unevictable
568 * list. 587 * list.
569 */ 588 */
570 lru = LRU_UNEVICTABLE; 589 is_unevictable = true;
571 add_page_to_unevictable_list(page); 590 add_page_to_unevictable_list(page);
572 /* 591 /*
573 * When racing with an mlock or AS_UNEVICTABLE clearing 592 * When racing with an mlock or AS_UNEVICTABLE clearing
@@ -587,7 +606,7 @@ redo:
587 * page is on unevictable list, it never be freed. To avoid that, 606 * page is on unevictable list, it never be freed. To avoid that,
588 * check after we added it to the list, again. 607 * check after we added it to the list, again.
589 */ 608 */
590 if (lru == LRU_UNEVICTABLE && page_evictable(page)) { 609 if (is_unevictable && page_evictable(page)) {
591 if (!isolate_lru_page(page)) { 610 if (!isolate_lru_page(page)) {
592 put_page(page); 611 put_page(page);
593 goto redo; 612 goto redo;
@@ -598,9 +617,9 @@ redo:
598 */ 617 */
599 } 618 }
600 619
601 if (was_unevictable && lru != LRU_UNEVICTABLE) 620 if (was_unevictable && !is_unevictable)
602 count_vm_event(UNEVICTABLE_PGRESCUED); 621 count_vm_event(UNEVICTABLE_PGRESCUED);
603 else if (!was_unevictable && lru == LRU_UNEVICTABLE) 622 else if (!was_unevictable && is_unevictable)
604 count_vm_event(UNEVICTABLE_PGCULLED); 623 count_vm_event(UNEVICTABLE_PGCULLED);
605 624
606 put_page(page); /* drop ref from isolate */ 625 put_page(page); /* drop ref from isolate */
@@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1789 * latencies, so it's better to scan a minimum amount there as 1808 * latencies, so it's better to scan a minimum amount there as
1790 * well. 1809 * well.
1791 */ 1810 */
1792 if (current_is_kswapd() && zone->all_unreclaimable) 1811 if (current_is_kswapd() && !zone_reclaimable(zone))
1793 force_scan = true; 1812 force_scan = true;
1794 if (!global_reclaim(sc)) 1813 if (!global_reclaim(sc))
1795 force_scan = true; 1814 force_scan = true;
@@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2244 if (global_reclaim(sc)) { 2263 if (global_reclaim(sc)) {
2245 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2264 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2246 continue; 2265 continue;
2247 if (zone->all_unreclaimable && 2266 if (sc->priority != DEF_PRIORITY &&
2248 sc->priority != DEF_PRIORITY) 2267 !zone_reclaimable(zone))
2249 continue; /* Let kswapd poll it */ 2268 continue; /* Let kswapd poll it */
2250 if (IS_ENABLED(CONFIG_COMPACTION)) { 2269 if (IS_ENABLED(CONFIG_COMPACTION)) {
2251 /* 2270 /*
@@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2283 return aborted_reclaim; 2302 return aborted_reclaim;
2284} 2303}
2285 2304
2286static bool zone_reclaimable(struct zone *zone)
2287{
2288 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2289}
2290
2291/* All zones in zonelist are unreclaimable? */ 2305/* All zones in zonelist are unreclaimable? */
2292static bool all_unreclaimable(struct zonelist *zonelist, 2306static bool all_unreclaimable(struct zonelist *zonelist,
2293 struct scan_control *sc) 2307 struct scan_control *sc)
@@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
2301 continue; 2315 continue;
2302 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2316 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2303 continue; 2317 continue;
2304 if (!zone->all_unreclaimable) 2318 if (zone_reclaimable(zone))
2305 return false; 2319 return false;
2306 } 2320 }
2307 2321
@@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2712 * DEF_PRIORITY. Effectively, it considers them balanced so 2726 * DEF_PRIORITY. Effectively, it considers them balanced so
2713 * they must be considered balanced here as well! 2727 * they must be considered balanced here as well!
2714 */ 2728 */
2715 if (zone->all_unreclaimable) { 2729 if (!zone_reclaimable(zone)) {
2716 balanced_pages += zone->managed_pages; 2730 balanced_pages += zone->managed_pages;
2717 continue; 2731 continue;
2718 } 2732 }
@@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
2773 unsigned long lru_pages, 2787 unsigned long lru_pages,
2774 unsigned long *nr_attempted) 2788 unsigned long *nr_attempted)
2775{ 2789{
2776 unsigned long nr_slab;
2777 int testorder = sc->order; 2790 int testorder = sc->order;
2778 unsigned long balance_gap; 2791 unsigned long balance_gap;
2779 struct reclaim_state *reclaim_state = current->reclaim_state; 2792 struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
2818 shrink_zone(zone, sc); 2831 shrink_zone(zone, sc);
2819 2832
2820 reclaim_state->reclaimed_slab = 0; 2833 reclaim_state->reclaimed_slab = 0;
2821 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2834 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2822 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2835 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2823 2836
2824 /* Account for the number of pages attempted to reclaim */ 2837 /* Account for the number of pages attempted to reclaim */
2825 *nr_attempted += sc->nr_to_reclaim; 2838 *nr_attempted += sc->nr_to_reclaim;
2826 2839
2827 if (nr_slab == 0 && !zone_reclaimable(zone))
2828 zone->all_unreclaimable = 1;
2829
2830 zone_clear_flag(zone, ZONE_WRITEBACK); 2840 zone_clear_flag(zone, ZONE_WRITEBACK);
2831 2841
2832 /* 2842 /*
@@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2835 * BDIs but as pressure is relieved, speculatively avoid congestion 2845 * BDIs but as pressure is relieved, speculatively avoid congestion
2836 * waits. 2846 * waits.
2837 */ 2847 */
2838 if (!zone->all_unreclaimable && 2848 if (zone_reclaimable(zone) &&
2839 zone_balanced(zone, testorder, 0, classzone_idx)) { 2849 zone_balanced(zone, testorder, 0, classzone_idx)) {
2840 zone_clear_flag(zone, ZONE_CONGESTED); 2850 zone_clear_flag(zone, ZONE_CONGESTED);
2841 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2851 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2901 if (!populated_zone(zone)) 2911 if (!populated_zone(zone))
2902 continue; 2912 continue;
2903 2913
2904 if (zone->all_unreclaimable && 2914 if (sc.priority != DEF_PRIORITY &&
2905 sc.priority != DEF_PRIORITY) 2915 !zone_reclaimable(zone))
2906 continue; 2916 continue;
2907 2917
2908 /* 2918 /*
@@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2980 if (!populated_zone(zone)) 2990 if (!populated_zone(zone))
2981 continue; 2991 continue;
2982 2992
2983 if (zone->all_unreclaimable && 2993 if (sc.priority != DEF_PRIORITY &&
2984 sc.priority != DEF_PRIORITY) 2994 !zone_reclaimable(zone))
2985 continue; 2995 continue;
2986 2996
2987 sc.nr_scanned = 0; 2997 sc.nr_scanned = 0;
@@ -3237,7 +3247,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3237 } 3247 }
3238 if (!waitqueue_active(&pgdat->kswapd_wait)) 3248 if (!waitqueue_active(&pgdat->kswapd_wait))
3239 return; 3249 return;
3240 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) 3250 if (zone_balanced(zone, order, 0, 0))
3241 return; 3251 return;
3242 3252
3243 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3253 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void)
3265 return nr; 3275 return nr;
3266} 3276}
3267 3277
3268unsigned long zone_reclaimable_pages(struct zone *zone)
3269{
3270 int nr;
3271
3272 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3273 zone_page_state(zone, NR_INACTIVE_FILE);
3274
3275 if (get_nr_swap_pages() > 0)
3276 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3277 zone_page_state(zone, NR_INACTIVE_ANON);
3278
3279 return nr;
3280}
3281
3282#ifdef CONFIG_HIBERNATION 3278#ifdef CONFIG_HIBERNATION
3283/* 3279/*
3284 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3280 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3576 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3572 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3577 return ZONE_RECLAIM_FULL; 3573 return ZONE_RECLAIM_FULL;
3578 3574
3579 if (zone->all_unreclaimable) 3575 if (!zone_reclaimable(zone))
3580 return ZONE_RECLAIM_FULL; 3576 return ZONE_RECLAIM_FULL;
3581 3577
3582 /* 3578 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c2ef4458fa..9bb314577911 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,6 +19,9 @@
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/compaction.h> 21#include <linux/compaction.h>
22#include <linux/mm_inline.h>
23
24#include "internal.h"
22 25
23#ifdef CONFIG_VM_EVENT_COUNTERS 26#ifdef CONFIG_VM_EVENT_COUNTERS
24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 27DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -414,12 +417,17 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
414EXPORT_SYMBOL(dec_zone_page_state); 417EXPORT_SYMBOL(dec_zone_page_state);
415#endif 418#endif
416 419
420static inline void fold_diff(int *diff)
421{
422 int i;
423
424 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
425 if (diff[i])
426 atomic_long_add(diff[i], &vm_stat[i]);
427}
428
417/* 429/*
418 * Update the zone counters for one cpu. 430 * Update the zone counters for the current cpu.
419 *
420 * The cpu specified must be either the current cpu or a processor that
421 * is not online. If it is the current cpu then the execution thread must
422 * be pinned to the current cpu.
423 * 431 *
424 * Note that refresh_cpu_vm_stats strives to only access 432 * Note that refresh_cpu_vm_stats strives to only access
425 * node local memory. The per cpu pagesets on remote zones are placed 433 * node local memory. The per cpu pagesets on remote zones are placed
@@ -432,33 +440,29 @@ EXPORT_SYMBOL(dec_zone_page_state);
432 * with the global counters. These could cause remote node cache line 440 * with the global counters. These could cause remote node cache line
433 * bouncing and will have to be only done when necessary. 441 * bouncing and will have to be only done when necessary.
434 */ 442 */
435void refresh_cpu_vm_stats(int cpu) 443static void refresh_cpu_vm_stats(void)
436{ 444{
437 struct zone *zone; 445 struct zone *zone;
438 int i; 446 int i;
439 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 447 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
440 448
441 for_each_populated_zone(zone) { 449 for_each_populated_zone(zone) {
442 struct per_cpu_pageset *p; 450 struct per_cpu_pageset __percpu *p = zone->pageset;
443 451
444 p = per_cpu_ptr(zone->pageset, cpu); 452 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
453 int v;
445 454
446 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 455 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
447 if (p->vm_stat_diff[i]) { 456 if (v) {
448 unsigned long flags;
449 int v;
450 457
451 local_irq_save(flags);
452 v = p->vm_stat_diff[i];
453 p->vm_stat_diff[i] = 0;
454 local_irq_restore(flags);
455 atomic_long_add(v, &zone->vm_stat[i]); 458 atomic_long_add(v, &zone->vm_stat[i]);
456 global_diff[i] += v; 459 global_diff[i] += v;
457#ifdef CONFIG_NUMA 460#ifdef CONFIG_NUMA
458 /* 3 seconds idle till flush */ 461 /* 3 seconds idle till flush */
459 p->expire = 3; 462 __this_cpu_write(p->expire, 3);
460#endif 463#endif
461 } 464 }
465 }
462 cond_resched(); 466 cond_resched();
463#ifdef CONFIG_NUMA 467#ifdef CONFIG_NUMA
464 /* 468 /*
@@ -468,29 +472,57 @@ void refresh_cpu_vm_stats(int cpu)
468 * Check if there are pages remaining in this pageset 472 * Check if there are pages remaining in this pageset
469 * if not then there is nothing to expire. 473 * if not then there is nothing to expire.
470 */ 474 */
471 if (!p->expire || !p->pcp.count) 475 if (!__this_cpu_read(p->expire) ||
476 !__this_cpu_read(p->pcp.count))
472 continue; 477 continue;
473 478
474 /* 479 /*
475 * We never drain zones local to this processor. 480 * We never drain zones local to this processor.
476 */ 481 */
477 if (zone_to_nid(zone) == numa_node_id()) { 482 if (zone_to_nid(zone) == numa_node_id()) {
478 p->expire = 0; 483 __this_cpu_write(p->expire, 0);
479 continue; 484 continue;
480 } 485 }
481 486
482 p->expire--; 487
483 if (p->expire) 488 if (__this_cpu_dec_return(p->expire))
484 continue; 489 continue;
485 490
486 if (p->pcp.count) 491 if (__this_cpu_read(p->pcp.count))
487 drain_zone_pages(zone, &p->pcp); 492 drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
488#endif 493#endif
489 } 494 }
495 fold_diff(global_diff);
496}
490 497
491 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 498/*
492 if (global_diff[i]) 499 * Fold the data for an offline cpu into the global array.
493 atomic_long_add(global_diff[i], &vm_stat[i]); 500 * There cannot be any access by the offline cpu and therefore
501 * synchronization is simplified.
502 */
503void cpu_vm_stats_fold(int cpu)
504{
505 struct zone *zone;
506 int i;
507 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
508
509 for_each_populated_zone(zone) {
510 struct per_cpu_pageset *p;
511
512 p = per_cpu_ptr(zone->pageset, cpu);
513
514 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
515 if (p->vm_stat_diff[i]) {
516 int v;
517
518 v = p->vm_stat_diff[i];
519 p->vm_stat_diff[i] = 0;
520 atomic_long_add(v, &zone->vm_stat[i]);
521 global_diff[i] += v;
522 }
523 }
524
525 fold_diff(global_diff);
494} 526}
495 527
496/* 528/*
@@ -703,6 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
703const char * const vmstat_text[] = { 735const char * const vmstat_text[] = {
704 /* Zoned VM counters */ 736 /* Zoned VM counters */
705 "nr_free_pages", 737 "nr_free_pages",
738 "nr_alloc_batch",
706 "nr_inactive_anon", 739 "nr_inactive_anon",
707 "nr_active_anon", 740 "nr_active_anon",
708 "nr_inactive_file", 741 "nr_inactive_file",
@@ -817,6 +850,12 @@ const char * const vmstat_text[] = {
817 "thp_zero_page_alloc", 850 "thp_zero_page_alloc",
818 "thp_zero_page_alloc_failed", 851 "thp_zero_page_alloc_failed",
819#endif 852#endif
853#ifdef CONFIG_SMP
854 "nr_tlb_remote_flush",
855 "nr_tlb_remote_flush_received",
856#endif
857 "nr_tlb_local_flush_all",
858 "nr_tlb_local_flush_one",
820 859
821#endif /* CONFIG_VM_EVENTS_COUNTERS */ 860#endif /* CONFIG_VM_EVENTS_COUNTERS */
822}; 861};
@@ -1052,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1052 "\n all_unreclaimable: %u" 1091 "\n all_unreclaimable: %u"
1053 "\n start_pfn: %lu" 1092 "\n start_pfn: %lu"
1054 "\n inactive_ratio: %u", 1093 "\n inactive_ratio: %u",
1055 zone->all_unreclaimable, 1094 !zone_reclaimable(zone),
1056 zone->zone_start_pfn, 1095 zone->zone_start_pfn,
1057 zone->inactive_ratio); 1096 zone->inactive_ratio);
1058 seq_putc(m, '\n'); 1097 seq_putc(m, '\n');
@@ -1177,7 +1216,7 @@ int sysctl_stat_interval __read_mostly = HZ;
1177 1216
1178static void vmstat_update(struct work_struct *w) 1217static void vmstat_update(struct work_struct *w)
1179{ 1218{
1180 refresh_cpu_vm_stats(smp_processor_id()); 1219 refresh_cpu_vm_stats();
1181 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1220 schedule_delayed_work(&__get_cpu_var(vmstat_work),
1182 round_jiffies_relative(sysctl_stat_interval)); 1221 round_jiffies_relative(sysctl_stat_interval));
1183} 1222}
diff --git a/mm/zbud.c b/mm/zbud.c
index ad1e781284fd..9451361e6aa7 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -16,7 +16,7 @@
16 * 16 *
17 * zbud works by storing compressed pages, or "zpages", together in pairs in a 17 * zbud works by storing compressed pages, or "zpages", together in pairs in a
18 * single memory page called a "zbud page". The first buddy is "left 18 * single memory page called a "zbud page". The first buddy is "left
19 * justifed" at the beginning of the zbud page, and the last buddy is "right 19 * justified" at the beginning of the zbud page, and the last buddy is "right
20 * justified" at the end of the zbud page. The benefit is that if either 20 * justified" at the end of the zbud page. The benefit is that if either
21 * buddy is freed, the freed buddy space, coalesced with whatever slack space 21 * buddy is freed, the freed buddy space, coalesced with whatever slack space
22 * that existed between the buddies, results in the largest possible free region 22 * that existed between the buddies, results in the largest possible free region
@@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
244 * as zbud pool pages. 244 * as zbud pool pages.
245 * 245 *
246 * Return: 0 if success and handle is set, otherwise -EINVAL is the size or 246 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page. 248 * a new page.
249 */ 249 */
diff --git a/mm/zswap.c b/mm/zswap.c
index deda2b671e12..841e35f1db22 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage) 409 struct page **retpage)
410{ 410{
411 struct page *found_page, *new_page = NULL; 411 struct page *found_page, *new_page = NULL;
412 struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; 412 struct address_space *swapper_space = swap_address_space(entry);
413 int err; 413 int err;
414 414
415 *retpage = NULL; 415 *retpage = NULL;
@@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
790static void zswap_frontswap_invalidate_area(unsigned type) 790static void zswap_frontswap_invalidate_area(unsigned type)
791{ 791{
792 struct zswap_tree *tree = zswap_trees[type]; 792 struct zswap_tree *tree = zswap_trees[type];
793 struct rb_node *node; 793 struct zswap_entry *entry, *n;
794 struct zswap_entry *entry;
795 794
796 if (!tree) 795 if (!tree)
797 return; 796 return;
798 797
799 /* walk the tree and free everything */ 798 /* walk the tree and free everything */
800 spin_lock(&tree->lock); 799 spin_lock(&tree->lock);
801 /* 800 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
802 * TODO: Even though this code should not be executed because
803 * the try_to_unuse() in swapoff should have emptied the tree,
804 * it is very wasteful to rebalance the tree after every
805 * removal when we are freeing the whole tree.
806 *
807 * If post-order traversal code is ever added to the rbtree
808 * implementation, it should be used here.
809 */
810 while ((node = rb_first(&tree->rbroot))) {
811 entry = rb_entry(node, struct zswap_entry, rbnode);
812 rb_erase(&entry->rbnode, &tree->rbroot);
813 zbud_free(tree->pool, entry->handle); 801 zbud_free(tree->pool, entry->handle);
814 zswap_entry_cache_free(entry); 802 zswap_entry_cache_free(entry);
815 atomic_dec(&zswap_stored_pages); 803 atomic_dec(&zswap_stored_pages);
diff --git a/net/socket.c b/net/socket.c
index b2d7c629eeb9..0ceaa5cb9ead 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3072,12 +3072,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
3072 3072
3073 uifmap32 = &uifr32->ifr_ifru.ifru_map; 3073 uifmap32 = &uifr32->ifr_ifru.ifru_map;
3074 err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); 3074 err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
3075 err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); 3075 err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
3076 err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); 3076 err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
3077 err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); 3077 err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
3078 err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq); 3078 err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
3079 err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma); 3079 err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
3080 err |= __get_user(ifr.ifr_map.port, &uifmap32->port); 3080 err |= get_user(ifr.ifr_map.port, &uifmap32->port);
3081 if (err) 3081 if (err)
3082 return -EFAULT; 3082 return -EFAULT;
3083 3083
@@ -3088,12 +3088,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
3088 3088
3089 if (cmd == SIOCGIFMAP && !err) { 3089 if (cmd == SIOCGIFMAP && !err) {
3090 err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); 3090 err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
3091 err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); 3091 err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
3092 err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); 3092 err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
3093 err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); 3093 err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
3094 err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq); 3094 err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
3095 err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma); 3095 err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
3096 err |= __put_user(ifr.ifr_map.port, &uifmap32->port); 3096 err |= put_user(ifr.ifr_map.port, &uifmap32->port);
3097 if (err) 3097 if (err)
3098 err = -EFAULT; 3098 err = -EFAULT;
3099 } 3099 }
@@ -3167,25 +3167,25 @@ static int routing_ioctl(struct net *net, struct socket *sock,
3167 struct in6_rtmsg32 __user *ur6 = argp; 3167 struct in6_rtmsg32 __user *ur6 = argp;
3168 ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), 3168 ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
3169 3 * sizeof(struct in6_addr)); 3169 3 * sizeof(struct in6_addr));
3170 ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); 3170 ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
3171 ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); 3171 ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
3172 ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); 3172 ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
3173 ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); 3173 ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
3174 ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); 3174 ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
3175 ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); 3175 ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
3176 ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); 3176 ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
3177 3177
3178 r = (void *) &r6; 3178 r = (void *) &r6;
3179 } else { /* ipv4 */ 3179 } else { /* ipv4 */
3180 struct rtentry32 __user *ur4 = argp; 3180 struct rtentry32 __user *ur4 = argp;
3181 ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), 3181 ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
3182 3 * sizeof(struct sockaddr)); 3182 3 * sizeof(struct sockaddr));
3183 ret |= __get_user(r4.rt_flags, &(ur4->rt_flags)); 3183 ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
3184 ret |= __get_user(r4.rt_metric, &(ur4->rt_metric)); 3184 ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
3185 ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu)); 3185 ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
3186 ret |= __get_user(r4.rt_window, &(ur4->rt_window)); 3186 ret |= get_user(r4.rt_window, &(ur4->rt_window));
3187 ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt)); 3187 ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
3188 ret |= __get_user(rtdev, &(ur4->rt_dev)); 3188 ret |= get_user(rtdev, &(ur4->rt_dev));
3189 if (rtdev) { 3189 if (rtdev) {
3190 ret |= copy_from_user(devname, compat_ptr(rtdev), 15); 3190 ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
3191 r4.rt_dev = (char __user __force *)devname; 3191 r4.rt_dev = (char __user __force *)devname;
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2ee9eb750560..47016c304c84 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -31,12 +31,16 @@ my $show_types = 0;
31my $fix = 0; 31my $fix = 0;
32my $root; 32my $root;
33my %debug; 33my %debug;
34my %ignore_type = ();
35my %camelcase = (); 34my %camelcase = ();
35my %use_type = ();
36my @use = ();
37my %ignore_type = ();
36my @ignore = (); 38my @ignore = ();
37my $help = 0; 39my $help = 0;
38my $configuration_file = ".checkpatch.conf"; 40my $configuration_file = ".checkpatch.conf";
39my $max_line_length = 80; 41my $max_line_length = 80;
42my $ignore_perl_version = 0;
43my $minimum_perl_version = 5.10.0;
40 44
41sub help { 45sub help {
42 my ($exitcode) = @_; 46 my ($exitcode) = @_;
@@ -54,6 +58,7 @@ Options:
54 --terse one line per report 58 --terse one line per report
55 -f, --file treat FILE as regular source file 59 -f, --file treat FILE as regular source file
56 --subjective, --strict enable more subjective tests 60 --subjective, --strict enable more subjective tests
61 --types TYPE(,TYPE2...) show only these comma separated message types
57 --ignore TYPE(,TYPE2...) ignore various comma separated message types 62 --ignore TYPE(,TYPE2...) ignore various comma separated message types
58 --max-line-length=n set the maximum line length, if exceeded, warn 63 --max-line-length=n set the maximum line length, if exceeded, warn
59 --show-types show the message "types" in the output 64 --show-types show the message "types" in the output
@@ -71,6 +76,8 @@ Options:
71 "<inputfile>.EXPERIMENTAL-checkpatch-fixes" 76 "<inputfile>.EXPERIMENTAL-checkpatch-fixes"
72 with potential errors corrected to the preferred 77 with potential errors corrected to the preferred
73 checkpatch style 78 checkpatch style
79 --ignore-perl-version override checking of perl version. expect
80 runtime errors.
74 -h, --help, --version display this help and exit 81 -h, --help, --version display this help and exit
75 82
76When FILE is - read standard input. 83When FILE is - read standard input.
@@ -116,6 +123,7 @@ GetOptions(
116 'subjective!' => \$check, 123 'subjective!' => \$check,
117 'strict!' => \$check, 124 'strict!' => \$check,
118 'ignore=s' => \@ignore, 125 'ignore=s' => \@ignore,
126 'types=s' => \@use,
119 'show-types!' => \$show_types, 127 'show-types!' => \$show_types,
120 'max-line-length=i' => \$max_line_length, 128 'max-line-length=i' => \$max_line_length,
121 'root=s' => \$root, 129 'root=s' => \$root,
@@ -123,6 +131,7 @@ GetOptions(
123 'mailback!' => \$mailback, 131 'mailback!' => \$mailback,
124 'summary-file!' => \$summary_file, 132 'summary-file!' => \$summary_file,
125 'fix!' => \$fix, 133 'fix!' => \$fix,
134 'ignore-perl-version!' => \$ignore_perl_version,
126 'debug=s' => \%debug, 135 'debug=s' => \%debug,
127 'test-only=s' => \$tst_only, 136 'test-only=s' => \$tst_only,
128 'h|help' => \$help, 137 'h|help' => \$help,
@@ -133,24 +142,50 @@ help(0) if ($help);
133 142
134my $exit = 0; 143my $exit = 0;
135 144
145if ($^V && $^V lt $minimum_perl_version) {
146 printf "$P: requires at least perl version %vd\n", $minimum_perl_version;
147 if (!$ignore_perl_version) {
148 exit(1);
149 }
150}
151
136if ($#ARGV < 0) { 152if ($#ARGV < 0) {
137 print "$P: no input files\n"; 153 print "$P: no input files\n";
138 exit(1); 154 exit(1);
139} 155}
140 156
141@ignore = split(/,/, join(',',@ignore)); 157sub hash_save_array_words {
142foreach my $word (@ignore) { 158 my ($hashRef, $arrayRef) = @_;
143 $word =~ s/\s*\n?$//g; 159
144 $word =~ s/^\s*//g; 160 my @array = split(/,/, join(',', @$arrayRef));
145 $word =~ s/\s+/ /g; 161 foreach my $word (@array) {
146 $word =~ tr/[a-z]/[A-Z]/; 162 $word =~ s/\s*\n?$//g;
163 $word =~ s/^\s*//g;
164 $word =~ s/\s+/ /g;
165 $word =~ tr/[a-z]/[A-Z]/;
166
167 next if ($word =~ m/^\s*#/);
168 next if ($word =~ m/^\s*$/);
147 169
148 next if ($word =~ m/^\s*#/); 170 $hashRef->{$word}++;
149 next if ($word =~ m/^\s*$/); 171 }
172}
150 173
151 $ignore_type{$word}++; 174sub hash_show_words {
175 my ($hashRef, $prefix) = @_;
176
177 if ($quiet == 0 && keys %$hashRef) {
178 print "NOTE: $prefix message types:";
179 foreach my $word (sort keys %$hashRef) {
180 print " $word";
181 }
182 print "\n\n";
183 }
152} 184}
153 185
186hash_save_array_words(\%ignore_type, \@ignore);
187hash_save_array_words(\%use_type, \@use);
188
154my $dbg_values = 0; 189my $dbg_values = 0;
155my $dbg_possible = 0; 190my $dbg_possible = 0;
156my $dbg_type = 0; 191my $dbg_type = 0;
@@ -207,6 +242,8 @@ our $Sparse = qr{
207 __rcu 242 __rcu
208 }x; 243 }x;
209 244
245our $InitAttribute = qr{__(?:mem|cpu|dev|net_|)(?:initdata|initconst|init\b)};
246
210# Notes to $Attribute: 247# Notes to $Attribute:
211# We need \b after 'init' otherwise 'initconst' will cause a false positive in a check 248# We need \b after 'init' otherwise 'initconst' will cause a false positive in a check
212our $Attribute = qr{ 249our $Attribute = qr{
@@ -227,7 +264,7 @@ our $Attribute = qr{
227 __deprecated| 264 __deprecated|
228 __read_mostly| 265 __read_mostly|
229 __kprobes| 266 __kprobes|
230 __(?:mem|cpu|dev|)(?:initdata|initconst|init\b)| 267 $InitAttribute|
231 ____cacheline_aligned| 268 ____cacheline_aligned|
232 ____cacheline_aligned_in_smp| 269 ____cacheline_aligned_in_smp|
233 ____cacheline_internodealigned_in_smp| 270 ____cacheline_internodealigned_in_smp|
@@ -257,6 +294,7 @@ our $Operators = qr{
257 }x; 294 }x;
258 295
259our $NonptrType; 296our $NonptrType;
297our $NonptrTypeWithAttr;
260our $Type; 298our $Type;
261our $Declare; 299our $Declare;
262 300
@@ -319,6 +357,12 @@ our @typeList = (
319 qr{${Ident}_handler}, 357 qr{${Ident}_handler},
320 qr{${Ident}_handler_fn}, 358 qr{${Ident}_handler_fn},
321); 359);
360our @typeListWithAttr = (
361 @typeList,
362 qr{struct\s+$InitAttribute\s+$Ident},
363 qr{union\s+$InitAttribute\s+$Ident},
364);
365
322our @modifierList = ( 366our @modifierList = (
323 qr{fastcall}, 367 qr{fastcall},
324); 368);
@@ -332,6 +376,7 @@ our $allowed_asm_includes = qr{(?x:
332sub build_types { 376sub build_types {
333 my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; 377 my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)";
334 my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; 378 my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)";
379 my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)";
335 $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; 380 $Modifier = qr{(?:$Attribute|$Sparse|$mods)};
336 $NonptrType = qr{ 381 $NonptrType = qr{
337 (?:$Modifier\s+|const\s+)* 382 (?:$Modifier\s+|const\s+)*
@@ -342,6 +387,15 @@ sub build_types {
342 ) 387 )
343 (?:\s+$Modifier|\s+const)* 388 (?:\s+$Modifier|\s+const)*
344 }x; 389 }x;
390 $NonptrTypeWithAttr = qr{
391 (?:$Modifier\s+|const\s+)*
392 (?:
393 (?:typeof|__typeof__)\s*\([^\)]*\)|
394 (?:$typeTypedefs\b)|
395 (?:${allWithAttr}\b)
396 )
397 (?:\s+$Modifier|\s+const)*
398 }x;
345 $Type = qr{ 399 $Type = qr{
346 $NonptrType 400 $NonptrType
347 (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? 401 (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)?
@@ -1355,7 +1409,9 @@ sub possible {
1355my $prefix = ''; 1409my $prefix = '';
1356 1410
1357sub show_type { 1411sub show_type {
1358 return !defined $ignore_type{$_[0]}; 1412 return defined $use_type{$_[0]} if (scalar keys %use_type > 0);
1413
1414 return !defined $ignore_type{$_[0]};
1359} 1415}
1360 1416
1361sub report { 1417sub report {
@@ -1435,7 +1491,23 @@ sub check_absolute_file {
1435sub trim { 1491sub trim {
1436 my ($string) = @_; 1492 my ($string) = @_;
1437 1493
1438 $string =~ s/(^\s+|\s+$)//g; 1494 $string =~ s/^\s+|\s+$//g;
1495
1496 return $string;
1497}
1498
1499sub ltrim {
1500 my ($string) = @_;
1501
1502 $string =~ s/^\s+//;
1503
1504 return $string;
1505}
1506
1507sub rtrim {
1508 my ($string) = @_;
1509
1510 $string =~ s/\s+$//;
1439 1511
1440 return $string; 1512 return $string;
1441} 1513}
@@ -1532,6 +1604,7 @@ sub process {
1532 my %suppress_export; 1604 my %suppress_export;
1533 my $suppress_statement = 0; 1605 my $suppress_statement = 0;
1534 1606
1607 my %signatures = ();
1535 1608
1536 # Pre-scan the patch sanitizing the lines. 1609 # Pre-scan the patch sanitizing the lines.
1537 # Pre-scan the patch looking for any __setup documentation. 1610 # Pre-scan the patch looking for any __setup documentation.
@@ -1624,6 +1697,8 @@ sub process {
1624 $linenr = 0; 1697 $linenr = 0;
1625 foreach my $line (@lines) { 1698 foreach my $line (@lines) {
1626 $linenr++; 1699 $linenr++;
1700 my $sline = $line; #copy of $line
1701 $sline =~ s/$;/ /g; #with comments as spaces
1627 1702
1628 my $rawline = $rawlines[$linenr - 1]; 1703 my $rawline = $rawlines[$linenr - 1];
1629 1704
@@ -1781,6 +1856,17 @@ sub process {
1781 "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); 1856 "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr);
1782 } 1857 }
1783 } 1858 }
1859
1860# Check for duplicate signatures
1861 my $sig_nospace = $line;
1862 $sig_nospace =~ s/\s//g;
1863 $sig_nospace = lc($sig_nospace);
1864 if (defined $signatures{$sig_nospace}) {
1865 WARN("BAD_SIGN_OFF",
1866 "Duplicate signature\n" . $herecurr);
1867 } else {
1868 $signatures{$sig_nospace} = 1;
1869 }
1784 } 1870 }
1785 1871
1786# Check for wrappage within a valid hunk of the file 1872# Check for wrappage within a valid hunk of the file
@@ -1845,15 +1931,17 @@ sub process {
1845#trailing whitespace 1931#trailing whitespace
1846 if ($line =~ /^\+.*\015/) { 1932 if ($line =~ /^\+.*\015/) {
1847 my $herevet = "$here\n" . cat_vet($rawline) . "\n"; 1933 my $herevet = "$here\n" . cat_vet($rawline) . "\n";
1848 ERROR("DOS_LINE_ENDINGS", 1934 if (ERROR("DOS_LINE_ENDINGS",
1849 "DOS line endings\n" . $herevet); 1935 "DOS line endings\n" . $herevet) &&
1850 1936 $fix) {
1937 $fixed[$linenr - 1] =~ s/[\s\015]+$//;
1938 }
1851 } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { 1939 } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) {
1852 my $herevet = "$here\n" . cat_vet($rawline) . "\n"; 1940 my $herevet = "$here\n" . cat_vet($rawline) . "\n";
1853 if (ERROR("TRAILING_WHITESPACE", 1941 if (ERROR("TRAILING_WHITESPACE",
1854 "trailing whitespace\n" . $herevet) && 1942 "trailing whitespace\n" . $herevet) &&
1855 $fix) { 1943 $fix) {
1856 $fixed[$linenr - 1] =~ s/^(\+.*?)\s+$/$1/; 1944 $fixed[$linenr - 1] =~ s/\s+$//;
1857 } 1945 }
1858 1946
1859 $rpt_cleaners = 1; 1947 $rpt_cleaners = 1;
@@ -2060,6 +2148,7 @@ sub process {
2060 if ($realfile =~ m@^(drivers/net/|net/)@ && 2148 if ($realfile =~ m@^(drivers/net/|net/)@ &&
2061 $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /* 2149 $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /*
2062 $prevrawline !~ /\*\/[ \t]*$/ && #no trailing */ 2150 $prevrawline !~ /\*\/[ \t]*$/ && #no trailing */
2151 $rawline =~ /^\+/ && #line is new
2063 $rawline !~ /^\+[ \t]*\*/) { #no leading * 2152 $rawline !~ /^\+[ \t]*\*/) { #no leading *
2064 WARN("NETWORKING_BLOCK_COMMENT_STYLE", 2153 WARN("NETWORKING_BLOCK_COMMENT_STYLE",
2065 "networking block comments start with * on subsequent lines\n" . $hereprev); 2154 "networking block comments start with * on subsequent lines\n" . $hereprev);
@@ -2126,7 +2215,7 @@ sub process {
2126 $realline_next); 2215 $realline_next);
2127#print "LINE<$line>\n"; 2216#print "LINE<$line>\n";
2128 if ($linenr >= $suppress_statement && 2217 if ($linenr >= $suppress_statement &&
2129 $realcnt && $line =~ /.\s*\S/) { 2218 $realcnt && $sline =~ /.\s*\S/) {
2130 ($stat, $cond, $line_nr_next, $remain_next, $off_next) = 2219 ($stat, $cond, $line_nr_next, $remain_next, $off_next) =
2131 ctx_statement_block($linenr, $realcnt, 0); 2220 ctx_statement_block($linenr, $realcnt, 0);
2132 $stat =~ s/\n./\n /g; 2221 $stat =~ s/\n./\n /g;
@@ -2486,16 +2575,22 @@ sub process {
2486 } 2575 }
2487 2576
2488# check for global initialisers. 2577# check for global initialisers.
2489 if ($line =~ /^.$Type\s*$Ident\s*(?:\s+$Modifier)*\s*=\s*(0|NULL|false)\s*;/) { 2578 if ($line =~ /^\+(\s*$Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/) {
2490 ERROR("GLOBAL_INITIALISERS", 2579 if (ERROR("GLOBAL_INITIALISERS",
2491 "do not initialise globals to 0 or NULL\n" . 2580 "do not initialise globals to 0 or NULL\n" .
2492 $herecurr); 2581 $herecurr) &&
2582 $fix) {
2583 $fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/;
2584 }
2493 } 2585 }
2494# check for static initialisers. 2586# check for static initialisers.
2495 if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) { 2587 if ($line =~ /^\+.*\bstatic\s.*=\s*(0|NULL|false)\s*;/) {
2496 ERROR("INITIALISED_STATIC", 2588 if (ERROR("INITIALISED_STATIC",
2497 "do not initialise statics to 0 or NULL\n" . 2589 "do not initialise statics to 0 or NULL\n" .
2498 $herecurr); 2590 $herecurr) &&
2591 $fix) {
2592 $fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/;
2593 }
2499 } 2594 }
2500 2595
2501# check for static const char * arrays. 2596# check for static const char * arrays.
@@ -2638,8 +2733,12 @@ sub process {
2638 } 2733 }
2639 2734
2640 if ($line =~ /\bpr_warning\s*\(/) { 2735 if ($line =~ /\bpr_warning\s*\(/) {
2641 WARN("PREFER_PR_LEVEL", 2736 if (WARN("PREFER_PR_LEVEL",
2642 "Prefer pr_warn(... to pr_warning(...\n" . $herecurr); 2737 "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) &&
2738 $fix) {
2739 $fixed[$linenr - 1] =~
2740 s/\bpr_warning\b/pr_warn/;
2741 }
2643 } 2742 }
2644 2743
2645 if ($line =~ /\bdev_printk\s*\(\s*KERN_([A-Z]+)/) { 2744 if ($line =~ /\bdev_printk\s*\(\s*KERN_([A-Z]+)/) {
@@ -2759,6 +2858,7 @@ sub process {
2759 $off = 0; 2858 $off = 0;
2760 2859
2761 my $blank = copy_spacing($opline); 2860 my $blank = copy_spacing($opline);
2861 my $last_after = -1;
2762 2862
2763 for (my $n = 0; $n < $#elements; $n += 2) { 2863 for (my $n = 0; $n < $#elements; $n += 2) {
2764 2864
@@ -2824,7 +2924,7 @@ sub process {
2824 $cc !~ /^\\/ && $cc !~ /^;/) { 2924 $cc !~ /^\\/ && $cc !~ /^;/) {
2825 if (ERROR("SPACING", 2925 if (ERROR("SPACING",
2826 "space required after that '$op' $at\n" . $hereptr)) { 2926 "space required after that '$op' $at\n" . $hereptr)) {
2827 $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; 2927 $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
2828 $line_fixed = 1; 2928 $line_fixed = 1;
2829 } 2929 }
2830 } 2930 }
@@ -2839,11 +2939,11 @@ sub process {
2839 if ($ctx =~ /Wx.|.xW/) { 2939 if ($ctx =~ /Wx.|.xW/) {
2840 if (ERROR("SPACING", 2940 if (ERROR("SPACING",
2841 "spaces prohibited around that '$op' $at\n" . $hereptr)) { 2941 "spaces prohibited around that '$op' $at\n" . $hereptr)) {
2842 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); 2942 $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
2843 $line_fixed = 1;
2844 if (defined $fix_elements[$n + 2]) { 2943 if (defined $fix_elements[$n + 2]) {
2845 $fix_elements[$n + 2] =~ s/^\s+//; 2944 $fix_elements[$n + 2] =~ s/^\s+//;
2846 } 2945 }
2946 $line_fixed = 1;
2847 } 2947 }
2848 } 2948 }
2849 2949
@@ -2852,8 +2952,9 @@ sub process {
2852 if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { 2952 if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) {
2853 if (ERROR("SPACING", 2953 if (ERROR("SPACING",
2854 "space required after that '$op' $at\n" . $hereptr)) { 2954 "space required after that '$op' $at\n" . $hereptr)) {
2855 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " "; 2955 $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
2856 $line_fixed = 1; 2956 $line_fixed = 1;
2957 $last_after = $n;
2857 } 2958 }
2858 } 2959 }
2859 2960
@@ -2870,8 +2971,10 @@ sub process {
2870 if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) { 2971 if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) {
2871 if (ERROR("SPACING", 2972 if (ERROR("SPACING",
2872 "space required before that '$op' $at\n" . $hereptr)) { 2973 "space required before that '$op' $at\n" . $hereptr)) {
2873 $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]); 2974 if ($n != $last_after + 2) {
2874 $line_fixed = 1; 2975 $good = $fix_elements[$n] . " " . ltrim($fix_elements[$n + 1]);
2976 $line_fixed = 1;
2977 }
2875 } 2978 }
2876 } 2979 }
2877 if ($op eq '*' && $cc =~/\s*$Modifier\b/) { 2980 if ($op eq '*' && $cc =~/\s*$Modifier\b/) {
@@ -2880,12 +2983,11 @@ sub process {
2880 } elsif ($ctx =~ /.xW/) { 2983 } elsif ($ctx =~ /.xW/) {
2881 if (ERROR("SPACING", 2984 if (ERROR("SPACING",
2882 "space prohibited after that '$op' $at\n" . $hereptr)) { 2985 "space prohibited after that '$op' $at\n" . $hereptr)) {
2883 $fixed_line =~ s/\s+$//; 2986 $good = $fix_elements[$n] . rtrim($fix_elements[$n + 1]);
2884 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
2885 $line_fixed = 1;
2886 if (defined $fix_elements[$n + 2]) { 2987 if (defined $fix_elements[$n + 2]) {
2887 $fix_elements[$n + 2] =~ s/^\s+//; 2988 $fix_elements[$n + 2] =~ s/^\s+//;
2888 } 2989 }
2990 $line_fixed = 1;
2889 } 2991 }
2890 } 2992 }
2891 2993
@@ -2894,8 +2996,7 @@ sub process {
2894 if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) { 2996 if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) {
2895 if (ERROR("SPACING", 2997 if (ERROR("SPACING",
2896 "space required one side of that '$op' $at\n" . $hereptr)) { 2998 "space required one side of that '$op' $at\n" . $hereptr)) {
2897 $fixed_line =~ s/\s+$//; 2999 $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
2898 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " ";
2899 $line_fixed = 1; 3000 $line_fixed = 1;
2900 } 3001 }
2901 } 3002 }
@@ -2903,20 +3004,18 @@ sub process {
2903 ($ctx =~ /Wx./ && $cc =~ /^;/)) { 3004 ($ctx =~ /Wx./ && $cc =~ /^;/)) {
2904 if (ERROR("SPACING", 3005 if (ERROR("SPACING",
2905 "space prohibited before that '$op' $at\n" . $hereptr)) { 3006 "space prohibited before that '$op' $at\n" . $hereptr)) {
2906 $fixed_line =~ s/\s+$//; 3007 $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
2907 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
2908 $line_fixed = 1; 3008 $line_fixed = 1;
2909 } 3009 }
2910 } 3010 }
2911 if ($ctx =~ /ExW/) { 3011 if ($ctx =~ /ExW/) {
2912 if (ERROR("SPACING", 3012 if (ERROR("SPACING",
2913 "space prohibited after that '$op' $at\n" . $hereptr)) { 3013 "space prohibited after that '$op' $at\n" . $hereptr)) {
2914 $fixed_line =~ s/\s+$//; 3014 $good = $fix_elements[$n] . trim($fix_elements[$n + 1]);
2915 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
2916 $line_fixed = 1;
2917 if (defined $fix_elements[$n + 2]) { 3015 if (defined $fix_elements[$n + 2]) {
2918 $fix_elements[$n + 2] =~ s/^\s+//; 3016 $fix_elements[$n + 2] =~ s/^\s+//;
2919 } 3017 }
3018 $line_fixed = 1;
2920 } 3019 }
2921 } 3020 }
2922 3021
@@ -2930,8 +3029,10 @@ sub process {
2930 if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) { 3029 if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) {
2931 if (ERROR("SPACING", 3030 if (ERROR("SPACING",
2932 "need consistent spacing around '$op' $at\n" . $hereptr)) { 3031 "need consistent spacing around '$op' $at\n" . $hereptr)) {
2933 $fixed_line =~ s/\s+$//; 3032 $good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
2934 $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; 3033 if (defined $fix_elements[$n + 2]) {
3034 $fix_elements[$n + 2] =~ s/^\s+//;
3035 }
2935 $line_fixed = 1; 3036 $line_fixed = 1;
2936 } 3037 }
2937 } 3038 }
@@ -2942,7 +3043,7 @@ sub process {
2942 if ($ctx =~ /Wx./) { 3043 if ($ctx =~ /Wx./) {
2943 if (ERROR("SPACING", 3044 if (ERROR("SPACING",
2944 "space prohibited before that '$op' $at\n" . $hereptr)) { 3045 "space prohibited before that '$op' $at\n" . $hereptr)) {
2945 $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); 3046 $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
2946 $line_fixed = 1; 3047 $line_fixed = 1;
2947 } 3048 }
2948 } 3049 }
@@ -2969,8 +3070,10 @@ sub process {
2969 if ($ok == 0) { 3070 if ($ok == 0) {
2970 if (ERROR("SPACING", 3071 if (ERROR("SPACING",
2971 "spaces required around that '$op' $at\n" . $hereptr)) { 3072 "spaces required around that '$op' $at\n" . $hereptr)) {
2972 $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; 3073 $good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
2973 $good = $fix_elements[$n] . " " . trim($fix_elements[$n + 1]) . " "; 3074 if (defined $fix_elements[$n + 2]) {
3075 $fix_elements[$n + 2] =~ s/^\s+//;
3076 }
2974 $line_fixed = 1; 3077 $line_fixed = 1;
2975 } 3078 }
2976 } 3079 }
@@ -3031,8 +3134,7 @@ sub process {
3031 if (ERROR("SPACING", 3134 if (ERROR("SPACING",
3032 "space required before the open brace '{'\n" . $herecurr) && 3135 "space required before the open brace '{'\n" . $herecurr) &&
3033 $fix) { 3136 $fix) {
3034 $fixed[$linenr - 1] =~ 3137 $fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/;
3035 s/^(\+.*(?:do|\))){/$1 {/;
3036 } 3138 }
3037 } 3139 }
3038 3140
@@ -3047,8 +3149,12 @@ sub process {
3047# closing brace should have a space following it when it has anything 3149# closing brace should have a space following it when it has anything
3048# on the line 3150# on the line
3049 if ($line =~ /}(?!(?:,|;|\)))\S/) { 3151 if ($line =~ /}(?!(?:,|;|\)))\S/) {
3050 ERROR("SPACING", 3152 if (ERROR("SPACING",
3051 "space required after that close brace '}'\n" . $herecurr); 3153 "space required after that close brace '}'\n" . $herecurr) &&
3154 $fix) {
3155 $fixed[$linenr - 1] =~
3156 s/}((?!(?:,|;|\)))\S)/} $1/;
3157 }
3052 } 3158 }
3053 3159
3054# check spacing on square brackets 3160# check spacing on square brackets
@@ -3271,8 +3377,13 @@ sub process {
3271 3377
3272#gcc binary extension 3378#gcc binary extension
3273 if ($var =~ /^$Binary$/) { 3379 if ($var =~ /^$Binary$/) {
3274 WARN("GCC_BINARY_CONSTANT", 3380 if (WARN("GCC_BINARY_CONSTANT",
3275 "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr); 3381 "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) &&
3382 $fix) {
3383 my $hexval = sprintf("0x%x", oct($var));
3384 $fixed[$linenr - 1] =~
3385 s/\b$var\b/$hexval/;
3386 }
3276 } 3387 }
3277 3388
3278#CamelCase 3389#CamelCase
@@ -3282,19 +3393,26 @@ sub process {
3282 $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && 3393 $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ &&
3283#Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show) 3394#Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show)
3284 $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) { 3395 $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) {
3285 seed_camelcase_includes() if ($check); 3396 while ($var =~ m{($Ident)}g) {
3286 if (!defined $camelcase{$var}) { 3397 my $word = $1;
3287 $camelcase{$var} = 1; 3398 next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/);
3288 CHK("CAMELCASE", 3399 seed_camelcase_includes() if ($check);
3289 "Avoid CamelCase: <$var>\n" . $herecurr); 3400 if (!defined $camelcase{$word}) {
3401 $camelcase{$word} = 1;
3402 CHK("CAMELCASE",
3403 "Avoid CamelCase: <$word>\n" . $herecurr);
3404 }
3290 } 3405 }
3291 } 3406 }
3292 } 3407 }
3293 3408
3294#no spaces allowed after \ in define 3409#no spaces allowed after \ in define
3295 if ($line=~/\#\s*define.*\\\s$/) { 3410 if ($line =~ /\#\s*define.*\\\s+$/) {
3296 WARN("WHITESPACE_AFTER_LINE_CONTINUATION", 3411 if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION",
3297 "Whitepspace after \\ makes next lines useless\n" . $herecurr); 3412 "Whitespace after \\ makes next lines useless\n" . $herecurr) &&
3413 $fix) {
3414 $fixed[$linenr - 1] =~ s/\s+$//;
3415 }
3298 } 3416 }
3299 3417
3300#warn if <asm/foo.h> is #included and <linux/foo.h> is available (uses RAW line) 3418#warn if <asm/foo.h> is #included and <linux/foo.h> is available (uses RAW line)
@@ -3374,7 +3492,8 @@ sub process {
3374 $dstat !~ /^for\s*$Constant$/ && # for (...) 3492 $dstat !~ /^for\s*$Constant$/ && # for (...)
3375 $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() 3493 $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar()
3376 $dstat !~ /^do\s*{/ && # do {... 3494 $dstat !~ /^do\s*{/ && # do {...
3377 $dstat !~ /^\({/) # ({... 3495 $dstat !~ /^\({/ && # ({...
3496 $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
3378 { 3497 {
3379 $ctx =~ s/\n*$//; 3498 $ctx =~ s/\n*$//;
3380 my $herectx = $here . "\n"; 3499 my $herectx = $here . "\n";
@@ -3606,6 +3725,32 @@ sub process {
3606 } 3725 }
3607 } 3726 }
3608 3727
3728sub string_find_replace {
3729 my ($string, $find, $replace) = @_;
3730
3731 $string =~ s/$find/$replace/g;
3732
3733 return $string;
3734}
3735
3736# check for bad placement of section $InitAttribute (e.g.: __initdata)
3737 if ($line =~ /(\b$InitAttribute\b)/) {
3738 my $attr = $1;
3739 if ($line =~ /^\+\s*static\s+(?:const\s+)?(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*[=;]/) {
3740 my $ptr = $1;
3741 my $var = $2;
3742 if ((($ptr =~ /\b(union|struct)\s+$attr\b/ &&
3743 ERROR("MISPLACED_INIT",
3744 "$attr should be placed after $var\n" . $herecurr)) ||
3745 ($ptr !~ /\b(union|struct)\s+$attr\b/ &&
3746 WARN("MISPLACED_INIT",
3747 "$attr should be placed after $var\n" . $herecurr))) &&
3748 $fix) {
3749 $fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e;
3750 }
3751 }
3752 }
3753
3609# prefer usleep_range over udelay 3754# prefer usleep_range over udelay
3610 if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) { 3755 if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) {
3611 # ignore udelay's < 10, however 3756 # ignore udelay's < 10, however
@@ -3691,8 +3836,12 @@ sub process {
3691 3836
3692# Check for __inline__ and __inline, prefer inline 3837# Check for __inline__ and __inline, prefer inline
3693 if ($line =~ /\b(__inline__|__inline)\b/) { 3838 if ($line =~ /\b(__inline__|__inline)\b/) {
3694 WARN("INLINE", 3839 if (WARN("INLINE",
3695 "plain inline is preferred over $1\n" . $herecurr); 3840 "plain inline is preferred over $1\n" . $herecurr) &&
3841 $fix) {
3842 $fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/;
3843
3844 }
3696 } 3845 }
3697 3846
3698# Check for __attribute__ packed, prefer __packed 3847# Check for __attribute__ packed, prefer __packed
@@ -3709,14 +3858,21 @@ sub process {
3709 3858
3710# Check for __attribute__ format(printf, prefer __printf 3859# Check for __attribute__ format(printf, prefer __printf
3711 if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { 3860 if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) {
3712 WARN("PREFER_PRINTF", 3861 if (WARN("PREFER_PRINTF",
3713 "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); 3862 "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) &&
3863 $fix) {
3864 $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex;
3865
3866 }
3714 } 3867 }
3715 3868
3716# Check for __attribute__ format(scanf, prefer __scanf 3869# Check for __attribute__ format(scanf, prefer __scanf
3717 if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\b/) { 3870 if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\b/) {
3718 WARN("PREFER_SCANF", 3871 if (WARN("PREFER_SCANF",
3719 "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr); 3872 "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) &&
3873 $fix) {
3874 $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex;
3875 }
3720 } 3876 }
3721 3877
3722# check for sizeof(&) 3878# check for sizeof(&)
@@ -3727,8 +3883,11 @@ sub process {
3727 3883
3728# check for sizeof without parenthesis 3884# check for sizeof without parenthesis
3729 if ($line =~ /\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/) { 3885 if ($line =~ /\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/) {
3730 WARN("SIZEOF_PARENTHESIS", 3886 if (WARN("SIZEOF_PARENTHESIS",
3731 "sizeof $1 should be sizeof($1)\n" . $herecurr); 3887 "sizeof $1 should be sizeof($1)\n" . $herecurr) &&
3888 $fix) {
3889 $fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex;
3890 }
3732 } 3891 }
3733 3892
3734# check for line continuations in quoted strings with odd counts of " 3893# check for line continuations in quoted strings with odd counts of "
@@ -3747,8 +3906,11 @@ sub process {
3747 if ($line =~ /\bseq_printf\s*\(/) { 3906 if ($line =~ /\bseq_printf\s*\(/) {
3748 my $fmt = get_quoted_string($line, $rawline); 3907 my $fmt = get_quoted_string($line, $rawline);
3749 if ($fmt !~ /[^\\]\%/) { 3908 if ($fmt !~ /[^\\]\%/) {
3750 WARN("PREFER_SEQ_PUTS", 3909 if (WARN("PREFER_SEQ_PUTS",
3751 "Prefer seq_puts to seq_printf\n" . $herecurr); 3910 "Prefer seq_puts to seq_printf\n" . $herecurr) &&
3911 $fix) {
3912 $fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/;
3913 }
3752 } 3914 }
3753 } 3915 }
3754 3916
@@ -3810,6 +3972,16 @@ sub process {
3810 } 3972 }
3811 } 3973 }
3812 3974
3975# check for new externs in .h files.
3976 if ($realfile =~ /\.h$/ &&
3977 $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) {
3978 if (WARN("AVOID_EXTERNS",
3979 "extern prototypes should be avoided in .h files\n" . $herecurr) &&
3980 $fix) {
3981 $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/;
3982 }
3983 }
3984
3813# check for new externs in .c files. 3985# check for new externs in .c files.
3814 if ($realfile =~ /\.c$/ && defined $stat && 3986 if ($realfile =~ /\.c$/ && defined $stat &&
3815 $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) 3987 $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s)
@@ -3879,8 +4051,11 @@ sub process {
3879 4051
3880# check for multiple semicolons 4052# check for multiple semicolons
3881 if ($line =~ /;\s*;\s*$/) { 4053 if ($line =~ /;\s*;\s*$/) {
3882 WARN("ONE_SEMICOLON", 4054 if (WARN("ONE_SEMICOLON",
3883 "Statements terminations use 1 semicolon\n" . $herecurr); 4055 "Statements terminations use 1 semicolon\n" . $herecurr) &&
4056 $fix) {
4057 $fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g;
4058 }
3884 } 4059 }
3885 4060
3886# check for switch/default statements without a break; 4061# check for switch/default statements without a break;
@@ -3898,9 +4073,12 @@ sub process {
3898 } 4073 }
3899 4074
3900# check for gcc specific __FUNCTION__ 4075# check for gcc specific __FUNCTION__
3901 if ($line =~ /__FUNCTION__/) { 4076 if ($line =~ /\b__FUNCTION__\b/) {
3902 WARN("USE_FUNC", 4077 if (WARN("USE_FUNC",
3903 "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); 4078 "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr) &&
4079 $fix) {
4080 $fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g;
4081 }
3904 } 4082 }
3905 4083
3906# check for use of yield() 4084# check for use of yield()
@@ -4105,13 +4283,8 @@ sub process {
4105 } 4283 }
4106 } 4284 }
4107 4285
4108 if ($quiet == 0 && keys %ignore_type) { 4286 hash_show_words(\%use_type, "Used");
4109 print "NOTE: Ignored message types:"; 4287 hash_show_words(\%ignore_type, "Ignored");
4110 foreach my $ignore (sort keys %ignore_type) {
4111 print " $ignore";
4112 }
4113 print "\n\n";
4114 }
4115 4288
4116 if ($clean == 0 && $fix && "@rawlines" ne "@fixed") { 4289 if ($clean == 0 && $fix && "@rawlines" ne "@fixed") {
4117 my $newfile = $filename . ".EXPERIMENTAL-checkpatch-fixes"; 4290 my $newfile = $filename . ".EXPERIMENTAL-checkpatch-fixes";