aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2011-01-14 02:06:31 -0500
committerPaul Mundt <lethal@linux-sh.org>2011-01-14 02:06:31 -0500
commitc488a4731abb53aa1bab9fccd8a7472083159bfd (patch)
treedb6d4a664a1e4b7685c1d2d79da63263f40adf7b
parent6d2ae89c36e2adab5cfa69fecb11290082817ac6 (diff)
parentbba958783b1b4cb0a9420f4e11082467132a334c (diff)
Merge branch 'common/mmcif' into rmobile-latest
-rw-r--r--Documentation/cgroups/memory.txt74
-rw-r--r--Documentation/device-mapper/dm-crypt.txt7
-rw-r--r--Documentation/device-mapper/dm-raid.txt70
-rw-r--r--Documentation/filesystems/proc.txt7
-rw-r--r--Documentation/gpio.txt2
-rw-r--r--Documentation/vm/transhuge.txt298
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/alpha/include/asm/mman.h3
-rw-r--r--arch/arm/kernel/module.c14
-rw-r--r--arch/arm/mm/pgd.c2
-rw-r--r--arch/avr32/boards/atngw100/setup.c2
-rw-r--r--arch/avr32/boards/atstk1000/atstk1002.c2
-rw-r--r--arch/avr32/boards/favr-32/setup.c2
-rw-r--r--arch/avr32/boards/hammerhead/setup.c2
-rw-r--r--arch/avr32/boards/merisc/setup.c2
-rw-r--r--arch/avr32/boards/mimc200/setup.c2
-rw-r--r--arch/avr32/configs/atngw100_defconfig23
-rw-r--r--arch/avr32/configs/atngw100_evklcd100_defconfig17
-rw-r--r--arch/avr32/configs/atngw100_evklcd101_defconfig17
-rw-r--r--arch/avr32/configs/atngw100mkii_defconfig22
-rw-r--r--arch/avr32/configs/atngw100mkii_evklcd100_defconfig17
-rw-r--r--arch/avr32/configs/atngw100mkii_evklcd101_defconfig17
-rw-r--r--arch/avr32/configs/atstk1002_defconfig25
-rw-r--r--arch/avr32/configs/atstk1003_defconfig41
-rw-r--r--arch/avr32/configs/atstk1004_defconfig109
-rw-r--r--arch/avr32/configs/atstk1006_defconfig23
-rw-r--r--arch/avr32/configs/favr-32_defconfig2
-rw-r--r--arch/avr32/configs/hammerhead_defconfig2
-rw-r--r--arch/avr32/include/asm/syscalls.h14
-rw-r--r--arch/avr32/kernel/process.c9
-rw-r--r--arch/avr32/kernel/time.c5
-rw-r--r--arch/ia64/kernel/perfmon.c2
-rw-r--r--arch/ia64/mm/hugetlbpage.c2
-rw-r--r--arch/mips/include/asm/mman.h3
-rw-r--r--arch/mips/kernel/module.c14
-rw-r--r--arch/parisc/include/asm/mman.h3
-rw-r--r--arch/powerpc/mm/gup.c12
-rw-r--r--arch/sh/mm/hugetlbpage.c2
-rw-r--r--arch/sparc/kernel/module.c14
-rw-r--r--arch/sparc/mm/generic_32.c2
-rw-r--r--arch/sparc/mm/generic_64.c2
-rw-r--r--arch/sparc/mm/hugetlbpage.c2
-rw-r--r--arch/um/kernel/skas/mmu.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/asm/paravirt.h25
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/xen/page.h16
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kvm/mmu.c125
-rw-r--r--arch/x86/kvm/paging_tmpl.h9
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/mmu.c365
-rw-r--r--arch/x86/xen/p2m.c510
-rw-r--r--arch/xtensa/include/asm/mman.h3
-rw-r--r--drivers/base/node.c21
-rw-r--r--drivers/md/Kconfig24
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c12
-rw-r--r--drivers/md/dm-crypt.c618
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-ioctl.c111
-rw-r--r--drivers/md/dm-kcopyd.c57
-rw-r--r--drivers/md/dm-log-userspace-base.c139
-rw-r--r--drivers/md/dm-log-userspace-transfer.c1
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c67
-rw-r--r--drivers/md/dm-raid.c697
-rw-r--r--drivers/md/dm-raid1.c19
-rw-r--r--drivers/md/dm-snap-persistent.c4
-rw-r--r--drivers/md/dm-snap.c62
-rw-r--r--drivers/md/dm-stripe.c27
-rw-r--r--drivers/md/dm-table.c19
-rw-r--r--drivers/md/dm.c23
-rw-r--r--drivers/md/md.c197
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/raid1.c33
-rw-r--r--drivers/md/raid10.c17
-rw-r--r--drivers/md/raid5.c16
-rw-r--r--drivers/serial/atmel_serial.c5
-rw-r--r--drivers/xen/Kconfig11
-rw-r--r--drivers/xen/Makefile5
-rw-r--r--drivers/xen/gntdev.c665
-rw-r--r--drivers/xen/grant-table.c46
-rw-r--r--drivers/xen/platform-pci.c21
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/fs-writeback.c105
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/nfs/dir.c6
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c14
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--include/asm-generic/gpio.h10
-rw-r--r--include/asm-generic/mman-common.h3
-rw-r--r--include/asm-generic/pgtable.h225
-rw-r--r--include/linux/compaction.h25
-rw-r--r--include/linux/device-mapper.h12
-rw-r--r--include/linux/dm-ioctl.h14
-rw-r--r--include/linux/dm-log-userspace.h13
-rw-r--r--include/linux/gfp.h15
-rw-r--r--include/linux/gpio.h6
-rw-r--r--include/linux/huge_mm.h179
-rw-r--r--include/linux/irqdesc.h2
-rw-r--r--include/linux/kernel.h7
-rw-r--r--include/linux/kernel_stat.h19
-rw-r--r--include/linux/khugepaged.h67
-rw-r--r--include/linux/memcontrol.h36
-rw-r--r--include/linux/memory_hotplug.h14
-rw-r--r--include/linux/migrate.h12
-rw-r--r--include/linux/mm.h140
-rw-r--r--include/linux/mm_inline.h19
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/mmc/sh_mmcif.h4
-rw-r--r--include/linux/mmu_notifier.h66
-rw-r--r--include/linux/mmzone.h14
-rw-r--r--include/linux/page-flags.h71
-rw-r--r--include/linux/page_cgroup.h54
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/radix-tree.h16
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/vmalloc.h10
-rw-r--r--include/linux/vmstat.h7
-rw-r--r--include/trace/events/compaction.h74
-rw-r--r--include/trace/events/vmscan.h6
-rw-r--r--include/trace/events/writeback.h1
-rw-r--r--include/xen/gntdev.h119
-rw-r--r--include/xen/grant_table.h44
-rw-r--r--kernel/fork.c41
-rw-r--r--kernel/futex.c55
-rw-r--r--kernel/irq/irqdesc.c40
-rw-r--r--mm/Kconfig38
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c174
-rw-r--r--mm/dmapool.c16
-rw-r--r--mm/filemap.c17
-rw-r--r--mm/huge_memory.c2346
-rw-r--r--mm/hugetlb.c111
-rw-r--r--mm/internal.h16
-rw-r--r--mm/ksm.c81
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memcontrol.c258
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c336
-rw-r--r--mm/memory_hotplug.c17
-rw-r--r--mm/mempolicy.c23
-rw-r--r--mm/migrate.c123
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c163
-rw-r--r--mm/mmap.c17
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c7
-rw-r--r--mm/page_alloc.c163
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/pgtable-generic.c123
-rw-r--r--mm/rmap.c91
-rw-r--r--mm/slub.c11
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c322
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmalloc.c89
-rw-r--r--mm/vmscan.c432
-rw-r--r--mm/vmstat.c51
-rw-r--r--virt/kvm/kvm_main.c39
181 files changed, 9940 insertions, 2132 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 7781857dc940..bac328c232f5 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -385,6 +385,10 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
385pgpgin - # of pages paged in (equivalent to # of charging events). 385pgpgin - # of pages paged in (equivalent to # of charging events).
386pgpgout - # of pages paged out (equivalent to # of uncharging events). 386pgpgout - # of pages paged out (equivalent to # of uncharging events).
387swap - # of bytes of swap usage 387swap - # of bytes of swap usage
388dirty - # of bytes that are waiting to get written back to the disk.
389writeback - # of bytes that are actively being written back to the disk.
390nfs_unstable - # of bytes sent to the NFS server, but not yet committed to
391 the actual storage.
388inactive_anon - # of bytes of anonymous memory and swap cache memory on 392inactive_anon - # of bytes of anonymous memory and swap cache memory on
389 LRU list. 393 LRU list.
390active_anon - # of bytes of anonymous and swap cache memory on active 394active_anon - # of bytes of anonymous and swap cache memory on active
@@ -406,6 +410,9 @@ total_mapped_file - sum of all children's "cache"
406total_pgpgin - sum of all children's "pgpgin" 410total_pgpgin - sum of all children's "pgpgin"
407total_pgpgout - sum of all children's "pgpgout" 411total_pgpgout - sum of all children's "pgpgout"
408total_swap - sum of all children's "swap" 412total_swap - sum of all children's "swap"
413total_dirty - sum of all children's "dirty"
414total_writeback - sum of all children's "writeback"
415total_nfs_unstable - sum of all children's "nfs_unstable"
409total_inactive_anon - sum of all children's "inactive_anon" 416total_inactive_anon - sum of all children's "inactive_anon"
410total_active_anon - sum of all children's "active_anon" 417total_active_anon - sum of all children's "active_anon"
411total_inactive_file - sum of all children's "inactive_file" 418total_inactive_file - sum of all children's "inactive_file"
@@ -453,6 +460,73 @@ memory under it will be reclaimed.
453You can reset failcnt by writing 0 to failcnt file. 460You can reset failcnt by writing 0 to failcnt file.
454# echo 0 > .../memory.failcnt 461# echo 0 > .../memory.failcnt
455 462
4635.5 dirty memory
464
465Control the maximum amount of dirty pages a cgroup can have at any given time.
466
467Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim)
468page cache used by a cgroup. So, in case of multiple cgroup writers, they will
469not be able to consume more than their designated share of dirty pages and will
470be forced to perform write-out if they cross that limit.
471
472The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*. It
473is possible to configure a limit to trigger both a direct writeback or a
474background writeback performed by per-bdi flusher threads. The root cgroup
475memory.dirty_* control files are read-only and match the contents of
476the /proc/sys/vm/dirty_* files.
477
478Per-cgroup dirty limits can be set using the following files in the cgroupfs:
479
480- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of
481 cgroup memory) at which a process generating dirty pages will itself start
482 writing out dirty data.
483
484- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes)
485 in the cgroup at which a process generating dirty pages will start itself
486 writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to indicate
487 that value is kilo, mega or gigabytes.
488
489 Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio.
490 Only one of them may be specified at a time. When one is written it is
491 immediately taken into account to evaluate the dirty memory limits and the
492 other appears as 0 when read.
493
494- memory.dirty_background_ratio: the amount of dirty memory of the cgroup
495 (expressed as a percentage of cgroup memory) at which background writeback
496 kernel threads will start writing out dirty data.
497
498- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed
499 in bytes) in the cgroup at which background writeback kernel threads will
500 start writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to
501 indicate that value is kilo, mega or gigabytes.
502
503 Note: memory.dirty_background_limit_in_bytes is the counterpart of
504 memory.dirty_background_ratio. Only one of them may be specified at a time.
505 When one is written it is immediately taken into account to evaluate the dirty
506 memory limits and the other appears as 0 when read.
507
508A cgroup may contain more dirty memory than its dirty limit. This is possible
509because of the principle that the first cgroup to touch a page is charged for
510it. Subsequent page counting events (dirty, writeback, nfs_unstable) are also
511counted to the originally charged cgroup.
512
513Example: If page is allocated by a cgroup A task, then the page is charged to
514cgroup A. If the page is later dirtied by a task in cgroup B, then the cgroup A
515dirty count will be incremented. If cgroup A is over its dirty limit but cgroup
516B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A
517over its dirty limit without throttling the dirtying cgroup B task.
518
519When use_hierarchy=0, each cgroup has dirty memory usage and limits.
520System-wide dirty limits are also consulted. Dirty memory consumption is
521checked against both system-wide and per-cgroup dirty limits.
522
523The current implementation does not enforce per-cgroup dirty limits when
524use_hierarchy=1. System-wide dirty limits are used for processes in such
525cgroups. Attempts to read memory.dirty_* files return the system-wide
526values. Writes to the memory.dirty_* files return error. An enhanced
527implementation is needed to check the chain of parents to ensure that no
528dirty limit is exceeded.
529
4566. Hierarchy support 5306. Hierarchy support
457 531
458The memory controller supports a deep hierarchy and hierarchical accounting. 532The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 524de926290d..59293ac4a5d0 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
8 8
9<cipher> 9<cipher>
10 Encryption cipher and an optional IV generation mode. 10 Encryption cipher and an optional IV generation mode.
11 (In format cipher-chainmode-ivopts:ivmode). 11 (In format cipher[:keycount]-chainmode-ivopts:ivmode).
12 Examples: 12 Examples:
13 des 13 des
14 aes-cbc-essiv:sha256 14 aes-cbc-essiv:sha256
@@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
20 Key used for encryption. It is encoded as a hexadecimal number. 20 Key used for encryption. It is encoded as a hexadecimal number.
21 You can only use key sizes that are valid for the selected cipher. 21 You can only use key sizes that are valid for the selected cipher.
22 22
23<keycount>
24 Multi-key compatibility mode. You can define <keycount> keys and
25 then sectors are encrypted according to their offsets (sector 0 uses key0;
26 sector 1 uses key1 etc.). <keycount> must be a power of two.
27
23<iv_offset> 28<iv_offset>
24 The IV offset is a sector count that is added to the sector number 29 The IV offset is a sector count that is added to the sector number
25 before creating the IV. 30 before creating the IV.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
new file mode 100644
index 000000000000..33b6b7071ac8
--- /dev/null
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -0,0 +1,70 @@
1Device-mapper RAID (dm-raid) is a bridge from DM to MD. It
2provides a way to use device-mapper interfaces to access the MD RAID
3drivers.
4
5As with all device-mapper targets, the nominal public interfaces are the
6constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
7and STATUSTYPE_TABLE). The CTR table looks like the following:
8
91: <s> <l> raid \
102: <raid_type> <#raid_params> <raid_params> \
113: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
12
13Line 1 contains the standard first three arguments to any device-mapper
14target - the start, length, and target type fields. The target type in
15this case is "raid".
16
17Line 2 contains the arguments that define the particular raid
18type/personality/level, the required arguments for that raid type, and
19any optional arguments. Possible raid types include: raid4, raid5_la,
20raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is
21planned for the future.) The list of required and optional parameters
22is the same for all the current raid types. The required parameters are
23positional, while the optional parameters are given as key/value pairs.
24The possible parameters are as follows:
25 <chunk_size> Chunk size in sectors.
26 [[no]sync] Force/Prevent RAID initialization
27 [rebuild <idx>] Rebuild the drive indicated by the index
28 [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
29 [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
30 [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
31 [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
32 [stripe_cache <sectors>] Stripe cache size for higher RAIDs
33
34Line 3 contains the list of devices that compose the array in
35metadata/data device pairs. If the metadata is stored separately, a '-'
36is given for the metadata device position. If a drive has failed or is
37missing at creation time, a '-' can be given for both the metadata and
38data drives for a given position.
39
40NB. Currently all metadata devices must be specified as '-'.
41
42Examples:
43# RAID4 - 4 data drives, 1 parity
44# No metadata devices specified to hold superblock/bitmap info
45# Chunk size of 1MiB
46# (Lines separated for easy reading)
470 1960893648 raid \
48 raid4 1 2048 \
49 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
50
51# RAID4 - 4 data drives, 1 parity (no metadata devices)
52# Chunk size of 1MiB, force RAID initialization,
53# min recovery rate at 20 kiB/sec/disk
540 1960893648 raid \
55 raid4 4 2048 min_recovery_rate 20 sync\
56 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
57
58Performing a 'dmsetup table' should display the CTR table used to
59construct the mapping (with possible reordering of optional
60parameters).
61
62Performing a 'dmsetup status' will yield information on the state and
63health of the array. The output is as follows:
641: <s> <l> raid \
652: <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
66
67Line 1 is standard DM output. Line 2 is best shown by example:
68 0 1960893648 raid raid4 5 AAAAA 2/490221568
69Here we can see the RAID type is raid4, there are 5 devices - all of
70which are 'A'live, and the array is 2/490221568 complete with recovery.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9471225212c4..23cae6548d3a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,7 @@ Anonymous: 0 kB
375Swap: 0 kB 375Swap: 0 kB
376KernelPageSize: 4 kB 376KernelPageSize: 4 kB
377MMUPageSize: 4 kB 377MMUPageSize: 4 kB
378Locked: 374 kB
378 379
379The first of these lines shows the same information as is displayed for the 380The first of these lines shows the same information as is displayed for the
380mapping in /proc/PID/maps. The remaining lines show the size of the mapping 381mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a
670 671
671> cat /proc/meminfo 672> cat /proc/meminfo
672 673
674The "Locked" indicates whether the mapping is locked in memory or not.
675
673 676
674MemTotal: 16344972 kB 677MemTotal: 16344972 kB
675MemFree: 13634064 kB 678MemFree: 13634064 kB
@@ -1320,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
1320Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the 1323Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
1321other with its scaled value. 1324other with its scaled value.
1322 1325
1326The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
1327value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
1328requires CAP_SYS_RESOURCE.
1329
1323NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see 1330NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
1324Documentation/feature-removal-schedule.txt. 1331Documentation/feature-removal-schedule.txt.
1325 1332
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index a492d92bb098..792faa3c06cf 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction:
135 int gpio_direction_input(unsigned gpio); 135 int gpio_direction_input(unsigned gpio);
136 int gpio_direction_output(unsigned gpio, int value); 136 int gpio_direction_output(unsigned gpio, int value);
137 137
138The return value is zero for success, else a negative errno. It must 138The return value is zero for success, else a negative errno. It should
139be checked, since the get/set calls don't have error returns and since 139be checked, since the get/set calls don't have error returns and since
140misconfiguration is possible. You should normally issue these calls from 140misconfiguration is possible. You should normally issue these calls from
141a task context. However, for spinlock-safe GPIOs it's OK to use them 141a task context. However, for spinlock-safe GPIOs it's OK to use them
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
new file mode 100644
index 000000000000..0924aaca3302
--- /dev/null
+++ b/Documentation/vm/transhuge.txt
@@ -0,0 +1,298 @@
1= Transparent Hugepage Support =
2
3== Objective ==
4
5Performance critical computing applications dealing with large memory
6working sets are already running on top of libhugetlbfs and in turn
7hugetlbfs. Transparent Hugepage Support is an alternative means of
8using huge pages for the backing of virtual memory with huge pages
9that supports the automatic promotion and demotion of page sizes and
10without the shortcomings of hugetlbfs.
11
12Currently it only works for anonymous memory mappings but in the
13future it can expand over the pagecache layer starting with tmpfs.
14
15The reason applications are running faster is because of two
16factors. The first factor is almost completely irrelevant and it's not
17of significant interest because it'll also have the downside of
18requiring larger clear-page copy-page in page faults which is a
19potentially negative effect. The first factor consists in taking a
20single page fault for each 2M virtual region touched by userland (so
21reducing the enter/exit kernel frequency by a 512 times factor). This
22only matters the first time the memory is accessed for the lifetime of
23a memory mapping. The second long lasting and much more important
24factor will affect all subsequent accesses to the memory for the whole
25runtime of the application. The second factor consist of two
26components: 1) the TLB miss will run faster (especially with
27virtualization using nested pagetables but almost always also on bare
28metal without virtualization) and 2) a single TLB entry will be
29mapping a much larger amount of virtual memory in turn reducing the
30number of TLB misses. With virtualization and nested pagetables the
31TLB can be mapped of larger size only if both KVM and the Linux guest
32are using hugepages but a significant speedup already happens if only
33one of the two is using hugepages just because of the fact the TLB
34miss is going to run faster.
35
36== Design ==
37
38- "graceful fallback": mm components which don't have transparent
39 hugepage knowledge fall back to breaking a transparent hugepage and
40 working on the regular pages and their respective regular pmd/pte
41 mappings
42
43- if a hugepage allocation fails because of memory fragmentation,
44 regular pages should be gracefully allocated instead and mixed in
45 the same vma without any failure or significant delay and without
46 userland noticing
47
48- if some task quits and more hugepages become available (either
49 immediately in the buddy or through the VM), guest physical memory
50 backed by regular pages should be relocated on hugepages
51 automatically (with khugepaged)
52
53- it doesn't require memory reservation and in turn it uses hugepages
54 whenever possible (the only possible reservation here is kernelcore=
55 to avoid unmovable pages to fragment all the memory but such a tweak
56 is not specific to transparent hugepage support and it's a generic
57 feature that applies to all dynamic high order allocations in the
58 kernel)
59
60- this initial support only offers the feature in the anonymous memory
61 regions but it'd be ideal to move it to tmpfs and the pagecache
62 later
63
64Transparent Hugepage Support maximizes the usefulness of free memory
65if compared to the reservation approach of hugetlbfs by allowing all
66unused memory to be used as cache or other movable (or even unmovable
67entities). It doesn't require reservation to prevent hugepage
68allocation failures to be noticeable from userland. It allows paging
69and all other advanced VM features to be available on the
70hugepages. It requires no modifications for applications to take
71advantage of it.
72
73Applications however can be further optimized to take advantage of
74this feature, like for example they've been optimized before to avoid
75a flood of mmap system calls for every malloc(4k). Optimizing userland
76is by far not mandatory and khugepaged already can take care of long
77lived page allocations even for hugepage unaware applications that
78deals with large amounts of memory.
79
80In certain cases when hugepages are enabled system wide, application
81may end up allocating more memory resources. An application may mmap a
82large region but only touch 1 byte of it, in that case a 2M page might
83be allocated instead of a 4k page for no good. This is why it's
84possible to disable hugepages system-wide and to only have them inside
85MADV_HUGEPAGE madvise regions.
86
87Embedded systems should enable hugepages only inside madvise regions
88to eliminate any risk of wasting any precious byte of memory and to
89only run faster.
90
91Applications that gets a lot of benefit from hugepages and that don't
92risk to lose memory by using hugepages, should use
93madvise(MADV_HUGEPAGE) on their critical mmapped regions.
94
95== sysfs ==
96
97Transparent Hugepage Support can be entirely disabled (mostly for
98debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
99avoid the risk of consuming more memory resources) or enabled system
100wide. This can be achieved with one of:
101
102echo always >/sys/kernel/mm/transparent_hugepage/enabled
103echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
104echo never >/sys/kernel/mm/transparent_hugepage/enabled
105
106It's also possible to limit defrag efforts in the VM to generate
107hugepages in case they're not immediately free to madvise regions or
108to never try to defrag memory and simply fallback to regular pages
109unless hugepages are immediately available. Clearly if we spend CPU
110time to defrag memory, we would expect to gain even more by the fact
111we use hugepages later instead of regular pages. This isn't always
112guaranteed, but it may be more likely in case the allocation is for a
113MADV_HUGEPAGE region.
114
115echo always >/sys/kernel/mm/transparent_hugepage/defrag
116echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
117echo never >/sys/kernel/mm/transparent_hugepage/defrag
118
119khugepaged will be automatically started when
120transparent_hugepage/enabled is set to "always" or "madvise, and it'll
121be automatically shutdown if it's set to "never".
122
123khugepaged runs usually at low frequency so while one may not want to
124invoke defrag algorithms synchronously during the page faults, it
125should be worth invoking defrag at least in khugepaged. However it's
126also possible to disable defrag in khugepaged:
127
128echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
129echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
130
131You can also control how many pages khugepaged should scan at each
132pass:
133
134/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
135
136and how many milliseconds to wait in khugepaged between each pass (you
137can set this to 0 to run khugepaged at 100% utilization of one core):
138
139/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
140
141and how many milliseconds to wait in khugepaged if there's an hugepage
142allocation failure to throttle the next allocation attempt.
143
144/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
145
146The khugepaged progress can be seen in the number of pages collapsed:
147
148/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
149
150for each pass:
151
152/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
153
154== Boot parameter ==
155
156You can change the sysfs boot time defaults of Transparent Hugepage
157Support by passing the parameter "transparent_hugepage=always" or
158"transparent_hugepage=madvise" or "transparent_hugepage=never"
159(without "") to the kernel command line.
160
161== Need of application restart ==
162
163The transparent_hugepage/enabled values only affect future
164behavior. So to make them effective you need to restart any
165application that could have been using hugepages. This also applies to
166the regions registered in khugepaged.
167
168== get_user_pages and follow_page ==
169
170get_user_pages and follow_page if run on a hugepage, will return the
171head or tail pages as usual (exactly as they would do on
172hugetlbfs). Most gup users will only care about the actual physical
173address of the page and its temporary pinning to release after the I/O
174is complete, so they won't ever notice the fact the page is huge. But
175if any driver is going to mangle over the page structure of the tail
176page (like for checking page->mapping or other bits that are relevant
177for the head page and not the tail page), it should be updated to jump
178to check head page instead (while serializing properly against
179split_huge_page() to avoid the head and tail pages to disappear from
180under it, see the futex code to see an example of that, hugetlbfs also
181needed special handling in futex code for similar reasons).
182
183NOTE: these aren't new constraints to the GUP API, and they match the
184same constrains that applies to hugetlbfs too, so any driver capable
185of handling GUP on hugetlbfs will also work fine on transparent
186hugepage backed mappings.
187
188In case you can't handle compound pages if they're returned by
189follow_page, the FOLL_SPLIT bit can be specified as parameter to
190follow_page, so that it will split the hugepages before returning
191them. Migration for example passes FOLL_SPLIT as parameter to
192follow_page because it's not hugepage aware and in fact it can't work
193at all on hugetlbfs (but it instead works fine on transparent
194hugepages thanks to FOLL_SPLIT). migration simply can't deal with
195hugepages being returned (as it's not only checking the pfn of the
196page and pinning it during the copy but it pretends to migrate the
197memory in regular page sizes and with regular pte/pmd mappings).
198
199== Optimizing the applications ==
200
201To be guaranteed that the kernel will map a 2M page immediately in any
202memory region, the mmap region has to be hugepage naturally
203aligned. posix_memalign() can provide that guarantee.
204
205== Hugetlbfs ==
206
207You can use hugetlbfs on a kernel that has transparent hugepage
208support enabled just fine as always. No difference can be noted in
209hugetlbfs other than there will be less overall fragmentation. All
210usual features belonging to hugetlbfs are preserved and
211unaffected. libhugetlbfs will also work fine as usual.
212
213== Graceful fallback ==
214
215Code walking pagetables but unware about huge pmds can simply call
216split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
217pmd_offset. It's trivial to make the code transparent hugepage aware
218by just grepping for "pmd_offset" and adding split_huge_page_pmd where
219missing after pmd_offset returns the pmd. Thanks to the graceful
220fallback design, with a one liner change, you can avoid to write
221hundred if not thousand of lines of complex code to make your code
222hugepage aware.
223
224If you're not walking pagetables but you run into a physical hugepage
225but you can't handle it natively in your code, you can split it by
226calling split_huge_page(page). This is what the Linux VM does before
227it tries to swapout the hugepage for example.
228
229Example to make mremap.c transparent hugepage aware with a one liner
230change:
231
232diff --git a/mm/mremap.c b/mm/mremap.c
233--- a/mm/mremap.c
234+++ b/mm/mremap.c
235@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
236 return NULL;
237
238 pmd = pmd_offset(pud, addr);
239+ split_huge_page_pmd(mm, pmd);
240 if (pmd_none_or_clear_bad(pmd))
241 return NULL;
242
243== Locking in hugepage aware code ==
244
245We want as much code as possible hugepage aware, as calling
246split_huge_page() or split_huge_page_pmd() has a cost.
247
248To make pagetable walks huge pmd aware, all you need to do is to call
249pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
250mmap_sem in read (or write) mode to be sure an huge pmd cannot be
251created from under you by khugepaged (khugepaged collapse_huge_page
252takes the mmap_sem in write mode in addition to the anon_vma lock). If
253pmd_trans_huge returns false, you just fallback in the old code
254paths. If instead pmd_trans_huge returns true, you have to take the
255mm->page_table_lock and re-run pmd_trans_huge. Taking the
256page_table_lock will prevent the huge pmd to be converted into a
257regular pmd from under you (split_huge_page can run in parallel to the
258pagetable walk). If the second pmd_trans_huge returns false, you
259should just drop the page_table_lock and fallback to the old code as
260before. Otherwise you should run pmd_trans_splitting on the pmd. In
261case pmd_trans_splitting returns true, it means split_huge_page is
262already in the middle of splitting the page. So if pmd_trans_splitting
263returns true it's enough to drop the page_table_lock and call
264wait_split_huge_page and then fallback the old code paths. You are
265guaranteed by the time wait_split_huge_page returns, the pmd isn't
266huge anymore. If pmd_trans_splitting returns false, you can proceed to
267process the huge pmd and the hugepage natively. Once finished you can
268drop the page_table_lock.
269
270== compound_lock, get_user_pages and put_page ==
271
272split_huge_page internally has to distribute the refcounts in the head
273page to the tail pages before clearing all PG_head/tail bits from the
274page structures. It can do that easily for refcounts taken by huge pmd
275mappings. But the GUI API as created by hugetlbfs (that returns head
276and tail pages if running get_user_pages on an address backed by any
277hugepage), requires the refcount to be accounted on the tail pages and
278not only in the head pages, if we want to be able to run
279split_huge_page while there are gup pins established on any tail
280page. Failure to be able to run split_huge_page if there's any gup pin
281on any tail page, would mean having to split all hugepages upfront in
282get_user_pages which is unacceptable as too many gup users are
283performance critical and they must work natively on hugepages like
284they work natively on hugetlbfs already (hugetlbfs is simpler because
285hugetlbfs pages cannot be splitted so there wouldn't be requirement of
286accounting the pins on the tail pages for hugetlbfs). If we wouldn't
287account the gup refcounts on the tail pages during gup, we won't know
288anymore which tail page is pinned by gup and which is not while we run
289split_huge_page. But we still have to add the gup pin to the head page
290too, to know when we can free the compound page in case it's never
291splitted during its lifetime. That requires changing not just
292get_page, but put_page as well so that when put_page runs on a tail
293page (and only on a tail page) it will find its respective head page,
294and then it will decrease the head page refcount in addition to the
295tail page refcount. To obtain a head page reliably and to decrease its
296refcount without race conditions, put_page has to serialize against
297__split_huge_page_refcount using a special per-page lock called
298compound_lock.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3dd5c6fce989..af656ded404e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6592,13 +6592,12 @@ F: Documentation/i2c/busses/i2c-viapro
6592F: drivers/i2c/busses/i2c-viapro.c 6592F: drivers/i2c/busses/i2c-viapro.c
6593 6593
6594VIA SD/MMC CARD CONTROLLER DRIVER 6594VIA SD/MMC CARD CONTROLLER DRIVER
6595M: Joseph Chan <JosephChan@via.com.tw> 6595M: Bruce Chang <brucechang@via.com.tw>
6596M: Harald Welte <HaraldWelte@viatech.com> 6596M: Harald Welte <HaraldWelte@viatech.com>
6597S: Maintained 6597S: Maintained
6598F: drivers/mmc/host/via-sdmmc.c 6598F: drivers/mmc/host/via-sdmmc.c
6599 6599
6600VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER 6600VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER
6601M: Joseph Chan <JosephChan@via.com.tw>
6602M: Florian Tobias Schandinat <FlorianSchandinat@gmx.de> 6601M: Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
6603L: linux-fbdev@vger.kernel.org 6602L: linux-fbdev@vger.kernel.org
6604S: Maintained 6603S: Maintained
diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h
index 99c56d47879d..72db984f8781 100644
--- a/arch/alpha/include/asm/mman.h
+++ b/arch/alpha/include/asm/mman.h
@@ -53,6 +53,9 @@
53#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 53#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
54#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 54#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
55 55
56#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
57#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
58
56/* compatibility flags */ 59/* compatibility flags */
57#define MAP_FILE 0 60#define MAP_FILE 0
58 61
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 0c1bb68ff4a8..2cfe8161b478 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -38,17 +38,9 @@
38#ifdef CONFIG_MMU 38#ifdef CONFIG_MMU
39void *module_alloc(unsigned long size) 39void *module_alloc(unsigned long size)
40{ 40{
41 struct vm_struct *area; 41 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
42 42 GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
43 size = PAGE_ALIGN(size); 43 __builtin_return_address(0));
44 if (!size)
45 return NULL;
46
47 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
48 if (!area)
49 return NULL;
50
51 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
52} 44}
53#else /* CONFIG_MMU */ 45#else /* CONFIG_MMU */
54void *module_alloc(unsigned long size) 46void *module_alloc(unsigned long size)
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 93292a18cf77..709244c66fa3 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
50 if (!new_pmd) 50 if (!new_pmd)
51 goto no_pmd; 51 goto no_pmd;
52 52
53 new_pte = pte_alloc_map(mm, new_pmd, 0); 53 new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
54 if (!new_pte) 54 if (!new_pte)
55 goto no_pte; 55 goto no_pte;
56 56
diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c
index 8c6a2440e345..659d119ce712 100644
--- a/arch/avr32/boards/atngw100/setup.c
+++ b/arch/avr32/boards/atngw100/setup.c
@@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
188 */ 188 */
189 regs = (void __iomem __force *)res->start; 189 regs = (void __iomem __force *)res->start;
190 pclk = clk_get(&pdev->dev, "pclk"); 190 pclk = clk_get(&pdev->dev, "pclk");
191 if (!pclk) 191 if (IS_ERR(pclk))
192 return; 192 return;
193 193
194 clk_enable(pclk); 194 clk_enable(pclk);
diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c
index 2adc261c9e3d..6ce30fb2ec94 100644
--- a/arch/avr32/boards/atstk1000/atstk1002.c
+++ b/arch/avr32/boards/atstk1000/atstk1002.c
@@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
203 */ 203 */
204 regs = (void __iomem __force *)res->start; 204 regs = (void __iomem __force *)res->start;
205 pclk = clk_get(&pdev->dev, "pclk"); 205 pclk = clk_get(&pdev->dev, "pclk");
206 if (!pclk) 206 if (IS_ERR(pclk))
207 return; 207 return;
208 208
209 clk_enable(pclk); 209 clk_enable(pclk);
diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c
index 75f19f47fb2f..86fab77a5a00 100644
--- a/arch/avr32/boards/favr-32/setup.c
+++ b/arch/avr32/boards/favr-32/setup.c
@@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
206 */ 206 */
207 regs = (void __iomem __force *)res->start; 207 regs = (void __iomem __force *)res->start;
208 pclk = clk_get(&pdev->dev, "pclk"); 208 pclk = clk_get(&pdev->dev, "pclk");
209 if (!pclk) 209 if (IS_ERR(pclk))
210 return; 210 return;
211 211
212 clk_enable(pclk); 212 clk_enable(pclk);
diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c
index dd009875a405..da14fbdd4e8e 100644
--- a/arch/avr32/boards/hammerhead/setup.c
+++ b/arch/avr32/boards/hammerhead/setup.c
@@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
150 regs = (void __iomem __force *)res->start; 150 regs = (void __iomem __force *)res->start;
151 pclk = clk_get(&pdev->dev, "pclk"); 151 pclk = clk_get(&pdev->dev, "pclk");
152 152
153 if (!pclk) 153 if (IS_ERR(pclk))
154 return; 154 return;
155 155
156 clk_enable(pclk); 156 clk_enable(pclk);
diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c
index 623b077594fc..e61bc948f959 100644
--- a/arch/avr32/boards/merisc/setup.c
+++ b/arch/avr32/boards/merisc/setup.c
@@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
134 134
135 regs = (void __iomem __force *)res->start; 135 regs = (void __iomem __force *)res->start;
136 pclk = clk_get(&pdev->dev, "pclk"); 136 pclk = clk_get(&pdev->dev, "pclk");
137 if (!pclk) 137 if (IS_ERR(pclk))
138 return; 138 return;
139 139
140 clk_enable(pclk); 140 clk_enable(pclk);
diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c
index 523d8e183bef..c4da5cba2dbf 100644
--- a/arch/avr32/boards/mimc200/setup.c
+++ b/arch/avr32/boards/mimc200/setup.c
@@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
162 */ 162 */
163 regs = (void __iomem __force *)res->start; 163 regs = (void __iomem __force *)res->start;
164 pclk = clk_get(&pdev->dev, "pclk"); 164 pclk = clk_get(&pdev->dev, "pclk");
165 if (!pclk) 165 if (IS_ERR(pclk))
166 return; 166 return;
167 167
168 clk_enable(pclk); 168 clk_enable(pclk);
diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig
index 9854013d2728..6f9ca56de1f6 100644
--- a/arch/avr32/configs/atngw100_defconfig
+++ b/arch/avr32/configs/atngw100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
8CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
9CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
10# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
11# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 11CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 14CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y
19# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_NO_HZ=y 18CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
29CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 26CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
30CONFIG_CPU_FREQ_GOV_USERSPACE=y 27CONFIG_CPU_FREQ_GOV_USERSPACE=y
31CONFIG_CPU_FREQ_AT32AP=y 28CONFIG_CPU_FREQ_AT32AP=y
29CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
32CONFIG_NET=y 30CONFIG_NET=y
33CONFIG_PACKET=y 31CONFIG_PACKET=y
34CONFIG_UNIX=y 32CONFIG_UNIX=y
@@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y
72CONFIG_BLK_DEV_LOOP=m 70CONFIG_BLK_DEV_LOOP=m
73CONFIG_BLK_DEV_NBD=m 71CONFIG_BLK_DEV_NBD=m
74CONFIG_BLK_DEV_RAM=m 72CONFIG_BLK_DEV_RAM=m
73CONFIG_MISC_DEVICES=y
75CONFIG_ATMEL_TCLIB=y 74CONFIG_ATMEL_TCLIB=y
76CONFIG_EEPROM_AT24=m
77CONFIG_NETDEVICES=y 75CONFIG_NETDEVICES=y
78CONFIG_TUN=m 76CONFIG_TUN=m
79CONFIG_NET_ETHERNET=y 77CONFIG_NET_ETHERNET=y
@@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y
106CONFIG_WATCHDOG=y 104CONFIG_WATCHDOG=y
107CONFIG_AT32AP700X_WDT=y 105CONFIG_AT32AP700X_WDT=y
108CONFIG_USB_GADGET=y 106CONFIG_USB_GADGET=y
107CONFIG_USB_GADGET_VBUS_DRAW=350
109CONFIG_USB_ZERO=m 108CONFIG_USB_ZERO=m
110CONFIG_USB_ETH=m 109CONFIG_USB_ETH=m
111CONFIG_USB_GADGETFS=m 110CONFIG_USB_GADGETFS=m
@@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m
115CONFIG_MMC=y 114CONFIG_MMC=y
116CONFIG_MMC_TEST=m 115CONFIG_MMC_TEST=m
117CONFIG_MMC_ATMELMCI=y 116CONFIG_MMC_ATMELMCI=y
118CONFIG_MMC_SPI=m
119CONFIG_NEW_LEDS=y 117CONFIG_NEW_LEDS=y
120CONFIG_LEDS_CLASS=y 118CONFIG_LEDS_CLASS=y
121CONFIG_LEDS_GPIO=y 119CONFIG_LEDS_GPIO=y
122CONFIG_LEDS_TRIGGERS=y 120CONFIG_LEDS_TRIGGERS=y
123CONFIG_LEDS_TRIGGER_TIMER=y 121CONFIG_LEDS_TRIGGER_TIMER=y
124CONFIG_LEDS_TRIGGER_HEARTBEAT=y 122CONFIG_LEDS_TRIGGER_HEARTBEAT=y
125CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
126CONFIG_RTC_CLASS=y 123CONFIG_RTC_CLASS=y
127CONFIG_RTC_DRV_AT32AP700X=y 124CONFIG_RTC_DRV_AT32AP700X=y
128CONFIG_DMADEVICES=y 125CONFIG_DMADEVICES=y
@@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y
130CONFIG_EXT3_FS=y 127CONFIG_EXT3_FS=y
131# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 128# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
132# CONFIG_EXT3_FS_XATTR is not set 129# CONFIG_EXT3_FS_XATTR is not set
130CONFIG_EXT4_FS=y
131# CONFIG_EXT4_FS_XATTR is not set
133# CONFIG_DNOTIFY is not set 132# CONFIG_DNOTIFY is not set
134CONFIG_FUSE_FS=m 133CONFIG_FUSE_FS=m
135CONFIG_MSDOS_FS=m 134CONFIG_MSDOS_FS=m
136CONFIG_VFAT_FS=m 135CONFIG_VFAT_FS=m
137CONFIG_FAT_DEFAULT_CODEPAGE=850 136CONFIG_FAT_DEFAULT_CODEPAGE=850
137CONFIG_PROC_KCORE=y
138CONFIG_TMPFS=y 138CONFIG_TMPFS=y
139CONFIG_CONFIGFS_FS=m 139CONFIG_CONFIGFS_FS=y
140CONFIG_JFFS2_FS=y 140CONFIG_JFFS2_FS=y
141CONFIG_UFS_FS=y 141CONFIG_UBIFS_FS=y
142CONFIG_NFS_FS=y 142CONFIG_NFS_FS=y
143CONFIG_NFS_V3=y 143CONFIG_NFS_V3=y
144CONFIG_ROOT_NFS=y 144CONFIG_ROOT_NFS=y
145CONFIG_NFSD=m 145CONFIG_NFSD=m
146CONFIG_NFSD_V3=y 146CONFIG_NFSD_V3=y
147CONFIG_SMB_FS=m
148CONFIG_CIFS=m 147CONFIG_CIFS=m
149CONFIG_NLS_CODEPAGE_437=m 148CONFIG_NLS_CODEPAGE_437=m
150CONFIG_NLS_CODEPAGE_850=m 149CONFIG_NLS_CODEPAGE_850=m
@@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y
155CONFIG_DEBUG_KERNEL=y 154CONFIG_DEBUG_KERNEL=y
156CONFIG_DETECT_HUNG_TASK=y 155CONFIG_DETECT_HUNG_TASK=y
157CONFIG_FRAME_POINTER=y 156CONFIG_FRAME_POINTER=y
158# CONFIG_RCU_CPU_STALL_DETECTOR is not set
159CONFIG_CRYPTO_PCBC=m
diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig
index 7ceda354597b..7eece0af34c9 100644
--- a/arch/avr32/configs/atngw100_evklcd100_defconfig
+++ b/arch/avr32/configs/atngw100_evklcd100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
8CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
9CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
10# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
11# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 11CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 14CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y
19# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_NO_HZ=y 18CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
31CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 28CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
32CONFIG_CPU_FREQ_GOV_USERSPACE=y 29CONFIG_CPU_FREQ_GOV_USERSPACE=y
33CONFIG_CPU_FREQ_AT32AP=y 30CONFIG_CPU_FREQ_AT32AP=y
31CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
34CONFIG_NET=y 32CONFIG_NET=y
35CONFIG_PACKET=y 33CONFIG_PACKET=y
36CONFIG_UNIX=y 34CONFIG_UNIX=y
@@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y
74CONFIG_BLK_DEV_LOOP=m 72CONFIG_BLK_DEV_LOOP=m
75CONFIG_BLK_DEV_NBD=m 73CONFIG_BLK_DEV_NBD=m
76CONFIG_BLK_DEV_RAM=m 74CONFIG_BLK_DEV_RAM=m
75CONFIG_MISC_DEVICES=y
77CONFIG_ATMEL_TCLIB=y 76CONFIG_ATMEL_TCLIB=y
78CONFIG_NETDEVICES=y 77CONFIG_NETDEVICES=y
78CONFIG_TUN=m
79CONFIG_NET_ETHERNET=y 79CONFIG_NET_ETHERNET=y
80CONFIG_MACB=y 80CONFIG_MACB=y
81# CONFIG_NETDEV_1000 is not set 81# CONFIG_NETDEV_1000 is not set
@@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m
104CONFIG_SPI=y 104CONFIG_SPI=y
105CONFIG_SPI_ATMEL=y 105CONFIG_SPI_ATMEL=y
106CONFIG_SPI_SPIDEV=m 106CONFIG_SPI_SPIDEV=m
107CONFIG_GPIO_SYSFS=y
107# CONFIG_HWMON is not set 108# CONFIG_HWMON is not set
108CONFIG_WATCHDOG=y 109CONFIG_WATCHDOG=y
109CONFIG_AT32AP700X_WDT=y 110CONFIG_AT32AP700X_WDT=y
@@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m
127CONFIG_USB_G_SERIAL=m 128CONFIG_USB_G_SERIAL=m
128CONFIG_USB_CDC_COMPOSITE=m 129CONFIG_USB_CDC_COMPOSITE=m
129CONFIG_MMC=y 130CONFIG_MMC=y
131CONFIG_MMC_TEST=m
130CONFIG_MMC_ATMELMCI=y 132CONFIG_MMC_ATMELMCI=y
131CONFIG_NEW_LEDS=y 133CONFIG_NEW_LEDS=y
132CONFIG_LEDS_CLASS=y 134CONFIG_LEDS_CLASS=y
@@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y
141CONFIG_EXT3_FS=y 143CONFIG_EXT3_FS=y
142# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 144# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
143# CONFIG_EXT3_FS_XATTR is not set 145# CONFIG_EXT3_FS_XATTR is not set
146CONFIG_EXT4_FS=y
147# CONFIG_EXT4_FS_XATTR is not set
144# CONFIG_DNOTIFY is not set 148# CONFIG_DNOTIFY is not set
145CONFIG_FUSE_FS=m 149CONFIG_FUSE_FS=m
146CONFIG_MSDOS_FS=m 150CONFIG_MSDOS_FS=m
147CONFIG_VFAT_FS=m 151CONFIG_VFAT_FS=m
148CONFIG_FAT_DEFAULT_CODEPAGE=850 152CONFIG_FAT_DEFAULT_CODEPAGE=850
153CONFIG_PROC_KCORE=y
149CONFIG_TMPFS=y 154CONFIG_TMPFS=y
150CONFIG_CONFIGFS_FS=y 155CONFIG_CONFIGFS_FS=y
151CONFIG_JFFS2_FS=y 156CONFIG_JFFS2_FS=y
@@ -155,7 +160,6 @@ CONFIG_NFS_V3=y
155CONFIG_ROOT_NFS=y 160CONFIG_ROOT_NFS=y
156CONFIG_NFSD=m 161CONFIG_NFSD=m
157CONFIG_NFSD_V3=y 162CONFIG_NFSD_V3=y
158CONFIG_SMB_FS=m
159CONFIG_CIFS=m 163CONFIG_CIFS=m
160CONFIG_NLS_CODEPAGE_437=m 164CONFIG_NLS_CODEPAGE_437=m
161CONFIG_NLS_CODEPAGE_850=m 165CONFIG_NLS_CODEPAGE_850=m
@@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y
166CONFIG_DEBUG_KERNEL=y 170CONFIG_DEBUG_KERNEL=y
167CONFIG_DETECT_HUNG_TASK=y 171CONFIG_DETECT_HUNG_TASK=y
168CONFIG_FRAME_POINTER=y 172CONFIG_FRAME_POINTER=y
169# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig
index 7bc5b2ce68d5..387eb9d6e423 100644
--- a/arch/avr32/configs/atngw100_evklcd101_defconfig
+++ b/arch/avr32/configs/atngw100_evklcd101_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
8CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
9CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
10# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
11# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 11CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 14CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y
19# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_NO_HZ=y 18CONFIG_NO_HZ=y
@@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y
30CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 27CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
31CONFIG_CPU_FREQ_GOV_USERSPACE=y 28CONFIG_CPU_FREQ_GOV_USERSPACE=y
32CONFIG_CPU_FREQ_AT32AP=y 29CONFIG_CPU_FREQ_AT32AP=y
30CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
33CONFIG_NET=y 31CONFIG_NET=y
34CONFIG_PACKET=y 32CONFIG_PACKET=y
35CONFIG_UNIX=y 33CONFIG_UNIX=y
@@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y
73CONFIG_BLK_DEV_LOOP=m 71CONFIG_BLK_DEV_LOOP=m
74CONFIG_BLK_DEV_NBD=m 72CONFIG_BLK_DEV_NBD=m
75CONFIG_BLK_DEV_RAM=m 73CONFIG_BLK_DEV_RAM=m
74CONFIG_MISC_DEVICES=y
76CONFIG_ATMEL_TCLIB=y 75CONFIG_ATMEL_TCLIB=y
77CONFIG_NETDEVICES=y 76CONFIG_NETDEVICES=y
77CONFIG_TUN=m
78CONFIG_NET_ETHERNET=y 78CONFIG_NET_ETHERNET=y
79CONFIG_MACB=y 79CONFIG_MACB=y
80# CONFIG_NETDEV_1000 is not set 80# CONFIG_NETDEV_1000 is not set
@@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m
103CONFIG_SPI=y 103CONFIG_SPI=y
104CONFIG_SPI_ATMEL=y 104CONFIG_SPI_ATMEL=y
105CONFIG_SPI_SPIDEV=m 105CONFIG_SPI_SPIDEV=m
106CONFIG_GPIO_SYSFS=y
106# CONFIG_HWMON is not set 107# CONFIG_HWMON is not set
107CONFIG_WATCHDOG=y 108CONFIG_WATCHDOG=y
108CONFIG_AT32AP700X_WDT=y 109CONFIG_AT32AP700X_WDT=y
@@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m
126CONFIG_USB_G_SERIAL=m 127CONFIG_USB_G_SERIAL=m
127CONFIG_USB_CDC_COMPOSITE=m 128CONFIG_USB_CDC_COMPOSITE=m
128CONFIG_MMC=y 129CONFIG_MMC=y
130CONFIG_MMC_TEST=m
129CONFIG_MMC_ATMELMCI=y 131CONFIG_MMC_ATMELMCI=y
130CONFIG_NEW_LEDS=y 132CONFIG_NEW_LEDS=y
131CONFIG_LEDS_CLASS=y 133CONFIG_LEDS_CLASS=y
@@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y
140CONFIG_EXT3_FS=y 142CONFIG_EXT3_FS=y
141# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 143# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
142# CONFIG_EXT3_FS_XATTR is not set 144# CONFIG_EXT3_FS_XATTR is not set
145CONFIG_EXT4_FS=y
146# CONFIG_EXT4_FS_XATTR is not set
143# CONFIG_DNOTIFY is not set 147# CONFIG_DNOTIFY is not set
144CONFIG_FUSE_FS=m 148CONFIG_FUSE_FS=m
145CONFIG_MSDOS_FS=m 149CONFIG_MSDOS_FS=m
146CONFIG_VFAT_FS=m 150CONFIG_VFAT_FS=m
147CONFIG_FAT_DEFAULT_CODEPAGE=850 151CONFIG_FAT_DEFAULT_CODEPAGE=850
152CONFIG_PROC_KCORE=y
148CONFIG_TMPFS=y 153CONFIG_TMPFS=y
149CONFIG_CONFIGFS_FS=y 154CONFIG_CONFIGFS_FS=y
150CONFIG_JFFS2_FS=y 155CONFIG_JFFS2_FS=y
@@ -154,7 +159,6 @@ CONFIG_NFS_V3=y
154CONFIG_ROOT_NFS=y 159CONFIG_ROOT_NFS=y
155CONFIG_NFSD=m 160CONFIG_NFSD=m
156CONFIG_NFSD_V3=y 161CONFIG_NFSD_V3=y
157CONFIG_SMB_FS=m
158CONFIG_CIFS=m 162CONFIG_CIFS=m
159CONFIG_NLS_CODEPAGE_437=m 163CONFIG_NLS_CODEPAGE_437=m
160CONFIG_NLS_CODEPAGE_850=m 164CONFIG_NLS_CODEPAGE_850=m
@@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y
165CONFIG_DEBUG_KERNEL=y 169CONFIG_DEBUG_KERNEL=y
166CONFIG_DETECT_HUNG_TASK=y 170CONFIG_DETECT_HUNG_TASK=y
167CONFIG_FRAME_POINTER=y 171CONFIG_FRAME_POINTER=y
168# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig
index 4bd36821d4a2..f0fe237133a9 100644
--- a/arch/avr32/configs/atngw100mkii_defconfig
+++ b/arch/avr32/configs/atngw100mkii_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
8CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
9CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
10# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
11# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 11CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 14CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y
19# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_NO_HZ=y 18CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
29CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 26CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
30CONFIG_CPU_FREQ_GOV_USERSPACE=y 27CONFIG_CPU_FREQ_GOV_USERSPACE=y
31CONFIG_CPU_FREQ_AT32AP=y 28CONFIG_CPU_FREQ_AT32AP=y
29CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
32CONFIG_NET=y 30CONFIG_NET=y
33CONFIG_PACKET=y 31CONFIG_PACKET=y
34CONFIG_UNIX=y 32CONFIG_UNIX=y
@@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y
74CONFIG_BLK_DEV_LOOP=m 72CONFIG_BLK_DEV_LOOP=m
75CONFIG_BLK_DEV_NBD=m 73CONFIG_BLK_DEV_NBD=m
76CONFIG_BLK_DEV_RAM=m 74CONFIG_BLK_DEV_RAM=m
75CONFIG_MISC_DEVICES=y
77CONFIG_ATMEL_TCLIB=y 76CONFIG_ATMEL_TCLIB=y
78CONFIG_NETDEVICES=y 77CONFIG_NETDEVICES=y
79CONFIG_TUN=m 78CONFIG_TUN=m
@@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y
107CONFIG_WATCHDOG=y 106CONFIG_WATCHDOG=y
108CONFIG_AT32AP700X_WDT=y 107CONFIG_AT32AP700X_WDT=y
109CONFIG_USB_GADGET=y 108CONFIG_USB_GADGET=y
109CONFIG_USB_GADGET_VBUS_DRAW=350
110CONFIG_USB_ZERO=m 110CONFIG_USB_ZERO=m
111CONFIG_USB_ETH=m 111CONFIG_USB_ETH=m
112CONFIG_USB_GADGETFS=m 112CONFIG_USB_GADGETFS=m
@@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m
116CONFIG_MMC=y 116CONFIG_MMC=y
117CONFIG_MMC_TEST=m 117CONFIG_MMC_TEST=m
118CONFIG_MMC_ATMELMCI=y 118CONFIG_MMC_ATMELMCI=y
119CONFIG_MMC_SPI=m
120CONFIG_NEW_LEDS=y 119CONFIG_NEW_LEDS=y
121CONFIG_LEDS_CLASS=y 120CONFIG_LEDS_CLASS=y
122CONFIG_LEDS_GPIO=y 121CONFIG_LEDS_GPIO=y
123CONFIG_LEDS_TRIGGERS=y 122CONFIG_LEDS_TRIGGERS=y
124CONFIG_LEDS_TRIGGER_TIMER=y 123CONFIG_LEDS_TRIGGER_TIMER=y
125CONFIG_LEDS_TRIGGER_HEARTBEAT=y 124CONFIG_LEDS_TRIGGER_HEARTBEAT=y
126CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
127CONFIG_RTC_CLASS=y 125CONFIG_RTC_CLASS=y
128CONFIG_RTC_DRV_AT32AP700X=y 126CONFIG_RTC_DRV_AT32AP700X=y
129CONFIG_DMADEVICES=y 127CONFIG_DMADEVICES=y
@@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y
131CONFIG_EXT3_FS=y 129CONFIG_EXT3_FS=y
132# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 130# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
133# CONFIG_EXT3_FS_XATTR is not set 131# CONFIG_EXT3_FS_XATTR is not set
132CONFIG_EXT4_FS=y
133# CONFIG_EXT4_FS_XATTR is not set
134# CONFIG_DNOTIFY is not set 134# CONFIG_DNOTIFY is not set
135CONFIG_FUSE_FS=m 135CONFIG_FUSE_FS=m
136CONFIG_MSDOS_FS=m 136CONFIG_MSDOS_FS=m
137CONFIG_VFAT_FS=m 137CONFIG_VFAT_FS=m
138CONFIG_FAT_DEFAULT_CODEPAGE=850 138CONFIG_FAT_DEFAULT_CODEPAGE=850
139CONFIG_PROC_KCORE=y
139CONFIG_TMPFS=y 140CONFIG_TMPFS=y
140CONFIG_CONFIGFS_FS=m 141CONFIG_CONFIGFS_FS=y
141CONFIG_JFFS2_FS=y 142CONFIG_JFFS2_FS=y
142CONFIG_UFS_FS=y 143CONFIG_UBIFS_FS=y
143CONFIG_NFS_FS=y 144CONFIG_NFS_FS=y
144CONFIG_NFS_V3=y 145CONFIG_NFS_V3=y
145CONFIG_ROOT_NFS=y 146CONFIG_ROOT_NFS=y
146CONFIG_NFSD=m 147CONFIG_NFSD=m
147CONFIG_NFSD_V3=y 148CONFIG_NFSD_V3=y
148CONFIG_SMB_FS=m
149CONFIG_CIFS=m 149CONFIG_CIFS=m
150CONFIG_NLS_CODEPAGE_437=m 150CONFIG_NLS_CODEPAGE_437=m
151CONFIG_NLS_CODEPAGE_850=m 151CONFIG_NLS_CODEPAGE_850=m
@@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y
156CONFIG_DEBUG_KERNEL=y 156CONFIG_DEBUG_KERNEL=y
157CONFIG_DETECT_HUNG_TASK=y 157CONFIG_DETECT_HUNG_TASK=y
158CONFIG_FRAME_POINTER=y 158CONFIG_FRAME_POINTER=y
159# CONFIG_RCU_CPU_STALL_DETECTOR is not set
160CONFIG_CRYPTO_PCBC=m
diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
index f8437ef3237f..e4a7c1dc8380 100644
--- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
+++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
8CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
9CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
10# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
11# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 11CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 14CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y
19# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_NO_HZ=y 18CONFIG_NO_HZ=y
@@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y
32CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 29CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
33CONFIG_CPU_FREQ_GOV_USERSPACE=y 30CONFIG_CPU_FREQ_GOV_USERSPACE=y
34CONFIG_CPU_FREQ_AT32AP=y 31CONFIG_CPU_FREQ_AT32AP=y
32CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
35CONFIG_NET=y 33CONFIG_NET=y
36CONFIG_PACKET=y 34CONFIG_PACKET=y
37CONFIG_UNIX=y 35CONFIG_UNIX=y
@@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y
77CONFIG_BLK_DEV_LOOP=m 75CONFIG_BLK_DEV_LOOP=m
78CONFIG_BLK_DEV_NBD=m 76CONFIG_BLK_DEV_NBD=m
79CONFIG_BLK_DEV_RAM=m 77CONFIG_BLK_DEV_RAM=m
78CONFIG_MISC_DEVICES=y
80CONFIG_ATMEL_TCLIB=y 79CONFIG_ATMEL_TCLIB=y
81CONFIG_NETDEVICES=y 80CONFIG_NETDEVICES=y
81CONFIG_TUN=m
82CONFIG_NET_ETHERNET=y 82CONFIG_NET_ETHERNET=y
83CONFIG_MACB=y 83CONFIG_MACB=y
84# CONFIG_NETDEV_1000 is not set 84# CONFIG_NETDEV_1000 is not set
@@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m
107CONFIG_SPI=y 107CONFIG_SPI=y
108CONFIG_SPI_ATMEL=y 108CONFIG_SPI_ATMEL=y
109CONFIG_SPI_SPIDEV=m 109CONFIG_SPI_SPIDEV=m
110CONFIG_GPIO_SYSFS=y
110# CONFIG_HWMON is not set 111# CONFIG_HWMON is not set
111CONFIG_WATCHDOG=y 112CONFIG_WATCHDOG=y
112CONFIG_AT32AP700X_WDT=y 113CONFIG_AT32AP700X_WDT=y
@@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m
130CONFIG_USB_G_SERIAL=m 131CONFIG_USB_G_SERIAL=m
131CONFIG_USB_CDC_COMPOSITE=m 132CONFIG_USB_CDC_COMPOSITE=m
132CONFIG_MMC=y 133CONFIG_MMC=y
134CONFIG_MMC_TEST=m
133CONFIG_MMC_ATMELMCI=y 135CONFIG_MMC_ATMELMCI=y
134CONFIG_NEW_LEDS=y 136CONFIG_NEW_LEDS=y
135CONFIG_LEDS_CLASS=y 137CONFIG_LEDS_CLASS=y
@@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y
144CONFIG_EXT3_FS=y 146CONFIG_EXT3_FS=y
145# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 147# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
146# CONFIG_EXT3_FS_XATTR is not set 148# CONFIG_EXT3_FS_XATTR is not set
149CONFIG_EXT4_FS=y
150# CONFIG_EXT4_FS_XATTR is not set
147# CONFIG_DNOTIFY is not set 151# CONFIG_DNOTIFY is not set
148CONFIG_FUSE_FS=m 152CONFIG_FUSE_FS=m
149CONFIG_MSDOS_FS=m 153CONFIG_MSDOS_FS=m
150CONFIG_VFAT_FS=m 154CONFIG_VFAT_FS=m
151CONFIG_FAT_DEFAULT_CODEPAGE=850 155CONFIG_FAT_DEFAULT_CODEPAGE=850
156CONFIG_PROC_KCORE=y
152CONFIG_TMPFS=y 157CONFIG_TMPFS=y
153CONFIG_CONFIGFS_FS=y 158CONFIG_CONFIGFS_FS=y
154CONFIG_JFFS2_FS=y 159CONFIG_JFFS2_FS=y
@@ -158,7 +163,6 @@ CONFIG_NFS_V3=y
158CONFIG_ROOT_NFS=y 163CONFIG_ROOT_NFS=y
159CONFIG_NFSD=m 164CONFIG_NFSD=m
160CONFIG_NFSD_V3=y 165CONFIG_NFSD_V3=y
161CONFIG_SMB_FS=m
162CONFIG_CIFS=m 166CONFIG_CIFS=m
163CONFIG_NLS_CODEPAGE_437=m 167CONFIG_NLS_CODEPAGE_437=m
164CONFIG_NLS_CODEPAGE_850=m 168CONFIG_NLS_CODEPAGE_850=m
@@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y
169CONFIG_DEBUG_KERNEL=y 173CONFIG_DEBUG_KERNEL=y
170CONFIG_DETECT_HUNG_TASK=y 174CONFIG_DETECT_HUNG_TASK=y
171CONFIG_FRAME_POINTER=y 175CONFIG_FRAME_POINTER=y
172# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
index 7f58f996d945..6f37f70c2c37 100644
--- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
+++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
8CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
9CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
10# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
11# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 11CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 14CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y
19# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_NO_HZ=y 18CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
31CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 28CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
32CONFIG_CPU_FREQ_GOV_USERSPACE=y 29CONFIG_CPU_FREQ_GOV_USERSPACE=y
33CONFIG_CPU_FREQ_AT32AP=y 30CONFIG_CPU_FREQ_AT32AP=y
31CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
34CONFIG_NET=y 32CONFIG_NET=y
35CONFIG_PACKET=y 33CONFIG_PACKET=y
36CONFIG_UNIX=y 34CONFIG_UNIX=y
@@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y
76CONFIG_BLK_DEV_LOOP=m 74CONFIG_BLK_DEV_LOOP=m
77CONFIG_BLK_DEV_NBD=m 75CONFIG_BLK_DEV_NBD=m
78CONFIG_BLK_DEV_RAM=m 76CONFIG_BLK_DEV_RAM=m
77CONFIG_MISC_DEVICES=y
79CONFIG_ATMEL_TCLIB=y 78CONFIG_ATMEL_TCLIB=y
80CONFIG_NETDEVICES=y 79CONFIG_NETDEVICES=y
80CONFIG_TUN=m
81CONFIG_NET_ETHERNET=y 81CONFIG_NET_ETHERNET=y
82CONFIG_MACB=y 82CONFIG_MACB=y
83# CONFIG_NETDEV_1000 is not set 83# CONFIG_NETDEV_1000 is not set
@@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m
106CONFIG_SPI=y 106CONFIG_SPI=y
107CONFIG_SPI_ATMEL=y 107CONFIG_SPI_ATMEL=y
108CONFIG_SPI_SPIDEV=m 108CONFIG_SPI_SPIDEV=m
109CONFIG_GPIO_SYSFS=y
109# CONFIG_HWMON is not set 110# CONFIG_HWMON is not set
110CONFIG_WATCHDOG=y 111CONFIG_WATCHDOG=y
111CONFIG_AT32AP700X_WDT=y 112CONFIG_AT32AP700X_WDT=y
@@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m
129CONFIG_USB_G_SERIAL=m 130CONFIG_USB_G_SERIAL=m
130CONFIG_USB_CDC_COMPOSITE=m 131CONFIG_USB_CDC_COMPOSITE=m
131CONFIG_MMC=y 132CONFIG_MMC=y
133CONFIG_MMC_TEST=m
132CONFIG_MMC_ATMELMCI=y 134CONFIG_MMC_ATMELMCI=y
133CONFIG_NEW_LEDS=y 135CONFIG_NEW_LEDS=y
134CONFIG_LEDS_CLASS=y 136CONFIG_LEDS_CLASS=y
@@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y
143CONFIG_EXT3_FS=y 145CONFIG_EXT3_FS=y
144# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 146# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
145# CONFIG_EXT3_FS_XATTR is not set 147# CONFIG_EXT3_FS_XATTR is not set
148CONFIG_EXT4_FS=y
149# CONFIG_EXT4_FS_XATTR is not set
146# CONFIG_DNOTIFY is not set 150# CONFIG_DNOTIFY is not set
147CONFIG_FUSE_FS=m 151CONFIG_FUSE_FS=m
148CONFIG_MSDOS_FS=m 152CONFIG_MSDOS_FS=m
149CONFIG_VFAT_FS=m 153CONFIG_VFAT_FS=m
150CONFIG_FAT_DEFAULT_CODEPAGE=850 154CONFIG_FAT_DEFAULT_CODEPAGE=850
155CONFIG_PROC_KCORE=y
151CONFIG_TMPFS=y 156CONFIG_TMPFS=y
152CONFIG_CONFIGFS_FS=y 157CONFIG_CONFIGFS_FS=y
153CONFIG_JFFS2_FS=y 158CONFIG_JFFS2_FS=y
@@ -157,7 +162,6 @@ CONFIG_NFS_V3=y
157CONFIG_ROOT_NFS=y 162CONFIG_ROOT_NFS=y
158CONFIG_NFSD=m 163CONFIG_NFSD=m
159CONFIG_NFSD_V3=y 164CONFIG_NFSD_V3=y
160CONFIG_SMB_FS=m
161CONFIG_CIFS=m 165CONFIG_CIFS=m
162CONFIG_NLS_CODEPAGE_437=m 166CONFIG_NLS_CODEPAGE_437=m
163CONFIG_NLS_CODEPAGE_850=m 167CONFIG_NLS_CODEPAGE_850=m
@@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y
168CONFIG_DEBUG_KERNEL=y 172CONFIG_DEBUG_KERNEL=y
169CONFIG_DETECT_HUNG_TASK=y 173CONFIG_DETECT_HUNG_TASK=y
170CONFIG_FRAME_POINTER=y 174CONFIG_FRAME_POINTER=y
171# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig
index aec4c43a75da..4fb01f5ab42f 100644
--- a/arch/avr32/configs/atstk1002_defconfig
+++ b/arch/avr32/configs/atstk1002_defconfig
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
6CONFIG_SYSFS_DEPRECATED_V2=y
7CONFIG_RELAY=y 6CONFIG_RELAY=y
8CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
9# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
11# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
12CONFIG_PROFILING=y 11CONFIG_PROFILING=y
13CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
14CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
15CONFIG_MODULES=y 14CONFIG_MODULES=y
16CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
17# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
@@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y
26CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 25CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
27CONFIG_CPU_FREQ_GOV_USERSPACE=y 26CONFIG_CPU_FREQ_GOV_USERSPACE=y
28CONFIG_CPU_FREQ_AT32AP=y 27CONFIG_CPU_FREQ_AT32AP=y
28CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
29CONFIG_NET=y 29CONFIG_NET=y
30CONFIG_PACKET=y 30CONFIG_PACKET=y
31CONFIG_UNIX=y 31CONFIG_UNIX=y
@@ -35,6 +35,7 @@ CONFIG_INET=y
35CONFIG_IP_PNP=y 35CONFIG_IP_PNP=y
36CONFIG_IP_PNP_DHCP=y 36CONFIG_IP_PNP_DHCP=y
37CONFIG_NET_IPIP=m 37CONFIG_NET_IPIP=m
38CONFIG_NET_IPGRE_DEMUX=m
38CONFIG_NET_IPGRE=m 39CONFIG_NET_IPGRE=m
39CONFIG_INET_AH=m 40CONFIG_INET_AH=m
40CONFIG_INET_ESP=m 41CONFIG_INET_ESP=m
@@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y
58CONFIG_MTD_CFI=y 59CONFIG_MTD_CFI=y
59CONFIG_MTD_CFI_AMDSTD=y 60CONFIG_MTD_CFI_AMDSTD=y
60CONFIG_MTD_PHYSMAP=y 61CONFIG_MTD_PHYSMAP=y
61CONFIG_MTD_DATAFLASH=m
62CONFIG_MTD_M25P80=m
63CONFIG_MTD_UBI=y 62CONFIG_MTD_UBI=y
64CONFIG_BLK_DEV_LOOP=m 63CONFIG_BLK_DEV_LOOP=m
65CONFIG_BLK_DEV_NBD=m 64CONFIG_BLK_DEV_NBD=m
66CONFIG_BLK_DEV_RAM=m 65CONFIG_BLK_DEV_RAM=m
66CONFIG_MISC_DEVICES=y
67CONFIG_ATMEL_PWM=m 67CONFIG_ATMEL_PWM=m
68CONFIG_ATMEL_TCLIB=y 68CONFIG_ATMEL_TCLIB=y
69CONFIG_ATMEL_SSC=m 69CONFIG_ATMEL_SSC=m
70CONFIG_EEPROM_AT24=m
71# CONFIG_SCSI_PROC_FS is not set 70# CONFIG_SCSI_PROC_FS is not set
72CONFIG_BLK_DEV_SD=m 71CONFIG_BLK_DEV_SD=m
73CONFIG_BLK_DEV_SR=m 72CONFIG_BLK_DEV_SR=m
@@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m
120CONFIG_SND_PCM_OSS=m 119CONFIG_SND_PCM_OSS=m
121# CONFIG_SND_SUPPORT_OLD_API is not set 120# CONFIG_SND_SUPPORT_OLD_API is not set
122# CONFIG_SND_VERBOSE_PROCFS is not set 121# CONFIG_SND_VERBOSE_PROCFS is not set
123# CONFIG_SND_DRIVERS is not set
124CONFIG_SND_AT73C213=m 122CONFIG_SND_AT73C213=m
125# CONFIG_HID_SUPPORT is not set 123# CONFIG_HID_SUPPORT is not set
126CONFIG_USB_GADGET=y 124CONFIG_USB_GADGET=y
@@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m
131CONFIG_USB_G_SERIAL=m 129CONFIG_USB_G_SERIAL=m
132CONFIG_USB_CDC_COMPOSITE=m 130CONFIG_USB_CDC_COMPOSITE=m
133CONFIG_MMC=y 131CONFIG_MMC=y
132CONFIG_MMC_TEST=m
134CONFIG_MMC_ATMELMCI=y 133CONFIG_MMC_ATMELMCI=y
135CONFIG_MMC_SPI=m
136CONFIG_NEW_LEDS=y 134CONFIG_NEW_LEDS=y
137CONFIG_LEDS_CLASS=m 135CONFIG_LEDS_CLASS=y
138CONFIG_LEDS_ATMEL_PWM=m 136CONFIG_LEDS_ATMEL_PWM=m
139CONFIG_LEDS_GPIO=m 137CONFIG_LEDS_GPIO=m
140CONFIG_LEDS_TRIGGERS=y 138CONFIG_LEDS_TRIGGERS=y
141CONFIG_LEDS_TRIGGER_TIMER=m 139CONFIG_LEDS_TRIGGER_TIMER=m
142CONFIG_LEDS_TRIGGER_HEARTBEAT=m 140CONFIG_LEDS_TRIGGER_HEARTBEAT=m
143CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
144CONFIG_RTC_CLASS=y 141CONFIG_RTC_CLASS=y
145CONFIG_RTC_DRV_AT32AP700X=y 142CONFIG_RTC_DRV_AT32AP700X=y
146CONFIG_DMADEVICES=y 143CONFIG_DMADEVICES=y
@@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y
149# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 146# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
150# CONFIG_EXT3_FS_XATTR is not set 147# CONFIG_EXT3_FS_XATTR is not set
151CONFIG_EXT4_FS=y 148CONFIG_EXT4_FS=y
149# CONFIG_EXT4_FS_XATTR is not set
152# CONFIG_DNOTIFY is not set 150# CONFIG_DNOTIFY is not set
153CONFIG_FUSE_FS=m 151CONFIG_FUSE_FS=m
154CONFIG_MSDOS_FS=m 152CONFIG_MSDOS_FS=m
155CONFIG_VFAT_FS=m 153CONFIG_VFAT_FS=m
154CONFIG_FAT_DEFAULT_CODEPAGE=850
156CONFIG_PROC_KCORE=y 155CONFIG_PROC_KCORE=y
157CONFIG_TMPFS=y 156CONFIG_TMPFS=y
157CONFIG_CONFIGFS_FS=y
158CONFIG_JFFS2_FS=y 158CONFIG_JFFS2_FS=y
159# CONFIG_JFFS2_FS_WRITEBUFFER is not set
160CONFIG_UBIFS_FS=y 159CONFIG_UBIFS_FS=y
161CONFIG_MINIX_FS=m
162CONFIG_NFS_FS=y 160CONFIG_NFS_FS=y
163CONFIG_NFS_V3=y 161CONFIG_NFS_V3=y
164CONFIG_ROOT_NFS=y 162CONFIG_ROOT_NFS=y
163CONFIG_CIFS=m
165CONFIG_NLS_CODEPAGE_437=m 164CONFIG_NLS_CODEPAGE_437=m
165CONFIG_NLS_CODEPAGE_850=m
166CONFIG_NLS_ISO8859_1=m 166CONFIG_NLS_ISO8859_1=m
167CONFIG_NLS_UTF8=m 167CONFIG_NLS_UTF8=m
168CONFIG_MAGIC_SYSRQ=y 168CONFIG_MAGIC_SYSRQ=y
@@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y
170CONFIG_DEBUG_KERNEL=y 170CONFIG_DEBUG_KERNEL=y
171CONFIG_DETECT_HUNG_TASK=y 171CONFIG_DETECT_HUNG_TASK=y
172CONFIG_FRAME_POINTER=y 172CONFIG_FRAME_POINTER=y
173# CONFIG_RCU_CPU_STALL_DETECTOR is not set
174# CONFIG_CRYPTO_HW is not set
175CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig
index 50ba3db682ca..9faaf9b900f2 100644
--- a/arch/avr32/configs/atstk1003_defconfig
+++ b/arch/avr32/configs/atstk1003_defconfig
@@ -2,22 +2,15 @@ CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_TASKSTATS=y
8CONFIG_TASK_DELAY_ACCT=y
9CONFIG_AUDIT=y
10CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
11CONFIG_SYSFS_DEPRECATED_V2=y
12CONFIG_RELAY=y 6CONFIG_RELAY=y
13CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
14# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
15# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
16# CONFIG_SLUB_DEBUG is not set
17# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
18CONFIG_PROFILING=y 11CONFIG_PROFILING=y
19CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
20CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
21CONFIG_MODULES=y 14CONFIG_MODULES=y
22CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
23# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
@@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y
33CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 26CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
34CONFIG_CPU_FREQ_GOV_USERSPACE=y 27CONFIG_CPU_FREQ_GOV_USERSPACE=y
35CONFIG_CPU_FREQ_AT32AP=y 28CONFIG_CPU_FREQ_AT32AP=y
29CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
36CONFIG_NET=y 30CONFIG_NET=y
37CONFIG_PACKET=y 31CONFIG_PACKET=y
38CONFIG_UNIX=y 32CONFIG_UNIX=y
@@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y
54CONFIG_MTD_CFI=y 48CONFIG_MTD_CFI=y
55CONFIG_MTD_CFI_AMDSTD=y 49CONFIG_MTD_CFI_AMDSTD=y
56CONFIG_MTD_PHYSMAP=y 50CONFIG_MTD_PHYSMAP=y
57CONFIG_MTD_DATAFLASH=m 51CONFIG_MTD_UBI=y
58CONFIG_MTD_M25P80=m
59CONFIG_BLK_DEV_LOOP=m 52CONFIG_BLK_DEV_LOOP=m
60CONFIG_BLK_DEV_NBD=m 53CONFIG_BLK_DEV_NBD=m
61CONFIG_BLK_DEV_RAM=m 54CONFIG_BLK_DEV_RAM=m
55CONFIG_MISC_DEVICES=y
62CONFIG_ATMEL_PWM=m 56CONFIG_ATMEL_PWM=m
63CONFIG_ATMEL_TCLIB=y 57CONFIG_ATMEL_TCLIB=y
64CONFIG_ATMEL_SSC=m 58CONFIG_ATMEL_SSC=m
65CONFIG_EEPROM_AT24=m
66# CONFIG_SCSI_PROC_FS is not set 59# CONFIG_SCSI_PROC_FS is not set
67CONFIG_BLK_DEV_SD=m 60CONFIG_BLK_DEV_SD=m
68CONFIG_BLK_DEV_SR=m 61CONFIG_BLK_DEV_SR=m
62# CONFIG_SCSI_LOWLEVEL is not set
69CONFIG_ATA=m 63CONFIG_ATA=m
70# CONFIG_SATA_PMP is not set 64# CONFIG_SATA_PMP is not set
71CONFIG_PATA_AT32=m 65CONFIG_PATA_AT32=m
@@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m
77CONFIG_PPP_DEFLATE=m 71CONFIG_PPP_DEFLATE=m
78CONFIG_PPP_BSDCOMP=m 72CONFIG_PPP_BSDCOMP=m
79CONFIG_INPUT=m 73CONFIG_INPUT=m
74CONFIG_INPUT_EVDEV=m
80# CONFIG_KEYBOARD_ATKBD is not set 75# CONFIG_KEYBOARD_ATKBD is not set
81CONFIG_KEYBOARD_GPIO=m 76CONFIG_KEYBOARD_GPIO=m
82# CONFIG_MOUSE_PS2 is not set 77# CONFIG_MOUSE_PS2 is not set
@@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m
106CONFIG_SND_AT73C213=m 101CONFIG_SND_AT73C213=m
107# CONFIG_HID_SUPPORT is not set 102# CONFIG_HID_SUPPORT is not set
108CONFIG_USB_GADGET=y 103CONFIG_USB_GADGET=y
109CONFIG_USB_GADGET_DEBUG_FS=y
110CONFIG_USB_ZERO=m 104CONFIG_USB_ZERO=m
111CONFIG_USB_ETH=m 105CONFIG_USB_ETH=m
112CONFIG_USB_GADGETFS=m 106CONFIG_USB_GADGETFS=m
@@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m
116CONFIG_MMC=y 110CONFIG_MMC=y
117CONFIG_MMC_TEST=m 111CONFIG_MMC_TEST=m
118CONFIG_MMC_ATMELMCI=y 112CONFIG_MMC_ATMELMCI=y
119CONFIG_MMC_SPI=m
120CONFIG_NEW_LEDS=y 113CONFIG_NEW_LEDS=y
121CONFIG_LEDS_CLASS=y 114CONFIG_LEDS_CLASS=y
122CONFIG_LEDS_ATMEL_PWM=m 115CONFIG_LEDS_ATMEL_PWM=m
123CONFIG_LEDS_GPIO=y 116CONFIG_LEDS_GPIO=m
124CONFIG_LEDS_TRIGGERS=y 117CONFIG_LEDS_TRIGGERS=y
125CONFIG_LEDS_TRIGGER_TIMER=y 118CONFIG_LEDS_TRIGGER_TIMER=m
126CONFIG_LEDS_TRIGGER_HEARTBEAT=y 119CONFIG_LEDS_TRIGGER_HEARTBEAT=m
127CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
128CONFIG_RTC_CLASS=y 120CONFIG_RTC_CLASS=y
129CONFIG_RTC_DRV_AT32AP700X=y 121CONFIG_RTC_DRV_AT32AP700X=y
130CONFIG_DMADEVICES=y 122CONFIG_DMADEVICES=y
131CONFIG_DW_DMAC=y 123CONFIG_EXT2_FS=y
132CONFIG_EXT2_FS=m 124CONFIG_EXT3_FS=y
133CONFIG_EXT3_FS=m 125# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
134# CONFIG_EXT3_FS_XATTR is not set 126# CONFIG_EXT3_FS_XATTR is not set
127CONFIG_EXT4_FS=y
128# CONFIG_EXT4_FS_XATTR is not set
135# CONFIG_DNOTIFY is not set 129# CONFIG_DNOTIFY is not set
136CONFIG_FUSE_FS=m 130CONFIG_FUSE_FS=m
137CONFIG_MSDOS_FS=m 131CONFIG_MSDOS_FS=m
138CONFIG_VFAT_FS=m 132CONFIG_VFAT_FS=m
133CONFIG_FAT_DEFAULT_CODEPAGE=850
139CONFIG_PROC_KCORE=y 134CONFIG_PROC_KCORE=y
140CONFIG_TMPFS=y 135CONFIG_TMPFS=y
141CONFIG_CONFIGFS_FS=m 136CONFIG_CONFIGFS_FS=y
142CONFIG_JFFS2_FS=y 137CONFIG_JFFS2_FS=y
138CONFIG_UBIFS_FS=y
143# CONFIG_NETWORK_FILESYSTEMS is not set 139# CONFIG_NETWORK_FILESYSTEMS is not set
144CONFIG_NLS_CODEPAGE_437=m 140CONFIG_NLS_CODEPAGE_437=m
141CONFIG_NLS_CODEPAGE_850=m
145CONFIG_NLS_ISO8859_1=m 142CONFIG_NLS_ISO8859_1=m
146CONFIG_NLS_UTF8=m 143CONFIG_NLS_UTF8=m
147CONFIG_MAGIC_SYSRQ=y 144CONFIG_MAGIC_SYSRQ=y
148CONFIG_DEBUG_FS=y 145CONFIG_DEBUG_FS=y
149CONFIG_DEBUG_KERNEL=y 146CONFIG_DEBUG_KERNEL=y
147CONFIG_DETECT_HUNG_TASK=y
150CONFIG_FRAME_POINTER=y 148CONFIG_FRAME_POINTER=y
151CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig
index 329e10ba3b54..3d2a5d85f970 100644
--- a/arch/avr32/configs/atstk1004_defconfig
+++ b/arch/avr32/configs/atstk1004_defconfig
@@ -1,19 +1,32 @@
1CONFIG_EXPERIMENTAL=y 1CONFIG_EXPERIMENTAL=y
2# CONFIG_LOCALVERSION_AUTO is not set 2# CONFIG_LOCALVERSION_AUTO is not set
3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y
3CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
4CONFIG_SYSFS_DEPRECATED_V2=y 6CONFIG_RELAY=y
7CONFIG_BLK_DEV_INITRD=y
5# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
6# CONFIG_BASE_FULL is not set 9# CONFIG_BASE_FULL is not set
7# CONFIG_FUTEX is not set
8# CONFIG_EPOLL is not set
9# CONFIG_SIGNALFD is not set
10# CONFIG_TIMERFD is not set
11# CONFIG_EVENTFD is not set
12# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
13CONFIG_SLOB=y 11CONFIG_PROFILING=y
14# CONFIG_BLOCK is not set 12CONFIG_OPROFILE=m
13# CONFIG_KPROBES is not set
14CONFIG_MODULES=y
15CONFIG_MODULE_UNLOAD=y
16# CONFIG_BLK_DEV_BSG is not set
17# CONFIG_IOSCHED_DEADLINE is not set
18CONFIG_NO_HZ=y
19CONFIG_HIGH_RES_TIMERS=y
15CONFIG_BOARD_ATSTK1004=y 20CONFIG_BOARD_ATSTK1004=y
16# CONFIG_OWNERSHIP_TRACE is not set 21# CONFIG_OWNERSHIP_TRACE is not set
22CONFIG_NMI_DEBUGGING=y
23CONFIG_PM=y
24CONFIG_CPU_FREQ=y
25# CONFIG_CPU_FREQ_STAT is not set
26CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
27CONFIG_CPU_FREQ_GOV_USERSPACE=y
28CONFIG_CPU_FREQ_AT32AP=y
29CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
17CONFIG_NET=y 30CONFIG_NET=y
18CONFIG_PACKET=y 31CONFIG_PACKET=y
19CONFIG_UNIX=y 32CONFIG_UNIX=y
@@ -31,40 +44,104 @@ CONFIG_MTD=y
31CONFIG_MTD_PARTITIONS=y 44CONFIG_MTD_PARTITIONS=y
32CONFIG_MTD_CMDLINE_PARTS=y 45CONFIG_MTD_CMDLINE_PARTS=y
33CONFIG_MTD_CHAR=y 46CONFIG_MTD_CHAR=y
47CONFIG_MTD_BLOCK=y
34CONFIG_MTD_CFI=y 48CONFIG_MTD_CFI=y
35CONFIG_MTD_CFI_AMDSTD=y 49CONFIG_MTD_CFI_AMDSTD=y
36CONFIG_MTD_PHYSMAP=y 50CONFIG_MTD_PHYSMAP=y
37# CONFIG_MISC_DEVICES is not set 51CONFIG_MTD_UBI=y
38# CONFIG_INPUT is not set 52CONFIG_BLK_DEV_LOOP=m
53CONFIG_BLK_DEV_NBD=m
54CONFIG_BLK_DEV_RAM=m
55CONFIG_MISC_DEVICES=y
56CONFIG_ATMEL_PWM=m
57CONFIG_ATMEL_TCLIB=y
58CONFIG_ATMEL_SSC=m
59# CONFIG_SCSI_PROC_FS is not set
60CONFIG_BLK_DEV_SD=m
61CONFIG_BLK_DEV_SR=m
62# CONFIG_SCSI_LOWLEVEL is not set
63CONFIG_ATA=m
64# CONFIG_SATA_PMP is not set
65CONFIG_PATA_AT32=m
66CONFIG_NETDEVICES=y
67# CONFIG_NETDEV_1000 is not set
68# CONFIG_NETDEV_10000 is not set
69CONFIG_PPP=m
70CONFIG_PPP_ASYNC=m
71CONFIG_PPP_DEFLATE=m
72CONFIG_PPP_BSDCOMP=m
73CONFIG_INPUT=m
74CONFIG_INPUT_EVDEV=m
75# CONFIG_KEYBOARD_ATKBD is not set
76CONFIG_KEYBOARD_GPIO=m
77# CONFIG_MOUSE_PS2 is not set
78CONFIG_MOUSE_GPIO=m
39# CONFIG_SERIO is not set 79# CONFIG_SERIO is not set
40# CONFIG_VT is not set 80# CONFIG_VT is not set
41# CONFIG_DEVKMEM is not set 81# CONFIG_DEVKMEM is not set
42CONFIG_SERIAL_ATMEL=y 82CONFIG_SERIAL_ATMEL=y
43CONFIG_SERIAL_ATMEL_CONSOLE=y 83CONFIG_SERIAL_ATMEL_CONSOLE=y
44# CONFIG_SERIAL_ATMEL_PDC is not set
45# CONFIG_LEGACY_PTYS is not set 84# CONFIG_LEGACY_PTYS is not set
46# CONFIG_HW_RANDOM is not set 85# CONFIG_HW_RANDOM is not set
86CONFIG_I2C=m
87CONFIG_I2C_CHARDEV=m
88CONFIG_I2C_GPIO=m
47CONFIG_SPI=y 89CONFIG_SPI=y
48CONFIG_SPI_ATMEL=y 90CONFIG_SPI_ATMEL=y
91CONFIG_SPI_SPIDEV=m
92CONFIG_GPIO_SYSFS=y
49# CONFIG_HWMON is not set 93# CONFIG_HWMON is not set
50CONFIG_WATCHDOG=y 94CONFIG_WATCHDOG=y
51CONFIG_AT32AP700X_WDT=y 95CONFIG_AT32AP700X_WDT=y
52CONFIG_FB=y 96CONFIG_FB=y
53CONFIG_FB_ATMEL=y 97CONFIG_FB_ATMEL=y
54CONFIG_BACKLIGHT_LCD_SUPPORT=y 98CONFIG_BACKLIGHT_LCD_SUPPORT=y
99CONFIG_LCD_CLASS_DEVICE=y
55CONFIG_LCD_LTV350QV=y 100CONFIG_LCD_LTV350QV=y
56# CONFIG_BACKLIGHT_CLASS_DEVICE is not set 101# CONFIG_BACKLIGHT_CLASS_DEVICE is not set
57CONFIG_USB_GADGET=y 102CONFIG_USB_GADGET=y
58CONFIG_USB_ETH=y 103CONFIG_USB_ZERO=m
59# CONFIG_USB_ETH_RNDIS is not set 104CONFIG_USB_ETH=m
105CONFIG_USB_GADGETFS=m
106CONFIG_USB_FILE_STORAGE=m
107CONFIG_USB_G_SERIAL=m
108CONFIG_USB_CDC_COMPOSITE=m
109CONFIG_MMC=y
110CONFIG_MMC_TEST=m
111CONFIG_MMC_ATMELMCI=y
112CONFIG_NEW_LEDS=y
113CONFIG_LEDS_CLASS=y
114CONFIG_LEDS_ATMEL_PWM=m
115CONFIG_LEDS_GPIO=m
116CONFIG_LEDS_TRIGGERS=y
117CONFIG_LEDS_TRIGGER_TIMER=m
118CONFIG_LEDS_TRIGGER_HEARTBEAT=m
60CONFIG_RTC_CLASS=y 119CONFIG_RTC_CLASS=y
61# CONFIG_RTC_INTF_PROC is not set
62CONFIG_RTC_DRV_AT32AP700X=y 120CONFIG_RTC_DRV_AT32AP700X=y
121CONFIG_DMADEVICES=y
122CONFIG_EXT2_FS=y
123CONFIG_EXT3_FS=y
124# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
125# CONFIG_EXT3_FS_XATTR is not set
126CONFIG_EXT4_FS=y
127# CONFIG_EXT4_FS_XATTR is not set
63# CONFIG_DNOTIFY is not set 128# CONFIG_DNOTIFY is not set
129CONFIG_FUSE_FS=m
130CONFIG_MSDOS_FS=m
131CONFIG_VFAT_FS=m
132CONFIG_FAT_DEFAULT_CODEPAGE=850
64CONFIG_PROC_KCORE=y 133CONFIG_PROC_KCORE=y
65# CONFIG_PROC_PAGE_MONITOR is not set
66CONFIG_TMPFS=y 134CONFIG_TMPFS=y
135CONFIG_CONFIGFS_FS=y
67CONFIG_JFFS2_FS=y 136CONFIG_JFFS2_FS=y
68# CONFIG_JFFS2_FS_WRITEBUFFER is not set 137CONFIG_UBIFS_FS=y
69# CONFIG_NETWORK_FILESYSTEMS is not set 138# CONFIG_NETWORK_FILESYSTEMS is not set
139CONFIG_NLS_CODEPAGE_437=m
140CONFIG_NLS_CODEPAGE_850=m
141CONFIG_NLS_ISO8859_1=m
142CONFIG_NLS_UTF8=m
70CONFIG_MAGIC_SYSRQ=y 143CONFIG_MAGIC_SYSRQ=y
144CONFIG_DEBUG_FS=y
145CONFIG_DEBUG_KERNEL=y
146CONFIG_DETECT_HUNG_TASK=y
147CONFIG_FRAME_POINTER=y
diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig
index dbcc1b51e506..1ed8f22d4fe2 100644
--- a/arch/avr32/configs/atstk1006_defconfig
+++ b/arch/avr32/configs/atstk1006_defconfig
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
3CONFIG_SYSVIPC=y 3CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 4CONFIG_POSIX_MQUEUE=y
5CONFIG_LOG_BUF_SHIFT=14 5CONFIG_LOG_BUF_SHIFT=14
6CONFIG_SYSFS_DEPRECATED_V2=y
7CONFIG_RELAY=y 6CONFIG_RELAY=y
8CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
9# CONFIG_SYSCTL_SYSCALL is not set 8# CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
11# CONFIG_COMPAT_BRK is not set 10# CONFIG_COMPAT_BRK is not set
12CONFIG_PROFILING=y 11CONFIG_PROFILING=y
13CONFIG_OPROFILE=m 12CONFIG_OPROFILE=m
14CONFIG_KPROBES=y 13# CONFIG_KPROBES is not set
15CONFIG_MODULES=y 14CONFIG_MODULES=y
16CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
17# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
@@ -37,6 +36,7 @@ CONFIG_INET=y
37CONFIG_IP_PNP=y 36CONFIG_IP_PNP=y
38CONFIG_IP_PNP_DHCP=y 37CONFIG_IP_PNP_DHCP=y
39CONFIG_NET_IPIP=m 38CONFIG_NET_IPIP=m
39CONFIG_NET_IPGRE_DEMUX=m
40CONFIG_NET_IPGRE=m 40CONFIG_NET_IPGRE=m
41CONFIG_INET_AH=m 41CONFIG_INET_AH=m
42CONFIG_INET_ESP=m 42CONFIG_INET_ESP=m
@@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y
60CONFIG_MTD_CFI=y 60CONFIG_MTD_CFI=y
61CONFIG_MTD_CFI_AMDSTD=y 61CONFIG_MTD_CFI_AMDSTD=y
62CONFIG_MTD_PHYSMAP=y 62CONFIG_MTD_PHYSMAP=y
63CONFIG_MTD_DATAFLASH=m
64CONFIG_MTD_DATAFLASH_OTP=y
65CONFIG_MTD_M25P80=m
66CONFIG_MTD_NAND=y 63CONFIG_MTD_NAND=y
67CONFIG_MTD_NAND_ATMEL=y 64CONFIG_MTD_NAND_ATMEL=y
68CONFIG_MTD_UBI=y 65CONFIG_MTD_UBI=y
69CONFIG_BLK_DEV_LOOP=m 66CONFIG_BLK_DEV_LOOP=m
70CONFIG_BLK_DEV_NBD=m 67CONFIG_BLK_DEV_NBD=m
71CONFIG_BLK_DEV_RAM=m 68CONFIG_BLK_DEV_RAM=m
69CONFIG_MISC_DEVICES=y
72CONFIG_ATMEL_PWM=m 70CONFIG_ATMEL_PWM=m
73CONFIG_ATMEL_TCLIB=y 71CONFIG_ATMEL_TCLIB=y
74CONFIG_ATMEL_SSC=m 72CONFIG_ATMEL_SSC=m
@@ -132,17 +130,17 @@ CONFIG_USB_ETH=m
132CONFIG_USB_GADGETFS=m 130CONFIG_USB_GADGETFS=m
133CONFIG_USB_FILE_STORAGE=m 131CONFIG_USB_FILE_STORAGE=m
134CONFIG_USB_G_SERIAL=m 132CONFIG_USB_G_SERIAL=m
133CONFIG_USB_CDC_COMPOSITE=m
135CONFIG_MMC=y 134CONFIG_MMC=y
135CONFIG_MMC_TEST=m
136CONFIG_MMC_ATMELMCI=y 136CONFIG_MMC_ATMELMCI=y
137CONFIG_MMC_SPI=m
138CONFIG_NEW_LEDS=y 137CONFIG_NEW_LEDS=y
139CONFIG_LEDS_CLASS=m 138CONFIG_LEDS_CLASS=y
140CONFIG_LEDS_ATMEL_PWM=m 139CONFIG_LEDS_ATMEL_PWM=m
141CONFIG_LEDS_GPIO=m 140CONFIG_LEDS_GPIO=m
142CONFIG_LEDS_TRIGGERS=y 141CONFIG_LEDS_TRIGGERS=y
143CONFIG_LEDS_TRIGGER_TIMER=m 142CONFIG_LEDS_TRIGGER_TIMER=m
144CONFIG_LEDS_TRIGGER_HEARTBEAT=m 143CONFIG_LEDS_TRIGGER_HEARTBEAT=m
145CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
146CONFIG_RTC_CLASS=y 144CONFIG_RTC_CLASS=y
147CONFIG_RTC_DRV_AT32AP700X=y 145CONFIG_RTC_DRV_AT32AP700X=y
148CONFIG_DMADEVICES=y 146CONFIG_DMADEVICES=y
@@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y
156CONFIG_FUSE_FS=m 154CONFIG_FUSE_FS=m
157CONFIG_MSDOS_FS=m 155CONFIG_MSDOS_FS=m
158CONFIG_VFAT_FS=m 156CONFIG_VFAT_FS=m
157CONFIG_FAT_DEFAULT_CODEPAGE=850
159CONFIG_PROC_KCORE=y 158CONFIG_PROC_KCORE=y
160CONFIG_TMPFS=y 159CONFIG_TMPFS=y
160CONFIG_CONFIGFS_FS=y
161CONFIG_JFFS2_FS=y 161CONFIG_JFFS2_FS=y
162CONFIG_UBIFS_FS=y 162CONFIG_UBIFS_FS=y
163CONFIG_MINIX_FS=m
164CONFIG_NFS_FS=y 163CONFIG_NFS_FS=y
165CONFIG_NFS_V3=y 164CONFIG_NFS_V3=y
166CONFIG_ROOT_NFS=y 165CONFIG_ROOT_NFS=y
166CONFIG_CIFS=m
167CONFIG_NLS_CODEPAGE_437=m 167CONFIG_NLS_CODEPAGE_437=m
168CONFIG_NLS_CODEPAGE_850=m
168CONFIG_NLS_ISO8859_1=m 169CONFIG_NLS_ISO8859_1=m
169CONFIG_NLS_UTF8=m 170CONFIG_NLS_UTF8=m
170CONFIG_MAGIC_SYSRQ=y 171CONFIG_MAGIC_SYSRQ=y
@@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y
172CONFIG_DEBUG_KERNEL=y 173CONFIG_DEBUG_KERNEL=y
173CONFIG_DETECT_HUNG_TASK=y 174CONFIG_DETECT_HUNG_TASK=y
174CONFIG_FRAME_POINTER=y 175CONFIG_FRAME_POINTER=y
175# CONFIG_RCU_CPU_STALL_DETECTOR is not set
176CONFIG_CRYPTO_FIPS=y
177# CONFIG_CRYPTO_HW is not set
178CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig
index 0c813b661a0a..aeadc955db32 100644
--- a/arch/avr32/configs/favr-32_defconfig
+++ b/arch/avr32/configs/favr-32_defconfig
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y
11# CONFIG_COMPAT_BRK is not set 11# CONFIG_COMPAT_BRK is not set
12CONFIG_PROFILING=y 12CONFIG_PROFILING=y
13CONFIG_OPROFILE=m 13CONFIG_OPROFILE=m
14CONFIG_KPROBES=y 14# CONFIG_KPROBES is not set
15CONFIG_MODULES=y 15CONFIG_MODULES=y
16CONFIG_MODULE_UNLOAD=y 16CONFIG_MODULE_UNLOAD=y
17# CONFIG_BLK_DEV_BSG is not set 17# CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig
index dcc01f0eb294..1692beeb7ed3 100644
--- a/arch/avr32/configs/hammerhead_defconfig
+++ b/arch/avr32/configs/hammerhead_defconfig
@@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y
12# CONFIG_COMPAT_BRK is not set 12# CONFIG_COMPAT_BRK is not set
13CONFIG_PROFILING=y 13CONFIG_PROFILING=y
14CONFIG_OPROFILE=m 14CONFIG_OPROFILE=m
15CONFIG_KPROBES=y 15# CONFIG_KPROBES is not set
16CONFIG_MODULES=y 16CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 17CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_FORCE_UNLOAD=y 18CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h
index ab608b70b24d..244f2acab546 100644
--- a/arch/avr32/include/asm/syscalls.h
+++ b/arch/avr32/include/asm/syscalls.h
@@ -15,20 +15,6 @@
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/signal.h> 16#include <linux/signal.h>
17 17
18/* kernel/process.c */
19asmlinkage int sys_fork(struct pt_regs *);
20asmlinkage int sys_clone(unsigned long, unsigned long,
21 unsigned long, unsigned long,
22 struct pt_regs *);
23asmlinkage int sys_vfork(struct pt_regs *);
24asmlinkage int sys_execve(const char __user *, char __user *__user *,
25 char __user *__user *, struct pt_regs *);
26
27/* kernel/signal.c */
28asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *,
29 struct pt_regs *);
30asmlinkage int sys_rt_sigreturn(struct pt_regs *);
31
32/* mm/cache.c */ 18/* mm/cache.c */
33asmlinkage int sys_cacheflush(int, void __user *, size_t); 19asmlinkage int sys_cacheflush(int, void __user *, size_t);
34 20
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 9c46aaad11ce..ef5a2a08fcca 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs)
367} 367}
368 368
369asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, 369asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
370 unsigned long parent_tidptr, 370 void __user *parent_tidptr, void __user *child_tidptr,
371 unsigned long child_tidptr, struct pt_regs *regs) 371 struct pt_regs *regs)
372{ 372{
373 if (!newsp) 373 if (!newsp)
374 newsp = regs->sp; 374 newsp = regs->sp;
375 return do_fork(clone_flags, newsp, regs, 0, 375 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr,
376 (int __user *)parent_tidptr, 376 child_tidptr);
377 (int __user *)child_tidptr);
378} 377}
379 378
380asmlinkage int sys_vfork(struct pt_regs *regs) 379asmlinkage int sys_vfork(struct pt_regs *regs)
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 668ed2817e51..05ad29112ff4 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -35,7 +35,6 @@ static struct clocksource counter = {
35 .rating = 50, 35 .rating = 50,
36 .read = read_cycle_count, 36 .read = read_cycle_count,
37 .mask = CLOCKSOURCE_MASK(32), 37 .mask = CLOCKSOURCE_MASK(32),
38 .shift = 16,
39 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 38 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
40}; 39};
41 40
@@ -123,9 +122,7 @@ void __init time_init(void)
123 122
124 /* figure rate for counter */ 123 /* figure rate for counter */
125 counter_hz = clk_get_rate(boot_cpu_data.clk); 124 counter_hz = clk_get_rate(boot_cpu_data.clk);
126 counter.mult = clocksource_hz2mult(counter_hz, counter.shift); 125 ret = clocksource_register_hz(&counter, counter_hz);
127
128 ret = clocksource_register(&counter);
129 if (ret) 126 if (ret)
130 pr_debug("timer: could not register clocksource: %d\n", ret); 127 pr_debug("timer: could not register clocksource: %d\n", ret);
131 128
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ac76da099a6d..89accc626b86 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
618} 618}
619 619
620/* forward declaration */ 620/* forward declaration */
621static static const struct dentry_operations pfmfs_dentry_operations; 621static const struct dentry_operations pfmfs_dentry_operations;
622 622
623static struct dentry * 623static struct dentry *
624pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) 624pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 1841ee7e65f9..5ca674b74737 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
38 if (pud) { 38 if (pud) {
39 pmd = pmd_alloc(mm, pud, taddr); 39 pmd = pmd_alloc(mm, pud, taddr);
40 if (pmd) 40 if (pmd)
41 pte = pte_alloc_map(mm, pmd, taddr); 41 pte = pte_alloc_map(mm, NULL, pmd, taddr);
42 } 42 }
43 return pte; 43 return pte;
44} 44}
diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h
index c892bfb3e2c1..785b4ea4ec3f 100644
--- a/arch/mips/include/asm/mman.h
+++ b/arch/mips/include/asm/mman.h
@@ -77,6 +77,9 @@
77#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 77#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
78#define MADV_HWPOISON 100 /* poison a page for testing */ 78#define MADV_HWPOISON 100 /* poison a page for testing */
79 79
80#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
81#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
82
80/* compatibility flags */ 83/* compatibility flags */
81#define MAP_FILE 0 84#define MAP_FILE 0
82 85
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 6f51dda87fce..d87a72e9fac7 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock);
46void *module_alloc(unsigned long size) 46void *module_alloc(unsigned long size)
47{ 47{
48#ifdef MODULE_START 48#ifdef MODULE_START
49 struct vm_struct *area; 49 return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
50 50 GFP_KERNEL, PAGE_KERNEL, -1,
51 size = PAGE_ALIGN(size); 51 __builtin_return_address(0));
52 if (!size)
53 return NULL;
54
55 area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END);
56 if (!area)
57 return NULL;
58
59 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
60#else 52#else
61 if (size == 0) 53 if (size == 0)
62 return NULL; 54 return NULL;
diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h
index 9749c8afe83a..f5b7bf5fba68 100644
--- a/arch/parisc/include/asm/mman.h
+++ b/arch/parisc/include/asm/mman.h
@@ -59,6 +59,9 @@
59#define MADV_MERGEABLE 65 /* KSM may merge identical pages */ 59#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
60#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ 60#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
61 61
62#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */
63#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */
64
62/* compatibility flags */ 65/* compatibility flags */
63#define MAP_FILE 0 66#define MAP_FILE 0
64#define MAP_VARIABLE 0 67#define MAP_VARIABLE 0
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c7..fec13200868f 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -16,6 +16,16 @@
16 16
17#ifdef __HAVE_ARCH_PTE_SPECIAL 17#ifdef __HAVE_ARCH_PTE_SPECIAL
18 18
19static inline void get_huge_page_tail(struct page *page)
20{
21 /*
22 * __split_huge_page_refcount() cannot run
23 * from under us.
24 */
25 VM_BUG_ON(atomic_read(&page->_count) < 0);
26 atomic_inc(&page->_count);
27}
28
19/* 29/*
20 * The performance critical leaf functions are made noinline otherwise gcc 30 * The performance critical leaf functions are made noinline otherwise gcc
21 * inlines everything into a single function which results in too much 31 * inlines everything into a single function which results in too much
@@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
47 put_page(page); 57 put_page(page);
48 return 0; 58 return 0;
49 } 59 }
60 if (PageTail(page))
61 get_huge_page_tail(page);
50 pages[*nr] = page; 62 pages[*nr] = page;
51 (*nr)++; 63 (*nr)++;
52 64
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 9163db3e8d15..d7762349ea48 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
35 if (pud) { 35 if (pud) {
36 pmd = pmd_alloc(mm, pud, addr); 36 pmd = pmd_alloc(mm, pud, addr);
37 if (pmd) 37 if (pmd)
38 pte = pte_alloc_map(mm, pmd, addr); 38 pte = pte_alloc_map(mm, NULL, pmd, addr);
39 } 39 }
40 } 40 }
41 41
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index ee3c7dde8d9f..8d348c474a2f 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -23,17 +23,11 @@
23 23
24static void *module_map(unsigned long size) 24static void *module_map(unsigned long size)
25{ 25{
26 struct vm_struct *area; 26 if (PAGE_ALIGN(size) > MODULES_LEN)
27
28 size = PAGE_ALIGN(size);
29 if (!size || size > MODULES_LEN)
30 return NULL;
31
32 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
33 if (!area)
34 return NULL; 27 return NULL;
35 28 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
36 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); 29 GFP_KERNEL, PAGE_KERNEL, -1,
30 __builtin_return_address(0));
37} 31}
38 32
39static char *dot2underscore(char *name) 33static char *dot2underscore(char *name)
diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c
index 5edcac184eaf..e6067b75f11c 100644
--- a/arch/sparc/mm/generic_32.c
+++ b/arch/sparc/mm/generic_32.c
@@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
50 end = PGDIR_SIZE; 50 end = PGDIR_SIZE;
51 offset -= address; 51 offset -= address;
52 do { 52 do {
53 pte_t * pte = pte_alloc_map(mm, pmd, address); 53 pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
54 if (!pte) 54 if (!pte)
55 return -ENOMEM; 55 return -ENOMEM;
56 io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); 56 io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c
index 04f2bf4cd571..3cb00dfd4bd6 100644
--- a/arch/sparc/mm/generic_64.c
+++ b/arch/sparc/mm/generic_64.c
@@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
92 end = PGDIR_SIZE; 92 end = PGDIR_SIZE;
93 offset -= address; 93 offset -= address;
94 do { 94 do {
95 pte_t * pte = pte_alloc_map(mm, pmd, address); 95 pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
96 if (!pte) 96 if (!pte)
97 return -ENOMEM; 97 return -ENOMEM;
98 io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); 98 io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5fdddf134caa..f4e97646ce23 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
214 if (pud) { 214 if (pud) {
215 pmd = pmd_alloc(mm, pud, addr); 215 pmd = pmd_alloc(mm, pud, addr);
216 if (pmd) 216 if (pmd)
217 pte = pte_alloc_map(mm, pmd, addr); 217 pte = pte_alloc_map(mm, NULL, pmd, addr);
218 } 218 }
219 return pte; 219 return pte;
220} 220}
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 3d099f974785..1aee587e9c5d 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
31 if (!pmd) 31 if (!pmd)
32 goto out_pmd; 32 goto out_pmd;
33 33
34 pte = pte_alloc_map(mm, pmd, proc); 34 pte = pte_alloc_map(mm, NULL, pmd, proc);
35 if (!pte) 35 if (!pte)
36 goto out_pte; 36 goto out_pte;
37 37
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21a9fba..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
822#define KVM_ARCH_WANT_MMU_NOTIFIER 822#define KVM_ARCH_WANT_MMU_NOTIFIER
823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
824int kvm_age_hva(struct kvm *kvm, unsigned long hva); 824int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
825void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
826int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
827int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 828int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b8..2071a8b2b32f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
435{ 435{
436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); 436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
437} 437}
438static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
439 pmd_t *pmdp)
440{
441 PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
442}
438 443
439static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep) 445 pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
442 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
443} 448}
444 449
450static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
451 pmd_t *pmdp)
452{
453 PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
454}
455
445static inline pte_t __pte(pteval_t val) 456static inline pte_t __pte(pteval_t val)
446{ 457{
447 pteval_t ret; 458 pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
543 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); 554 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
544} 555}
545 556
557#ifdef CONFIG_TRANSPARENT_HUGEPAGE
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd)
560{
561#if PAGETABLE_LEVELS >= 3
562 if (sizeof(pmdval_t) > sizeof(long))
563 /* 5 arg words */
564 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
565 else
566 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
567#endif
568}
569#endif
570
546static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 571static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
547{ 572{
548 pmdval_t val = native_pmd_val(pmd); 573 pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, 265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
266 pte_t *ptep, pte_t pteval); 266 pte_t *ptep, pte_t pteval);
267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
268 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
269 pmd_t *pmdp, pmd_t pmdval);
268 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 270 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep); 271 pte_t *ptep);
270 void (*pte_update_defer)(struct mm_struct *mm, 272 void (*pte_update_defer)(struct mm_struct *mm,
271 unsigned long addr, pte_t *ptep); 273 unsigned long addr, pte_t *ptep);
274 void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
275 pmd_t *pmdp);
276 void (*pmd_update_defer)(struct mm_struct *mm,
277 unsigned long addr, pmd_t *pmdp);
272 278
273 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 279 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
274 pte_t *ptep); 280 pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
47#endif 47#endif
48 48
49#ifdef CONFIG_SMP
50static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
51{
52 return __pmd(xchg((pmdval_t *)xp, 0));
53}
54#else
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif
57
49/* 58/*
50 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
51 * split up the 29 bits of offset into this range: 60 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
105#endif 105#endif
106 106
107#ifdef CONFIG_SMP
108union split_pmd {
109 struct {
110 u32 pmd_low;
111 u32 pmd_high;
112 };
113 pmd_t pmd;
114};
115static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
116{
117 union split_pmd res, *orig = (union split_pmd *)pmdp;
118
119 /* xchg acts as a barrier before setting of the high bits */
120 res.pmd_low = xchg(&orig->pmd_low, 0);
121 res.pmd_high = orig->pmd_high;
122 orig->pmd_high = 0;
123
124 return res.pmd;
125}
126#else
127#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
128#endif
129
107/* 130/*
108 * Bits 0, 6 and 7 are taken in the low part of the pte, 131 * Bits 0, 6 and 7 are taken in the low part of the pte,
109 * put the 32 bits of offset into the high part. 132 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
35#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
36#define set_pte(ptep, pte) native_set_pte(ptep, pte) 36#define set_pte(ptep, pte) native_set_pte(ptep, pte)
37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
38#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
38 39
39#define set_pte_atomic(ptep, pte) \ 40#define set_pte_atomic(ptep, pte) \
40 native_set_pte_atomic(ptep, pte) 41 native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
59 60
60#define pte_update(mm, addr, ptep) do { } while (0) 61#define pte_update(mm, addr, ptep) do { } while (0)
61#define pte_update_defer(mm, addr, ptep) do { } while (0) 62#define pte_update_defer(mm, addr, ptep) do { } while (0)
63#define pmd_update(mm, addr, ptep) do { } while (0)
64#define pmd_update_defer(mm, addr, ptep) do { } while (0)
62 65
63#define pgd_val(x) native_pgd_val(x) 66#define pgd_val(x) native_pgd_val(x)
64#define __pgd(x) native_make_pgd(x) 67#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
94 return pte_flags(pte) & _PAGE_ACCESSED; 97 return pte_flags(pte) & _PAGE_ACCESSED;
95} 98}
96 99
100static inline int pmd_young(pmd_t pmd)
101{
102 return pmd_flags(pmd) & _PAGE_ACCESSED;
103}
104
97static inline int pte_write(pte_t pte) 105static inline int pte_write(pte_t pte)
98{ 106{
99 return pte_flags(pte) & _PAGE_RW; 107 return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
142 (_PAGE_PSE | _PAGE_PRESENT); 150 (_PAGE_PSE | _PAGE_PRESENT);
143} 151}
144 152
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154static inline int pmd_trans_splitting(pmd_t pmd)
155{
156 return pmd_val(pmd) & _PAGE_SPLITTING;
157}
158
159static inline int pmd_trans_huge(pmd_t pmd)
160{
161 return pmd_val(pmd) & _PAGE_PSE;
162}
163
164static inline int has_transparent_hugepage(void)
165{
166 return cpu_has_pse;
167}
168#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
169
145static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 170static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
146{ 171{
147 pteval_t v = native_pte_val(pte); 172 pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
216 return pte_set_flags(pte, _PAGE_SPECIAL); 241 return pte_set_flags(pte, _PAGE_SPECIAL);
217} 242}
218 243
244static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
245{
246 pmdval_t v = native_pmd_val(pmd);
247
248 return __pmd(v | set);
249}
250
251static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
252{
253 pmdval_t v = native_pmd_val(pmd);
254
255 return __pmd(v & ~clear);
256}
257
258static inline pmd_t pmd_mkold(pmd_t pmd)
259{
260 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
261}
262
263static inline pmd_t pmd_wrprotect(pmd_t pmd)
264{
265 return pmd_clear_flags(pmd, _PAGE_RW);
266}
267
268static inline pmd_t pmd_mkdirty(pmd_t pmd)
269{
270 return pmd_set_flags(pmd, _PAGE_DIRTY);
271}
272
273static inline pmd_t pmd_mkhuge(pmd_t pmd)
274{
275 return pmd_set_flags(pmd, _PAGE_PSE);
276}
277
278static inline pmd_t pmd_mkyoung(pmd_t pmd)
279{
280 return pmd_set_flags(pmd, _PAGE_ACCESSED);
281}
282
283static inline pmd_t pmd_mkwrite(pmd_t pmd)
284{
285 return pmd_set_flags(pmd, _PAGE_RW);
286}
287
288static inline pmd_t pmd_mknotpresent(pmd_t pmd)
289{
290 return pmd_clear_flags(pmd, _PAGE_PRESENT);
291}
292
219/* 293/*
220 * Mask out unsupported bits in a present pgprot. Non-present pgprots 294 * Mask out unsupported bits in a present pgprot. Non-present pgprots
221 * can use those bits for other purposes, so leave them be. 295 * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
256 return __pte(val); 330 return __pte(val);
257} 331}
258 332
333static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
334{
335 pmdval_t val = pmd_val(pmd);
336
337 val &= _HPAGE_CHG_MASK;
338 val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
339
340 return __pmd(val);
341}
342
259/* mprotect needs to preserve PAT bits when updating vm_page_prot */ 343/* mprotect needs to preserve PAT bits when updating vm_page_prot */
260#define pgprot_modify pgprot_modify 344#define pgprot_modify pgprot_modify
261static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) 345static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
350 * Currently stuck as a macro due to indirect forward reference to 434 * Currently stuck as a macro due to indirect forward reference to
351 * linux/mmzone.h's __section_mem_map_addr() definition: 435 * linux/mmzone.h's __section_mem_map_addr() definition:
352 */ 436 */
353#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 437#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
354 438
355/* 439/*
356 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 440 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
524 return res; 608 return res;
525} 609}
526 610
611static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
612{
613 pmd_t res = *pmdp;
614
615 native_pmd_clear(pmdp);
616 return res;
617}
618
527static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 619static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep , pte_t pte) 620 pte_t *ptep , pte_t pte)
529{ 621{
530 native_set_pte(ptep, pte); 622 native_set_pte(ptep, pte);
531} 623}
532 624
625static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
626 pmd_t *pmdp , pmd_t pmd)
627{
628 native_set_pmd(pmdp, pmd);
629}
630
533#ifndef CONFIG_PARAVIRT 631#ifndef CONFIG_PARAVIRT
534/* 632/*
535 * Rules for using pte_update - it must be called after any PTE update which 633 * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
607 705
608#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address)
609 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709
710#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
711extern int pmdp_set_access_flags(struct vm_area_struct *vma,
712 unsigned long address, pmd_t *pmdp,
713 pmd_t entry, int dirty);
714
715#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
716extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
717 unsigned long addr, pmd_t *pmdp);
718
719#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
720extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
721 unsigned long address, pmd_t *pmdp);
722
723
724#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
725extern void pmdp_splitting_flush(struct vm_area_struct *vma,
726 unsigned long addr, pmd_t *pmdp);
727
728#define __HAVE_ARCH_PMD_WRITE
729static inline int pmd_write(pmd_t pmd)
730{
731 return pmd_flags(pmd) & _PAGE_RW;
732}
733
734#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
735static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
736 pmd_t *pmdp)
737{
738 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
739 pmd_update(mm, addr, pmdp);
740 return pmd;
741}
742
743#define __HAVE_ARCH_PMDP_SET_WRPROTECT
744static inline void pmdp_set_wrprotect(struct mm_struct *mm,
745 unsigned long addr, pmd_t *pmdp)
746{
747 clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
748 pmd_update(mm, addr, pmdp);
749}
750
610/* 751/*
611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 752 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
612 * 753 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
59 native_set_pte(ptep, pte); 59 native_set_pte(ptep, pte);
60} 60}
61 61
62static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
63{
64 *pmdp = pmd;
65}
66
67static inline void native_pmd_clear(pmd_t *pmd)
68{
69 native_set_pmd(pmd, native_make_pmd(0));
70}
71
62static inline pte_t native_ptep_get_and_clear(pte_t *xp) 72static inline pte_t native_ptep_get_and_clear(pte_t *xp)
63{ 73{
64#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
72#endif 82#endif
73} 83}
74 84
75static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 85static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
76{ 86{
77 *pmdp = pmd; 87#ifdef CONFIG_SMP
78} 88 return native_make_pmd(xchg(&xp->pmd, 0));
79 89#else
80static inline void native_pmd_clear(pmd_t *pmd) 90 /* native_local_pmdp_get_and_clear,
81{ 91 but duplicated because of cyclic dependency */
82 native_set_pmd(pmd, native_make_pmd(0)); 92 pmd_t ret = *xp;
93 native_pmd_clear(xp);
94 return ret;
95#endif
83} 96}
84 97
85static inline void native_set_pud(pud_t *pudp, pud_t pud) 98static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) 181#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 182
170#define __HAVE_ARCH_PTE_SAME 183#define __HAVE_ARCH_PTE_SAME
184
171#endif /* !__ASSEMBLY__ */ 185#endif /* !__ASSEMBLY__ */
172 186
173#endif /* _ASM_X86_PGTABLE_64_H */ 187#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26 27
27/* If _PAGE_BIT_PRESENT is clear, we use these: */ 28/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
48#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
49 51
50#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
70/* Set of bits not changed in pte_modify */ 72/* Set of bits not changed in pte_modify */
71#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 73#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
72 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 74 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
75#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
73 76
74#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 77#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
75#define _PAGE_CACHE_WB (0) 78#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21c..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
42extern unsigned long get_phys_to_machine(unsigned long pfn); 42extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
44 44
45extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49
45static inline unsigned long pfn_to_mfn(unsigned long pfn) 50static inline unsigned long pfn_to_mfn(unsigned long pfn)
46{ 51{
47 unsigned long mfn; 52 unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
72 if (xen_feature(XENFEAT_auto_translated_physmap)) 77 if (xen_feature(XENFEAT_auto_translated_physmap))
73 return mfn; 78 return mfn;
74 79
75 if (unlikely((mfn >> machine_to_phys_order) != 0))
76 return ~0;
77
78 pfn = 0; 80 pfn = 0;
79 /* 81 /*
80 * The array access can fail (e.g., device space beyond end of RAM). 82 * The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
83 */ 85 */
84 __get_user(pfn, &machine_to_phys_mapping[mfn]); 86 __get_user(pfn, &machine_to_phys_mapping[mfn]);
85 87
88 /*
89 * If this appears to be a foreign mfn (because the pfn
90 * doesn't map back to the mfn), then check the local override
91 * table to see if there's a better pfn to use.
92 */
93 if (get_phys_to_machine(pfn) != mfn)
94 pfn = m2p_find_override_pfn(mfn, pfn);
95
86 return pfn; 96 return pfn;
87} 97}
88 98
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
37 37
38void *module_alloc(unsigned long size) 38void *module_alloc(unsigned long size)
39{ 39{
40 struct vm_struct *area; 40 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL; 41 return NULL;
47 42 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); 43 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
49 if (!area) 44 -1, __builtin_return_address(0));
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
53 PAGE_KERNEL_EXEC);
54} 45}
55 46
56/* Free memory returned from module_alloc */ 47/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
421 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
422 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
423 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
424 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
425 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
426 429
427 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
428 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 133 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 134 if (!pmd)
135 return -1; 135 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 136 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 137 if (!pte)
138 return -1; 138 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb499813..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 554 return ret;
555} 555}
556 556
557static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
558{ 558{
559 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
560 int host_level, level, max_level;
561
562 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
563 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
564 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
565 569
566 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
567 571
@@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
941 return young; 945 return young;
942} 946}
943 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
944#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
945 978
946static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
961 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
962} 995}
963 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
964#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
965static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
966{ 1004{
@@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2281 return 1; 2319 return 1;
2282} 2320}
2283 2321
2322static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2323 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2324{
2325 pfn_t pfn = *pfnp;
2326 gfn_t gfn = *gfnp;
2327 int level = *levelp;
2328
2329 /*
2330 * Check if it's a transparent hugepage. If this would be an
2331 * hugetlbfs page, level wouldn't be set to
2332 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2333 * here.
2334 */
2335 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2336 level == PT_PAGE_TABLE_LEVEL &&
2337 PageTransCompound(pfn_to_page(pfn)) &&
2338 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2339 unsigned long mask;
2340 /*
2341 * mmu_notifier_retry was successful and we hold the
2342 * mmu_lock here, so the pmd can't become splitting
2343 * from under us, and in turn
2344 * __split_huge_page_refcount() can't run from under
2345 * us and we can safely transfer the refcount from
2346 * PG_tail to PG_head as we switch the pfn to tail to
2347 * head.
2348 */
2349 *levelp = level = PT_DIRECTORY_LEVEL;
2350 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2351 VM_BUG_ON((gfn & mask) != (pfn & mask));
2352 if (pfn & mask) {
2353 gfn &= ~mask;
2354 *gfnp = gfn;
2355 kvm_release_pfn_clean(pfn);
2356 pfn &= ~mask;
2357 if (!get_page_unless_zero(pfn_to_page(pfn)))
2358 BUG();
2359 *pfnp = pfn;
2360 }
2361 }
2362}
2363
2284static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2364static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2285 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2365 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2286 2366
@@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2289{ 2369{
2290 int r; 2370 int r;
2291 int level; 2371 int level;
2372 int force_pt_level;
2292 pfn_t pfn; 2373 pfn_t pfn;
2293 unsigned long mmu_seq; 2374 unsigned long mmu_seq;
2294 bool map_writable; 2375 bool map_writable;
2295 2376
2296 level = mapping_level(vcpu, gfn); 2377 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2297 2378 if (likely(!force_pt_level)) {
2298 /* 2379 level = mapping_level(vcpu, gfn);
2299 * This path builds a PAE pagetable - so we can map 2mb pages at 2380 /*
2300 * maximum. Therefore check if the level is larger than that. 2381 * This path builds a PAE pagetable - so we can map
2301 */ 2382 * 2mb pages at maximum. Therefore check if the level
2302 if (level > PT_DIRECTORY_LEVEL) 2383 * is larger than that.
2303 level = PT_DIRECTORY_LEVEL; 2384 */
2385 if (level > PT_DIRECTORY_LEVEL)
2386 level = PT_DIRECTORY_LEVEL;
2304 2387
2305 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2388 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2389 } else
2390 level = PT_PAGE_TABLE_LEVEL;
2306 2391
2307 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2392 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2308 smp_rmb(); 2393 smp_rmb();
@@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2318 if (mmu_notifier_retry(vcpu, mmu_seq)) 2403 if (mmu_notifier_retry(vcpu, mmu_seq))
2319 goto out_unlock; 2404 goto out_unlock;
2320 kvm_mmu_free_some_pages(vcpu); 2405 kvm_mmu_free_some_pages(vcpu);
2406 if (likely(!force_pt_level))
2407 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2321 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, 2408 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2322 prefault); 2409 prefault);
2323 spin_unlock(&vcpu->kvm->mmu_lock); 2410 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2655 pfn_t pfn; 2742 pfn_t pfn;
2656 int r; 2743 int r;
2657 int level; 2744 int level;
2745 int force_pt_level;
2658 gfn_t gfn = gpa >> PAGE_SHIFT; 2746 gfn_t gfn = gpa >> PAGE_SHIFT;
2659 unsigned long mmu_seq; 2747 unsigned long mmu_seq;
2660 int write = error_code & PFERR_WRITE_MASK; 2748 int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2667 if (r) 2755 if (r)
2668 return r; 2756 return r;
2669 2757
2670 level = mapping_level(vcpu, gfn); 2758 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2671 2759 if (likely(!force_pt_level)) {
2672 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2760 level = mapping_level(vcpu, gfn);
2761 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2762 } else
2763 level = PT_PAGE_TABLE_LEVEL;
2673 2764
2674 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2765 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2675 smp_rmb(); 2766 smp_rmb();
@@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2684 if (mmu_notifier_retry(vcpu, mmu_seq)) 2775 if (mmu_notifier_retry(vcpu, mmu_seq))
2685 goto out_unlock; 2776 goto out_unlock;
2686 kvm_mmu_free_some_pages(vcpu); 2777 kvm_mmu_free_some_pages(vcpu);
2778 if (likely(!force_pt_level))
2779 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2687 r = __direct_map(vcpu, gpa, write, map_writable, 2780 r = __direct_map(vcpu, gpa, write, map_writable,
2688 level, gfn, pfn, prefault); 2781 level, gfn, pfn, prefault);
2689 spin_unlock(&vcpu->kvm->mmu_lock); 2782 spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 53210f1e94c2..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
550 int r; 550 int r;
551 pfn_t pfn; 551 pfn_t pfn;
552 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
553 unsigned long mmu_seq; 554 unsigned long mmu_seq;
554 bool map_writable; 555 bool map_writable;
555 556
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
577 return 0; 578 return 0;
578 } 579 }
579 580
580 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
581 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
582 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
583 } 588 }
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
599 604
600 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
602 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
603 level, &write_pt, pfn, map_writable, prefault); 610 level, &write_pt, pfn, map_writable, prefault);
604 (void)sptep; 611 (void)sptep;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
320 return changed; 320 return changed;
321} 321}
322 322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324int pmdp_set_access_flags(struct vm_area_struct *vma,
325 unsigned long address, pmd_t *pmdp,
326 pmd_t entry, int dirty)
327{
328 int changed = !pmd_same(*pmdp, entry);
329
330 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
331
332 if (changed && dirty) {
333 *pmdp = entry;
334 pmd_update_defer(vma->vm_mm, address, pmdp);
335 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
336 }
337
338 return changed;
339}
340#endif
341
323int ptep_test_and_clear_young(struct vm_area_struct *vma, 342int ptep_test_and_clear_young(struct vm_area_struct *vma,
324 unsigned long addr, pte_t *ptep) 343 unsigned long addr, pte_t *ptep)
325{ 344{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
335 return ret; 354 return ret;
336} 355}
337 356
357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
358int pmdp_test_and_clear_young(struct vm_area_struct *vma,
359 unsigned long addr, pmd_t *pmdp)
360{
361 int ret = 0;
362
363 if (pmd_young(*pmdp))
364 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
365 (unsigned long *)pmdp);
366
367 if (ret)
368 pmd_update(vma->vm_mm, addr, pmdp);
369
370 return ret;
371}
372#endif
373
338int ptep_clear_flush_young(struct vm_area_struct *vma, 374int ptep_clear_flush_young(struct vm_area_struct *vma,
339 unsigned long address, pte_t *ptep) 375 unsigned long address, pte_t *ptep)
340{ 376{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
347 return young; 383 return young;
348} 384}
349 385
386#ifdef CONFIG_TRANSPARENT_HUGEPAGE
387int pmdp_clear_flush_young(struct vm_area_struct *vma,
388 unsigned long address, pmd_t *pmdp)
389{
390 int young;
391
392 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
393
394 young = pmdp_test_and_clear_young(vma, address, pmdp);
395 if (young)
396 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
397
398 return young;
399}
400
401void pmdp_splitting_flush(struct vm_area_struct *vma,
402 unsigned long address, pmd_t *pmdp)
403{
404 int set;
405 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
406 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
407 (unsigned long *)pmdp);
408 if (set) {
409 pmd_update(vma->vm_mm, address, pmdp);
410 /* need tlb flush only to serialize against gup-fast */
411 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
412 }
413}
414#endif
415
350/** 416/**
351 * reserve_top_address - reserves a hole in the top of kernel address space 417 * reserve_top_address - reserves a hole in the top of kernel address space
352 * @reserve - size of hole to reserve 418 * @reserve - size of hole to reserve
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fde..7575e55cd52e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
173 */ 173 */
174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
204
205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
208
209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
210
211/* Placeholders for holes in the address space */
212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
215
216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
219
220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
222
223static inline unsigned p2m_top_index(unsigned long pfn)
224{
225 BUG_ON(pfn >= MAX_P2M_PFN);
226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
232}
233
234static inline unsigned p2m_index(unsigned long pfn)
235{
236 return pfn % P2M_PER_PAGE;
237}
238
239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
261}
262
263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
297void xen_build_mfn_list_list(void)
298{
299 unsigned long pfn;
300
301 /* Pre-initialize p2m_top_mfn to be completely missing */
302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
308
309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
314 }
315
316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
317 unsigned topidx = p2m_top_index(pfn);
318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
352 }
353}
354
355void xen_setup_mfn_list_list(void)
356{
357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
358
359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
360 virt_to_mfn(p2m_top_mfn);
361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
362}
363
364/* Set up p2m_top to point to the domain-builder provided p2m pages */
365void __init xen_build_dynamic_phys_to_machine(void)
366{
367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
372
373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
381
382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
390
391 if (p2m_top[topidx] == p2m_mid_missing) {
392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
394
395 p2m_top[topidx] = mid;
396 }
397
398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
400}
401
402unsigned long get_phys_to_machine(unsigned long pfn)
403{
404 unsigned topidx, mididx, idx;
405
406 if (unlikely(pfn >= MAX_P2M_PFN))
407 return INVALID_P2M_ENTRY;
408
409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
411 idx = p2m_index(pfn);
412
413 return p2m_top[topidx][mididx][idx];
414}
415EXPORT_SYMBOL_GPL(get_phys_to_machine);
416
417static void *alloc_p2m_page(void)
418{
419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
420}
421
422static void free_p2m_page(void *p)
423{
424 free_page((unsigned long)p);
425}
426
427/*
428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
439
440 topidx = p2m_top_index(pfn);
441 mididx = p2m_mid_index(pfn);
442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
456 }
457
458 top_mfn_p = &p2m_top_mfn[topidx];
459 mid_mfn = p2m_top_mfn_p[topidx];
460
461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
462
463 if (mid_mfn == p2m_mid_missing_mfn) {
464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
467
468 mid_mfn = alloc_p2m_page();
469 if (!mid_mfn)
470 return false;
471
472 p2m_mid_mfn_init(mid_mfn);
473
474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
499}
500
501/* Try to install p2m mapping; fail if intermediate bits missing */
502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
503{
504 unsigned topidx, mididx, idx;
505
506 if (unlikely(pfn >= MAX_P2M_PFN)) {
507 BUG_ON(mfn != INVALID_P2M_ENTRY);
508 return true;
509 }
510
511 topidx = p2m_top_index(pfn);
512 mididx = p2m_mid_index(pfn);
513 idx = p2m_index(pfn);
514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
519
520 return true;
521}
522
523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
524{
525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
527 return true;
528 }
529
530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
531 if (!alloc_p2m(pfn))
532 return false;
533
534 if (!__set_phys_to_machine(pfn, mfn))
535 return false;
536 }
537
538 return true;
539}
540
541unsigned long arbitrary_virt_to_mfn(void *vaddr) 176unsigned long arbitrary_virt_to_mfn(void *vaddr)
542{ 177{
543 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 178 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..8f2251d2a3f8
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,510 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/list.h>
31#include <linux/hash.h>
32#include <linux/sched.h>
33
34#include <asm/cache.h>
35#include <asm/setup.h>
36
37#include <asm/xen/page.h>
38#include <asm/xen/hypercall.h>
39#include <asm/xen/hypervisor.h>
40
41#include "xen-ops.h"
42
43static void __init m2p_override_init(void);
44
45unsigned long xen_max_p2m_pfn __read_mostly;
46
47#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
50
51#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
52
53/* Placeholders for holes in the address space */
54static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
55static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
56static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
57
58static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64
65static inline unsigned p2m_top_index(unsigned long pfn)
66{
67 BUG_ON(pfn >= MAX_P2M_PFN);
68 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
69}
70
71static inline unsigned p2m_mid_index(unsigned long pfn)
72{
73 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
74}
75
76static inline unsigned p2m_index(unsigned long pfn)
77{
78 return pfn % P2M_PER_PAGE;
79}
80
81static void p2m_top_init(unsigned long ***top)
82{
83 unsigned i;
84
85 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
86 top[i] = p2m_mid_missing;
87}
88
89static void p2m_top_mfn_init(unsigned long *top)
90{
91 unsigned i;
92
93 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
94 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
95}
96
97static void p2m_top_mfn_p_init(unsigned long **top)
98{
99 unsigned i;
100
101 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
102 top[i] = p2m_mid_missing_mfn;
103}
104
105static void p2m_mid_init(unsigned long **mid)
106{
107 unsigned i;
108
109 for (i = 0; i < P2M_MID_PER_PAGE; i++)
110 mid[i] = p2m_missing;
111}
112
113static void p2m_mid_mfn_init(unsigned long *mid)
114{
115 unsigned i;
116
117 for (i = 0; i < P2M_MID_PER_PAGE; i++)
118 mid[i] = virt_to_mfn(p2m_missing);
119}
120
121static void p2m_init(unsigned long *p2m)
122{
123 unsigned i;
124
125 for (i = 0; i < P2M_MID_PER_PAGE; i++)
126 p2m[i] = INVALID_P2M_ENTRY;
127}
128
129/*
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
131 *
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
135 *
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
138 */
139void xen_build_mfn_list_list(void)
140{
141 unsigned long pfn;
142
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn == NULL) {
145 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn);
147
148 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
149 p2m_top_mfn_p_init(p2m_top_mfn_p);
150
151 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
152 p2m_top_mfn_init(p2m_top_mfn);
153 } else {
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn);
156 }
157
158 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
159 unsigned topidx = p2m_top_index(pfn);
160 unsigned mididx = p2m_mid_index(pfn);
161 unsigned long **mid;
162 unsigned long *mid_mfn_p;
163
164 mid = p2m_top[topidx];
165 mid_mfn_p = p2m_top_mfn_p[topidx];
166
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
170 */
171 if (mid == p2m_mid_missing) {
172 BUG_ON(mididx);
173 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
174 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
175 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
176 continue;
177 }
178
179 if (mid_mfn_p == p2m_mid_missing_mfn) {
180 /*
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
184 * it too late.
185 */
186 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
187 p2m_mid_mfn_init(mid_mfn_p);
188
189 p2m_top_mfn_p[topidx] = mid_mfn_p;
190 }
191
192 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
193 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
194 }
195}
196
197void xen_setup_mfn_list_list(void)
198{
199 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
200
201 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
202 virt_to_mfn(p2m_top_mfn);
203 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
204}
205
206/* Set up p2m_top to point to the domain-builder provided p2m pages */
207void __init xen_build_dynamic_phys_to_machine(void)
208{
209 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
210 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
211 unsigned long pfn;
212
213 xen_max_p2m_pfn = max_pfn;
214
215 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
216 p2m_init(p2m_missing);
217
218 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
219 p2m_mid_init(p2m_mid_missing);
220
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top);
223
224 /*
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
228 */
229 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
230 unsigned topidx = p2m_top_index(pfn);
231 unsigned mididx = p2m_mid_index(pfn);
232
233 if (p2m_top[topidx] == p2m_mid_missing) {
234 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
235 p2m_mid_init(mid);
236
237 p2m_top[topidx] = mid;
238 }
239
240 p2m_top[topidx][mididx] = &mfn_list[pfn];
241 }
242
243 m2p_override_init();
244}
245
246unsigned long get_phys_to_machine(unsigned long pfn)
247{
248 unsigned topidx, mididx, idx;
249
250 if (unlikely(pfn >= MAX_P2M_PFN))
251 return INVALID_P2M_ENTRY;
252
253 topidx = p2m_top_index(pfn);
254 mididx = p2m_mid_index(pfn);
255 idx = p2m_index(pfn);
256
257 return p2m_top[topidx][mididx][idx];
258}
259EXPORT_SYMBOL_GPL(get_phys_to_machine);
260
261static void *alloc_p2m_page(void)
262{
263 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
264}
265
266static void free_p2m_page(void *p)
267{
268 free_page((unsigned long)p);
269}
270
271/*
272 * Fully allocate the p2m structure for a given pfn. We need to check
273 * that both the top and mid levels are allocated, and make sure the
274 * parallel mfn tree is kept in sync. We may race with other cpus, so
275 * the new pages are installed with cmpxchg; if we lose the race then
276 * simply free the page we allocated and use the one that's there.
277 */
278static bool alloc_p2m(unsigned long pfn)
279{
280 unsigned topidx, mididx;
281 unsigned long ***top_p, **mid;
282 unsigned long *top_mfn_p, *mid_mfn;
283
284 topidx = p2m_top_index(pfn);
285 mididx = p2m_mid_index(pfn);
286
287 top_p = &p2m_top[topidx];
288 mid = *top_p;
289
290 if (mid == p2m_mid_missing) {
291 /* Mid level is missing, allocate a new one */
292 mid = alloc_p2m_page();
293 if (!mid)
294 return false;
295
296 p2m_mid_init(mid);
297
298 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
299 free_p2m_page(mid);
300 }
301
302 top_mfn_p = &p2m_top_mfn[topidx];
303 mid_mfn = p2m_top_mfn_p[topidx];
304
305 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
306
307 if (mid_mfn == p2m_mid_missing_mfn) {
308 /* Separately check the mid mfn level */
309 unsigned long missing_mfn;
310 unsigned long mid_mfn_mfn;
311
312 mid_mfn = alloc_p2m_page();
313 if (!mid_mfn)
314 return false;
315
316 p2m_mid_mfn_init(mid_mfn);
317
318 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
319 mid_mfn_mfn = virt_to_mfn(mid_mfn);
320 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
321 free_p2m_page(mid_mfn);
322 else
323 p2m_top_mfn_p[topidx] = mid_mfn;
324 }
325
326 if (p2m_top[topidx][mididx] == p2m_missing) {
327 /* p2m leaf page is missing */
328 unsigned long *p2m;
329
330 p2m = alloc_p2m_page();
331 if (!p2m)
332 return false;
333
334 p2m_init(p2m);
335
336 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
337 free_p2m_page(p2m);
338 else
339 mid_mfn[mididx] = virt_to_mfn(p2m);
340 }
341
342 return true;
343}
344
345/* Try to install p2m mapping; fail if intermediate bits missing */
346bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
347{
348 unsigned topidx, mididx, idx;
349
350 if (unlikely(pfn >= MAX_P2M_PFN)) {
351 BUG_ON(mfn != INVALID_P2M_ENTRY);
352 return true;
353 }
354
355 topidx = p2m_top_index(pfn);
356 mididx = p2m_mid_index(pfn);
357 idx = p2m_index(pfn);
358
359 if (p2m_top[topidx][mididx] == p2m_missing)
360 return mfn == INVALID_P2M_ENTRY;
361
362 p2m_top[topidx][mididx][idx] = mfn;
363
364 return true;
365}
366
367bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368{
369 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
370 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
371 return true;
372 }
373
374 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
375 if (!alloc_p2m(pfn))
376 return false;
377
378 if (!__set_phys_to_machine(pfn, mfn))
379 return false;
380 }
381
382 return true;
383}
384
385#define M2P_OVERRIDE_HASH_SHIFT 10
386#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
387
388static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
389static DEFINE_SPINLOCK(m2p_override_lock);
390
391static void __init m2p_override_init(void)
392{
393 unsigned i;
394
395 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
396 sizeof(unsigned long));
397
398 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
399 INIT_LIST_HEAD(&m2p_overrides[i]);
400}
401
402static unsigned long mfn_hash(unsigned long mfn)
403{
404 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
405}
406
407/* Add an MFN override for a particular page */
408int m2p_add_override(unsigned long mfn, struct page *page)
409{
410 unsigned long flags;
411 unsigned long pfn;
412 unsigned long address;
413 unsigned level;
414 pte_t *ptep = NULL;
415
416 pfn = page_to_pfn(page);
417 if (!PageHighMem(page)) {
418 address = (unsigned long)__va(pfn << PAGE_SHIFT);
419 ptep = lookup_address(address, &level);
420
421 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
422 "m2p_add_override: pfn %lx not mapped", pfn))
423 return -EINVAL;
424 }
425
426 page->private = mfn;
427 page->index = pfn_to_mfn(pfn);
428
429 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
430 if (!PageHighMem(page))
431 /* Just zap old mapping for now */
432 pte_clear(&init_mm, address, ptep);
433
434 spin_lock_irqsave(&m2p_override_lock, flags);
435 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
436 spin_unlock_irqrestore(&m2p_override_lock, flags);
437
438 return 0;
439}
440
441int m2p_remove_override(struct page *page)
442{
443 unsigned long flags;
444 unsigned long mfn;
445 unsigned long pfn;
446 unsigned long address;
447 unsigned level;
448 pte_t *ptep = NULL;
449
450 pfn = page_to_pfn(page);
451 mfn = get_phys_to_machine(pfn);
452 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
453 return -EINVAL;
454
455 if (!PageHighMem(page)) {
456 address = (unsigned long)__va(pfn << PAGE_SHIFT);
457 ptep = lookup_address(address, &level);
458
459 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
460 "m2p_remove_override: pfn %lx not mapped", pfn))
461 return -EINVAL;
462 }
463
464 spin_lock_irqsave(&m2p_override_lock, flags);
465 list_del(&page->lru);
466 spin_unlock_irqrestore(&m2p_override_lock, flags);
467 __set_phys_to_machine(pfn, page->index);
468
469 if (!PageHighMem(page))
470 set_pte_at(&init_mm, address, ptep,
471 pfn_pte(pfn, PAGE_KERNEL));
472 /* No tlb flush necessary because the caller already
473 * left the pte unmapped. */
474
475 return 0;
476}
477
478struct page *m2p_find_override(unsigned long mfn)
479{
480 unsigned long flags;
481 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
482 struct page *p, *ret;
483
484 ret = NULL;
485
486 spin_lock_irqsave(&m2p_override_lock, flags);
487
488 list_for_each_entry(p, bucket, lru) {
489 if (p->private == mfn) {
490 ret = p;
491 break;
492 }
493 }
494
495 spin_unlock_irqrestore(&m2p_override_lock, flags);
496
497 return ret;
498}
499
500unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
501{
502 struct page *p = m2p_find_override(mfn);
503 unsigned long ret = pfn;
504
505 if (p)
506 ret = page_to_pfn(p);
507
508 return ret;
509}
510EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h
index fca4db425f6e..30789010733d 100644
--- a/arch/xtensa/include/asm/mman.h
+++ b/arch/xtensa/include/asm/mman.h
@@ -83,6 +83,9 @@
83#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 83#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
84#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 84#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
85 85
86#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
87#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
88
86/* compatibility flags */ 89/* compatibility flags */
87#define MAP_FILE 0 90#define MAP_FILE 0
88 91
diff --git a/drivers/base/node.c b/drivers/base/node.c
index ce012a9c6201..36b43052001d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
117 "Node %d WritebackTmp: %8lu kB\n" 117 "Node %d WritebackTmp: %8lu kB\n"
118 "Node %d Slab: %8lu kB\n" 118 "Node %d Slab: %8lu kB\n"
119 "Node %d SReclaimable: %8lu kB\n" 119 "Node %d SReclaimable: %8lu kB\n"
120 "Node %d SUnreclaim: %8lu kB\n", 120 "Node %d SUnreclaim: %8lu kB\n"
121#ifdef CONFIG_TRANSPARENT_HUGEPAGE
122 "Node %d AnonHugePages: %8lu kB\n"
123#endif
124 ,
121 nid, K(node_page_state(nid, NR_FILE_DIRTY)), 125 nid, K(node_page_state(nid, NR_FILE_DIRTY)),
122 nid, K(node_page_state(nid, NR_WRITEBACK)), 126 nid, K(node_page_state(nid, NR_WRITEBACK)),
123 nid, K(node_page_state(nid, NR_FILE_PAGES)), 127 nid, K(node_page_state(nid, NR_FILE_PAGES)),
124 nid, K(node_page_state(nid, NR_FILE_MAPPED)), 128 nid, K(node_page_state(nid, NR_FILE_MAPPED)),
125 nid, K(node_page_state(nid, NR_ANON_PAGES)), 129 nid, K(node_page_state(nid, NR_ANON_PAGES)
130#ifdef CONFIG_TRANSPARENT_HUGEPAGE
131 + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
132 HPAGE_PMD_NR
133#endif
134 ),
126 nid, K(node_page_state(nid, NR_SHMEM)), 135 nid, K(node_page_state(nid, NR_SHMEM)),
127 nid, node_page_state(nid, NR_KERNEL_STACK) * 136 nid, node_page_state(nid, NR_KERNEL_STACK) *
128 THREAD_SIZE / 1024, 137 THREAD_SIZE / 1024,
@@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
133 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + 142 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
134 node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), 143 node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
135 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), 144 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
136 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); 145 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
146#ifdef CONFIG_TRANSPARENT_HUGEPAGE
147 , nid,
148 K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
149 HPAGE_PMD_NR)
150#endif
151 );
137 n += hugetlb_report_node_meminfo(nid, buf + n); 152 n += hugetlb_report_node_meminfo(nid, buf + n);
138 return n; 153 return n;
139} 154}
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..98d9ec85e0eb 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
240 Allow volume managers to mirror logical volumes, also 240 Allow volume managers to mirror logical volumes, also
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID456
247 select BLK_DEV_MD
248 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings
250
251 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure
253 of a single drive. For a given sector (row) number, (N - 1) drives
254 contain data sectors, and one drive contains the parity protection.
255 For a RAID-4 set, the parity blocks are present on a single drive,
256 while a RAID-5 set distributes the parity across the drives in one
257 of the available parity distribution methods.
258
259 A RAID-6 set of N drives with a capacity of C MB per drive
260 provides the capacity of C * (N - 2) MB, and protects
261 against a failure of any two drives. For a given sector
262 (row) number, (N - 2) drives contain data sectors, and two
263 drives contains two independent redundancy syndromes. Like
264 RAID-5, RAID-6 distributes the syndromes across the drives
265 in one of the available parity distribution methods.
266
243config DM_LOG_USERSPACE 267config DM_LOG_USERSPACE
244 tristate "Mirror userspace logging (EXPERIMENTAL)" 268 tristate "Mirror userspace logging (EXPERIMENTAL)"
245 depends on DM_MIRROR && EXPERIMENTAL && NET 269 depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..d0138606c2e8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
38obj-$(CONFIG_DM_ZERO) += dm-zero.o 38obj-$(CONFIG_DM_ZERO) += dm-zero.o
39obj-$(CONFIG_DM_RAID) += dm-raid.o
39 40
40ifeq ($(CONFIG_DM_UEVENT),y) 41ifeq ($(CONFIG_DM_UEVENT),y)
41dm-mod-objs += dm-uevent.o 42dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a1ffe3527aa..9a35320fb59f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
210 || test_bit(Faulty, &rdev->flags)) 210 || test_bit(Faulty, &rdev->flags))
211 continue; 211 continue;
212 212
213 target = rdev->sb_start + offset + index * (PAGE_SIZE/512); 213 target = offset + index * (PAGE_SIZE/512);
214 214
215 if (sync_page_io(rdev, target, 215 if (sync_page_io(rdev, target,
216 roundup(size, bdev_logical_block_size(rdev->bdev)), 216 roundup(size, bdev_logical_block_size(rdev->bdev)),
217 page, READ)) { 217 page, READ, true)) {
218 page->index = index; 218 page->index = index;
219 attach_page_buffers(page, NULL); /* so that free_buffer will 219 attach_page_buffers(page, NULL); /* so that free_buffer will
220 * quietly no-op */ 220 * quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
264static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 264static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
265{ 265{
266 mdk_rdev_t *rdev = NULL; 266 mdk_rdev_t *rdev = NULL;
267 struct block_device *bdev;
267 mddev_t *mddev = bitmap->mddev; 268 mddev_t *mddev = bitmap->mddev;
268 269
269 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 270 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
270 int size = PAGE_SIZE; 271 int size = PAGE_SIZE;
271 loff_t offset = mddev->bitmap_info.offset; 272 loff_t offset = mddev->bitmap_info.offset;
273
274 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
275
272 if (page->index == bitmap->file_pages-1) 276 if (page->index == bitmap->file_pages-1)
273 size = roundup(bitmap->last_page_size, 277 size = roundup(bitmap->last_page_size,
274 bdev_logical_block_size(rdev->bdev)); 278 bdev_logical_block_size(bdev));
275 /* Just make sure we aren't corrupting data or 279 /* Just make sure we aren't corrupting data or
276 * metadata 280 * metadata
277 */ 281 */
@@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1542 wait_event(bitmap->mddev->recovery_wait, 1546 wait_event(bitmap->mddev->recovery_wait,
1543 atomic_read(&bitmap->mddev->recovery_active) == 0); 1547 atomic_read(&bitmap->mddev->recovery_active) == 0);
1544 1548
1545 bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; 1549 bitmap->mddev->curr_resync_completed = sector;
1546 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1550 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1547 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1551 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
1548 s = 0; 1552 s = 0;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4e054bd91664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
18#include <linux/crypto.h> 18#include <linux/crypto.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/backing-dev.h> 20#include <linux/backing-dev.h>
21#include <linux/percpu.h>
21#include <asm/atomic.h> 22#include <asm/atomic.h>
22#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
23#include <asm/page.h> 24#include <asm/page.h>
24#include <asm/unaligned.h> 25#include <asm/unaligned.h>
26#include <crypto/hash.h>
27#include <crypto/md5.h>
28#include <crypto/algapi.h>
25 29
26#include <linux/device-mapper.h> 30#include <linux/device-mapper.h>
27 31
@@ -63,6 +67,7 @@ struct dm_crypt_request {
63 struct convert_context *ctx; 67 struct convert_context *ctx;
64 struct scatterlist sg_in; 68 struct scatterlist sg_in;
65 struct scatterlist sg_out; 69 struct scatterlist sg_out;
70 sector_t iv_sector;
66}; 71};
67 72
68struct crypt_config; 73struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
73 void (*dtr)(struct crypt_config *cc); 78 void (*dtr)(struct crypt_config *cc);
74 int (*init)(struct crypt_config *cc); 79 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc); 80 int (*wipe)(struct crypt_config *cc);
76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 81 int (*generator)(struct crypt_config *cc, u8 *iv,
82 struct dm_crypt_request *dmreq);
83 int (*post)(struct crypt_config *cc, u8 *iv,
84 struct dm_crypt_request *dmreq);
77}; 85};
78 86
79struct iv_essiv_private { 87struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm; 88 struct crypto_hash *hash_tfm;
82 u8 *salt; 89 u8 *salt;
83}; 90};
@@ -86,11 +93,32 @@ struct iv_benbi_private {
86 int shift; 93 int shift;
87}; 94};
88 95
96#define LMK_SEED_SIZE 64 /* hash + 0 */
97struct iv_lmk_private {
98 struct crypto_shash *hash_tfm;
99 u8 *seed;
100};
101
89/* 102/*
90 * Crypt: maps a linear range of a block device 103 * Crypt: maps a linear range of a block device
91 * and encrypts / decrypts at the same time. 104 * and encrypts / decrypts at the same time.
92 */ 105 */
93enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; 106enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
107
108/*
109 * Duplicated per-CPU state for cipher.
110 */
111struct crypt_cpu {
112 struct ablkcipher_request *req;
113 /* ESSIV: struct crypto_cipher *essiv_tfm */
114 void *iv_private;
115 struct crypto_ablkcipher *tfms[0];
116};
117
118/*
119 * The fields in here must be read only after initialization,
120 * changing state should be in crypt_cpu.
121 */
94struct crypt_config { 122struct crypt_config {
95 struct dm_dev *dev; 123 struct dm_dev *dev;
96 sector_t start; 124 sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
108 struct workqueue_struct *crypt_queue; 136 struct workqueue_struct *crypt_queue;
109 137
110 char *cipher; 138 char *cipher;
111 char *cipher_mode; 139 char *cipher_string;
112 140
113 struct crypt_iv_operations *iv_gen_ops; 141 struct crypt_iv_operations *iv_gen_ops;
114 union { 142 union {
115 struct iv_essiv_private essiv; 143 struct iv_essiv_private essiv;
116 struct iv_benbi_private benbi; 144 struct iv_benbi_private benbi;
145 struct iv_lmk_private lmk;
117 } iv_gen_private; 146 } iv_gen_private;
118 sector_t iv_offset; 147 sector_t iv_offset;
119 unsigned int iv_size; 148 unsigned int iv_size;
120 149
121 /* 150 /*
151 * Duplicated per cpu state. Access through
152 * per_cpu_ptr() only.
153 */
154 struct crypt_cpu __percpu *cpu;
155 unsigned tfms_count;
156
157 /*
122 * Layout of each crypto request: 158 * Layout of each crypto request:
123 * 159 *
124 * struct ablkcipher_request 160 * struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
132 * correctly aligned. 168 * correctly aligned.
133 */ 169 */
134 unsigned int dmreq_start; 170 unsigned int dmreq_start;
135 struct ablkcipher_request *req;
136 171
137 struct crypto_ablkcipher *tfm;
138 unsigned long flags; 172 unsigned long flags;
139 unsigned int key_size; 173 unsigned int key_size;
174 unsigned int key_parts;
140 u8 key[0]; 175 u8 key[0];
141}; 176};
142 177
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
148 183
149static void clone_init(struct dm_crypt_io *, struct bio *); 184static void clone_init(struct dm_crypt_io *, struct bio *);
150static void kcryptd_queue_crypt(struct dm_crypt_io *io); 185static void kcryptd_queue_crypt(struct dm_crypt_io *io);
186static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
187
188static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
189{
190 return this_cpu_ptr(cc->cpu);
191}
192
193/*
194 * Use this to access cipher attributes that are the same for each CPU.
195 */
196static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
197{
198 return __this_cpu_ptr(cc->cpu)->tfms[0];
199}
151 200
152/* 201/*
153 * Different IV generation algorithms: 202 * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
168 * null: the initial vector is always zero. Provides compatibility with 217 * null: the initial vector is always zero. Provides compatibility with
169 * obsolete loop_fish2 devices. Do not use for new devices. 218 * obsolete loop_fish2 devices. Do not use for new devices.
170 * 219 *
220 * lmk: Compatible implementation of the block chaining mode used
221 * by the Loop-AES block device encryption system
222 * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
223 * It operates on full 512 byte sectors and uses CBC
224 * with an IV derived from the sector number, the data and
225 * optionally extra IV seed.
226 * This means that after decryption the first block
227 * of sector must be tweaked according to decrypted data.
228 * Loop-AES can use three encryption schemes:
229 * version 1: is plain aes-cbc mode
230 * version 2: uses 64 multikey scheme with lmk IV generator
231 * version 3: the same as version 2 with additional IV seed
232 * (it uses 65 keys, last key is used as IV seed)
233 *
171 * plumb: unimplemented, see: 234 * plumb: unimplemented, see:
172 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 235 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
173 */ 236 */
174 237
175static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 238static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
239 struct dm_crypt_request *dmreq)
176{ 240{
177 memset(iv, 0, cc->iv_size); 241 memset(iv, 0, cc->iv_size);
178 *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); 242 *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
179 243
180 return 0; 244 return 0;
181} 245}
182 246
183static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, 247static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
184 sector_t sector) 248 struct dm_crypt_request *dmreq)
185{ 249{
186 memset(iv, 0, cc->iv_size); 250 memset(iv, 0, cc->iv_size);
187 *(u64 *)iv = cpu_to_le64(sector); 251 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
188 252
189 return 0; 253 return 0;
190} 254}
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
195 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 259 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
196 struct hash_desc desc; 260 struct hash_desc desc;
197 struct scatterlist sg; 261 struct scatterlist sg;
198 int err; 262 struct crypto_cipher *essiv_tfm;
263 int err, cpu;
199 264
200 sg_init_one(&sg, cc->key, cc->key_size); 265 sg_init_one(&sg, cc->key, cc->key_size);
201 desc.tfm = essiv->hash_tfm; 266 desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
205 if (err) 270 if (err)
206 return err; 271 return err;
207 272
208 return crypto_cipher_setkey(essiv->tfm, essiv->salt, 273 for_each_possible_cpu(cpu) {
274 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
275
276 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
209 crypto_hash_digestsize(essiv->hash_tfm)); 277 crypto_hash_digestsize(essiv->hash_tfm));
278 if (err)
279 return err;
280 }
281
282 return 0;
210} 283}
211 284
212/* Wipe salt and reset key derived from volume key */ 285/* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
214{ 287{
215 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 288 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
216 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 289 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
290 struct crypto_cipher *essiv_tfm;
291 int cpu, r, err = 0;
217 292
218 memset(essiv->salt, 0, salt_size); 293 memset(essiv->salt, 0, salt_size);
219 294
220 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); 295 for_each_possible_cpu(cpu) {
296 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
297 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
298 if (r)
299 err = r;
300 }
301
302 return err;
303}
304
305/* Set up per cpu cipher state */
306static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
307 struct dm_target *ti,
308 u8 *salt, unsigned saltsize)
309{
310 struct crypto_cipher *essiv_tfm;
311 int err;
312
313 /* Setup the essiv_tfm with the given salt */
314 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
315 if (IS_ERR(essiv_tfm)) {
316 ti->error = "Error allocating crypto tfm for ESSIV";
317 return essiv_tfm;
318 }
319
320 if (crypto_cipher_blocksize(essiv_tfm) !=
321 crypto_ablkcipher_ivsize(any_tfm(cc))) {
322 ti->error = "Block size of ESSIV cipher does "
323 "not match IV size of block cipher";
324 crypto_free_cipher(essiv_tfm);
325 return ERR_PTR(-EINVAL);
326 }
327
328 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
329 if (err) {
330 ti->error = "Failed to set key for ESSIV cipher";
331 crypto_free_cipher(essiv_tfm);
332 return ERR_PTR(err);
333 }
334
335 return essiv_tfm;
221} 336}
222 337
223static void crypt_iv_essiv_dtr(struct crypt_config *cc) 338static void crypt_iv_essiv_dtr(struct crypt_config *cc)
224{ 339{
340 int cpu;
341 struct crypt_cpu *cpu_cc;
342 struct crypto_cipher *essiv_tfm;
225 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 343 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
226 344
227 crypto_free_cipher(essiv->tfm);
228 essiv->tfm = NULL;
229
230 crypto_free_hash(essiv->hash_tfm); 345 crypto_free_hash(essiv->hash_tfm);
231 essiv->hash_tfm = NULL; 346 essiv->hash_tfm = NULL;
232 347
233 kzfree(essiv->salt); 348 kzfree(essiv->salt);
234 essiv->salt = NULL; 349 essiv->salt = NULL;
350
351 for_each_possible_cpu(cpu) {
352 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
353 essiv_tfm = cpu_cc->iv_private;
354
355 if (essiv_tfm)
356 crypto_free_cipher(essiv_tfm);
357
358 cpu_cc->iv_private = NULL;
359 }
235} 360}
236 361
237static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 362static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
240 struct crypto_cipher *essiv_tfm = NULL; 365 struct crypto_cipher *essiv_tfm = NULL;
241 struct crypto_hash *hash_tfm = NULL; 366 struct crypto_hash *hash_tfm = NULL;
242 u8 *salt = NULL; 367 u8 *salt = NULL;
243 int err; 368 int err, cpu;
244 369
245 if (!opts) { 370 if (!opts) {
246 ti->error = "Digest algorithm missing for ESSIV mode"; 371 ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
262 goto bad; 387 goto bad;
263 } 388 }
264 389
265 /* Allocate essiv_tfm */
266 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
267 if (IS_ERR(essiv_tfm)) {
268 ti->error = "Error allocating crypto tfm for ESSIV";
269 err = PTR_ERR(essiv_tfm);
270 goto bad;
271 }
272 if (crypto_cipher_blocksize(essiv_tfm) !=
273 crypto_ablkcipher_ivsize(cc->tfm)) {
274 ti->error = "Block size of ESSIV cipher does "
275 "not match IV size of block cipher";
276 err = -EINVAL;
277 goto bad;
278 }
279
280 cc->iv_gen_private.essiv.salt = salt; 390 cc->iv_gen_private.essiv.salt = salt;
281 cc->iv_gen_private.essiv.tfm = essiv_tfm;
282 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 391 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
283 392
393 for_each_possible_cpu(cpu) {
394 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
395 crypto_hash_digestsize(hash_tfm));
396 if (IS_ERR(essiv_tfm)) {
397 crypt_iv_essiv_dtr(cc);
398 return PTR_ERR(essiv_tfm);
399 }
400 per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
401 }
402
284 return 0; 403 return 0;
285 404
286bad: 405bad:
287 if (essiv_tfm && !IS_ERR(essiv_tfm))
288 crypto_free_cipher(essiv_tfm);
289 if (hash_tfm && !IS_ERR(hash_tfm)) 406 if (hash_tfm && !IS_ERR(hash_tfm))
290 crypto_free_hash(hash_tfm); 407 crypto_free_hash(hash_tfm);
291 kfree(salt); 408 kfree(salt);
292 return err; 409 return err;
293} 410}
294 411
295static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 412static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
413 struct dm_crypt_request *dmreq)
296{ 414{
415 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
416
297 memset(iv, 0, cc->iv_size); 417 memset(iv, 0, cc->iv_size);
298 *(u64 *)iv = cpu_to_le64(sector); 418 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
299 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); 419 crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
420
300 return 0; 421 return 0;
301} 422}
302 423
303static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 424static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
304 const char *opts) 425 const char *opts)
305{ 426{
306 unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); 427 unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
307 int log = ilog2(bs); 428 int log = ilog2(bs);
308 429
309 /* we need to calculate how far we must shift the sector count 430 /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
328{ 449{
329} 450}
330 451
331static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 452static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
453 struct dm_crypt_request *dmreq)
332{ 454{
333 __be64 val; 455 __be64 val;
334 456
335 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 457 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
336 458
337 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); 459 val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
338 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 460 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
339 461
340 return 0; 462 return 0;
341} 463}
342 464
343static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 465static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
466 struct dm_crypt_request *dmreq)
344{ 467{
345 memset(iv, 0, cc->iv_size); 468 memset(iv, 0, cc->iv_size);
346 469
347 return 0; 470 return 0;
348} 471}
349 472
473static void crypt_iv_lmk_dtr(struct crypt_config *cc)
474{
475 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
476
477 if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
478 crypto_free_shash(lmk->hash_tfm);
479 lmk->hash_tfm = NULL;
480
481 kzfree(lmk->seed);
482 lmk->seed = NULL;
483}
484
485static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
486 const char *opts)
487{
488 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
489
490 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
491 if (IS_ERR(lmk->hash_tfm)) {
492 ti->error = "Error initializing LMK hash";
493 return PTR_ERR(lmk->hash_tfm);
494 }
495
496 /* No seed in LMK version 2 */
497 if (cc->key_parts == cc->tfms_count) {
498 lmk->seed = NULL;
499 return 0;
500 }
501
502 lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
503 if (!lmk->seed) {
504 crypt_iv_lmk_dtr(cc);
505 ti->error = "Error kmallocing seed storage in LMK";
506 return -ENOMEM;
507 }
508
509 return 0;
510}
511
512static int crypt_iv_lmk_init(struct crypt_config *cc)
513{
514 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
515 int subkey_size = cc->key_size / cc->key_parts;
516
517 /* LMK seed is on the position of LMK_KEYS + 1 key */
518 if (lmk->seed)
519 memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
520 crypto_shash_digestsize(lmk->hash_tfm));
521
522 return 0;
523}
524
525static int crypt_iv_lmk_wipe(struct crypt_config *cc)
526{
527 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
528
529 if (lmk->seed)
530 memset(lmk->seed, 0, LMK_SEED_SIZE);
531
532 return 0;
533}
534
535static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
536 struct dm_crypt_request *dmreq,
537 u8 *data)
538{
539 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
540 struct {
541 struct shash_desc desc;
542 char ctx[crypto_shash_descsize(lmk->hash_tfm)];
543 } sdesc;
544 struct md5_state md5state;
545 u32 buf[4];
546 int i, r;
547
548 sdesc.desc.tfm = lmk->hash_tfm;
549 sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
550
551 r = crypto_shash_init(&sdesc.desc);
552 if (r)
553 return r;
554
555 if (lmk->seed) {
556 r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
557 if (r)
558 return r;
559 }
560
561 /* Sector is always 512B, block size 16, add data of blocks 1-31 */
562 r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
563 if (r)
564 return r;
565
566 /* Sector is cropped to 56 bits here */
567 buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
568 buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
569 buf[2] = cpu_to_le32(4024);
570 buf[3] = 0;
571 r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
572 if (r)
573 return r;
574
575 /* No MD5 padding here */
576 r = crypto_shash_export(&sdesc.desc, &md5state);
577 if (r)
578 return r;
579
580 for (i = 0; i < MD5_HASH_WORDS; i++)
581 __cpu_to_le32s(&md5state.hash[i]);
582 memcpy(iv, &md5state.hash, cc->iv_size);
583
584 return 0;
585}
586
587static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
588 struct dm_crypt_request *dmreq)
589{
590 u8 *src;
591 int r = 0;
592
593 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
594 src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
595 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
596 kunmap_atomic(src, KM_USER0);
597 } else
598 memset(iv, 0, cc->iv_size);
599
600 return r;
601}
602
603static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
604 struct dm_crypt_request *dmreq)
605{
606 u8 *dst;
607 int r;
608
609 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
610 return 0;
611
612 dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
613 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
614
615 /* Tweak the first block of plaintext sector */
616 if (!r)
617 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
618
619 kunmap_atomic(dst, KM_USER0);
620 return r;
621}
622
350static struct crypt_iv_operations crypt_iv_plain_ops = { 623static struct crypt_iv_operations crypt_iv_plain_ops = {
351 .generator = crypt_iv_plain_gen 624 .generator = crypt_iv_plain_gen
352}; 625};
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
373 .generator = crypt_iv_null_gen 646 .generator = crypt_iv_null_gen
374}; 647};
375 648
649static struct crypt_iv_operations crypt_iv_lmk_ops = {
650 .ctr = crypt_iv_lmk_ctr,
651 .dtr = crypt_iv_lmk_dtr,
652 .init = crypt_iv_lmk_init,
653 .wipe = crypt_iv_lmk_wipe,
654 .generator = crypt_iv_lmk_gen,
655 .post = crypt_iv_lmk_post
656};
657
376static void crypt_convert_init(struct crypt_config *cc, 658static void crypt_convert_init(struct crypt_config *cc,
377 struct convert_context *ctx, 659 struct convert_context *ctx,
378 struct bio *bio_out, struct bio *bio_in, 660 struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
400 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); 682 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
401} 683}
402 684
685static u8 *iv_of_dmreq(struct crypt_config *cc,
686 struct dm_crypt_request *dmreq)
687{
688 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
689 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
690}
691
403static int crypt_convert_block(struct crypt_config *cc, 692static int crypt_convert_block(struct crypt_config *cc,
404 struct convert_context *ctx, 693 struct convert_context *ctx,
405 struct ablkcipher_request *req) 694 struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
411 int r = 0; 700 int r = 0;
412 701
413 dmreq = dmreq_of_req(cc, req); 702 dmreq = dmreq_of_req(cc, req);
414 iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), 703 iv = iv_of_dmreq(cc, dmreq);
415 crypto_ablkcipher_alignmask(cc->tfm) + 1);
416 704
705 dmreq->iv_sector = ctx->sector;
417 dmreq->ctx = ctx; 706 dmreq->ctx = ctx;
418 sg_init_table(&dmreq->sg_in, 1); 707 sg_init_table(&dmreq->sg_in, 1);
419 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 708 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
436 } 725 }
437 726
438 if (cc->iv_gen_ops) { 727 if (cc->iv_gen_ops) {
439 r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); 728 r = cc->iv_gen_ops->generator(cc, iv, dmreq);
440 if (r < 0) 729 if (r < 0)
441 return r; 730 return r;
442 } 731 }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
449 else 738 else
450 r = crypto_ablkcipher_decrypt(req); 739 r = crypto_ablkcipher_decrypt(req);
451 740
741 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
742 r = cc->iv_gen_ops->post(cc, iv, dmreq);
743
452 return r; 744 return r;
453} 745}
454 746
455static void kcryptd_async_done(struct crypto_async_request *async_req, 747static void kcryptd_async_done(struct crypto_async_request *async_req,
456 int error); 748 int error);
749
457static void crypt_alloc_req(struct crypt_config *cc, 750static void crypt_alloc_req(struct crypt_config *cc,
458 struct convert_context *ctx) 751 struct convert_context *ctx)
459{ 752{
460 if (!cc->req) 753 struct crypt_cpu *this_cc = this_crypt_config(cc);
461 cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 754 unsigned key_index = ctx->sector & (cc->tfms_count - 1);
462 ablkcipher_request_set_tfm(cc->req, cc->tfm); 755
463 ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | 756 if (!this_cc->req)
464 CRYPTO_TFM_REQ_MAY_SLEEP, 757 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
465 kcryptd_async_done, 758
466 dmreq_of_req(cc, cc->req)); 759 ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
760 ablkcipher_request_set_callback(this_cc->req,
761 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
762 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
467} 763}
468 764
469/* 765/*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
472static int crypt_convert(struct crypt_config *cc, 768static int crypt_convert(struct crypt_config *cc,
473 struct convert_context *ctx) 769 struct convert_context *ctx)
474{ 770{
771 struct crypt_cpu *this_cc = this_crypt_config(cc);
475 int r; 772 int r;
476 773
477 atomic_set(&ctx->pending, 1); 774 atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
483 780
484 atomic_inc(&ctx->pending); 781 atomic_inc(&ctx->pending);
485 782
486 r = crypt_convert_block(cc, ctx, cc->req); 783 r = crypt_convert_block(cc, ctx, this_cc->req);
487 784
488 switch (r) { 785 switch (r) {
489 /* async */ 786 /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
492 INIT_COMPLETION(ctx->restart); 789 INIT_COMPLETION(ctx->restart);
493 /* fall through*/ 790 /* fall through*/
494 case -EINPROGRESS: 791 case -EINPROGRESS:
495 cc->req = NULL; 792 this_cc->req = NULL;
496 ctx->sector++; 793 ctx->sector++;
497 continue; 794 continue;
498 795
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
651 * They must be separated as otherwise the final stages could be 948 * They must be separated as otherwise the final stages could be
652 * starved by new requests which can block in the first stages due 949 * starved by new requests which can block in the first stages due
653 * to memory allocation. 950 * to memory allocation.
951 *
952 * The work is done per CPU global for all dm-crypt instances.
953 * They should not depend on each other and do not block.
654 */ 954 */
655static void crypt_endio(struct bio *clone, int error) 955static void crypt_endio(struct bio *clone, int error)
656{ 956{
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
691 clone->bi_destructor = dm_crypt_bio_destructor; 991 clone->bi_destructor = dm_crypt_bio_destructor;
692} 992}
693 993
694static void kcryptd_io_read(struct dm_crypt_io *io) 994static void kcryptd_unplug(struct crypt_config *cc)
995{
996 blk_unplug(bdev_get_queue(cc->dev->bdev));
997}
998
999static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
695{ 1000{
696 struct crypt_config *cc = io->target->private; 1001 struct crypt_config *cc = io->target->private;
697 struct bio *base_bio = io->base_bio; 1002 struct bio *base_bio = io->base_bio;
698 struct bio *clone; 1003 struct bio *clone;
699 1004
700 crypt_inc_pending(io);
701
702 /* 1005 /*
703 * The block layer might modify the bvec array, so always 1006 * The block layer might modify the bvec array, so always
704 * copy the required bvecs because we need the original 1007 * copy the required bvecs because we need the original
705 * one in order to decrypt the whole bio data *afterwards*. 1008 * one in order to decrypt the whole bio data *afterwards*.
706 */ 1009 */
707 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); 1010 clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
708 if (unlikely(!clone)) { 1011 if (!clone) {
709 io->error = -ENOMEM; 1012 kcryptd_unplug(cc);
710 crypt_dec_pending(io); 1013 return 1;
711 return;
712 } 1014 }
713 1015
1016 crypt_inc_pending(io);
1017
714 clone_init(io, clone); 1018 clone_init(io, clone);
715 clone->bi_idx = 0; 1019 clone->bi_idx = 0;
716 clone->bi_vcnt = bio_segments(base_bio); 1020 clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
720 sizeof(struct bio_vec) * clone->bi_vcnt); 1024 sizeof(struct bio_vec) * clone->bi_vcnt);
721 1025
722 generic_make_request(clone); 1026 generic_make_request(clone);
1027 return 0;
723} 1028}
724 1029
725static void kcryptd_io_write(struct dm_crypt_io *io) 1030static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
732{ 1037{
733 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 1038 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
734 1039
735 if (bio_data_dir(io->base_bio) == READ) 1040 if (bio_data_dir(io->base_bio) == READ) {
736 kcryptd_io_read(io); 1041 crypt_inc_pending(io);
737 else 1042 if (kcryptd_io_read(io, GFP_NOIO))
1043 io->error = -ENOMEM;
1044 crypt_dec_pending(io);
1045 } else
738 kcryptd_io_write(io); 1046 kcryptd_io_write(io);
739} 1047}
740 1048
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
901 return; 1209 return;
902 } 1210 }
903 1211
1212 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1213 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1214
904 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1215 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
905 1216
906 if (!atomic_dec_and_test(&ctx->pending)) 1217 if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
971 } 1282 }
972} 1283}
973 1284
974static int crypt_set_key(struct crypt_config *cc, char *key) 1285static void crypt_free_tfms(struct crypt_config *cc, int cpu)
975{ 1286{
976 unsigned key_size = strlen(key) >> 1; 1287 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1288 unsigned i;
977 1289
978 if (cc->key_size && cc->key_size != key_size) 1290 for (i = 0; i < cc->tfms_count; i++)
1291 if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
1292 crypto_free_ablkcipher(cpu_cc->tfms[i]);
1293 cpu_cc->tfms[i] = NULL;
1294 }
1295}
1296
1297static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
1298{
1299 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1300 unsigned i;
1301 int err;
1302
1303 for (i = 0; i < cc->tfms_count; i++) {
1304 cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1305 if (IS_ERR(cpu_cc->tfms[i])) {
1306 err = PTR_ERR(cpu_cc->tfms[i]);
1307 crypt_free_tfms(cc, cpu);
1308 return err;
1309 }
1310 }
1311
1312 return 0;
1313}
1314
1315static int crypt_setkey_allcpus(struct crypt_config *cc)
1316{
1317 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1318 int cpu, err = 0, i, r;
1319
1320 for_each_possible_cpu(cpu) {
1321 for (i = 0; i < cc->tfms_count; i++) {
1322 r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
1323 cc->key + (i * subkey_size), subkey_size);
1324 if (r)
1325 err = r;
1326 }
1327 }
1328
1329 return err;
1330}
1331
1332static int crypt_set_key(struct crypt_config *cc, char *key)
1333{
1334 /* The key size may not be changed. */
1335 if (cc->key_size != (strlen(key) >> 1))
979 return -EINVAL; 1336 return -EINVAL;
980 1337
981 cc->key_size = key_size; /* initial settings */ 1338 /* Hyphen (which gives a key_size of zero) means there is no key. */
1339 if (!cc->key_size && strcmp(key, "-"))
1340 return -EINVAL;
982 1341
983 if ((!key_size && strcmp(key, "-")) || 1342 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
984 (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
985 return -EINVAL; 1343 return -EINVAL;
986 1344
987 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 1345 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
988 1346
989 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); 1347 return crypt_setkey_allcpus(cc);
990} 1348}
991 1349
992static int crypt_wipe_key(struct crypt_config *cc) 1350static int crypt_wipe_key(struct crypt_config *cc)
993{ 1351{
994 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 1352 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
995 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 1353 memset(&cc->key, 0, cc->key_size * sizeof(u8));
996 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); 1354
1355 return crypt_setkey_allcpus(cc);
997} 1356}
998 1357
999static void crypt_dtr(struct dm_target *ti) 1358static void crypt_dtr(struct dm_target *ti)
1000{ 1359{
1001 struct crypt_config *cc = ti->private; 1360 struct crypt_config *cc = ti->private;
1361 struct crypt_cpu *cpu_cc;
1362 int cpu;
1002 1363
1003 ti->private = NULL; 1364 ti->private = NULL;
1004 1365
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
1010 if (cc->crypt_queue) 1371 if (cc->crypt_queue)
1011 destroy_workqueue(cc->crypt_queue); 1372 destroy_workqueue(cc->crypt_queue);
1012 1373
1374 if (cc->cpu)
1375 for_each_possible_cpu(cpu) {
1376 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1377 if (cpu_cc->req)
1378 mempool_free(cpu_cc->req, cc->req_pool);
1379 crypt_free_tfms(cc, cpu);
1380 }
1381
1013 if (cc->bs) 1382 if (cc->bs)
1014 bioset_free(cc->bs); 1383 bioset_free(cc->bs);
1015 1384
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
1023 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1392 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
1024 cc->iv_gen_ops->dtr(cc); 1393 cc->iv_gen_ops->dtr(cc);
1025 1394
1026 if (cc->tfm && !IS_ERR(cc->tfm))
1027 crypto_free_ablkcipher(cc->tfm);
1028
1029 if (cc->dev) 1395 if (cc->dev)
1030 dm_put_device(ti, cc->dev); 1396 dm_put_device(ti, cc->dev);
1031 1397
1398 if (cc->cpu)
1399 free_percpu(cc->cpu);
1400
1032 kzfree(cc->cipher); 1401 kzfree(cc->cipher);
1033 kzfree(cc->cipher_mode); 1402 kzfree(cc->cipher_string);
1034 1403
1035 /* Must zero key material before freeing */ 1404 /* Must zero key material before freeing */
1036 kzfree(cc); 1405 kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1040 char *cipher_in, char *key) 1409 char *cipher_in, char *key)
1041{ 1410{
1042 struct crypt_config *cc = ti->private; 1411 struct crypt_config *cc = ti->private;
1043 char *tmp, *cipher, *chainmode, *ivmode, *ivopts; 1412 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1044 char *cipher_api = NULL; 1413 char *cipher_api = NULL;
1045 int ret = -EINVAL; 1414 int cpu, ret = -EINVAL;
1046 1415
1047 /* Convert to crypto api definition? */ 1416 /* Convert to crypto api definition? */
1048 if (strchr(cipher_in, '(')) { 1417 if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1050 return -EINVAL; 1419 return -EINVAL;
1051 } 1420 }
1052 1421
1422 cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
1423 if (!cc->cipher_string)
1424 goto bad_mem;
1425
1053 /* 1426 /*
1054 * Legacy dm-crypt cipher specification 1427 * Legacy dm-crypt cipher specification
1055 * cipher-mode-iv:ivopts 1428 * cipher[:keycount]-mode-iv:ivopts
1056 */ 1429 */
1057 tmp = cipher_in; 1430 tmp = cipher_in;
1058 cipher = strsep(&tmp, "-"); 1431 keycount = strsep(&tmp, "-");
1432 cipher = strsep(&keycount, ":");
1433
1434 if (!keycount)
1435 cc->tfms_count = 1;
1436 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
1437 !is_power_of_2(cc->tfms_count)) {
1438 ti->error = "Bad cipher key count specification";
1439 return -EINVAL;
1440 }
1441 cc->key_parts = cc->tfms_count;
1059 1442
1060 cc->cipher = kstrdup(cipher, GFP_KERNEL); 1443 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1061 if (!cc->cipher) 1444 if (!cc->cipher)
1062 goto bad_mem; 1445 goto bad_mem;
1063 1446
1064 if (tmp) {
1065 cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
1066 if (!cc->cipher_mode)
1067 goto bad_mem;
1068 }
1069
1070 chainmode = strsep(&tmp, "-"); 1447 chainmode = strsep(&tmp, "-");
1071 ivopts = strsep(&tmp, "-"); 1448 ivopts = strsep(&tmp, "-");
1072 ivmode = strsep(&ivopts, ":"); 1449 ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1074 if (tmp) 1451 if (tmp)
1075 DMWARN("Ignoring unexpected additional cipher options"); 1452 DMWARN("Ignoring unexpected additional cipher options");
1076 1453
1077 /* Compatibility mode for old dm-crypt mappings */ 1454 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
1455 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
1456 __alignof__(struct crypt_cpu));
1457 if (!cc->cpu) {
1458 ti->error = "Cannot allocate per cpu state";
1459 goto bad_mem;
1460 }
1461
1462 /*
1463 * For compatibility with the original dm-crypt mapping format, if
1464 * only the cipher name is supplied, use cbc-plain.
1465 */
1078 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { 1466 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
1079 kfree(cc->cipher_mode);
1080 cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
1081 chainmode = "cbc"; 1467 chainmode = "cbc";
1082 ivmode = "plain"; 1468 ivmode = "plain";
1083 } 1469 }
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1099 } 1485 }
1100 1486
1101 /* Allocate cipher */ 1487 /* Allocate cipher */
1102 cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); 1488 for_each_possible_cpu(cpu) {
1103 if (IS_ERR(cc->tfm)) { 1489 ret = crypt_alloc_tfms(cc, cpu, cipher_api);
1104 ret = PTR_ERR(cc->tfm); 1490 if (ret < 0) {
1105 ti->error = "Error allocating crypto tfm"; 1491 ti->error = "Error allocating crypto tfm";
1106 goto bad; 1492 goto bad;
1493 }
1107 } 1494 }
1108 1495
1109 /* Initialize and set key */ 1496 /* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1114 } 1501 }
1115 1502
1116 /* Initialize IV */ 1503 /* Initialize IV */
1117 cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); 1504 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
1118 if (cc->iv_size) 1505 if (cc->iv_size)
1119 /* at least a 64 bit sector number should fit in our buffer */ 1506 /* at least a 64 bit sector number should fit in our buffer */
1120 cc->iv_size = max(cc->iv_size, 1507 cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1137 cc->iv_gen_ops = &crypt_iv_benbi_ops; 1524 cc->iv_gen_ops = &crypt_iv_benbi_ops;
1138 else if (strcmp(ivmode, "null") == 0) 1525 else if (strcmp(ivmode, "null") == 0)
1139 cc->iv_gen_ops = &crypt_iv_null_ops; 1526 cc->iv_gen_ops = &crypt_iv_null_ops;
1140 else { 1527 else if (strcmp(ivmode, "lmk") == 0) {
1528 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1529 /* Version 2 and 3 is recognised according
1530 * to length of provided multi-key string.
1531 * If present (version 3), last key is used as IV seed.
1532 */
1533 if (cc->key_size % cc->key_parts)
1534 cc->key_parts++;
1535 } else {
1141 ret = -EINVAL; 1536 ret = -EINVAL;
1142 ti->error = "Invalid IV mode"; 1537 ti->error = "Invalid IV mode";
1143 goto bad; 1538 goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1194 ti->error = "Cannot allocate encryption context"; 1589 ti->error = "Cannot allocate encryption context";
1195 return -ENOMEM; 1590 return -ENOMEM;
1196 } 1591 }
1592 cc->key_size = key_size;
1197 1593
1198 ti->private = cc; 1594 ti->private = cc;
1199 ret = crypt_ctr_cipher(ti, argv[0], argv[1]); 1595 ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1208 } 1604 }
1209 1605
1210 cc->dmreq_start = sizeof(struct ablkcipher_request); 1606 cc->dmreq_start = sizeof(struct ablkcipher_request);
1211 cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); 1607 cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
1212 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); 1608 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
1213 cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & 1609 cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
1214 ~(crypto_tfm_ctx_alignment() - 1); 1610 ~(crypto_tfm_ctx_alignment() - 1);
1215 1611
1216 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 1612 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1219 ti->error = "Cannot allocate crypt request mempool"; 1615 ti->error = "Cannot allocate crypt request mempool";
1220 goto bad; 1616 goto bad;
1221 } 1617 }
1222 cc->req = NULL;
1223 1618
1224 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 1619 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
1225 if (!cc->page_pool) { 1620 if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1252 cc->start = tmpll; 1647 cc->start = tmpll;
1253 1648
1254 ret = -ENOMEM; 1649 ret = -ENOMEM;
1255 cc->io_queue = create_singlethread_workqueue("kcryptd_io"); 1650 cc->io_queue = alloc_workqueue("kcryptd_io",
1651 WQ_NON_REENTRANT|
1652 WQ_MEM_RECLAIM,
1653 1);
1256 if (!cc->io_queue) { 1654 if (!cc->io_queue) {
1257 ti->error = "Couldn't create kcryptd io queue"; 1655 ti->error = "Couldn't create kcryptd io queue";
1258 goto bad; 1656 goto bad;
1259 } 1657 }
1260 1658
1261 cc->crypt_queue = create_singlethread_workqueue("kcryptd"); 1659 cc->crypt_queue = alloc_workqueue("kcryptd",
1660 WQ_NON_REENTRANT|
1661 WQ_CPU_INTENSIVE|
1662 WQ_MEM_RECLAIM,
1663 1);
1262 if (!cc->crypt_queue) { 1664 if (!cc->crypt_queue) {
1263 ti->error = "Couldn't create kcryptd queue"; 1665 ti->error = "Couldn't create kcryptd queue";
1264 goto bad; 1666 goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1286 1688
1287 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); 1689 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
1288 1690
1289 if (bio_data_dir(io->base_bio) == READ) 1691 if (bio_data_dir(io->base_bio) == READ) {
1290 kcryptd_queue_io(io); 1692 if (kcryptd_io_read(io, GFP_NOWAIT))
1291 else 1693 kcryptd_queue_io(io);
1694 } else
1292 kcryptd_queue_crypt(io); 1695 kcryptd_queue_crypt(io);
1293 1696
1294 return DM_MAPIO_SUBMITTED; 1697 return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1306 break; 1709 break;
1307 1710
1308 case STATUSTYPE_TABLE: 1711 case STATUSTYPE_TABLE:
1309 if (cc->cipher_mode) 1712 DMEMIT("%s ", cc->cipher_string);
1310 DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
1311 else
1312 DMEMIT("%s ", cc->cipher);
1313 1713
1314 if (cc->key_size > 0) { 1714 if (cc->key_size > 0) {
1315 if ((maxlen - sz) < ((cc->key_size << 1) + 1)) 1715 if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1421 1821
1422static struct target_type crypt_target = { 1822static struct target_type crypt_target = {
1423 .name = "crypt", 1823 .name = "crypt",
1424 .version = {1, 7, 0}, 1824 .version = {1, 10, 0},
1425 .module = THIS_MODULE, 1825 .module = THIS_MODULE,
1426 .ctr = crypt_ctr, 1826 .ctr = crypt_ctr,
1427 .dtr = crypt_dtr, 1827 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
352{ 352{
353 int r = -ENOMEM; 353 int r = -ENOMEM;
354 354
355 kdelayd_wq = create_workqueue("kdelayd"); 355 kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
356 if (!kdelayd_wq) { 356 if (!kdelayd_wq) {
357 DMERR("Couldn't start kdelayd"); 357 DMERR("Couldn't start kdelayd");
358 goto bad_queue; 358 goto bad_queue;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..6d12775a1061 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
295 DMWARN("remove_all left %d open device(s)", dev_skipped); 295 DMWARN("remove_all left %d open device(s)", dev_skipped);
296} 296}
297 297
298/*
299 * Set the uuid of a hash_cell that isn't already set.
300 */
301static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
302{
303 mutex_lock(&dm_hash_cells_mutex);
304 hc->uuid = new_uuid;
305 mutex_unlock(&dm_hash_cells_mutex);
306
307 list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
308}
309
310/*
311 * Changes the name of a hash_cell and returns the old name for
312 * the caller to free.
313 */
314static char *__change_cell_name(struct hash_cell *hc, char *new_name)
315{
316 char *old_name;
317
318 /*
319 * Rename and move the name cell.
320 */
321 list_del(&hc->name_list);
322 old_name = hc->name;
323
324 mutex_lock(&dm_hash_cells_mutex);
325 hc->name = new_name;
326 mutex_unlock(&dm_hash_cells_mutex);
327
328 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
329
330 return old_name;
331}
332
298static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, 333static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
299 const char *new) 334 const char *new)
300{ 335{
301 char *new_name, *old_name; 336 char *new_data, *old_name = NULL;
302 struct hash_cell *hc; 337 struct hash_cell *hc;
303 struct dm_table *table; 338 struct dm_table *table;
304 struct mapped_device *md; 339 struct mapped_device *md;
340 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
305 341
306 /* 342 /*
307 * duplicate new. 343 * duplicate new.
308 */ 344 */
309 new_name = kstrdup(new, GFP_KERNEL); 345 new_data = kstrdup(new, GFP_KERNEL);
310 if (!new_name) 346 if (!new_data)
311 return ERR_PTR(-ENOMEM); 347 return ERR_PTR(-ENOMEM);
312 348
313 down_write(&_hash_lock); 349 down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
315 /* 351 /*
316 * Is new free ? 352 * Is new free ?
317 */ 353 */
318 hc = __get_name_cell(new); 354 if (change_uuid)
355 hc = __get_uuid_cell(new);
356 else
357 hc = __get_name_cell(new);
358
319 if (hc) { 359 if (hc) {
320 DMWARN("asked to rename to an already-existing name %s -> %s", 360 DMWARN("Unable to change %s on mapped device %s to one that "
361 "already exists: %s",
362 change_uuid ? "uuid" : "name",
321 param->name, new); 363 param->name, new);
322 dm_put(hc->md); 364 dm_put(hc->md);
323 up_write(&_hash_lock); 365 up_write(&_hash_lock);
324 kfree(new_name); 366 kfree(new_data);
325 return ERR_PTR(-EBUSY); 367 return ERR_PTR(-EBUSY);
326 } 368 }
327 369
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
330 */ 372 */
331 hc = __get_name_cell(param->name); 373 hc = __get_name_cell(param->name);
332 if (!hc) { 374 if (!hc) {
333 DMWARN("asked to rename a non-existent device %s -> %s", 375 DMWARN("Unable to rename non-existent device, %s to %s%s",
334 param->name, new); 376 param->name, change_uuid ? "uuid " : "", new);
335 up_write(&_hash_lock); 377 up_write(&_hash_lock);
336 kfree(new_name); 378 kfree(new_data);
337 return ERR_PTR(-ENXIO); 379 return ERR_PTR(-ENXIO);
338 } 380 }
339 381
340 /* 382 /*
341 * rename and move the name cell. 383 * Does this device already have a uuid?
342 */ 384 */
343 list_del(&hc->name_list); 385 if (change_uuid && hc->uuid) {
344 old_name = hc->name; 386 DMWARN("Unable to change uuid of mapped device %s to %s "
345 mutex_lock(&dm_hash_cells_mutex); 387 "because uuid is already set to %s",
346 hc->name = new_name; 388 param->name, new, hc->uuid);
347 mutex_unlock(&dm_hash_cells_mutex); 389 dm_put(hc->md);
348 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 390 up_write(&_hash_lock);
391 kfree(new_data);
392 return ERR_PTR(-EINVAL);
393 }
394
395 if (change_uuid)
396 __set_cell_uuid(hc, new_data);
397 else
398 old_name = __change_cell_name(hc, new_data);
349 399
350 /* 400 /*
351 * Wake up any dm event waiters. 401 * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
729 hc = __find_device_hash_cell(param); 779 hc = __find_device_hash_cell(param);
730 780
731 if (!hc) { 781 if (!hc) {
732 DMWARN("device doesn't appear to be in the dev hash table."); 782 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
733 up_write(&_hash_lock); 783 up_write(&_hash_lock);
734 return -ENXIO; 784 return -ENXIO;
735 } 785 }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
741 */ 791 */
742 r = dm_lock_for_deletion(md); 792 r = dm_lock_for_deletion(md);
743 if (r) { 793 if (r) {
744 DMWARN("unable to remove open device %s", hc->name); 794 DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
745 up_write(&_hash_lock); 795 up_write(&_hash_lock);
746 dm_put(md); 796 dm_put(md);
747 return r; 797 return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
774static int dev_rename(struct dm_ioctl *param, size_t param_size) 824static int dev_rename(struct dm_ioctl *param, size_t param_size)
775{ 825{
776 int r; 826 int r;
777 char *new_name = (char *) param + param->data_start; 827 char *new_data = (char *) param + param->data_start;
778 struct mapped_device *md; 828 struct mapped_device *md;
829 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
779 830
780 if (new_name < param->data || 831 if (new_data < param->data ||
781 invalid_str(new_name, (void *) param + param_size) || 832 invalid_str(new_data, (void *) param + param_size) ||
782 strlen(new_name) > DM_NAME_LEN - 1) { 833 strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
783 DMWARN("Invalid new logical volume name supplied."); 834 DMWARN("Invalid new mapped device name or uuid string supplied.");
784 return -EINVAL; 835 return -EINVAL;
785 } 836 }
786 837
787 r = check_name(new_name); 838 if (!change_uuid) {
788 if (r) 839 r = check_name(new_data);
789 return r; 840 if (r)
841 return r;
842 }
790 843
791 md = dm_hash_rename(param, new_name); 844 md = dm_hash_rename(param, new_data);
792 if (IS_ERR(md)) 845 if (IS_ERR(md))
793 return PTR_ERR(md); 846 return PTR_ERR(md);
794 847
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
885 938
886 hc = __find_device_hash_cell(param); 939 hc = __find_device_hash_cell(param);
887 if (!hc) { 940 if (!hc) {
888 DMWARN("device doesn't appear to be in the dev hash table."); 941 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
889 up_write(&_hash_lock); 942 up_write(&_hash_lock);
890 return -ENXIO; 943 return -ENXIO;
891 } 944 }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1212 1265
1213 hc = __find_device_hash_cell(param); 1266 hc = __find_device_hash_cell(param);
1214 if (!hc) { 1267 if (!hc) {
1215 DMWARN("device doesn't appear to be in the dev hash table."); 1268 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
1216 up_write(&_hash_lock); 1269 up_write(&_hash_lock);
1217 return -ENXIO; 1270 return -ENXIO;
1218 } 1271 }
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
37 unsigned int nr_pages; 37 unsigned int nr_pages;
38 unsigned int nr_free_pages; 38 unsigned int nr_free_pages;
39 39
40 /*
41 * Block devices to unplug.
42 * Non-NULL pointer means that a block device has some pending requests
43 * and needs to be unplugged.
44 */
45 struct block_device *unplug[2];
46
40 struct dm_io_client *io_client; 47 struct dm_io_client *io_client;
41 48
42 wait_queue_head_t destroyq; 49 wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
308 return 0; 315 return 0;
309} 316}
310 317
318/*
319 * Unplug the block device at the specified index.
320 */
321static void unplug(struct dm_kcopyd_client *kc, int rw)
322{
323 if (kc->unplug[rw] != NULL) {
324 blk_unplug(bdev_get_queue(kc->unplug[rw]));
325 kc->unplug[rw] = NULL;
326 }
327}
328
329/*
330 * Prepare block device unplug. If there's another device
331 * to be unplugged at the same array index, we unplug that
332 * device first.
333 */
334static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
335 struct block_device *bdev)
336{
337 if (likely(kc->unplug[rw] == bdev))
338 return;
339 unplug(kc, rw);
340 kc->unplug[rw] = bdev;
341}
342
311static void complete_io(unsigned long error, void *context) 343static void complete_io(unsigned long error, void *context)
312{ 344{
313 struct kcopyd_job *job = (struct kcopyd_job *) context; 345 struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
345{ 377{
346 int r; 378 int r;
347 struct dm_io_request io_req = { 379 struct dm_io_request io_req = {
348 .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, 380 .bi_rw = job->rw,
349 .mem.type = DM_IO_PAGE_LIST, 381 .mem.type = DM_IO_PAGE_LIST,
350 .mem.ptr.pl = job->pages, 382 .mem.ptr.pl = job->pages,
351 .mem.offset = job->offset, 383 .mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
354 .client = job->kc->io_client, 386 .client = job->kc->io_client,
355 }; 387 };
356 388
357 if (job->rw == READ) 389 if (job->rw == READ) {
358 r = dm_io(&io_req, 1, &job->source, NULL); 390 r = dm_io(&io_req, 1, &job->source, NULL);
359 else 391 prepare_unplug(job->kc, READ, job->source.bdev);
392 } else {
393 if (job->num_dests > 1)
394 io_req.bi_rw |= REQ_UNPLUG;
360 r = dm_io(&io_req, job->num_dests, job->dests, NULL); 395 r = dm_io(&io_req, job->num_dests, job->dests, NULL);
396 if (!(io_req.bi_rw & REQ_UNPLUG))
397 prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
398 }
361 399
362 return r; 400 return r;
363} 401}
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
435 * Pages jobs when successful will jump onto the io jobs 473 * Pages jobs when successful will jump onto the io jobs
436 * list. io jobs call wake when they complete and it all 474 * list. io jobs call wake when they complete and it all
437 * starts again. 475 * starts again.
476 *
477 * Note that io_jobs add block devices to the unplug array,
478 * this array is cleared with "unplug" calls. It is thus
479 * forbidden to run complete_jobs after io_jobs and before
480 * unplug because the block device could be destroyed in
481 * job completion callback.
438 */ 482 */
439 process_jobs(&kc->complete_jobs, kc, run_complete_job); 483 process_jobs(&kc->complete_jobs, kc, run_complete_job);
440 process_jobs(&kc->pages_jobs, kc, run_pages_job); 484 process_jobs(&kc->pages_jobs, kc, run_pages_job);
441 process_jobs(&kc->io_jobs, kc, run_io_job); 485 process_jobs(&kc->io_jobs, kc, run_io_job);
486 unplug(kc, READ);
487 unplug(kc, WRITE);
442} 488}
443 489
444/* 490/*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
619 INIT_LIST_HEAD(&kc->io_jobs); 665 INIT_LIST_HEAD(&kc->io_jobs);
620 INIT_LIST_HEAD(&kc->pages_jobs); 666 INIT_LIST_HEAD(&kc->pages_jobs);
621 667
668 memset(kc->unplug, 0, sizeof(kc->unplug));
669
622 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); 670 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
623 if (!kc->job_pool) 671 if (!kc->job_pool)
624 goto bad_slab; 672 goto bad_slab;
625 673
626 INIT_WORK(&kc->kcopyd_work, do_work); 674 INIT_WORK(&kc->kcopyd_work, do_work);
627 kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); 675 kc->kcopyd_wq = alloc_workqueue("kcopyd",
676 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
628 if (!kc->kcopyd_wq) 677 if (!kc->kcopyd_wq)
629 goto bad_workqueue; 678 goto bad_workqueue;
630 679
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
12 12
13#include "dm-log-userspace-transfer.h" 13#include "dm-log-userspace-transfer.h"
14 14
15#define DM_LOG_USERSPACE_VSN "1.1.0"
16
15struct flush_entry { 17struct flush_entry {
16 int type; 18 int type;
17 region_t region; 19 region_t region;
18 struct list_head list; 20 struct list_head list;
19}; 21};
20 22
23/*
24 * This limit on the number of mark and clear request is, to a degree,
25 * arbitrary. However, there is some basis for the choice in the limits
26 * imposed on the size of data payload by dm-log-userspace-transfer.c:
27 * dm_consult_userspace().
28 */
29#define MAX_FLUSH_GROUP_COUNT 32
30
21struct log_c { 31struct log_c {
22 struct dm_target *ti; 32 struct dm_target *ti;
23 uint32_t region_size; 33 uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
37 */ 47 */
38 uint64_t in_sync_hint; 48 uint64_t in_sync_hint;
39 49
50 /*
51 * Mark and clear requests are held until a flush is issued
52 * so that we can group, and thereby limit, the amount of
53 * network traffic between kernel and userspace. The 'flush_lock'
54 * is used to protect these lists.
55 */
40 spinlock_t flush_lock; 56 spinlock_t flush_lock;
41 struct list_head flush_list; /* only for clear and mark requests */ 57 struct list_head mark_list;
58 struct list_head clear_list;
42}; 59};
43 60
44static mempool_t *flush_entry_pool; 61static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
169 186
170 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 187 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
171 spin_lock_init(&lc->flush_lock); 188 spin_lock_init(&lc->flush_lock);
172 INIT_LIST_HEAD(&lc->flush_list); 189 INIT_LIST_HEAD(&lc->mark_list);
190 INIT_LIST_HEAD(&lc->clear_list);
173 191
174 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 192 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
175 if (str_size < 0) { 193 if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
181 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 199 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
182 ctr_str, str_size, NULL, NULL); 200 ctr_str, str_size, NULL, NULL);
183 201
184 if (r == -ESRCH) { 202 if (r < 0) {
185 DMERR("Userspace log server not found"); 203 if (r == -ESRCH)
204 DMERR("Userspace log server not found");
205 else
206 DMERR("Userspace log server failed to create log");
186 goto out; 207 goto out;
187 } 208 }
188 209
@@ -214,10 +235,9 @@ out:
214 235
215static void userspace_dtr(struct dm_dirty_log *log) 236static void userspace_dtr(struct dm_dirty_log *log)
216{ 237{
217 int r;
218 struct log_c *lc = log->context; 238 struct log_c *lc = log->context;
219 239
220 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 240 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
221 NULL, 0, 241 NULL, 0,
222 NULL, NULL); 242 NULL, NULL);
223 243
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
338 return (r) ? 0 : (int)in_sync; 358 return (r) ? 0 : (int)in_sync;
339} 359}
340 360
361static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
362{
363 int r = 0;
364 struct flush_entry *fe;
365
366 list_for_each_entry(fe, flush_list, list) {
367 r = userspace_do_request(lc, lc->uuid, fe->type,
368 (char *)&fe->region,
369 sizeof(fe->region),
370 NULL, NULL);
371 if (r)
372 break;
373 }
374
375 return r;
376}
377
378static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
379{
380 int r = 0;
381 int count;
382 uint32_t type = 0;
383 struct flush_entry *fe, *tmp_fe;
384 LIST_HEAD(tmp_list);
385 uint64_t group[MAX_FLUSH_GROUP_COUNT];
386
387 /*
388 * Group process the requests
389 */
390 while (!list_empty(flush_list)) {
391 count = 0;
392
393 list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
394 group[count] = fe->region;
395 count++;
396
397 list_del(&fe->list);
398 list_add(&fe->list, &tmp_list);
399
400 type = fe->type;
401 if (count >= MAX_FLUSH_GROUP_COUNT)
402 break;
403 }
404
405 r = userspace_do_request(lc, lc->uuid, type,
406 (char *)(group),
407 count * sizeof(uint64_t),
408 NULL, NULL);
409 if (r) {
410 /* Group send failed. Attempt one-by-one. */
411 list_splice_init(&tmp_list, flush_list);
412 r = flush_one_by_one(lc, flush_list);
413 break;
414 }
415 }
416
417 /*
418 * Must collect flush_entrys that were successfully processed
419 * as a group so that they will be free'd by the caller.
420 */
421 list_splice_init(&tmp_list, flush_list);
422
423 return r;
424}
425
341/* 426/*
342 * userspace_flush 427 * userspace_flush
343 * 428 *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
360 int r = 0; 445 int r = 0;
361 unsigned long flags; 446 unsigned long flags;
362 struct log_c *lc = log->context; 447 struct log_c *lc = log->context;
363 LIST_HEAD(flush_list); 448 LIST_HEAD(mark_list);
449 LIST_HEAD(clear_list);
364 struct flush_entry *fe, *tmp_fe; 450 struct flush_entry *fe, *tmp_fe;
365 451
366 spin_lock_irqsave(&lc->flush_lock, flags); 452 spin_lock_irqsave(&lc->flush_lock, flags);
367 list_splice_init(&lc->flush_list, &flush_list); 453 list_splice_init(&lc->mark_list, &mark_list);
454 list_splice_init(&lc->clear_list, &clear_list);
368 spin_unlock_irqrestore(&lc->flush_lock, flags); 455 spin_unlock_irqrestore(&lc->flush_lock, flags);
369 456
370 if (list_empty(&flush_list)) 457 if (list_empty(&mark_list) && list_empty(&clear_list))
371 return 0; 458 return 0;
372 459
373 /* 460 r = flush_by_group(lc, &mark_list);
374 * FIXME: Count up requests, group request types, 461 if (r)
375 * allocate memory to stick all requests in and 462 goto fail;
376 * send to server in one go. Failing the allocation,
377 * do it one by one.
378 */
379 463
380 list_for_each_entry(fe, &flush_list, list) { 464 r = flush_by_group(lc, &clear_list);
381 r = userspace_do_request(lc, lc->uuid, fe->type, 465 if (r)
382 (char *)&fe->region, 466 goto fail;
383 sizeof(fe->region),
384 NULL, NULL);
385 if (r)
386 goto fail;
387 }
388 467
389 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 468 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
390 NULL, 0, NULL, NULL); 469 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
395 * Calling code will receive an error and will know that 474 * Calling code will receive an error and will know that
396 * the log facility has failed. 475 * the log facility has failed.
397 */ 476 */
398 list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { 477 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
478 list_del(&fe->list);
479 mempool_free(fe, flush_entry_pool);
480 }
481 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
399 list_del(&fe->list); 482 list_del(&fe->list);
400 mempool_free(fe, flush_entry_pool); 483 mempool_free(fe, flush_entry_pool);
401 } 484 }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
425 spin_lock_irqsave(&lc->flush_lock, flags); 508 spin_lock_irqsave(&lc->flush_lock, flags);
426 fe->type = DM_ULOG_MARK_REGION; 509 fe->type = DM_ULOG_MARK_REGION;
427 fe->region = region; 510 fe->region = region;
428 list_add(&fe->list, &lc->flush_list); 511 list_add(&fe->list, &lc->mark_list);
429 spin_unlock_irqrestore(&lc->flush_lock, flags); 512 spin_unlock_irqrestore(&lc->flush_lock, flags);
430 513
431 return; 514 return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
462 spin_lock_irqsave(&lc->flush_lock, flags); 545 spin_lock_irqsave(&lc->flush_lock, flags);
463 fe->type = DM_ULOG_CLEAR_REGION; 546 fe->type = DM_ULOG_CLEAR_REGION;
464 fe->region = region; 547 fe->region = region;
465 list_add(&fe->list, &lc->flush_list); 548 list_add(&fe->list, &lc->clear_list);
466 spin_unlock_irqrestore(&lc->flush_lock, flags); 549 spin_unlock_irqrestore(&lc->flush_lock, flags);
467 550
468 return; 551 return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
684 return r; 767 return r;
685 } 768 }
686 769
687 DMINFO("version 1.0.0 loaded"); 770 DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
688 return 0; 771 return 0;
689} 772}
690 773
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
694 dm_ulog_tfr_exit(); 777 dm_ulog_tfr_exit();
695 mempool_destroy(flush_entry_pool); 778 mempool_destroy(flush_entry_pool);
696 779
697 DMINFO("version 1.0.0 unloaded"); 780 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
698 return; 781 return;
699} 782}
700 783
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
198 198
199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); 199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
200 memcpy(tfr->uuid, uuid, DM_UUID_LEN); 200 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
201 tfr->version = DM_ULOG_REQUEST_VERSION;
201 tfr->luid = luid; 202 tfr->luid = luid;
202 tfr->seq = dm_ulog_seq++; 203 tfr->seq = dm_ulog_seq++;
203 204
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d153..6951536ea29c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
455 r = PTR_ERR(lc->io_req.client); 455 r = PTR_ERR(lc->io_req.client);
456 DMWARN("couldn't allocate disk io client"); 456 DMWARN("couldn't allocate disk io client");
457 kfree(lc); 457 kfree(lc);
458 return -ENOMEM; 458 return r;
459 } 459 }
460 460
461 lc->disk_header = vmalloc(buf_size); 461 lc->disk_header = vmalloc(buf_size);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..b82d28819e2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
23 23
24#define DM_MSG_PREFIX "multipath" 24#define DM_MSG_PREFIX "multipath"
25#define MESG_STR(x) x, sizeof(x) 25#define MESG_STR(x) x, sizeof(x)
26#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
26 28
27/* Path properties */ 29/* Path properties */
28struct pgpath { 30struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
33 unsigned fail_count; /* Cumulative failure count */ 35 unsigned fail_count; /* Cumulative failure count */
34 36
35 struct dm_path path; 37 struct dm_path path;
36 struct work_struct deactivate_path; 38 struct delayed_work activate_path;
37 struct work_struct activate_path;
38}; 39};
39 40
40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 41#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
65 66
66 const char *hw_handler_name; 67 const char *hw_handler_name;
67 char *hw_handler_params; 68 char *hw_handler_params;
69
68 unsigned nr_priority_groups; 70 unsigned nr_priority_groups;
69 struct list_head priority_groups; 71 struct list_head priority_groups;
72
73 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
74
70 unsigned pg_init_required; /* pg_init needs calling? */ 75 unsigned pg_init_required; /* pg_init needs calling? */
71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ 76 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
72 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 77 unsigned pg_init_delay_retry; /* Delay pg_init retry? */
73 78
74 unsigned nr_valid_paths; /* Total number of usable paths */ 79 unsigned nr_valid_paths; /* Total number of usable paths */
75 struct pgpath *current_pgpath; 80 struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
82 unsigned saved_queue_if_no_path;/* Saved state during suspension */ 87 unsigned saved_queue_if_no_path;/* Saved state during suspension */
83 unsigned pg_init_retries; /* Number of times to retry pg_init */ 88 unsigned pg_init_retries; /* Number of times to retry pg_init */
84 unsigned pg_init_count; /* Number of times pg_init called */ 89 unsigned pg_init_count; /* Number of times pg_init called */
90 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
85 91
86 struct work_struct process_queued_ios; 92 struct work_struct process_queued_ios;
87 struct list_head queued_ios; 93 struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
116static void process_queued_ios(struct work_struct *work); 122static void process_queued_ios(struct work_struct *work);
117static void trigger_event(struct work_struct *work); 123static void trigger_event(struct work_struct *work);
118static void activate_path(struct work_struct *work); 124static void activate_path(struct work_struct *work);
119static void deactivate_path(struct work_struct *work);
120 125
121 126
122/*----------------------------------------------- 127/*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
129 134
130 if (pgpath) { 135 if (pgpath) {
131 pgpath->is_active = 1; 136 pgpath->is_active = 1;
132 INIT_WORK(&pgpath->deactivate_path, deactivate_path); 137 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
133 INIT_WORK(&pgpath->activate_path, activate_path);
134 } 138 }
135 139
136 return pgpath; 140 return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
141 kfree(pgpath); 145 kfree(pgpath);
142} 146}
143 147
144static void deactivate_path(struct work_struct *work)
145{
146 struct pgpath *pgpath =
147 container_of(work, struct pgpath, deactivate_path);
148
149 blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
150}
151
152static struct priority_group *alloc_priority_group(void) 148static struct priority_group *alloc_priority_group(void)
153{ 149{
154 struct priority_group *pg; 150 struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
199 INIT_LIST_HEAD(&m->queued_ios); 195 INIT_LIST_HEAD(&m->queued_ios);
200 spin_lock_init(&m->lock); 196 spin_lock_init(&m->lock);
201 m->queue_io = 1; 197 m->queue_io = 1;
198 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
202 INIT_WORK(&m->process_queued_ios, process_queued_ios); 199 INIT_WORK(&m->process_queued_ios, process_queued_ios);
203 INIT_WORK(&m->trigger_event, trigger_event); 200 INIT_WORK(&m->trigger_event, trigger_event);
204 init_waitqueue_head(&m->pg_init_wait); 201 init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
238static void __pg_init_all_paths(struct multipath *m) 235static void __pg_init_all_paths(struct multipath *m)
239{ 236{
240 struct pgpath *pgpath; 237 struct pgpath *pgpath;
238 unsigned long pg_init_delay = 0;
241 239
242 m->pg_init_count++; 240 m->pg_init_count++;
243 m->pg_init_required = 0; 241 m->pg_init_required = 0;
242 if (m->pg_init_delay_retry)
243 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
244 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
244 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 245 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
245 /* Skip failed paths */ 246 /* Skip failed paths */
246 if (!pgpath->is_active) 247 if (!pgpath->is_active)
247 continue; 248 continue;
248 if (queue_work(kmpath_handlerd, &pgpath->activate_path)) 249 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
250 pg_init_delay))
249 m->pg_init_in_progress++; 251 m->pg_init_in_progress++;
250 } 252 }
251} 253}
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
793 const char *param_name; 795 const char *param_name;
794 796
795 static struct param _params[] = { 797 static struct param _params[] = {
796 {0, 3, "invalid number of feature args"}, 798 {0, 5, "invalid number of feature args"},
797 {1, 50, "pg_init_retries must be between 1 and 50"}, 799 {1, 50, "pg_init_retries must be between 1 and 50"},
800 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
798 }; 801 };
799 802
800 r = read_param(_params, shift(as), &argc, &ti->error); 803 r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
821 continue; 824 continue;
822 } 825 }
823 826
827 if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
828 (argc >= 1)) {
829 r = read_param(_params + 2, shift(as),
830 &m->pg_init_delay_msecs, &ti->error);
831 argc--;
832 continue;
833 }
834
824 ti->error = "Unrecognised multipath feature request"; 835 ti->error = "Unrecognised multipath feature request";
825 r = -EINVAL; 836 r = -EINVAL;
826 } while (argc && !r); 837 } while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
931 flush_workqueue(kmpath_handlerd); 942 flush_workqueue(kmpath_handlerd);
932 multipath_wait_for_pg_init_completion(m); 943 multipath_wait_for_pg_init_completion(m);
933 flush_workqueue(kmultipathd); 944 flush_workqueue(kmultipathd);
934 flush_scheduled_work(); 945 flush_work_sync(&m->trigger_event);
935} 946}
936 947
937static void multipath_dtr(struct dm_target *ti) 948static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
995 pgpath->path.dev->name, m->nr_valid_paths); 1006 pgpath->path.dev->name, m->nr_valid_paths);
996 1007
997 schedule_work(&m->trigger_event); 1008 schedule_work(&m->trigger_event);
998 queue_work(kmultipathd, &pgpath->deactivate_path);
999 1009
1000out: 1010out:
1001 spin_unlock_irqrestore(&m->lock, flags); 1011 spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
1034 m->current_pgpath = NULL; 1044 m->current_pgpath = NULL;
1035 queue_work(kmultipathd, &m->process_queued_ios); 1045 queue_work(kmultipathd, &m->process_queued_ios);
1036 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1046 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1037 if (queue_work(kmpath_handlerd, &pgpath->activate_path)) 1047 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1038 m->pg_init_in_progress++; 1048 m->pg_init_in_progress++;
1039 } 1049 }
1040 1050
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
1169 struct priority_group *pg = pgpath->pg; 1179 struct priority_group *pg = pgpath->pg;
1170 struct multipath *m = pg->m; 1180 struct multipath *m = pg->m;
1171 unsigned long flags; 1181 unsigned long flags;
1182 unsigned delay_retry = 0;
1172 1183
1173 /* device or driver problems */ 1184 /* device or driver problems */
1174 switch (errors) { 1185 switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
1193 */ 1204 */
1194 bypass_pg(m, pg, 1); 1205 bypass_pg(m, pg, 1);
1195 break; 1206 break;
1196 /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
1197 case SCSI_DH_RETRY: 1207 case SCSI_DH_RETRY:
1208 /* Wait before retrying. */
1209 delay_retry = 1;
1198 case SCSI_DH_IMM_RETRY: 1210 case SCSI_DH_IMM_RETRY:
1199 case SCSI_DH_RES_TEMP_UNAVAIL: 1211 case SCSI_DH_RES_TEMP_UNAVAIL:
1200 if (pg_init_limit_reached(m, pgpath)) 1212 if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
1227 if (!m->pg_init_required) 1239 if (!m->pg_init_required)
1228 m->queue_io = 0; 1240 m->queue_io = 0;
1229 1241
1242 m->pg_init_delay_retry = delay_retry;
1230 queue_work(kmultipathd, &m->process_queued_ios); 1243 queue_work(kmultipathd, &m->process_queued_ios);
1231 1244
1232 /* 1245 /*
@@ -1241,7 +1254,7 @@ out:
1241static void activate_path(struct work_struct *work) 1254static void activate_path(struct work_struct *work)
1242{ 1255{
1243 struct pgpath *pgpath = 1256 struct pgpath *pgpath =
1244 container_of(work, struct pgpath, activate_path); 1257 container_of(work, struct pgpath, activate_path.work);
1245 1258
1246 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1259 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1247 pg_init_done, pgpath); 1260 pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1382 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); 1395 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1383 else { 1396 else {
1384 DMEMIT("%u ", m->queue_if_no_path + 1397 DMEMIT("%u ", m->queue_if_no_path +
1385 (m->pg_init_retries > 0) * 2); 1398 (m->pg_init_retries > 0) * 2 +
1399 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
1386 if (m->queue_if_no_path) 1400 if (m->queue_if_no_path)
1387 DMEMIT("queue_if_no_path "); 1401 DMEMIT("queue_if_no_path ");
1388 if (m->pg_init_retries) 1402 if (m->pg_init_retries)
1389 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1403 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1404 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1405 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1390 } 1406 }
1391 1407
1392 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1408 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
1655 *---------------------------------------------------------------*/ 1671 *---------------------------------------------------------------*/
1656static struct target_type multipath_target = { 1672static struct target_type multipath_target = {
1657 .name = "multipath", 1673 .name = "multipath",
1658 .version = {1, 1, 1}, 1674 .version = {1, 2, 0},
1659 .module = THIS_MODULE, 1675 .module = THIS_MODULE,
1660 .ctr = multipath_ctr, 1676 .ctr = multipath_ctr,
1661 .dtr = multipath_dtr, 1677 .dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
1687 return -EINVAL; 1703 return -EINVAL;
1688 } 1704 }
1689 1705
1690 kmultipathd = create_workqueue("kmpathd"); 1706 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1691 if (!kmultipathd) { 1707 if (!kmultipathd) {
1692 DMERR("failed to create workqueue kmpathd"); 1708 DMERR("failed to create workqueue kmpathd");
1693 dm_unregister_target(&multipath_target); 1709 dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
1701 * old workqueue would also create a bottleneck in the 1717 * old workqueue would also create a bottleneck in the
1702 * path of the storage hardware device activation. 1718 * path of the storage hardware device activation.
1703 */ 1719 */
1704 kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); 1720 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1721 WQ_MEM_RECLAIM);
1705 if (!kmpath_handlerd) { 1722 if (!kmpath_handlerd) {
1706 DMERR("failed to create workqueue kmpath_handlerd"); 1723 DMERR("failed to create workqueue kmpath_handlerd");
1707 destroy_workqueue(kmultipathd); 1724 destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..b9e1e15ef11c
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
1/*
2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/slab.h>
9
10#include "md.h"
11#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h"
14
15#define DM_MSG_PREFIX "raid"
16
17/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
19 * make it so the flag doesn't set anything.
20 */
21#ifndef MD_SYNC_STATE_FORCED
22#define MD_SYNC_STATE_FORCED 0
23#endif
24
25struct raid_dev {
26 /*
27 * Two DM devices, one to hold metadata and one to hold the
28 * actual data/parity. The reason for this is to not confuse
29 * ti->len and give more flexibility in altering size and
30 * characteristics.
31 *
32 * While it is possible for this device to be associated
33 * with a different physical device than the data_dev, it
34 * is intended for it to be the same.
35 * |--------- Physical Device ---------|
36 * |- meta_dev -|------ data_dev ------|
37 */
38 struct dm_dev *meta_dev;
39 struct dm_dev *data_dev;
40 struct mdk_rdev_s rdev;
41};
42
43/*
44 * Flags for rs->print_flags field.
45 */
46#define DMPF_DAEMON_SLEEP 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2
48#define DMPF_SYNC 0x4
49#define DMPF_NOSYNC 0x8
50#define DMPF_STRIPE_CACHE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40
53
54struct raid_set {
55 struct dm_target *ti;
56
57 uint64_t print_flags;
58
59 struct mddev_s md;
60 struct raid_type *raid_type;
61 struct dm_target_callbacks callbacks;
62
63 struct raid_dev dev[0];
64};
65
66/* Supported raid types and properties. */
67static struct raid_type {
68 const char *name; /* RAID algorithm. */
69 const char *descr; /* Descriptor text for logging. */
70 const unsigned parity_devs; /* # of parity devices. */
71 const unsigned minimal_devs; /* minimal # of devices in set. */
72 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = {
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
78 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
79 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
80 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
81 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
82 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
83};
84
85static struct raid_type *get_raid_type(char *name)
86{
87 int i;
88
89 for (i = 0; i < ARRAY_SIZE(raid_types); i++)
90 if (!strcmp(raid_types[i].name, name))
91 return &raid_types[i];
92
93 return NULL;
94}
95
96static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
97{
98 unsigned i;
99 struct raid_set *rs;
100 sector_t sectors_per_dev;
101
102 if (raid_devs <= raid_type->parity_devs) {
103 ti->error = "Insufficient number of devices";
104 return ERR_PTR(-EINVAL);
105 }
106
107 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL);
111 }
112
113 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
114 if (!rs) {
115 ti->error = "Cannot allocate raid context";
116 return ERR_PTR(-ENOMEM);
117 }
118
119 mddev_init(&rs->md);
120
121 rs->ti = ti;
122 rs->raid_type = raid_type;
123 rs->md.raid_disks = raid_devs;
124 rs->md.level = raid_type->level;
125 rs->md.new_level = rs->md.level;
126 rs->md.dev_sectors = sectors_per_dev;
127 rs->md.layout = raid_type->algorithm;
128 rs->md.new_layout = rs->md.layout;
129 rs->md.delta_disks = 0;
130 rs->md.recovery_cp = 0;
131
132 for (i = 0; i < raid_devs; i++)
133 md_rdev_init(&rs->dev[i].rdev);
134
135 /*
136 * Remaining items to be initialized by further RAID params:
137 * rs->md.persistent
138 * rs->md.external
139 * rs->md.chunk_sectors
140 * rs->md.new_chunk_sectors
141 */
142
143 return rs;
144}
145
146static void context_free(struct raid_set *rs)
147{
148 int i;
149
150 for (i = 0; i < rs->md.raid_disks; i++)
151 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev);
153
154 kfree(rs);
155}
156
157/*
158 * For every device we have two words
159 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing
161 *
162 * This code parses those words.
163 */
164static int dev_parms(struct raid_set *rs, char **argv)
165{
166 int i;
167 int rebuild = 0;
168 int metadata_available = 0;
169 int ret = 0;
170
171 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
172 rs->dev[i].rdev.raid_disk = i;
173
174 rs->dev[i].meta_dev = NULL;
175 rs->dev[i].data_dev = NULL;
176
177 /*
178 * There are no offsets, since there is a separate device
179 * for data and metadata.
180 */
181 rs->dev[i].rdev.data_offset = 0;
182 rs->dev[i].rdev.mddev = &rs->md;
183
184 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported";
186 return -EINVAL;
187 }
188
189 if (!strcmp(argv[1], "-")) {
190 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
191 (!rs->dev[i].rdev.recovery_offset)) {
192 rs->ti->error = "Drive designated for rebuild not specified";
193 return -EINVAL;
194 }
195
196 continue;
197 }
198
199 ret = dm_get_device(rs->ti, argv[1],
200 dm_table_get_mode(rs->ti->table),
201 &rs->dev[i].data_dev);
202 if (ret) {
203 rs->ti->error = "RAID device lookup failure";
204 return ret;
205 }
206
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
210 rebuild++;
211 }
212
213 if (metadata_available) {
214 rs->md.external = 0;
215 rs->md.persistent = 1;
216 rs->md.major_version = 2;
217 } else if (rebuild && !rs->md.recovery_cp) {
218 /*
219 * Without metadata, we will not be able to tell if the array
220 * is in-sync or not - we must assume it is not. Therefore,
221 * it is impossible to rebuild a drive.
222 *
223 * Even if there is metadata, the on-disk information may
224 * indicate that the array is not in-sync and it will then
225 * fail at that time.
226 *
227 * User could specify 'nosync' option if desperate.
228 */
229 DMERR("Unable to rebuild drive while array is not in-sync");
230 rs->ti->error = "RAID device lookup failure";
231 return -EINVAL;
232 }
233
234 return 0;
235}
236
237/*
238 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args]
241 *
242 * Optional args:
243 * [[no]sync] Force or prevent recovery of the entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
250 */
251static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params)
253{
254 unsigned i, rebuild_cnt = 0;
255 unsigned long value;
256 char *key;
257
258 /*
259 * First, parse the in-order required arguments
260 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) ||
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size";
264 return -EINVAL;
265 }
266
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
268 argv++;
269 num_raid_params--;
270
271 /*
272 * Second, parse the unordered optional arguments
273 */
274 for (i = 0; i < rs->md.raid_disks; i++)
275 set_bit(In_sync, &rs->dev[i].rdev.flags);
276
277 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue;
283 }
284 if (!strcmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue;
289 }
290
291 /* The rest of the optional arguments come in key/value pairs */
292 if ((i + 1) >= num_raid_params) {
293 rs->ti->error = "Wrong number of raid parameters given";
294 return -EINVAL;
295 }
296
297 key = argv[i++];
298 if (strict_strtoul(argv[i], 10, &value) < 0) {
299 rs->ti->error = "Bad numerical argument given in raid params";
300 return -EINVAL;
301 }
302
303 if (!strcmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) {
305 rs->ti->error = "Too many rebuild drives given";
306 return -EINVAL;
307 }
308 if (value > rs->md.raid_disks) {
309 rs->ti->error = "Invalid rebuild index given";
310 return -EINVAL;
311 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) {
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316
317 /*
318 * In device-mapper, we specify things in sectors, but
319 * MD records this value in kB
320 */
321 value /= 2;
322 if (value > COUNTER_MAX) {
323 rs->ti->error = "Max write-behind limit out of range";
324 return -EINVAL;
325 }
326 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL;
332 }
333 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE;
336
337 /*
338 * In device-mapper, we specify things in sectors, but
339 * MD records this value in kB
340 */
341 value /= 2;
342
343 if (rs->raid_type->level < 5) {
344 rs->ti->error = "Inappropriate argument: stripe_cache";
345 return -EINVAL;
346 }
347 if (raid5_set_cache_size(&rs->md, (int)value)) {
348 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL;
350 }
351 } else if (!strcmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL;
356 }
357 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL;
363 }
364 rs->md.sync_speed_max = (int)value;
365 } else {
366 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters";
368 return -EINVAL;
369 }
370 }
371
372 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0;
374 rs->md.external = 1;
375
376 return 0;
377}
378
379static void do_table_event(struct work_struct *ws)
380{
381 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
382
383 dm_table_event(rs->ti->table);
384}
385
386static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389
390 return md_raid5_congested(&rs->md, bits);
391}
392
393static void raid_unplug(struct dm_target_callbacks *cb)
394{
395 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
396
397 md_raid5_unplug_device(rs->md.private);
398}
399
400/*
401 * Construct a RAID4/5/6 mapping:
402 * Args:
403 * <raid_type> <#raid_params> <raid_params> \
404 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
405 *
406 * ** metadata devices are not supported yet, use '-' instead **
407 *
408 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
409 * details on possible <raid_params>.
410 */
411static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
412{
413 int ret;
414 struct raid_type *rt;
415 unsigned long num_raid_params, num_raid_devs;
416 struct raid_set *rs = NULL;
417
418 /* Must have at least <raid_type> <#raid_params> */
419 if (argc < 2) {
420 ti->error = "Too few arguments";
421 return -EINVAL;
422 }
423
424 /* raid type */
425 rt = get_raid_type(argv[0]);
426 if (!rt) {
427 ti->error = "Unrecognised raid_type";
428 return -EINVAL;
429 }
430 argc--;
431 argv++;
432
433 /* number of RAID parameters */
434 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
435 ti->error = "Cannot understand number of RAID parameters";
436 return -EINVAL;
437 }
438 argc--;
439 argv++;
440
441 /* Skip over RAID params for now and find out # of devices */
442 if (num_raid_params + 1 > argc) {
443 ti->error = "Arguments do not agree with counts given";
444 return -EINVAL;
445 }
446
447 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
448 (num_raid_devs >= INT_MAX)) {
449 ti->error = "Cannot understand number of raid devices";
450 return -EINVAL;
451 }
452
453 rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
454 if (IS_ERR(rs))
455 return PTR_ERR(rs);
456
457 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
458 if (ret)
459 goto bad;
460
461 ret = -EINVAL;
462
463 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
464 argv += num_raid_params + 1;
465
466 if (argc != (num_raid_devs * 2)) {
467 ti->error = "Supplied RAID devices does not match the count given";
468 goto bad;
469 }
470
471 ret = dev_parms(rs, argv);
472 if (ret)
473 goto bad;
474
475 INIT_WORK(&rs->md.event_work, do_table_event);
476 ti->split_io = rs->md.chunk_sectors;
477 ti->private = rs;
478
479 mutex_lock(&rs->md.reconfig_mutex);
480 ret = md_run(&rs->md);
481 rs->md.in_sync = 0; /* Assume already marked dirty */
482 mutex_unlock(&rs->md.reconfig_mutex);
483
484 if (ret) {
485 ti->error = "Fail to run raid array";
486 goto bad;
487 }
488
489 rs->callbacks.congested_fn = raid_is_congested;
490 rs->callbacks.unplug_fn = raid_unplug;
491 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
492
493 return 0;
494
495bad:
496 context_free(rs);
497
498 return ret;
499}
500
501static void raid_dtr(struct dm_target *ti)
502{
503 struct raid_set *rs = ti->private;
504
505 list_del_init(&rs->callbacks.list);
506 md_stop(&rs->md);
507 context_free(rs);
508}
509
510static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
511{
512 struct raid_set *rs = ti->private;
513 mddev_t *mddev = &rs->md;
514
515 mddev->pers->make_request(mddev, bio);
516
517 return DM_MAPIO_SUBMITTED;
518}
519
520static int raid_status(struct dm_target *ti, status_type_t type,
521 char *result, unsigned maxlen)
522{
523 struct raid_set *rs = ti->private;
524 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
525 unsigned sz = 0;
526 int i;
527 sector_t sync;
528
529 switch (type) {
530 case STATUSTYPE_INFO:
531 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
532
533 for (i = 0; i < rs->md.raid_disks; i++) {
534 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
535 DMEMIT("D");
536 else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
537 DMEMIT("A");
538 else
539 DMEMIT("a");
540 }
541
542 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
543 sync = rs->md.curr_resync_completed;
544 else
545 sync = rs->md.recovery_cp;
546
547 if (sync > rs->md.resync_max_sectors)
548 sync = rs->md.resync_max_sectors;
549
550 DMEMIT(" %llu/%llu",
551 (unsigned long long) sync,
552 (unsigned long long) rs->md.resync_max_sectors);
553
554 break;
555 case STATUSTYPE_TABLE:
556 /* The string you would use to construct this array */
557 for (i = 0; i < rs->md.raid_disks; i++)
558 if (rs->dev[i].data_dev &&
559 !test_bit(In_sync, &rs->dev[i].rdev.flags))
560 raid_param_cnt++; /* for rebuilds */
561
562 raid_param_cnt += (hweight64(rs->print_flags) * 2);
563 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
564 raid_param_cnt--;
565
566 DMEMIT("%s %u %u", rs->raid_type->name,
567 raid_param_cnt, rs->md.chunk_sectors);
568
569 if ((rs->print_flags & DMPF_SYNC) &&
570 (rs->md.recovery_cp == MaxSector))
571 DMEMIT(" sync");
572 if (rs->print_flags & DMPF_NOSYNC)
573 DMEMIT(" nosync");
574
575 for (i = 0; i < rs->md.raid_disks; i++)
576 if (rs->dev[i].data_dev &&
577 !test_bit(In_sync, &rs->dev[i].rdev.flags))
578 DMEMIT(" rebuild %u", i);
579
580 if (rs->print_flags & DMPF_DAEMON_SLEEP)
581 DMEMIT(" daemon_sleep %lu",
582 rs->md.bitmap_info.daemon_sleep);
583
584 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
585 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
586
587 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
588 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
589
590 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
591 DMEMIT(" max_write_behind %lu",
592 rs->md.bitmap_info.max_write_behind);
593
594 if (rs->print_flags & DMPF_STRIPE_CACHE) {
595 raid5_conf_t *conf = rs->md.private;
596
597 /* convert from kiB to sectors */
598 DMEMIT(" stripe_cache %d",
599 conf ? conf->max_nr_stripes * 2 : 0);
600 }
601
602 DMEMIT(" %d", rs->md.raid_disks);
603 for (i = 0; i < rs->md.raid_disks; i++) {
604 DMEMIT(" -"); /* metadata device */
605
606 if (rs->dev[i].data_dev)
607 DMEMIT(" %s", rs->dev[i].data_dev->name);
608 else
609 DMEMIT(" -");
610 }
611 }
612
613 return 0;
614}
615
616static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
617{
618 struct raid_set *rs = ti->private;
619 unsigned i;
620 int ret = 0;
621
622 for (i = 0; !ret && i < rs->md.raid_disks; i++)
623 if (rs->dev[i].data_dev)
624 ret = fn(ti,
625 rs->dev[i].data_dev,
626 0, /* No offset on data devs */
627 rs->md.dev_sectors,
628 data);
629
630 return ret;
631}
632
633static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
634{
635 struct raid_set *rs = ti->private;
636 unsigned chunk_size = rs->md.chunk_sectors << 9;
637 raid5_conf_t *conf = rs->md.private;
638
639 blk_limits_io_min(limits, chunk_size);
640 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
641}
642
643static void raid_presuspend(struct dm_target *ti)
644{
645 struct raid_set *rs = ti->private;
646
647 md_stop_writes(&rs->md);
648}
649
650static void raid_postsuspend(struct dm_target *ti)
651{
652 struct raid_set *rs = ti->private;
653
654 mddev_suspend(&rs->md);
655}
656
657static void raid_resume(struct dm_target *ti)
658{
659 struct raid_set *rs = ti->private;
660
661 mddev_resume(&rs->md);
662}
663
664static struct target_type raid_target = {
665 .name = "raid",
666 .version = {1, 0, 0},
667 .module = THIS_MODULE,
668 .ctr = raid_ctr,
669 .dtr = raid_dtr,
670 .map = raid_map,
671 .status = raid_status,
672 .iterate_devices = raid_iterate_devices,
673 .io_hints = raid_io_hints,
674 .presuspend = raid_presuspend,
675 .postsuspend = raid_postsuspend,
676 .resume = raid_resume,
677};
678
679static int __init dm_raid_init(void)
680{
681 return dm_register_target(&raid_target);
682}
683
684static void __exit dm_raid_exit(void)
685{
686 dm_unregister_target(&raid_target);
687}
688
689module_init(dm_raid_init);
690module_exit(dm_raid_exit);
691
692MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
693MODULE_ALIAS("dm-raid4");
694MODULE_ALIAS("dm-raid5");
695MODULE_ALIAS("dm-raid6");
696MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
697MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
261 struct dm_io_request io_req = { 261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_FLUSH, 262 .bi_rw = WRITE_FLUSH,
263 .mem.type = DM_IO_KMEM, 263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL, 264 .mem.ptr.addr = NULL,
265 .client = ms->io_client, 265 .client = ms->io_client,
266 }; 266 };
267 267
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
637 .client = ms->io_client, 637 .client = ms->io_client,
638 }; 638 };
639 639
640 if (bio->bi_rw & REQ_DISCARD) {
641 io_req.bi_rw |= REQ_DISCARD;
642 io_req.mem.type = DM_IO_KMEM;
643 io_req.mem.ptr.addr = NULL;
644 }
645
640 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 646 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
641 map_region(dest++, m, bio); 647 map_region(dest++, m, bio);
642 648
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
670 bio_list_init(&requeue); 676 bio_list_init(&requeue);
671 677
672 while ((bio = bio_list_pop(writes))) { 678 while ((bio = bio_list_pop(writes))) {
673 if (bio->bi_rw & REQ_FLUSH) { 679 if ((bio->bi_rw & REQ_FLUSH) ||
680 (bio->bi_rw & REQ_DISCARD)) {
674 bio_list_add(&sync, bio); 681 bio_list_add(&sync, bio);
675 continue; 682 continue;
676 } 683 }
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1076 ti->private = ms; 1083 ti->private = ms;
1077 ti->split_io = dm_rh_get_region_size(ms->rh); 1084 ti->split_io = dm_rh_get_region_size(ms->rh);
1078 ti->num_flush_requests = 1; 1085 ti->num_flush_requests = 1;
1086 ti->num_discard_requests = 1;
1079 1087
1080 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1088 ms->kmirrord_wq = alloc_workqueue("kmirrord",
1089 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1081 if (!ms->kmirrord_wq) { 1090 if (!ms->kmirrord_wq) {
1082 DMERR("couldn't start kmirrord"); 1091 DMERR("couldn't start kmirrord");
1083 r = -ENOMEM; 1092 r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
1130 1139
1131 del_timer_sync(&ms->timer); 1140 del_timer_sync(&ms->timer);
1132 flush_workqueue(ms->kmirrord_wq); 1141 flush_workqueue(ms->kmirrord_wq);
1133 flush_scheduled_work(); 1142 flush_work_sync(&ms->trigger_event);
1134 dm_kcopyd_client_destroy(ms->kcopyd_client); 1143 dm_kcopyd_client_destroy(ms->kcopyd_client);
1135 destroy_workqueue(ms->kmirrord_wq); 1144 destroy_workqueue(ms->kmirrord_wq);
1136 free_context(ms, ti, ms->nr_mirrors); 1145 free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
1406 1415
1407static struct target_type mirror_target = { 1416static struct target_type mirror_target = {
1408 .name = "mirror", 1417 .name = "mirror",
1409 .version = {1, 12, 0}, 1418 .version = {1, 12, 1},
1410 .module = THIS_MODULE, 1419 .module = THIS_MODULE,
1411 .ctr = mirror_ctr, 1420 .ctr = mirror_ctr,
1412 .dtr = mirror_dtr, 1421 .dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115dc..95891dfcbca0 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
256 */ 256 */
257 INIT_WORK_ONSTACK(&req.work, do_metadata); 257 INIT_WORK_ONSTACK(&req.work, do_metadata);
258 queue_work(ps->metadata_wq, &req.work); 258 queue_work(ps->metadata_wq, &req.work);
259 flush_workqueue(ps->metadata_wq); 259 flush_work(&req.work);
260 260
261 return req.result; 261 return req.result;
262} 262}
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
818 atomic_set(&ps->pending_count, 0); 818 atomic_set(&ps->pending_count, 0);
819 ps->callbacks = NULL; 819 ps->callbacks = NULL;
820 820
821 ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); 821 ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
822 if (!ps->metadata_wq) { 822 if (!ps->metadata_wq) {
823 kfree(ps); 823 kfree(ps);
824 DMERR("couldn't start header metadata update thread"); 824 DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcbc..fdde53cd12b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/log2.h> 20#include <linux/log2.h>
21#include <linux/dm-kcopyd.h> 21#include <linux/dm-kcopyd.h>
22#include <linux/workqueue.h>
23 22
24#include "dm-exception-store.h" 23#include "dm-exception-store.h"
25 24
@@ -80,9 +79,6 @@ struct dm_snapshot {
80 /* Origin writes don't trigger exceptions until this is set */ 79 /* Origin writes don't trigger exceptions until this is set */
81 int active; 80 int active;
82 81
83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
85
86 atomic_t pending_exceptions_count; 82 atomic_t pending_exceptions_count;
87 83
88 mempool_t *pending_pool; 84 mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
106 102
107 struct dm_kcopyd_client *kcopyd_client; 103 struct dm_kcopyd_client *kcopyd_client;
108 104
109 /* Queue of snapshot writes for ksnapd to flush */
110 struct bio_list queued_bios;
111 struct work_struct queued_bios_work;
112
113 /* Wait for events based on state_bits */ 105 /* Wait for events based on state_bits */
114 unsigned long state_bits; 106 unsigned long state_bits;
115 107
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
160} 152}
161EXPORT_SYMBOL(dm_snap_cow); 153EXPORT_SYMBOL(dm_snap_cow);
162 154
163static struct workqueue_struct *ksnapd;
164static void flush_queued_bios(struct work_struct *work);
165
166static sector_t chunk_to_sector(struct dm_exception_store *store, 155static sector_t chunk_to_sector(struct dm_exception_store *store,
167 chunk_t chunk) 156 chunk_t chunk)
168{ 157{
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1110 s->ti = ti; 1099 s->ti = ti;
1111 s->valid = 1; 1100 s->valid = 1;
1112 s->active = 0; 1101 s->active = 0;
1113 s->suspended = 0;
1114 atomic_set(&s->pending_exceptions_count, 0); 1102 atomic_set(&s->pending_exceptions_count, 0);
1115 init_rwsem(&s->lock); 1103 init_rwsem(&s->lock);
1116 INIT_LIST_HEAD(&s->list); 1104 INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1153 1141
1154 spin_lock_init(&s->tracked_chunk_lock); 1142 spin_lock_init(&s->tracked_chunk_lock);
1155 1143
1156 bio_list_init(&s->queued_bios);
1157 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1158
1159 ti->private = s; 1144 ti->private = s;
1160 ti->num_flush_requests = num_flush_requests; 1145 ti->num_flush_requests = num_flush_requests;
1161 1146
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
1279 struct dm_snapshot *s = ti->private; 1264 struct dm_snapshot *s = ti->private;
1280 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1265 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1281 1266
1282 flush_workqueue(ksnapd);
1283
1284 down_read(&_origins_lock); 1267 down_read(&_origins_lock);
1285 /* Check whether exception handover must be cancelled */ 1268 /* Check whether exception handover must be cancelled */
1286 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1269 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
1342 } 1325 }
1343} 1326}
1344 1327
1345static void flush_queued_bios(struct work_struct *work)
1346{
1347 struct dm_snapshot *s =
1348 container_of(work, struct dm_snapshot, queued_bios_work);
1349 struct bio *queued_bios;
1350 unsigned long flags;
1351
1352 spin_lock_irqsave(&s->pe_lock, flags);
1353 queued_bios = bio_list_get(&s->queued_bios);
1354 spin_unlock_irqrestore(&s->pe_lock, flags);
1355
1356 flush_bios(queued_bios);
1357}
1358
1359static int do_origin(struct dm_dev *origin, struct bio *bio); 1328static int do_origin(struct dm_dev *origin, struct bio *bio);
1360 1329
1361/* 1330/*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
1760 stop_merge(s); 1729 stop_merge(s);
1761} 1730}
1762 1731
1763static void snapshot_postsuspend(struct dm_target *ti)
1764{
1765 struct dm_snapshot *s = ti->private;
1766
1767 down_write(&s->lock);
1768 s->suspended = 1;
1769 up_write(&s->lock);
1770}
1771
1772static int snapshot_preresume(struct dm_target *ti) 1732static int snapshot_preresume(struct dm_target *ti)
1773{ 1733{
1774 int r = 0; 1734 int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
1783 DMERR("Unable to resume snapshot source until " 1743 DMERR("Unable to resume snapshot source until "
1784 "handover completes."); 1744 "handover completes.");
1785 r = -EINVAL; 1745 r = -EINVAL;
1786 } else if (!snap_src->suspended) { 1746 } else if (!dm_suspended(snap_src->ti)) {
1787 DMERR("Unable to perform snapshot handover until " 1747 DMERR("Unable to perform snapshot handover until "
1788 "source is suspended."); 1748 "source is suspended.");
1789 r = -EINVAL; 1749 r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
1816 1776
1817 down_write(&s->lock); 1777 down_write(&s->lock);
1818 s->active = 1; 1778 s->active = 1;
1819 s->suspended = 0;
1820 up_write(&s->lock); 1779 up_write(&s->lock);
1821} 1780}
1822 1781
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
2194 2153
2195static struct target_type origin_target = { 2154static struct target_type origin_target = {
2196 .name = "snapshot-origin", 2155 .name = "snapshot-origin",
2197 .version = {1, 7, 0}, 2156 .version = {1, 7, 1},
2198 .module = THIS_MODULE, 2157 .module = THIS_MODULE,
2199 .ctr = origin_ctr, 2158 .ctr = origin_ctr,
2200 .dtr = origin_dtr, 2159 .dtr = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
2207 2166
2208static struct target_type snapshot_target = { 2167static struct target_type snapshot_target = {
2209 .name = "snapshot", 2168 .name = "snapshot",
2210 .version = {1, 9, 0}, 2169 .version = {1, 10, 0},
2211 .module = THIS_MODULE, 2170 .module = THIS_MODULE,
2212 .ctr = snapshot_ctr, 2171 .ctr = snapshot_ctr,
2213 .dtr = snapshot_dtr, 2172 .dtr = snapshot_dtr,
2214 .map = snapshot_map, 2173 .map = snapshot_map,
2215 .end_io = snapshot_end_io, 2174 .end_io = snapshot_end_io,
2216 .postsuspend = snapshot_postsuspend,
2217 .preresume = snapshot_preresume, 2175 .preresume = snapshot_preresume,
2218 .resume = snapshot_resume, 2176 .resume = snapshot_resume,
2219 .status = snapshot_status, 2177 .status = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
2222 2180
2223static struct target_type merge_target = { 2181static struct target_type merge_target = {
2224 .name = dm_snapshot_merge_target_name, 2182 .name = dm_snapshot_merge_target_name,
2225 .version = {1, 0, 0}, 2183 .version = {1, 1, 0},
2226 .module = THIS_MODULE, 2184 .module = THIS_MODULE,
2227 .ctr = snapshot_ctr, 2185 .ctr = snapshot_ctr,
2228 .dtr = snapshot_dtr, 2186 .dtr = snapshot_dtr,
2229 .map = snapshot_merge_map, 2187 .map = snapshot_merge_map,
2230 .end_io = snapshot_end_io, 2188 .end_io = snapshot_end_io,
2231 .presuspend = snapshot_merge_presuspend, 2189 .presuspend = snapshot_merge_presuspend,
2232 .postsuspend = snapshot_postsuspend,
2233 .preresume = snapshot_preresume, 2190 .preresume = snapshot_preresume,
2234 .resume = snapshot_merge_resume, 2191 .resume = snapshot_merge_resume,
2235 .status = snapshot_status, 2192 .status = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
2291 goto bad_tracked_chunk_cache; 2248 goto bad_tracked_chunk_cache;
2292 } 2249 }
2293 2250
2294 ksnapd = create_singlethread_workqueue("ksnapd");
2295 if (!ksnapd) {
2296 DMERR("Failed to create ksnapd workqueue.");
2297 r = -ENOMEM;
2298 goto bad_pending_pool;
2299 }
2300
2301 return 0; 2251 return 0;
2302 2252
2303bad_pending_pool:
2304 kmem_cache_destroy(tracked_chunk_cache);
2305bad_tracked_chunk_cache: 2253bad_tracked_chunk_cache:
2306 kmem_cache_destroy(pending_cache); 2254 kmem_cache_destroy(pending_cache);
2307bad_pending_cache: 2255bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
2322 2270
2323static void __exit dm_snapshot_exit(void) 2271static void __exit dm_snapshot_exit(void)
2324{ 2272{
2325 destroy_workqueue(ksnapd);
2326
2327 dm_unregister_target(&snapshot_target); 2273 dm_unregister_target(&snapshot_target);
2328 dm_unregister_target(&origin_target); 2274 dm_unregister_target(&origin_target);
2329 dm_unregister_target(&merge_target); 2275 dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fbf..dddfa14f2982 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
39 struct dm_target *ti; 39 struct dm_target *ti;
40 40
41 /* Work struct used for triggering events*/ 41 /* Work struct used for triggering events*/
42 struct work_struct kstriped_ws; 42 struct work_struct trigger_event;
43 43
44 struct stripe stripe[0]; 44 struct stripe stripe[0];
45}; 45};
46 46
47static struct workqueue_struct *kstriped;
48
49/* 47/*
50 * An event is triggered whenever a drive 48 * An event is triggered whenever a drive
51 * drops out of a stripe volume. 49 * drops out of a stripe volume.
52 */ 50 */
53static void trigger_event(struct work_struct *work) 51static void trigger_event(struct work_struct *work)
54{ 52{
55 struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); 53 struct stripe_c *sc = container_of(work, struct stripe_c,
56 54 trigger_event);
57 dm_table_event(sc->ti->table); 55 dm_table_event(sc->ti->table);
58
59} 56}
60 57
61static inline struct stripe_c *alloc_context(unsigned int stripes) 58static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 return -ENOMEM; 157 return -ENOMEM;
161 } 158 }
162 159
163 INIT_WORK(&sc->kstriped_ws, trigger_event); 160 INIT_WORK(&sc->trigger_event, trigger_event);
164 161
165 /* Set pointer to dm target; used in trigger_event */ 162 /* Set pointer to dm target; used in trigger_event */
166 sc->ti = ti; 163 sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
211 for (i = 0; i < sc->stripes; i++) 208 for (i = 0; i < sc->stripes; i++)
212 dm_put_device(ti, sc->stripe[i].dev); 209 dm_put_device(ti, sc->stripe[i].dev);
213 210
214 flush_workqueue(kstriped); 211 flush_work_sync(&sc->trigger_event);
215 kfree(sc); 212 kfree(sc);
216} 213}
217 214
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
367 atomic_inc(&(sc->stripe[i].error_count)); 364 atomic_inc(&(sc->stripe[i].error_count));
368 if (atomic_read(&(sc->stripe[i].error_count)) < 365 if (atomic_read(&(sc->stripe[i].error_count)) <
369 DM_IO_ERROR_THRESHOLD) 366 DM_IO_ERROR_THRESHOLD)
370 queue_work(kstriped, &sc->kstriped_ws); 367 schedule_work(&sc->trigger_event);
371 } 368 }
372 369
373 return error; 370 return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
401 398
402static struct target_type stripe_target = { 399static struct target_type stripe_target = {
403 .name = "striped", 400 .name = "striped",
404 .version = {1, 3, 0}, 401 .version = {1, 3, 1},
405 .module = THIS_MODULE, 402 .module = THIS_MODULE,
406 .ctr = stripe_ctr, 403 .ctr = stripe_ctr,
407 .dtr = stripe_dtr, 404 .dtr = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
422 return r; 419 return r;
423 } 420 }
424 421
425 kstriped = create_singlethread_workqueue("kstriped");
426 if (!kstriped) {
427 DMERR("failed to create workqueue kstriped");
428 dm_unregister_target(&stripe_target);
429 return -ENOMEM;
430 }
431
432 return r; 422 return r;
433} 423}
434 424
435void dm_stripe_exit(void) 425void dm_stripe_exit(void)
436{ 426{
437 dm_unregister_target(&stripe_target); 427 dm_unregister_target(&stripe_target);
438 destroy_workqueue(kstriped);
439
440 return;
441} 428}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
71 void *event_context; 71 void *event_context;
72 72
73 struct dm_md_mempools *mempools; 73 struct dm_md_mempools *mempools;
74
75 struct list_head target_callbacks;
74}; 76};
75 77
76/* 78/*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
204 return -ENOMEM; 206 return -ENOMEM;
205 207
206 INIT_LIST_HEAD(&t->devices); 208 INIT_LIST_HEAD(&t->devices);
209 INIT_LIST_HEAD(&t->target_callbacks);
207 atomic_set(&t->holders, 0); 210 atomic_set(&t->holders, 0);
208 t->discards_supported = 1; 211 t->discards_supported = 1;
209 212
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
1225 return 0; 1228 return 0;
1226} 1229}
1227 1230
1231void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
1232{
1233 list_add(&cb->list, &t->target_callbacks);
1234}
1235EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
1236
1228int dm_table_any_congested(struct dm_table *t, int bdi_bits) 1237int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1229{ 1238{
1230 struct dm_dev_internal *dd; 1239 struct dm_dev_internal *dd;
1231 struct list_head *devices = dm_table_get_devices(t); 1240 struct list_head *devices = dm_table_get_devices(t);
1241 struct dm_target_callbacks *cb;
1232 int r = 0; 1242 int r = 0;
1233 1243
1234 list_for_each_entry(dd, devices, list) { 1244 list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1243 bdevname(dd->dm_dev.bdev, b)); 1253 bdevname(dd->dm_dev.bdev, b));
1244 } 1254 }
1245 1255
1256 list_for_each_entry(cb, &t->target_callbacks, list)
1257 if (cb->congested_fn)
1258 r |= cb->congested_fn(cb, bdi_bits);
1259
1246 return r; 1260 return r;
1247} 1261}
1248 1262
@@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
1264{ 1278{
1265 struct dm_dev_internal *dd; 1279 struct dm_dev_internal *dd;
1266 struct list_head *devices = dm_table_get_devices(t); 1280 struct list_head *devices = dm_table_get_devices(t);
1281 struct dm_target_callbacks *cb;
1267 1282
1268 list_for_each_entry(dd, devices, list) { 1283 list_for_each_entry(dd, devices, list) {
1269 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); 1284 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
1276 dm_device_name(t->md), 1291 dm_device_name(t->md),
1277 bdevname(dd->dm_dev.bdev, b)); 1292 bdevname(dd->dm_dev.bdev, b));
1278 } 1293 }
1294
1295 list_for_each_entry(cb, &t->target_callbacks, list)
1296 if (cb->unplug_fn)
1297 cb->unplug_fn(cb);
1279} 1298}
1280 1299
1281struct mapped_device *dm_table_get_md(struct dm_table *t) 1300struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..eaa3af0e0632 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
32#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
33#define DM_COOKIE_LENGTH 24 33#define DM_COOKIE_LENGTH 24
34 34
35static DEFINE_MUTEX(dm_mutex);
36static const char *_name = DM_NAME; 35static const char *_name = DM_NAME;
37 36
38static unsigned int major = 0; 37static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
328{ 327{
329 struct mapped_device *md; 328 struct mapped_device *md;
330 329
331 mutex_lock(&dm_mutex);
332 spin_lock(&_minor_lock); 330 spin_lock(&_minor_lock);
333 331
334 md = bdev->bd_disk->private_data; 332 md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
346 344
347out: 345out:
348 spin_unlock(&_minor_lock); 346 spin_unlock(&_minor_lock);
349 mutex_unlock(&dm_mutex);
350 347
351 return md ? 0 : -ENXIO; 348 return md ? 0 : -ENXIO;
352} 349}
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
355{ 352{
356 struct mapped_device *md = disk->private_data; 353 struct mapped_device *md = disk->private_data;
357 354
358 mutex_lock(&dm_mutex); 355 spin_lock(&_minor_lock);
356
359 atomic_dec(&md->open_count); 357 atomic_dec(&md->open_count);
360 dm_put(md); 358 dm_put(md);
361 mutex_unlock(&dm_mutex); 359
360 spin_unlock(&_minor_lock);
362 361
363 return 0; 362 return 0;
364} 363}
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
1638 if (map_request(ti, clone, md)) 1637 if (map_request(ti, clone, md))
1639 goto requeued; 1638 goto requeued;
1640 1639
1641 spin_lock_irq(q->queue_lock); 1640 BUG_ON(!irqs_disabled());
1641 spin_lock(q->queue_lock);
1642 } 1642 }
1643 1643
1644 goto out; 1644 goto out;
1645 1645
1646requeued: 1646requeued:
1647 spin_lock_irq(q->queue_lock); 1647 BUG_ON(!irqs_disabled());
1648 spin_lock(q->queue_lock);
1648 1649
1649plug_and_out: 1650plug_and_out:
1650 if (!elv_queue_empty(q)) 1651 if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
1884 add_disk(md->disk); 1885 add_disk(md->disk);
1885 format_dev_t(md->name, MKDEV(_major, minor)); 1886 format_dev_t(md->name, MKDEV(_major, minor));
1886 1887
1887 md->wq = create_singlethread_workqueue("kdmflush"); 1888 md->wq = alloc_workqueue("kdmflush",
1889 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1888 if (!md->wq) 1890 if (!md->wq)
1889 goto bad_thread; 1891 goto bad_thread;
1890 1892
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
1992 wake_up(&md->eventq); 1994 wake_up(&md->eventq);
1993} 1995}
1994 1996
1997/*
1998 * Protected by md->suspend_lock obtained by dm_swap_table().
1999 */
1995static void __set_size(struct mapped_device *md, sector_t size) 2000static void __set_size(struct mapped_device *md, sector_t size)
1996{ 2001{
1997 set_capacity(md->disk, size); 2002 set_capacity(md->disk, size);
1998 2003
1999 mutex_lock(&md->bdev->bd_inode->i_mutex);
2000 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2004 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2001 mutex_unlock(&md->bdev->bd_inode->i_mutex);
2002} 2005}
2003 2006
2004/* 2007/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fc090ac9e28..cf8594c5ea21 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
288 int rv; 288 int rv;
289 int cpu; 289 int cpu;
290 290
291 if (mddev == NULL || mddev->pers == NULL) { 291 if (mddev == NULL || mddev->pers == NULL
292 || !mddev->ready) {
292 bio_io_error(bio); 293 bio_io_error(bio);
293 return 0; 294 return 0;
294 } 295 }
296 smp_rmb(); /* Ensure implications of 'active' are visible */
295 rcu_read_lock(); 297 rcu_read_lock();
296 if (mddev->suspended) { 298 if (mddev->suspended) {
297 DEFINE_WAIT(__wait); 299 DEFINE_WAIT(__wait);
@@ -703,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
703} 705}
704 706
705/* return the offset of the super block in 512byte sectors */ 707/* return the offset of the super block in 512byte sectors */
706static inline sector_t calc_dev_sboffset(struct block_device *bdev) 708static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
707{ 709{
708 sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; 710 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
709 return MD_NEW_SIZE_SECTORS(num_sectors); 711 return MD_NEW_SIZE_SECTORS(num_sectors);
710} 712}
711 713
@@ -763,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
763 */ 765 */
764 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 766 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
765 767
766 bio->bi_bdev = rdev->bdev; 768 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
767 bio->bi_sector = sector; 769 bio->bi_sector = sector;
768 bio_add_page(bio, page, size, 0); 770 bio_add_page(bio, page, size, 0);
769 bio->bi_private = rdev; 771 bio->bi_private = rdev;
@@ -793,7 +795,7 @@ static void bi_complete(struct bio *bio, int error)
793} 795}
794 796
795int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 797int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
796 struct page *page, int rw) 798 struct page *page, int rw, bool metadata_op)
797{ 799{
798 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 800 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
799 struct completion event; 801 struct completion event;
@@ -801,8 +803,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
801 803
802 rw |= REQ_SYNC | REQ_UNPLUG; 804 rw |= REQ_SYNC | REQ_UNPLUG;
803 805
804 bio->bi_bdev = rdev->bdev; 806 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
805 bio->bi_sector = sector; 807 rdev->meta_bdev : rdev->bdev;
808 if (metadata_op)
809 bio->bi_sector = sector + rdev->sb_start;
810 else
811 bio->bi_sector = sector + rdev->data_offset;
806 bio_add_page(bio, page, size, 0); 812 bio_add_page(bio, page, size, 0);
807 init_completion(&event); 813 init_completion(&event);
808 bio->bi_private = &event; 814 bio->bi_private = &event;
@@ -827,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
827 return 0; 833 return 0;
828 834
829 835
830 if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) 836 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
831 goto fail; 837 goto fail;
832 rdev->sb_loaded = 1; 838 rdev->sb_loaded = 1;
833 return 0; 839 return 0;
@@ -989,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
989 * 995 *
990 * It also happens to be a multiple of 4Kb. 996 * It also happens to be a multiple of 4Kb.
991 */ 997 */
992 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 998 rdev->sb_start = calc_dev_sboffset(rdev);
993 999
994 ret = read_disk_sb(rdev, MD_SB_BYTES); 1000 ret = read_disk_sb(rdev, MD_SB_BYTES);
995 if (ret) return ret; 1001 if (ret) return ret;
@@ -1330,7 +1336,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1330 return 0; /* component must fit device */ 1336 return 0; /* component must fit device */
1331 if (rdev->mddev->bitmap_info.offset) 1337 if (rdev->mddev->bitmap_info.offset)
1332 return 0; /* can't move bitmap */ 1338 return 0; /* can't move bitmap */
1333 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1339 rdev->sb_start = calc_dev_sboffset(rdev);
1334 if (!num_sectors || num_sectors > rdev->sb_start) 1340 if (!num_sectors || num_sectors > rdev->sb_start)
1335 num_sectors = rdev->sb_start; 1341 num_sectors = rdev->sb_start;
1336 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1342 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -2465,6 +2471,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2465 if (rdev2->raid_disk == slot) 2471 if (rdev2->raid_disk == slot)
2466 return -EEXIST; 2472 return -EEXIST;
2467 2473
2474 if (slot >= rdev->mddev->raid_disks &&
2475 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2476 return -ENOSPC;
2477
2468 rdev->raid_disk = slot; 2478 rdev->raid_disk = slot;
2469 if (test_bit(In_sync, &rdev->flags)) 2479 if (test_bit(In_sync, &rdev->flags))
2470 rdev->saved_raid_disk = slot; 2480 rdev->saved_raid_disk = slot;
@@ -2482,7 +2492,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 /* failure here is OK */; 2492 /* failure here is OK */;
2483 /* don't wakeup anyone, leave that to userspace. */ 2493 /* don't wakeup anyone, leave that to userspace. */
2484 } else { 2494 } else {
2485 if (slot >= rdev->mddev->raid_disks) 2495 if (slot >= rdev->mddev->raid_disks &&
2496 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2486 return -ENOSPC; 2497 return -ENOSPC;
2487 rdev->raid_disk = slot; 2498 rdev->raid_disk = slot;
2488 /* assume it is working */ 2499 /* assume it is working */
@@ -3107,7 +3118,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3107 char nm[20]; 3118 char nm[20];
3108 if (rdev->raid_disk < 0) 3119 if (rdev->raid_disk < 0)
3109 continue; 3120 continue;
3110 if (rdev->new_raid_disk > mddev->raid_disks) 3121 if (rdev->new_raid_disk >= mddev->raid_disks)
3111 rdev->new_raid_disk = -1; 3122 rdev->new_raid_disk = -1;
3112 if (rdev->new_raid_disk == rdev->raid_disk) 3123 if (rdev->new_raid_disk == rdev->raid_disk)
3113 continue; 3124 continue;
@@ -3736,6 +3747,8 @@ action_show(mddev_t *mddev, char *page)
3736 return sprintf(page, "%s\n", type); 3747 return sprintf(page, "%s\n", type);
3737} 3748}
3738 3749
3750static void reap_sync_thread(mddev_t *mddev);
3751
3739static ssize_t 3752static ssize_t
3740action_store(mddev_t *mddev, const char *page, size_t len) 3753action_store(mddev_t *mddev, const char *page, size_t len)
3741{ 3754{
@@ -3750,9 +3763,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
3750 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3763 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3751 if (mddev->sync_thread) { 3764 if (mddev->sync_thread) {
3752 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3765 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3753 md_unregister_thread(mddev->sync_thread); 3766 reap_sync_thread(mddev);
3754 mddev->sync_thread = NULL;
3755 mddev->recovery = 0;
3756 } 3767 }
3757 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3768 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3758 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3769 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3915,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3904static ssize_t 3915static ssize_t
3905sync_completed_show(mddev_t *mddev, char *page) 3916sync_completed_show(mddev_t *mddev, char *page)
3906{ 3917{
3907 unsigned long max_sectors, resync; 3918 unsigned long long max_sectors, resync;
3908 3919
3909 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3920 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3910 return sprintf(page, "none\n"); 3921 return sprintf(page, "none\n");
@@ -3915,7 +3926,7 @@ sync_completed_show(mddev_t *mddev, char *page)
3915 max_sectors = mddev->dev_sectors; 3926 max_sectors = mddev->dev_sectors;
3916 3927
3917 resync = mddev->curr_resync_completed; 3928 resync = mddev->curr_resync_completed;
3918 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3929 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
3919} 3930}
3920 3931
3921static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3932static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4013,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
4002{ 4013{
4003 char *e; 4014 char *e;
4004 unsigned long long new = simple_strtoull(buf, &e, 10); 4015 unsigned long long new = simple_strtoull(buf, &e, 10);
4016 unsigned long long old = mddev->suspend_lo;
4005 4017
4006 if (mddev->pers == NULL || 4018 if (mddev->pers == NULL ||
4007 mddev->pers->quiesce == NULL) 4019 mddev->pers->quiesce == NULL)
4008 return -EINVAL; 4020 return -EINVAL;
4009 if (buf == e || (*e && *e != '\n')) 4021 if (buf == e || (*e && *e != '\n'))
4010 return -EINVAL; 4022 return -EINVAL;
4011 if (new >= mddev->suspend_hi || 4023
4012 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 4024 mddev->suspend_lo = new;
4013 mddev->suspend_lo = new; 4025 if (new >= old)
4026 /* Shrinking suspended region */
4014 mddev->pers->quiesce(mddev, 2); 4027 mddev->pers->quiesce(mddev, 2);
4015 return len; 4028 else {
4016 } else 4029 /* Expanding suspended region - need to wait */
4017 return -EINVAL; 4030 mddev->pers->quiesce(mddev, 1);
4031 mddev->pers->quiesce(mddev, 0);
4032 }
4033 return len;
4018} 4034}
4019static struct md_sysfs_entry md_suspend_lo = 4035static struct md_sysfs_entry md_suspend_lo =
4020__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4036__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4047,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
4031{ 4047{
4032 char *e; 4048 char *e;
4033 unsigned long long new = simple_strtoull(buf, &e, 10); 4049 unsigned long long new = simple_strtoull(buf, &e, 10);
4050 unsigned long long old = mddev->suspend_hi;
4034 4051
4035 if (mddev->pers == NULL || 4052 if (mddev->pers == NULL ||
4036 mddev->pers->quiesce == NULL) 4053 mddev->pers->quiesce == NULL)
4037 return -EINVAL; 4054 return -EINVAL;
4038 if (buf == e || (*e && *e != '\n')) 4055 if (buf == e || (*e && *e != '\n'))
4039 return -EINVAL; 4056 return -EINVAL;
4040 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 4057
4041 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 4058 mddev->suspend_hi = new;
4042 mddev->suspend_hi = new; 4059 if (new <= old)
4060 /* Shrinking suspended region */
4061 mddev->pers->quiesce(mddev, 2);
4062 else {
4063 /* Expanding suspended region - need to wait */
4043 mddev->pers->quiesce(mddev, 1); 4064 mddev->pers->quiesce(mddev, 1);
4044 mddev->pers->quiesce(mddev, 0); 4065 mddev->pers->quiesce(mddev, 0);
4045 return len; 4066 }
4046 } else 4067 return len;
4047 return -EINVAL;
4048} 4068}
4049static struct md_sysfs_entry md_suspend_hi = 4069static struct md_sysfs_entry md_suspend_hi =
4050__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4070__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4422,7 +4442,9 @@ int md_run(mddev_t *mddev)
4422 * We don't want the data to overlap the metadata, 4442 * We don't want the data to overlap the metadata,
4423 * Internal Bitmap issues have been handled elsewhere. 4443 * Internal Bitmap issues have been handled elsewhere.
4424 */ 4444 */
4425 if (rdev->data_offset < rdev->sb_start) { 4445 if (rdev->meta_bdev) {
4446 /* Nothing to check */;
4447 } else if (rdev->data_offset < rdev->sb_start) {
4426 if (mddev->dev_sectors && 4448 if (mddev->dev_sectors &&
4427 rdev->data_offset + mddev->dev_sectors 4449 rdev->data_offset + mddev->dev_sectors
4428 > rdev->sb_start) { 4450 > rdev->sb_start) {
@@ -4556,7 +4578,8 @@ int md_run(mddev_t *mddev)
4556 mddev->safemode_timer.data = (unsigned long) mddev; 4578 mddev->safemode_timer.data = (unsigned long) mddev;
4557 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4579 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4558 mddev->in_sync = 1; 4580 mddev->in_sync = 1;
4559 4581 smp_wmb();
4582 mddev->ready = 1;
4560 list_for_each_entry(rdev, &mddev->disks, same_set) 4583 list_for_each_entry(rdev, &mddev->disks, same_set)
4561 if (rdev->raid_disk >= 0) { 4584 if (rdev->raid_disk >= 0) {
4562 char nm[20]; 4585 char nm[20];
@@ -4693,13 +4716,12 @@ static void md_clean(mddev_t *mddev)
4693 mddev->plug = NULL; 4716 mddev->plug = NULL;
4694} 4717}
4695 4718
4696void md_stop_writes(mddev_t *mddev) 4719static void __md_stop_writes(mddev_t *mddev)
4697{ 4720{
4698 if (mddev->sync_thread) { 4721 if (mddev->sync_thread) {
4699 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4722 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4700 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4723 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4701 md_unregister_thread(mddev->sync_thread); 4724 reap_sync_thread(mddev);
4702 mddev->sync_thread = NULL;
4703 } 4725 }
4704 4726
4705 del_timer_sync(&mddev->safemode_timer); 4727 del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4735,18 @@ void md_stop_writes(mddev_t *mddev)
4713 md_update_sb(mddev, 1); 4735 md_update_sb(mddev, 1);
4714 } 4736 }
4715} 4737}
4738
4739void md_stop_writes(mddev_t *mddev)
4740{
4741 mddev_lock(mddev);
4742 __md_stop_writes(mddev);
4743 mddev_unlock(mddev);
4744}
4716EXPORT_SYMBOL_GPL(md_stop_writes); 4745EXPORT_SYMBOL_GPL(md_stop_writes);
4717 4746
4718void md_stop(mddev_t *mddev) 4747void md_stop(mddev_t *mddev)
4719{ 4748{
4749 mddev->ready = 0;
4720 mddev->pers->stop(mddev); 4750 mddev->pers->stop(mddev);
4721 if (mddev->pers->sync_request && mddev->to_remove == NULL) 4751 if (mddev->pers->sync_request && mddev->to_remove == NULL)
4722 mddev->to_remove = &md_redundancy_group; 4752 mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4766,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
4736 goto out; 4766 goto out;
4737 } 4767 }
4738 if (mddev->pers) { 4768 if (mddev->pers) {
4739 md_stop_writes(mddev); 4769 __md_stop_writes(mddev);
4740 4770
4741 err = -ENXIO; 4771 err = -ENXIO;
4742 if (mddev->ro==1) 4772 if (mddev->ro==1)
@@ -4773,7 +4803,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4773 if (mddev->ro) 4803 if (mddev->ro)
4774 set_disk_ro(disk, 0); 4804 set_disk_ro(disk, 0);
4775 4805
4776 md_stop_writes(mddev); 4806 __md_stop_writes(mddev);
4777 md_stop(mddev); 4807 md_stop(mddev);
4778 mddev->queue->merge_bvec_fn = NULL; 4808 mddev->queue->merge_bvec_fn = NULL;
4779 mddev->queue->unplug_fn = NULL; 4809 mddev->queue->unplug_fn = NULL;
@@ -5151,9 +5181,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5151 /* set saved_raid_disk if appropriate */ 5181 /* set saved_raid_disk if appropriate */
5152 if (!mddev->persistent) { 5182 if (!mddev->persistent) {
5153 if (info->state & (1<<MD_DISK_SYNC) && 5183 if (info->state & (1<<MD_DISK_SYNC) &&
5154 info->raid_disk < mddev->raid_disks) 5184 info->raid_disk < mddev->raid_disks) {
5155 rdev->raid_disk = info->raid_disk; 5185 rdev->raid_disk = info->raid_disk;
5156 else 5186 set_bit(In_sync, &rdev->flags);
5187 } else
5157 rdev->raid_disk = -1; 5188 rdev->raid_disk = -1;
5158 } else 5189 } else
5159 super_types[mddev->major_version]. 5190 super_types[mddev->major_version].
@@ -5230,7 +5261,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5230 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5261 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5231 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5262 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5232 } else 5263 } else
5233 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5264 rdev->sb_start = calc_dev_sboffset(rdev);
5234 rdev->sectors = rdev->sb_start; 5265 rdev->sectors = rdev->sb_start;
5235 5266
5236 err = bind_rdev_to_array(rdev, mddev); 5267 err = bind_rdev_to_array(rdev, mddev);
@@ -5297,7 +5328,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
5297 } 5328 }
5298 5329
5299 if (mddev->persistent) 5330 if (mddev->persistent)
5300 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5331 rdev->sb_start = calc_dev_sboffset(rdev);
5301 else 5332 else
5302 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5333 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5303 5334
@@ -5510,7 +5541,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
5510 * sb_start or, if that is <data_offset, it must fit before the size 5541 * sb_start or, if that is <data_offset, it must fit before the size
5511 * of each device. If num_sectors is zero, we find the largest size 5542 * of each device. If num_sectors is zero, we find the largest size
5512 * that fits. 5543 * that fits.
5513
5514 */ 5544 */
5515 if (mddev->sync_thread) 5545 if (mddev->sync_thread)
5516 return -EBUSY; 5546 return -EBUSY;
@@ -6033,7 +6063,8 @@ static int md_thread(void * arg)
6033 || kthread_should_stop(), 6063 || kthread_should_stop(),
6034 thread->timeout); 6064 thread->timeout);
6035 6065
6036 if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) 6066 clear_bit(THREAD_WAKEUP, &thread->flags);
6067 if (!kthread_should_stop())
6037 thread->run(thread->mddev); 6068 thread->run(thread->mddev);
6038 } 6069 }
6039 6070
@@ -6799,7 +6830,7 @@ void md_do_sync(mddev_t *mddev)
6799 desc, mdname(mddev)); 6830 desc, mdname(mddev));
6800 mddev->curr_resync = j; 6831 mddev->curr_resync = j;
6801 } 6832 }
6802 mddev->curr_resync_completed = mddev->curr_resync; 6833 mddev->curr_resync_completed = j;
6803 6834
6804 while (j < max_sectors) { 6835 while (j < max_sectors) {
6805 sector_t sectors; 6836 sector_t sectors;
@@ -6817,8 +6848,7 @@ void md_do_sync(mddev_t *mddev)
6817 md_unplug(mddev); 6848 md_unplug(mddev);
6818 wait_event(mddev->recovery_wait, 6849 wait_event(mddev->recovery_wait,
6819 atomic_read(&mddev->recovery_active) == 0); 6850 atomic_read(&mddev->recovery_active) == 0);
6820 mddev->curr_resync_completed = 6851 mddev->curr_resync_completed = j;
6821 mddev->curr_resync;
6822 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6852 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6823 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6853 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6824 } 6854 }
@@ -7023,6 +7053,45 @@ static int remove_and_add_spares(mddev_t *mddev)
7023 } 7053 }
7024 return spares; 7054 return spares;
7025} 7055}
7056
7057static void reap_sync_thread(mddev_t *mddev)
7058{
7059 mdk_rdev_t *rdev;
7060
7061 /* resync has finished, collect result */
7062 md_unregister_thread(mddev->sync_thread);
7063 mddev->sync_thread = NULL;
7064 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7065 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7066 /* success...*/
7067 /* activate any spares */
7068 if (mddev->pers->spare_active(mddev))
7069 sysfs_notify(&mddev->kobj, NULL,
7070 "degraded");
7071 }
7072 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7073 mddev->pers->finish_reshape)
7074 mddev->pers->finish_reshape(mddev);
7075 md_update_sb(mddev, 1);
7076
7077 /* if array is no-longer degraded, then any saved_raid_disk
7078 * information must be scrapped
7079 */
7080 if (!mddev->degraded)
7081 list_for_each_entry(rdev, &mddev->disks, same_set)
7082 rdev->saved_raid_disk = -1;
7083
7084 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7085 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7086 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7087 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7088 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7089 /* flag recovery needed just to double check */
7090 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7091 sysfs_notify_dirent_safe(mddev->sysfs_action);
7092 md_new_event(mddev);
7093}
7094
7026/* 7095/*
7027 * This routine is regularly called by all per-raid-array threads to 7096 * This routine is regularly called by all per-raid-array threads to
7028 * deal with generic issues like resync and super-block update. 7097 * deal with generic issues like resync and super-block update.
@@ -7047,9 +7116,6 @@ static int remove_and_add_spares(mddev_t *mddev)
7047 */ 7116 */
7048void md_check_recovery(mddev_t *mddev) 7117void md_check_recovery(mddev_t *mddev)
7049{ 7118{
7050 mdk_rdev_t *rdev;
7051
7052
7053 if (mddev->bitmap) 7119 if (mddev->bitmap)
7054 bitmap_daemon_work(mddev); 7120 bitmap_daemon_work(mddev);
7055 7121
@@ -7117,34 +7183,7 @@ void md_check_recovery(mddev_t *mddev)
7117 goto unlock; 7183 goto unlock;
7118 } 7184 }
7119 if (mddev->sync_thread) { 7185 if (mddev->sync_thread) {
7120 /* resync has finished, collect result */ 7186 reap_sync_thread(mddev);
7121 md_unregister_thread(mddev->sync_thread);
7122 mddev->sync_thread = NULL;
7123 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7124 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7125 /* success...*/
7126 /* activate any spares */
7127 if (mddev->pers->spare_active(mddev))
7128 sysfs_notify(&mddev->kobj, NULL,
7129 "degraded");
7130 }
7131 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7132 mddev->pers->finish_reshape)
7133 mddev->pers->finish_reshape(mddev);
7134 md_update_sb(mddev, 1);
7135
7136 /* if array is no-longer degraded, then any saved_raid_disk
7137 * information must be scrapped
7138 */
7139 if (!mddev->degraded)
7140 list_for_each_entry(rdev, &mddev->disks, same_set)
7141 rdev->saved_raid_disk = -1;
7142
7143 mddev->recovery = 0;
7144 /* flag recovery needed just to double check */
7145 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7146 sysfs_notify_dirent_safe(mddev->sysfs_action);
7147 md_new_event(mddev);
7148 goto unlock; 7187 goto unlock;
7149 } 7188 }
7150 /* Set RUNNING before clearing NEEDED to avoid 7189 /* Set RUNNING before clearing NEEDED to avoid
@@ -7202,7 +7241,11 @@ void md_check_recovery(mddev_t *mddev)
7202 " thread...\n", 7241 " thread...\n",
7203 mdname(mddev)); 7242 mdname(mddev));
7204 /* leave the spares where they are, it shouldn't hurt */ 7243 /* leave the spares where they are, it shouldn't hurt */
7205 mddev->recovery = 0; 7244 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7245 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7246 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7247 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7248 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7206 } else 7249 } else
7207 md_wakeup_thread(mddev->sync_thread); 7250 md_wakeup_thread(mddev->sync_thread);
7208 sysfs_notify_dirent_safe(mddev->sysfs_action); 7251 sysfs_notify_dirent_safe(mddev->sysfs_action);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d05bab55df4e..eec517ced31a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -60,6 +60,12 @@ struct mdk_rdev_s
60 mddev_t *mddev; /* RAID array if running */ 60 mddev_t *mddev; /* RAID array if running */
61 int last_events; /* IO event timestamp */ 61 int last_events; /* IO event timestamp */
62 62
63 /*
64 * If meta_bdev is non-NULL, it means that a separate device is
65 * being used to store the metadata (superblock/bitmap) which
66 * would otherwise be contained on the same device as the data (bdev).
67 */
68 struct block_device *meta_bdev;
63 struct block_device *bdev; /* block device handle */ 69 struct block_device *bdev; /* block device handle */
64 70
65 struct page *sb_page; 71 struct page *sb_page;
@@ -148,7 +154,8 @@ struct mddev_s
148 * are happening, so run/ 154 * are happening, so run/
149 * takeover/stop are not safe 155 * takeover/stop are not safe
150 */ 156 */
151 157 int ready; /* See when safe to pass
158 * IO requests down */
152 struct gendisk *gendisk; 159 struct gendisk *gendisk;
153 160
154 struct kobject kobj; 161 struct kobject kobj;
@@ -497,8 +504,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
497extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 504extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
498 sector_t sector, int size, struct page *page); 505 sector_t sector, int size, struct page *page);
499extern void md_super_wait(mddev_t *mddev); 506extern void md_super_wait(mddev_t *mddev);
500extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 507extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
501 struct page *page, int rw); 508 struct page *page, int rw, bool metadata_op);
502extern void md_do_sync(mddev_t *mddev); 509extern void md_do_sync(mddev_t *mddev);
503extern void md_new_event(mddev_t *mddev); 510extern void md_new_event(mddev_t *mddev);
504extern int md_allow_write(mddev_t *mddev); 511extern int md_allow_write(mddev_t *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 845cf95b612c..a23ffa397ba9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1027 } else 1027 } else
1028 set_bit(Faulty, &rdev->flags); 1028 set_bit(Faulty, &rdev->flags);
1029 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1029 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1030 printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" 1030 printk(KERN_ALERT
1031 KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", 1031 "md/raid1:%s: Disk failure on %s, disabling device.\n"
1032 "md/raid1:%s: Operation continuing on %d devices.\n",
1032 mdname(mddev), bdevname(rdev->bdev, b), 1033 mdname(mddev), bdevname(rdev->bdev, b),
1033 mdname(mddev), conf->raid_disks - mddev->degraded); 1034 mdname(mddev), conf->raid_disks - mddev->degraded);
1034} 1035}
@@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1364 */ 1365 */
1365 rdev = conf->mirrors[d].rdev; 1366 rdev = conf->mirrors[d].rdev;
1366 if (sync_page_io(rdev, 1367 if (sync_page_io(rdev,
1367 sect + rdev->data_offset, 1368 sect,
1368 s<<9, 1369 s<<9,
1369 bio->bi_io_vec[idx].bv_page, 1370 bio->bi_io_vec[idx].bv_page,
1370 READ)) { 1371 READ, false)) {
1371 success = 1; 1372 success = 1;
1372 break; 1373 break;
1373 } 1374 }
@@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1390 rdev = conf->mirrors[d].rdev; 1391 rdev = conf->mirrors[d].rdev;
1391 atomic_add(s, &rdev->corrected_errors); 1392 atomic_add(s, &rdev->corrected_errors);
1392 if (sync_page_io(rdev, 1393 if (sync_page_io(rdev,
1393 sect + rdev->data_offset, 1394 sect,
1394 s<<9, 1395 s<<9,
1395 bio->bi_io_vec[idx].bv_page, 1396 bio->bi_io_vec[idx].bv_page,
1396 WRITE) == 0) 1397 WRITE, false) == 0)
1397 md_error(mddev, rdev); 1398 md_error(mddev, rdev);
1398 } 1399 }
1399 d = start; 1400 d = start;
@@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1405 continue; 1406 continue;
1406 rdev = conf->mirrors[d].rdev; 1407 rdev = conf->mirrors[d].rdev;
1407 if (sync_page_io(rdev, 1408 if (sync_page_io(rdev,
1408 sect + rdev->data_offset, 1409 sect,
1409 s<<9, 1410 s<<9,
1410 bio->bi_io_vec[idx].bv_page, 1411 bio->bi_io_vec[idx].bv_page,
1411 READ) == 0) 1412 READ, false) == 0)
1412 md_error(mddev, rdev); 1413 md_error(mddev, rdev);
1413 } 1414 }
1414 } else { 1415 } else {
@@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1488 rdev = conf->mirrors[d].rdev; 1489 rdev = conf->mirrors[d].rdev;
1489 if (rdev && 1490 if (rdev &&
1490 test_bit(In_sync, &rdev->flags) && 1491 test_bit(In_sync, &rdev->flags) &&
1491 sync_page_io(rdev, 1492 sync_page_io(rdev, sect, s<<9,
1492 sect + rdev->data_offset, 1493 conf->tmppage, READ, false))
1493 s<<9,
1494 conf->tmppage, READ))
1495 success = 1; 1494 success = 1;
1496 else { 1495 else {
1497 d++; 1496 d++;
@@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1514 rdev = conf->mirrors[d].rdev; 1513 rdev = conf->mirrors[d].rdev;
1515 if (rdev && 1514 if (rdev &&
1516 test_bit(In_sync, &rdev->flags)) { 1515 test_bit(In_sync, &rdev->flags)) {
1517 if (sync_page_io(rdev, 1516 if (sync_page_io(rdev, sect, s<<9,
1518 sect + rdev->data_offset, 1517 conf->tmppage, WRITE, false)
1519 s<<9, conf->tmppage, WRITE)
1520 == 0) 1518 == 0)
1521 /* Well, this device is dead */ 1519 /* Well, this device is dead */
1522 md_error(mddev, rdev); 1520 md_error(mddev, rdev);
@@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1531 rdev = conf->mirrors[d].rdev; 1529 rdev = conf->mirrors[d].rdev;
1532 if (rdev && 1530 if (rdev &&
1533 test_bit(In_sync, &rdev->flags)) { 1531 test_bit(In_sync, &rdev->flags)) {
1534 if (sync_page_io(rdev, 1532 if (sync_page_io(rdev, sect, s<<9,
1535 sect + rdev->data_offset, 1533 conf->tmppage, READ, false)
1536 s<<9, conf->tmppage, READ)
1537 == 0) 1534 == 0)
1538 /* Well, this device is dead */ 1535 /* Well, this device is dead */
1539 md_error(mddev, rdev); 1536 md_error(mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0641674827f0..69b659544390 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1051 } 1051 }
1052 set_bit(Faulty, &rdev->flags); 1052 set_bit(Faulty, &rdev->flags);
1053 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1053 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1054 printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" 1054 printk(KERN_ALERT
1055 KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", 1055 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1056 "md/raid10:%s: Operation continuing on %d devices.\n",
1056 mdname(mddev), bdevname(rdev->bdev, b), 1057 mdname(mddev), bdevname(rdev->bdev, b),
1057 mdname(mddev), conf->raid_disks - mddev->degraded); 1058 mdname(mddev), conf->raid_disks - mddev->degraded);
1058} 1059}
@@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1559 rcu_read_unlock(); 1560 rcu_read_unlock();
1560 success = sync_page_io(rdev, 1561 success = sync_page_io(rdev,
1561 r10_bio->devs[sl].addr + 1562 r10_bio->devs[sl].addr +
1562 sect + rdev->data_offset, 1563 sect,
1563 s<<9, 1564 s<<9,
1564 conf->tmppage, READ); 1565 conf->tmppage, READ, false);
1565 rdev_dec_pending(rdev, mddev); 1566 rdev_dec_pending(rdev, mddev);
1566 rcu_read_lock(); 1567 rcu_read_lock();
1567 if (success) 1568 if (success)
@@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1598 atomic_add(s, &rdev->corrected_errors); 1599 atomic_add(s, &rdev->corrected_errors);
1599 if (sync_page_io(rdev, 1600 if (sync_page_io(rdev,
1600 r10_bio->devs[sl].addr + 1601 r10_bio->devs[sl].addr +
1601 sect + rdev->data_offset, 1602 sect,
1602 s<<9, conf->tmppage, WRITE) 1603 s<<9, conf->tmppage, WRITE, false)
1603 == 0) { 1604 == 0) {
1604 /* Well, this device is dead */ 1605 /* Well, this device is dead */
1605 printk(KERN_NOTICE 1606 printk(KERN_NOTICE
@@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1635 rcu_read_unlock(); 1636 rcu_read_unlock();
1636 if (sync_page_io(rdev, 1637 if (sync_page_io(rdev,
1637 r10_bio->devs[sl].addr + 1638 r10_bio->devs[sl].addr +
1638 sect + rdev->data_offset, 1639 sect,
1639 s<<9, conf->tmppage, 1640 s<<9, conf->tmppage,
1640 READ) == 0) { 1641 READ, false) == 0) {
1641 /* Well, this device is dead */ 1642 /* Well, this device is dead */
1642 printk(KERN_NOTICE 1643 printk(KERN_NOTICE
1643 "md/raid10:%s: unable to read back " 1644 "md/raid10:%s: unable to read back "
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc574f303f8b..5044babfcda0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1721 set_bit(Faulty, &rdev->flags); 1721 set_bit(Faulty, &rdev->flags);
1722 printk(KERN_ALERT 1722 printk(KERN_ALERT
1723 "md/raid:%s: Disk failure on %s, disabling device.\n" 1723 "md/raid:%s: Disk failure on %s, disabling device.\n"
1724 KERN_ALERT
1725 "md/raid:%s: Operation continuing on %d devices.\n", 1724 "md/raid:%s: Operation continuing on %d devices.\n",
1726 mdname(mddev), 1725 mdname(mddev),
1727 bdevname(rdev->bdev, b), 1726 bdevname(rdev->bdev, b),
@@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4237 wait_event(conf->wait_for_overlap, 4236 wait_event(conf->wait_for_overlap,
4238 atomic_read(&conf->reshape_stripes)==0); 4237 atomic_read(&conf->reshape_stripes)==0);
4239 mddev->reshape_position = conf->reshape_progress; 4238 mddev->reshape_position = conf->reshape_progress;
4240 mddev->curr_resync_completed = mddev->curr_resync; 4239 mddev->curr_resync_completed = sector_nr;
4241 conf->reshape_checkpoint = jiffies; 4240 conf->reshape_checkpoint = jiffies;
4242 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4241 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4243 md_wakeup_thread(mddev->thread); 4242 md_wakeup_thread(mddev->thread);
@@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4338 wait_event(conf->wait_for_overlap, 4337 wait_event(conf->wait_for_overlap,
4339 atomic_read(&conf->reshape_stripes) == 0); 4338 atomic_read(&conf->reshape_stripes) == 0);
4340 mddev->reshape_position = conf->reshape_progress; 4339 mddev->reshape_position = conf->reshape_progress;
4341 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; 4340 mddev->curr_resync_completed = sector_nr;
4342 conf->reshape_checkpoint = jiffies; 4341 conf->reshape_checkpoint = jiffies;
4343 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4342 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4344 md_wakeup_thread(mddev->thread); 4343 md_wakeup_thread(mddev->thread);
@@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev)
5339 && !test_bit(Faulty, &tmp->rdev->flags) 5338 && !test_bit(Faulty, &tmp->rdev->flags)
5340 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5339 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5341 count++; 5340 count++;
5342 sysfs_notify_dirent(tmp->rdev->sysfs_state); 5341 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
5343 } 5342 }
5344 } 5343 }
5345 spin_lock_irqsave(&conf->device_lock, flags); 5344 spin_lock_irqsave(&conf->device_lock, flags);
@@ -5528,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev)
5528 return -ENOSPC; 5527 return -ENOSPC;
5529 5528
5530 list_for_each_entry(rdev, &mddev->disks, same_set) 5529 list_for_each_entry(rdev, &mddev->disks, same_set)
5531 if (rdev->raid_disk < 0 && 5530 if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks)
5532 !test_bit(Faulty, &rdev->flags)) 5531 && !test_bit(Faulty, &rdev->flags))
5533 spares++; 5532 spares++;
5534 5533
5535 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5534 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5589,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev)
5589 /* Failure here is OK */; 5588 /* Failure here is OK */;
5590 } else 5589 } else
5591 break; 5590 break;
5591 } else if (rdev->raid_disk >= conf->previous_raid_disks
5592 && !test_bit(Faulty, &rdev->flags)) {
5593 /* This is a spare that was manually added */
5594 set_bit(In_sync, &rdev->flags);
5595 added_devices++;
5592 } 5596 }
5593 5597
5594 /* When a reshape changes the number of devices, ->degraded 5598 /* When a reshape changes the number of devices, ->degraded
diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c
index 3892666b5fbd..2a1d52fb4936 100644
--- a/drivers/serial/atmel_serial.c
+++ b/drivers/serial/atmel_serial.c
@@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev)
1732 device_init_wakeup(&pdev->dev, 1); 1732 device_init_wakeup(&pdev->dev, 1);
1733 platform_set_drvdata(pdev, port); 1733 platform_set_drvdata(pdev, port);
1734 1734
1735 if (port->rs485.flags & SER_RS485_ENABLED) {
1736 UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL);
1737 UART_PUT_CR(&port->uart, ATMEL_US_RTSEN);
1738 }
1739
1735 return 0; 1740 return 0;
1736 1741
1737err_add_port: 1742err_add_port:
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 5a48ce996dea..07bec09d1dad 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -71,11 +71,18 @@ config XEN_SYS_HYPERVISOR
71 but will have no xen contents. 71 but will have no xen contents.
72 72
73config XEN_XENBUS_FRONTEND 73config XEN_XENBUS_FRONTEND
74 tristate 74 tristate
75
76config XEN_GNTDEV
77 tristate "userspace grant access device driver"
78 depends on XEN
79 select MMU_NOTIFIER
80 help
81 Allows userspace processes to use grants.
75 82
76config XEN_PLATFORM_PCI 83config XEN_PLATFORM_PCI
77 tristate "xen platform pci device driver" 84 tristate "xen platform pci device driver"
78 depends on XEN_PVHVM 85 depends on XEN_PVHVM && PCI
79 default m 86 default m
80 help 87 help
81 Driver for the Xen PCI Platform device: it is responsible for 88 Driver for the Xen PCI Platform device: it is responsible for
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 533a199e7a3f..5088cc2e6fe2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,11 +9,14 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
9obj-$(CONFIG_XEN_XENCOMM) += xencomm.o 9obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
10obj-$(CONFIG_XEN_BALLOON) += balloon.o 10obj-$(CONFIG_XEN_BALLOON) += balloon.o
11obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o 11obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
12obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
12obj-$(CONFIG_XENFS) += xenfs/ 13obj-$(CONFIG_XENFS) += xenfs/
13obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o 14obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
14obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o 15obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
15obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o 16obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
16obj-$(CONFIG_XEN_DOM0) += pci.o 17obj-$(CONFIG_XEN_DOM0) += pci.o
17 18
18xen-evtchn-y := evtchn.o 19xen-evtchn-y := evtchn.o
20xen-gntdev-y := gntdev.o
19 21
22xen-platform-pci-y := platform-pci.o
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 000000000000..1e31cdcdae1e
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,665 @@
1/******************************************************************************
2 * gntdev.c
3 *
4 * Device for accessing (in user-space) pages that have been granted by other
5 * domains.
6 *
7 * Copyright (c) 2006-2007, D G Murray.
8 * (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#undef DEBUG
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/init.h>
25#include <linux/miscdevice.h>
26#include <linux/fs.h>
27#include <linux/mm.h>
28#include <linux/mman.h>
29#include <linux/mmu_notifier.h>
30#include <linux/types.h>
31#include <linux/uaccess.h>
32#include <linux/sched.h>
33#include <linux/spinlock.h>
34#include <linux/slab.h>
35
36#include <xen/xen.h>
37#include <xen/grant_table.h>
38#include <xen/gntdev.h>
39#include <asm/xen/hypervisor.h>
40#include <asm/xen/hypercall.h>
41#include <asm/xen/page.h>
42
43MODULE_LICENSE("GPL");
44MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
45 "Gerd Hoffmann <kraxel@redhat.com>");
46MODULE_DESCRIPTION("User-space granted page access driver");
47
48static int limit = 1024;
49module_param(limit, int, 0644);
50MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
51 "once by a gntdev instance");
52
53struct gntdev_priv {
54 struct list_head maps;
55 uint32_t used;
56 uint32_t limit;
57 /* lock protects maps from concurrent changes */
58 spinlock_t lock;
59 struct mm_struct *mm;
60 struct mmu_notifier mn;
61};
62
63struct grant_map {
64 struct list_head next;
65 struct gntdev_priv *priv;
66 struct vm_area_struct *vma;
67 int index;
68 int count;
69 int flags;
70 int is_mapped;
71 struct ioctl_gntdev_grant_ref *grants;
72 struct gnttab_map_grant_ref *map_ops;
73 struct gnttab_unmap_grant_ref *unmap_ops;
74 struct page **pages;
75};
76
77/* ------------------------------------------------------------------ */
78
79static void gntdev_print_maps(struct gntdev_priv *priv,
80 char *text, int text_index)
81{
82#ifdef DEBUG
83 struct grant_map *map;
84
85 pr_debug("maps list (priv %p, usage %d/%d)\n",
86 priv, priv->used, priv->limit);
87
88 list_for_each_entry(map, &priv->maps, next)
89 pr_debug(" index %2d, count %2d %s\n",
90 map->index, map->count,
91 map->index == text_index && text ? text : "");
92#endif
93}
94
95static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
96{
97 struct grant_map *add;
98 int i;
99
100 add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
101 if (NULL == add)
102 return NULL;
103
104 add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
105 add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
106 add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
107 add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL);
108 if (NULL == add->grants ||
109 NULL == add->map_ops ||
110 NULL == add->unmap_ops ||
111 NULL == add->pages)
112 goto err;
113
114 for (i = 0; i < count; i++) {
115 add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
116 if (add->pages[i] == NULL)
117 goto err;
118 }
119
120 add->index = 0;
121 add->count = count;
122 add->priv = priv;
123
124 if (add->count + priv->used > priv->limit)
125 goto err;
126
127 return add;
128
129err:
130 if (add->pages)
131 for (i = 0; i < count; i++) {
132 if (add->pages[i])
133 __free_page(add->pages[i]);
134 }
135 kfree(add->pages);
136 kfree(add->grants);
137 kfree(add->map_ops);
138 kfree(add->unmap_ops);
139 kfree(add);
140 return NULL;
141}
142
143static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
144{
145 struct grant_map *map;
146
147 list_for_each_entry(map, &priv->maps, next) {
148 if (add->index + add->count < map->index) {
149 list_add_tail(&add->next, &map->next);
150 goto done;
151 }
152 add->index = map->index + map->count;
153 }
154 list_add_tail(&add->next, &priv->maps);
155
156done:
157 priv->used += add->count;
158 gntdev_print_maps(priv, "[new]", add->index);
159}
160
161static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
162 int index, int count)
163{
164 struct grant_map *map;
165
166 list_for_each_entry(map, &priv->maps, next) {
167 if (map->index != index)
168 continue;
169 if (map->count != count)
170 continue;
171 return map;
172 }
173 return NULL;
174}
175
176static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
177 unsigned long vaddr)
178{
179 struct grant_map *map;
180
181 list_for_each_entry(map, &priv->maps, next) {
182 if (!map->vma)
183 continue;
184 if (vaddr < map->vma->vm_start)
185 continue;
186 if (vaddr >= map->vma->vm_end)
187 continue;
188 return map;
189 }
190 return NULL;
191}
192
193static int gntdev_del_map(struct grant_map *map)
194{
195 int i;
196
197 if (map->vma)
198 return -EBUSY;
199 for (i = 0; i < map->count; i++)
200 if (map->unmap_ops[i].handle)
201 return -EBUSY;
202
203 map->priv->used -= map->count;
204 list_del(&map->next);
205 return 0;
206}
207
208static void gntdev_free_map(struct grant_map *map)
209{
210 int i;
211
212 if (!map)
213 return;
214
215 if (map->pages)
216 for (i = 0; i < map->count; i++) {
217 if (map->pages[i])
218 __free_page(map->pages[i]);
219 }
220 kfree(map->pages);
221 kfree(map->grants);
222 kfree(map->map_ops);
223 kfree(map->unmap_ops);
224 kfree(map);
225}
226
227/* ------------------------------------------------------------------ */
228
229static int find_grant_ptes(pte_t *pte, pgtable_t token,
230 unsigned long addr, void *data)
231{
232 struct grant_map *map = data;
233 unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
234 u64 pte_maddr;
235
236 BUG_ON(pgnr >= map->count);
237 pte_maddr = arbitrary_virt_to_machine(pte).maddr;
238
239 gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
240 GNTMAP_contains_pte | map->flags,
241 map->grants[pgnr].ref,
242 map->grants[pgnr].domid);
243 gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
244 GNTMAP_contains_pte | map->flags,
245 0 /* handle */);
246 return 0;
247}
248
249static int map_grant_pages(struct grant_map *map)
250{
251 int i, err = 0;
252
253 pr_debug("map %d+%d\n", map->index, map->count);
254 err = gnttab_map_refs(map->map_ops, map->pages, map->count);
255 if (err)
256 return err;
257
258 for (i = 0; i < map->count; i++) {
259 if (map->map_ops[i].status)
260 err = -EINVAL;
261 map->unmap_ops[i].handle = map->map_ops[i].handle;
262 }
263 return err;
264}
265
266static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
267{
268 int i, err = 0;
269
270 pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
271 err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
272 if (err)
273 return err;
274
275 for (i = 0; i < pages; i++) {
276 if (map->unmap_ops[offset+i].status)
277 err = -EINVAL;
278 map->unmap_ops[offset+i].handle = 0;
279 }
280 return err;
281}
282
283/* ------------------------------------------------------------------ */
284
285static void gntdev_vma_close(struct vm_area_struct *vma)
286{
287 struct grant_map *map = vma->vm_private_data;
288
289 pr_debug("close %p\n", vma);
290 map->is_mapped = 0;
291 map->vma = NULL;
292 vma->vm_private_data = NULL;
293}
294
295static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
296{
297 pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
298 vmf->virtual_address, vmf->pgoff);
299 vmf->flags = VM_FAULT_ERROR;
300 return 0;
301}
302
303static struct vm_operations_struct gntdev_vmops = {
304 .close = gntdev_vma_close,
305 .fault = gntdev_vma_fault,
306};
307
308/* ------------------------------------------------------------------ */
309
310static void mn_invl_range_start(struct mmu_notifier *mn,
311 struct mm_struct *mm,
312 unsigned long start, unsigned long end)
313{
314 struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
315 struct grant_map *map;
316 unsigned long mstart, mend;
317 int err;
318
319 spin_lock(&priv->lock);
320 list_for_each_entry(map, &priv->maps, next) {
321 if (!map->vma)
322 continue;
323 if (!map->is_mapped)
324 continue;
325 if (map->vma->vm_start >= end)
326 continue;
327 if (map->vma->vm_end <= start)
328 continue;
329 mstart = max(start, map->vma->vm_start);
330 mend = min(end, map->vma->vm_end);
331 pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
332 map->index, map->count,
333 map->vma->vm_start, map->vma->vm_end,
334 start, end, mstart, mend);
335 err = unmap_grant_pages(map,
336 (mstart - map->vma->vm_start) >> PAGE_SHIFT,
337 (mend - mstart) >> PAGE_SHIFT);
338 WARN_ON(err);
339 }
340 spin_unlock(&priv->lock);
341}
342
343static void mn_invl_page(struct mmu_notifier *mn,
344 struct mm_struct *mm,
345 unsigned long address)
346{
347 mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
348}
349
350static void mn_release(struct mmu_notifier *mn,
351 struct mm_struct *mm)
352{
353 struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
354 struct grant_map *map;
355 int err;
356
357 spin_lock(&priv->lock);
358 list_for_each_entry(map, &priv->maps, next) {
359 if (!map->vma)
360 continue;
361 pr_debug("map %d+%d (%lx %lx)\n",
362 map->index, map->count,
363 map->vma->vm_start, map->vma->vm_end);
364 err = unmap_grant_pages(map, /* offset */ 0, map->count);
365 WARN_ON(err);
366 }
367 spin_unlock(&priv->lock);
368}
369
370struct mmu_notifier_ops gntdev_mmu_ops = {
371 .release = mn_release,
372 .invalidate_page = mn_invl_page,
373 .invalidate_range_start = mn_invl_range_start,
374};
375
376/* ------------------------------------------------------------------ */
377
378static int gntdev_open(struct inode *inode, struct file *flip)
379{
380 struct gntdev_priv *priv;
381 int ret = 0;
382
383 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
384 if (!priv)
385 return -ENOMEM;
386
387 INIT_LIST_HEAD(&priv->maps);
388 spin_lock_init(&priv->lock);
389 priv->limit = limit;
390
391 priv->mm = get_task_mm(current);
392 if (!priv->mm) {
393 kfree(priv);
394 return -ENOMEM;
395 }
396 priv->mn.ops = &gntdev_mmu_ops;
397 ret = mmu_notifier_register(&priv->mn, priv->mm);
398 mmput(priv->mm);
399
400 if (ret) {
401 kfree(priv);
402 return ret;
403 }
404
405 flip->private_data = priv;
406 pr_debug("priv %p\n", priv);
407
408 return 0;
409}
410
411static int gntdev_release(struct inode *inode, struct file *flip)
412{
413 struct gntdev_priv *priv = flip->private_data;
414 struct grant_map *map;
415 int err;
416
417 pr_debug("priv %p\n", priv);
418
419 spin_lock(&priv->lock);
420 while (!list_empty(&priv->maps)) {
421 map = list_entry(priv->maps.next, struct grant_map, next);
422 err = gntdev_del_map(map);
423 if (WARN_ON(err))
424 gntdev_free_map(map);
425
426 }
427 spin_unlock(&priv->lock);
428
429 mmu_notifier_unregister(&priv->mn, priv->mm);
430 kfree(priv);
431 return 0;
432}
433
434static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
435 struct ioctl_gntdev_map_grant_ref __user *u)
436{
437 struct ioctl_gntdev_map_grant_ref op;
438 struct grant_map *map;
439 int err;
440
441 if (copy_from_user(&op, u, sizeof(op)) != 0)
442 return -EFAULT;
443 pr_debug("priv %p, add %d\n", priv, op.count);
444 if (unlikely(op.count <= 0))
445 return -EINVAL;
446 if (unlikely(op.count > priv->limit))
447 return -EINVAL;
448
449 err = -ENOMEM;
450 map = gntdev_alloc_map(priv, op.count);
451 if (!map)
452 return err;
453 if (copy_from_user(map->grants, &u->refs,
454 sizeof(map->grants[0]) * op.count) != 0) {
455 gntdev_free_map(map);
456 return err;
457 }
458
459 spin_lock(&priv->lock);
460 gntdev_add_map(priv, map);
461 op.index = map->index << PAGE_SHIFT;
462 spin_unlock(&priv->lock);
463
464 if (copy_to_user(u, &op, sizeof(op)) != 0) {
465 spin_lock(&priv->lock);
466 gntdev_del_map(map);
467 spin_unlock(&priv->lock);
468 gntdev_free_map(map);
469 return err;
470 }
471 return 0;
472}
473
474static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
475 struct ioctl_gntdev_unmap_grant_ref __user *u)
476{
477 struct ioctl_gntdev_unmap_grant_ref op;
478 struct grant_map *map;
479 int err = -ENOENT;
480
481 if (copy_from_user(&op, u, sizeof(op)) != 0)
482 return -EFAULT;
483 pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
484
485 spin_lock(&priv->lock);
486 map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
487 if (map)
488 err = gntdev_del_map(map);
489 spin_unlock(&priv->lock);
490 if (!err)
491 gntdev_free_map(map);
492 return err;
493}
494
495static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
496 struct ioctl_gntdev_get_offset_for_vaddr __user *u)
497{
498 struct ioctl_gntdev_get_offset_for_vaddr op;
499 struct grant_map *map;
500
501 if (copy_from_user(&op, u, sizeof(op)) != 0)
502 return -EFAULT;
503 pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
504
505 spin_lock(&priv->lock);
506 map = gntdev_find_map_vaddr(priv, op.vaddr);
507 if (map == NULL ||
508 map->vma->vm_start != op.vaddr) {
509 spin_unlock(&priv->lock);
510 return -EINVAL;
511 }
512 op.offset = map->index << PAGE_SHIFT;
513 op.count = map->count;
514 spin_unlock(&priv->lock);
515
516 if (copy_to_user(u, &op, sizeof(op)) != 0)
517 return -EFAULT;
518 return 0;
519}
520
521static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
522 struct ioctl_gntdev_set_max_grants __user *u)
523{
524 struct ioctl_gntdev_set_max_grants op;
525
526 if (copy_from_user(&op, u, sizeof(op)) != 0)
527 return -EFAULT;
528 pr_debug("priv %p, limit %d\n", priv, op.count);
529 if (op.count > limit)
530 return -E2BIG;
531
532 spin_lock(&priv->lock);
533 priv->limit = op.count;
534 spin_unlock(&priv->lock);
535 return 0;
536}
537
538static long gntdev_ioctl(struct file *flip,
539 unsigned int cmd, unsigned long arg)
540{
541 struct gntdev_priv *priv = flip->private_data;
542 void __user *ptr = (void __user *)arg;
543
544 switch (cmd) {
545 case IOCTL_GNTDEV_MAP_GRANT_REF:
546 return gntdev_ioctl_map_grant_ref(priv, ptr);
547
548 case IOCTL_GNTDEV_UNMAP_GRANT_REF:
549 return gntdev_ioctl_unmap_grant_ref(priv, ptr);
550
551 case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
552 return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
553
554 case IOCTL_GNTDEV_SET_MAX_GRANTS:
555 return gntdev_ioctl_set_max_grants(priv, ptr);
556
557 default:
558 pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
559 return -ENOIOCTLCMD;
560 }
561
562 return 0;
563}
564
565static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
566{
567 struct gntdev_priv *priv = flip->private_data;
568 int index = vma->vm_pgoff;
569 int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
570 struct grant_map *map;
571 int err = -EINVAL;
572
573 if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
574 return -EINVAL;
575
576 pr_debug("map %d+%d at %lx (pgoff %lx)\n",
577 index, count, vma->vm_start, vma->vm_pgoff);
578
579 spin_lock(&priv->lock);
580 map = gntdev_find_map_index(priv, index, count);
581 if (!map)
582 goto unlock_out;
583 if (map->vma)
584 goto unlock_out;
585 if (priv->mm != vma->vm_mm) {
586 printk(KERN_WARNING "Huh? Other mm?\n");
587 goto unlock_out;
588 }
589
590 vma->vm_ops = &gntdev_vmops;
591
592 vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
593
594 vma->vm_private_data = map;
595 map->vma = vma;
596
597 map->flags = GNTMAP_host_map | GNTMAP_application_map;
598 if (!(vma->vm_flags & VM_WRITE))
599 map->flags |= GNTMAP_readonly;
600
601 spin_unlock(&priv->lock);
602
603 err = apply_to_page_range(vma->vm_mm, vma->vm_start,
604 vma->vm_end - vma->vm_start,
605 find_grant_ptes, map);
606 if (err) {
607 printk(KERN_WARNING "find_grant_ptes() failure.\n");
608 return err;
609 }
610
611 err = map_grant_pages(map);
612 if (err) {
613 printk(KERN_WARNING "map_grant_pages() failure.\n");
614 return err;
615 }
616
617 map->is_mapped = 1;
618
619 return 0;
620
621unlock_out:
622 spin_unlock(&priv->lock);
623 return err;
624}
625
626static const struct file_operations gntdev_fops = {
627 .owner = THIS_MODULE,
628 .open = gntdev_open,
629 .release = gntdev_release,
630 .mmap = gntdev_mmap,
631 .unlocked_ioctl = gntdev_ioctl
632};
633
634static struct miscdevice gntdev_miscdev = {
635 .minor = MISC_DYNAMIC_MINOR,
636 .name = "xen/gntdev",
637 .fops = &gntdev_fops,
638};
639
640/* ------------------------------------------------------------------ */
641
642static int __init gntdev_init(void)
643{
644 int err;
645
646 if (!xen_domain())
647 return -ENODEV;
648
649 err = misc_register(&gntdev_miscdev);
650 if (err != 0) {
651 printk(KERN_ERR "Could not register gntdev device\n");
652 return err;
653 }
654 return 0;
655}
656
657static void __exit gntdev_exit(void)
658{
659 misc_deregister(&gntdev_miscdev);
660}
661
662module_init(gntdev_init);
663module_exit(gntdev_exit);
664
665/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 6c4531816496..9ef54ebc1194 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -447,6 +447,52 @@ unsigned int gnttab_max_grant_frames(void)
447} 447}
448EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); 448EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
449 449
450int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
451 struct page **pages, unsigned int count)
452{
453 int i, ret;
454 pte_t *pte;
455 unsigned long mfn;
456
457 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
458 if (ret)
459 return ret;
460
461 for (i = 0; i < count; i++) {
462 /* m2p override only supported for GNTMAP_contains_pte mappings */
463 if (!(map_ops[i].flags & GNTMAP_contains_pte))
464 continue;
465 pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
466 (map_ops[i].host_addr & ~PAGE_MASK));
467 mfn = pte_mfn(*pte);
468 ret = m2p_add_override(mfn, pages[i]);
469 if (ret)
470 return ret;
471 }
472
473 return ret;
474}
475EXPORT_SYMBOL_GPL(gnttab_map_refs);
476
477int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
478 struct page **pages, unsigned int count)
479{
480 int i, ret;
481
482 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
483 if (ret)
484 return ret;
485
486 for (i = 0; i < count; i++) {
487 ret = m2p_remove_override(pages[i]);
488 if (ret)
489 return ret;
490 }
491
492 return ret;
493}
494EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
495
450static int gnttab_map(unsigned int start_idx, unsigned int end_idx) 496static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
451{ 497{
452 struct gnttab_setup_table setup; 498 struct gnttab_setup_table setup;
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index c01b5ddce529..afbe041f42c5 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
105 const struct pci_device_id *ent) 105 const struct pci_device_id *ent)
106{ 106{
107 int i, ret; 107 int i, ret;
108 long ioaddr, iolen; 108 long ioaddr;
109 long mmio_addr, mmio_len; 109 long mmio_addr, mmio_len;
110 unsigned int max_nr_gframes; 110 unsigned int max_nr_gframes;
111 111
@@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
114 return i; 114 return i;
115 115
116 ioaddr = pci_resource_start(pdev, 0); 116 ioaddr = pci_resource_start(pdev, 0);
117 iolen = pci_resource_len(pdev, 0);
118 117
119 mmio_addr = pci_resource_start(pdev, 1); 118 mmio_addr = pci_resource_start(pdev, 1);
120 mmio_len = pci_resource_len(pdev, 1); 119 mmio_len = pci_resource_len(pdev, 1);
@@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
125 goto pci_out; 124 goto pci_out;
126 } 125 }
127 126
128 if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { 127 ret = pci_request_region(pdev, 1, DRV_NAME);
129 dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", 128 if (ret < 0)
130 mmio_addr, mmio_len);
131 ret = -EBUSY;
132 goto pci_out; 129 goto pci_out;
133 }
134 130
135 if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { 131 ret = pci_request_region(pdev, 0, DRV_NAME);
136 dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", 132 if (ret < 0)
137 iolen, ioaddr);
138 ret = -EBUSY;
139 goto mem_out; 133 goto mem_out;
140 }
141 134
142 platform_mmio = mmio_addr; 135 platform_mmio = mmio_addr;
143 platform_mmiolen = mmio_len; 136 platform_mmiolen = mmio_len;
@@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
169 return 0; 162 return 0;
170 163
171out: 164out:
172 release_region(ioaddr, iolen); 165 pci_release_region(pdev, 0);
173mem_out: 166mem_out:
174 release_mem_region(mmio_addr, mmio_len); 167 pci_release_region(pdev, 1);
175pci_out: 168pci_out:
176 pci_disable_device(pdev); 169 pci_disable_device(pdev);
177 return ret; 170 return ret;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9ed476906327..d3b28abdd6aa 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
141 return rc; 141 return rc;
142} 142}
143 143
144static inode *ecryptfs_get_inode(struct inode *lower_inode, 144static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
145 struct super_block *sb) 145 struct super_block *sb)
146{ 146{
147 struct inode *inode; 147 struct inode *inode;
148 int rc = 0; 148 int rc = 0;
149 149
150 lower_inode = lower_dentry->d_inode;
151 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { 150 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
152 rc = -EXDEV; 151 rc = -EXDEV;
153 goto out; 152 goto out;
@@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
202{ 201{
203 struct inode *lower_inode = lower_dentry->d_inode; 202 struct inode *lower_inode = lower_dentry->d_inode;
204 struct inode *inode = ecryptfs_get_inode(lower_inode, sb); 203 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
205 if (IS_ERR(inode) 204 if (IS_ERR(inode))
206 return PTR_ERR(inode); 205 return PTR_ERR(inode);
207 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) 206 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
208 d_add(dentry, inode); 207 d_add(dentry, inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
84 return list_entry(head, struct inode, i_wb_list); 84 return list_entry(head, struct inode, i_wb_list);
85} 85}
86 86
87static void bdi_queue_work(struct backing_dev_info *bdi, 87/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88 struct wb_writeback_work *work) 88static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89{ 89{
90 trace_writeback_queue(bdi, work);
91
92 spin_lock_bh(&bdi->wb_lock);
93 list_add_tail(&work->list, &bdi->work_list);
94 if (bdi->wb.task) { 90 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task); 91 wake_up_process(bdi->wb.task);
96 } else { 92 } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
98 * The bdi thread isn't there, wake up the forker thread which 94 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it. 95 * will create and run it.
100 */ 96 */
101 trace_writeback_nothread(bdi, work);
102 wake_up_process(default_backing_dev_info.wb.task); 97 wake_up_process(default_backing_dev_info.wb.task);
103 } 98 }
99}
100
101static void bdi_queue_work(struct backing_dev_info *bdi,
102 struct wb_writeback_work *work)
103{
104 trace_writeback_queue(bdi, work);
105
106 spin_lock_bh(&bdi->wb_lock);
107 list_add_tail(&work->list, &bdi->work_list);
108 if (!bdi->wb.task)
109 trace_writeback_nothread(bdi, work);
110 bdi_wakeup_flusher(bdi);
104 spin_unlock_bh(&bdi->wb_lock); 111 spin_unlock_bh(&bdi->wb_lock);
105} 112}
106 113
107static void 114static void
108__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 115__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
109 bool range_cyclic, bool for_background) 116 bool range_cyclic)
110{ 117{
111 struct wb_writeback_work *work; 118 struct wb_writeback_work *work;
112 119
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
126 work->sync_mode = WB_SYNC_NONE; 133 work->sync_mode = WB_SYNC_NONE;
127 work->nr_pages = nr_pages; 134 work->nr_pages = nr_pages;
128 work->range_cyclic = range_cyclic; 135 work->range_cyclic = range_cyclic;
129 work->for_background = for_background;
130 136
131 bdi_queue_work(bdi, work); 137 bdi_queue_work(bdi, work);
132} 138}
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
144 */ 150 */
145void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 151void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
146{ 152{
147 __bdi_start_writeback(bdi, nr_pages, true, false); 153 __bdi_start_writeback(bdi, nr_pages, true);
148} 154}
149 155
150/** 156/**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152 * @bdi: the backing device to write from 158 * @bdi: the backing device to write from
153 * 159 *
154 * Description: 160 * Description:
155 * This does WB_SYNC_NONE background writeback. The IO is only 161 * This makes sure WB_SYNC_NONE background writeback happens. When
156 * started when this function returns, we make no guarentees on 162 * this function returns, it is only guaranteed that for given BDI
157 * completion. Caller need not hold sb s_umount semaphore. 163 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore.
158 */ 165 */
159void bdi_start_background_writeback(struct backing_dev_info *bdi) 166void bdi_start_background_writeback(struct backing_dev_info *bdi)
160{ 167{
161 __bdi_start_writeback(bdi, LONG_MAX, true, true); 168 /*
169 * We just wake up the flusher thread. It will perform background
170 * writeback as soon as there is no other work to do.
171 */
172 trace_writeback_wake_background(bdi);
173 spin_lock_bh(&bdi->wb_lock);
174 bdi_wakeup_flusher(bdi);
175 spin_unlock_bh(&bdi->wb_lock);
162} 176}
163 177
164/* 178/*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
616 }; 630 };
617 unsigned long oldest_jif; 631 unsigned long oldest_jif;
618 long wrote = 0; 632 long wrote = 0;
633 long write_chunk;
619 struct inode *inode; 634 struct inode *inode;
620 635
621 if (wbc.for_kupdate) { 636 if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
628 wbc.range_end = LLONG_MAX; 643 wbc.range_end = LLONG_MAX;
629 } 644 }
630 645
646 /*
647 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
648 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
649 * here avoids calling into writeback_inodes_wb() more than once.
650 *
651 * The intended call sequence for WB_SYNC_ALL writeback is:
652 *
653 * wb_writeback()
654 * __writeback_inodes_sb() <== called only once
655 * write_cache_pages() <== called once for each inode
656 * (quickly) tag currently dirty pages
657 * (maybe slowly) sync all tagged pages
658 */
659 if (wbc.sync_mode == WB_SYNC_NONE)
660 write_chunk = MAX_WRITEBACK_PAGES;
661 else
662 write_chunk = LONG_MAX;
663
631 wbc.wb_start = jiffies; /* livelock avoidance */ 664 wbc.wb_start = jiffies; /* livelock avoidance */
632 for (;;) { 665 for (;;) {
633 /* 666 /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
637 break; 670 break;
638 671
639 /* 672 /*
673 * Background writeout and kupdate-style writeback may
674 * run forever. Stop them if there is other work to do
675 * so that e.g. sync can proceed. They'll be restarted
676 * after the other works are all done.
677 */
678 if ((work->for_background || work->for_kupdate) &&
679 !list_empty(&wb->bdi->work_list))
680 break;
681
682 /*
640 * For background writeout, stop when we are below the 683 * For background writeout, stop when we are below the
641 * background dirty threshold 684 * background dirty threshold
642 */ 685 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
644 break; 687 break;
645 688
646 wbc.more_io = 0; 689 wbc.more_io = 0;
647 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 690 wbc.nr_to_write = write_chunk;
648 wbc.pages_skipped = 0; 691 wbc.pages_skipped = 0;
649 692
650 trace_wbc_writeback_start(&wbc, wb->bdi); 693 trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
654 writeback_inodes_wb(wb, &wbc); 697 writeback_inodes_wb(wb, &wbc);
655 trace_wbc_writeback_written(&wbc, wb->bdi); 698 trace_wbc_writeback_written(&wbc, wb->bdi);
656 699
657 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 700 work->nr_pages -= write_chunk - wbc.nr_to_write;
658 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 701 wrote += write_chunk - wbc.nr_to_write;
659 702
660 /* 703 /*
661 * If we consumed everything, see if we have more 704 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
670 /* 713 /*
671 * Did we write something? Try for more 714 * Did we write something? Try for more
672 */ 715 */
673 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) 716 if (wbc.nr_to_write < write_chunk)
674 continue; 717 continue;
675 /* 718 /*
676 * Nothing written. Wait for some inode to 719 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
718 get_nr_dirty_inodes(); 761 get_nr_dirty_inodes();
719} 762}
720 763
764static long wb_check_background_flush(struct bdi_writeback *wb)
765{
766 if (over_bground_thresh()) {
767
768 struct wb_writeback_work work = {
769 .nr_pages = LONG_MAX,
770 .sync_mode = WB_SYNC_NONE,
771 .for_background = 1,
772 .range_cyclic = 1,
773 };
774
775 return wb_writeback(wb, &work);
776 }
777
778 return 0;
779}
780
721static long wb_check_old_data_flush(struct bdi_writeback *wb) 781static long wb_check_old_data_flush(struct bdi_writeback *wb)
722{ 782{
723 unsigned long expired; 783 unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
787 * Check for periodic writeback, kupdated() style 847 * Check for periodic writeback, kupdated() style
788 */ 848 */
789 wrote += wb_check_old_data_flush(wb); 849 wrote += wb_check_old_data_flush(wb);
850 wrote += wb_check_background_flush(wb);
790 clear_bit(BDI_writeback_running, &wb->bdi->state); 851 clear_bit(BDI_writeback_running, &wb->bdi->state);
791 852
792 return wrote; 853 return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
873 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 934 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
874 if (!bdi_has_dirty_io(bdi)) 935 if (!bdi_has_dirty_io(bdi))
875 continue; 936 continue;
876 __bdi_start_writeback(bdi, nr_pages, false, false); 937 __bdi_start_writeback(bdi, nr_pages, false);
877 } 938 }
878 rcu_read_unlock(); 939 rcu_read_unlock();
879} 940}
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1164 * @sb: the superblock 1225 * @sb: the superblock
1165 * 1226 *
1166 * This function writes and waits on any dirty inode belonging to this 1227 * This function writes and waits on any dirty inode belonging to this
1167 * super_block. The number of pages synced is returned. 1228 * super_block.
1168 */ 1229 */
1169void sync_inodes_sb(struct super_block *sb) 1230void sync_inodes_sb(struct super_block *sb)
1170{ 1231{
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1242EXPORT_SYMBOL(sync_inode); 1303EXPORT_SYMBOL(sync_inode);
1243 1304
1244/** 1305/**
1245 * sync_inode - write an inode to disk 1306 * sync_inode_metadata - write an inode to disk
1246 * @inode: the inode to sync 1307 * @inode: the inode to sync
1247 * @wait: wait for I/O to complete. 1308 * @wait: wait for I/O to complete.
1248 * 1309 *
1249 * Write an inode to disk and adjust it's dirty state after completion. 1310 * Write an inode to disk and adjust its dirty state after completion.
1250 * 1311 *
1251 * Note: only writes the actual inode, no associated data or other metadata. 1312 * Note: only writes the actual inode, no associated data or other metadata.
1252 */ 1313 */
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
40 * status of that page is hard. See end_buffer_async_read() for the details. 40 * status of that page is hard. See end_buffer_async_read() for the details.
41 * There is no point in duplicating all that complexity. 41 * There is no point in duplicating all that complexity.
42 */ 42 */
43static void mpage_end_io_read(struct bio *bio, int err) 43static void mpage_end_io(struct bio *bio, int err)
44{ 44{
45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
50 50
51 if (--bvec >= bio->bi_io_vec) 51 if (--bvec >= bio->bi_io_vec)
52 prefetchw(&bvec->bv_page->flags); 52 prefetchw(&bvec->bv_page->flags);
53 53 if (bio_data_dir(bio) == READ) {
54 if (uptodate) { 54 if (uptodate) {
55 SetPageUptodate(page); 55 SetPageUptodate(page);
56 } else { 56 } else {
57 ClearPageUptodate(page); 57 ClearPageUptodate(page);
58 SetPageError(page); 58 SetPageError(page);
59 } 59 }
60 unlock_page(page); 60 unlock_page(page);
61 } while (bvec >= bio->bi_io_vec); 61 } else { /* bio_data_dir(bio) == WRITE */
62 bio_put(bio); 62 if (!uptodate) {
63} 63 SetPageError(page);
64 64 if (page->mapping)
65static void mpage_end_io_write(struct bio *bio, int err) 65 set_bit(AS_EIO, &page->mapping->flags);
66{ 66 }
67 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 67 end_page_writeback(page);
68 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
69
70 do {
71 struct page *page = bvec->bv_page;
72
73 if (--bvec >= bio->bi_io_vec)
74 prefetchw(&bvec->bv_page->flags);
75
76 if (!uptodate){
77 SetPageError(page);
78 if (page->mapping)
79 set_bit(AS_EIO, &page->mapping->flags);
80 } 68 }
81 end_page_writeback(page);
82 } while (bvec >= bio->bi_io_vec); 69 } while (bvec >= bio->bi_io_vec);
83 bio_put(bio); 70 bio_put(bio);
84} 71}
85 72
86static struct bio *mpage_bio_submit(int rw, struct bio *bio) 73static struct bio *mpage_bio_submit(int rw, struct bio *bio)
87{ 74{
88 bio->bi_end_io = mpage_end_io_read; 75 bio->bi_end_io = mpage_end_io;
89 if (rw == WRITE)
90 bio->bi_end_io = mpage_end_io_write;
91 submit_bio(rw, bio); 76 submit_bio(rw, bio);
92 return NULL; 77 return NULL;
93} 78}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95b081bc9e25..64ee240f3c80 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1579{ 1579{
1580 struct iattr attr; 1580 struct iattr attr;
1581 int error; 1581 int error;
1582 int open_flags = 0;
1582 1583
1583 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1584 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1584 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1585 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1586 attr.ia_mode = mode; 1587 attr.ia_mode = mode;
1587 attr.ia_valid = ATTR_MODE; 1588 attr.ia_valid = ATTR_MODE;
1588 1589
1589 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); 1590 if ((nd->flags & LOOKUP_CREATE) != 0)
1591 open_flags = nd->intent.open.flags;
1592
1593 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
1590 if (error != 0) 1594 if (error != 0)
1591 goto out_err; 1595 goto out_err;
1592 return 0; 1596 return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1151 goto err_task_lock; 1151 goto err_task_lock;
1152 } 1152 }
1153 1153
1154 if (oom_score_adj < task->signal->oom_score_adj && 1154 if (oom_score_adj < task->signal->oom_score_adj_min &&
1155 !capable(CAP_SYS_RESOURCE)) { 1155 !capable(CAP_SYS_RESOURCE)) {
1156 err = -EACCES; 1156 err = -EACCES;
1157 goto err_sighand; 1157 goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1164 atomic_dec(&task->mm->oom_disable_count); 1164 atomic_dec(&task->mm->oom_disable_count);
1165 } 1165 }
1166 task->signal->oom_score_adj = oom_score_adj; 1166 task->signal->oom_score_adj = oom_score_adj;
1167 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1168 task->signal->oom_score_adj_min = oom_score_adj;
1167 /* 1169 /*
1168 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1170 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1169 * always attainable. 1171 * always attainable.
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %5lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104#ifdef CONFIG_TRANSPARENT_HUGEPAGE
105 "AnonHugePages: %8lu kB\n"
106#endif
104 , 107 ,
105 K(i.totalram), 108 K(i.totalram),
106 K(i.freeram), 109 K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
128 K(i.freeswap), 131 K(i.freeswap),
129 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
130 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
131 K(global_page_state(NR_ANON_PAGES)), 134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR
138#endif
139 ),
132 K(global_page_state(NR_FILE_MAPPED)), 140 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)), 141 K(global_page_state(NR_SHMEM)),
134 K(global_page_state(NR_SLAB_RECLAIMABLE) + 142 K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
151#ifdef CONFIG_MEMORY_FAILURE 159#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) 160 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif 161#endif
162#ifdef CONFIG_TRANSPARENT_HUGEPAGE
163 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
164 HPAGE_PMD_NR)
165#endif
154 ); 166 );
155 167
156 hugetlb_report_meminfo(m); 168 hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b06c674624e6..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 118
119 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
120
121 /* 119 /*
122 * Caveats on high order pages: 120 * Caveats on high order pages: page->_count will only be set
123 * PG_buddy will only be set on the head page; SLUB/SLQB do the same 121 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
124 * for PG_slab; SLOB won't set PG_slab at all on compound pages. 122 * SLOB won't set PG_slab at all on compound pages.
125 */ 123 */
124 if (PageBuddy(page))
125 u |= 1 << KPF_BUDDY;
126
127 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
128
126 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 129 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
127 u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
128 130
129 u |= kpf_copy_bit(k, KPF_ERROR, PG_error); 131 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
130 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); 132 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c3755bd8dd3e..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
418 "Anonymous: %8lu kB\n" 418 "Anonymous: %8lu kB\n"
419 "Swap: %8lu kB\n" 419 "Swap: %8lu kB\n"
420 "KernelPageSize: %8lu kB\n" 420 "KernelPageSize: %8lu kB\n"
421 "MMUPageSize: %8lu kB\n", 421 "MMUPageSize: %8lu kB\n"
422 "Locked: %8lu kB\n",
422 (vma->vm_end - vma->vm_start) >> 10, 423 (vma->vm_end - vma->vm_start) >> 10,
423 mss.resident >> 10, 424 mss.resident >> 10,
424 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 425 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
430 mss.anonymous >> 10, 431 mss.anonymous >> 10,
431 mss.swap >> 10, 432 mss.swap >> 10,
432 vma_kernel_pagesize(vma) >> 10, 433 vma_kernel_pagesize(vma) >> 10,
433 vma_mmu_pagesize(vma) >> 10); 434 vma_mmu_pagesize(vma) >> 10,
435 (vma->vm_flags & VM_LOCKED) ?
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
434 437
435 if (m->count < m->size) /* vma is copied successfully */ 438 if (m->count < m->size) /* vma is copied successfully */
436 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 6098cae2af8e..ff5c66080c8c 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data,
147/* Always use the library code for GPIO management calls, 147/* Always use the library code for GPIO management calls,
148 * or when sleeping may be involved. 148 * or when sleeping may be involved.
149 */ 149 */
150extern int __must_check gpio_request(unsigned gpio, const char *label); 150extern int gpio_request(unsigned gpio, const char *label);
151extern void gpio_free(unsigned gpio); 151extern void gpio_free(unsigned gpio);
152 152
153extern int __must_check gpio_direction_input(unsigned gpio); 153extern int gpio_direction_input(unsigned gpio);
154extern int __must_check gpio_direction_output(unsigned gpio, int value); 154extern int gpio_direction_output(unsigned gpio, int value);
155 155
156extern int gpio_set_debounce(unsigned gpio, unsigned debounce); 156extern int gpio_set_debounce(unsigned gpio, unsigned debounce);
157 157
@@ -192,8 +192,8 @@ struct gpio {
192 const char *label; 192 const char *label;
193}; 193};
194 194
195extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label); 195extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
196extern int __must_check gpio_request_array(struct gpio *array, size_t num); 196extern int gpio_request_array(struct gpio *array, size_t num);
197extern void gpio_free_array(struct gpio *array, size_t num); 197extern void gpio_free_array(struct gpio *array, size_t num);
198 198
199#ifdef CONFIG_GPIO_SYSFS 199#ifdef CONFIG_GPIO_SYSFS
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 3da9e2742fa0..787abbb6d867 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -45,6 +45,9 @@
45#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 45#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
46#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 46#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
47 47
48#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
49#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
50
48/* compatibility flags */ 51/* compatibility flags */
49#define MAP_FILE 0 52#define MAP_FILE 0
50 53
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 6f3c6ae4fe03..f1eddf71dd0c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -5,67 +5,108 @@
5#ifdef CONFIG_MMU 5#ifdef CONFIG_MMU
6 6
7#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 7#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
8/* 8extern int ptep_set_access_flags(struct vm_area_struct *vma,
9 * Largely same as above, but only sets the access flags (dirty, 9 unsigned long address, pte_t *ptep,
10 * accessed, and writable). Furthermore, we know it always gets set 10 pte_t entry, int dirty);
11 * to a "more permissive" setting, which allows most architectures 11#endif
12 * to optimize this. We return whether the PTE actually changed, which 12
13 * in turn instructs the caller to do things like update__mmu_cache. 13#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
14 * This used to be done in the caller, but sparc needs minor faults to 14extern int pmdp_set_access_flags(struct vm_area_struct *vma,
15 * force that call on sun4c so we changed this macro slightly 15 unsigned long address, pmd_t *pmdp,
16 */ 16 pmd_t entry, int dirty);
17#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
18({ \
19 int __changed = !pte_same(*(__ptep), __entry); \
20 if (__changed) { \
21 set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \
22 flush_tlb_page(__vma, __address); \
23 } \
24 __changed; \
25})
26#endif 17#endif
27 18
28#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 19#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
29#define ptep_test_and_clear_young(__vma, __address, __ptep) \ 20static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
30({ \ 21 unsigned long address,
31 pte_t __pte = *(__ptep); \ 22 pte_t *ptep)
32 int r = 1; \ 23{
33 if (!pte_young(__pte)) \ 24 pte_t pte = *ptep;
34 r = 0; \ 25 int r = 1;
35 else \ 26 if (!pte_young(pte))
36 set_pte_at((__vma)->vm_mm, (__address), \ 27 r = 0;
37 (__ptep), pte_mkold(__pte)); \ 28 else
38 r; \ 29 set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
39}) 30 return r;
31}
32#endif
33
34#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
35#ifdef CONFIG_TRANSPARENT_HUGEPAGE
36static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
37 unsigned long address,
38 pmd_t *pmdp)
39{
40 pmd_t pmd = *pmdp;
41 int r = 1;
42 if (!pmd_young(pmd))
43 r = 0;
44 else
45 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
46 return r;
47}
48#else /* CONFIG_TRANSPARENT_HUGEPAGE */
49static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
50 unsigned long address,
51 pmd_t *pmdp)
52{
53 BUG();
54 return 0;
55}
56#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
40#endif 57#endif
41 58
42#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 59#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
43#define ptep_clear_flush_young(__vma, __address, __ptep) \ 60int ptep_clear_flush_young(struct vm_area_struct *vma,
44({ \ 61 unsigned long address, pte_t *ptep);
45 int __young; \ 62#endif
46 __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ 63
47 if (__young) \ 64#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
48 flush_tlb_page(__vma, __address); \ 65int pmdp_clear_flush_young(struct vm_area_struct *vma,
49 __young; \ 66 unsigned long address, pmd_t *pmdp);
50})
51#endif 67#endif
52 68
53#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR 69#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
54#define ptep_get_and_clear(__mm, __address, __ptep) \ 70static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
55({ \ 71 unsigned long address,
56 pte_t __pte = *(__ptep); \ 72 pte_t *ptep)
57 pte_clear((__mm), (__address), (__ptep)); \ 73{
58 __pte; \ 74 pte_t pte = *ptep;
75 pte_clear(mm, address, ptep);
76 return pte;
77}
78#endif
79
80#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
81#ifdef CONFIG_TRANSPARENT_HUGEPAGE
82static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
83 unsigned long address,
84 pmd_t *pmdp)
85{
86 pmd_t pmd = *pmdp;
87 pmd_clear(mm, address, pmdp);
88 return pmd;
59}) 89})
90#else /* CONFIG_TRANSPARENT_HUGEPAGE */
91static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
92 unsigned long address,
93 pmd_t *pmdp)
94{
95 BUG();
96 return __pmd(0);
97}
98#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
60#endif 99#endif
61 100
62#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL 101#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
63#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \ 102static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
64({ \ 103 unsigned long address, pte_t *ptep,
65 pte_t __pte; \ 104 int full)
66 __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \ 105{
67 __pte; \ 106 pte_t pte;
68}) 107 pte = ptep_get_and_clear(mm, address, ptep);
108 return pte;
109}
69#endif 110#endif
70 111
71/* 112/*
@@ -74,20 +115,25 @@
74 * not present, or in the process of an address space destruction. 115 * not present, or in the process of an address space destruction.
75 */ 116 */
76#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL 117#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
77#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ 118static inline void pte_clear_not_present_full(struct mm_struct *mm,
78do { \ 119 unsigned long address,
79 pte_clear((__mm), (__address), (__ptep)); \ 120 pte_t *ptep,
80} while (0) 121 int full)
122{
123 pte_clear(mm, address, ptep);
124}
81#endif 125#endif
82 126
83#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH 127#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
84#define ptep_clear_flush(__vma, __address, __ptep) \ 128extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
85({ \ 129 unsigned long address,
86 pte_t __pte; \ 130 pte_t *ptep);
87 __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ 131#endif
88 flush_tlb_page(__vma, __address); \ 132
89 __pte; \ 133#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
90}) 134extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
135 unsigned long address,
136 pmd_t *pmdp);
91#endif 137#endif
92 138
93#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT 139#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
99} 145}
100#endif 146#endif
101 147
148#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
149#ifdef CONFIG_TRANSPARENT_HUGEPAGE
150static inline void pmdp_set_wrprotect(struct mm_struct *mm,
151 unsigned long address, pmd_t *pmdp)
152{
153 pmd_t old_pmd = *pmdp;
154 set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
155}
156#else /* CONFIG_TRANSPARENT_HUGEPAGE */
157static inline void pmdp_set_wrprotect(struct mm_struct *mm,
158 unsigned long address, pmd_t *pmdp)
159{
160 BUG();
161}
162#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
163#endif
164
165#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
166extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
167 unsigned long address,
168 pmd_t *pmdp);
169#endif
170
102#ifndef __HAVE_ARCH_PTE_SAME 171#ifndef __HAVE_ARCH_PTE_SAME
103#define pte_same(A,B) (pte_val(A) == pte_val(B)) 172static inline int pte_same(pte_t pte_a, pte_t pte_b)
173{
174 return pte_val(pte_a) == pte_val(pte_b);
175}
176#endif
177
178#ifndef __HAVE_ARCH_PMD_SAME
179#ifdef CONFIG_TRANSPARENT_HUGEPAGE
180static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
181{
182 return pmd_val(pmd_a) == pmd_val(pmd_b);
183}
184#else /* CONFIG_TRANSPARENT_HUGEPAGE */
185static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
186{
187 BUG();
188 return 0;
189}
190#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
104#endif 191#endif
105 192
106#ifndef __HAVE_ARCH_PAGE_TEST_DIRTY 193#ifndef __HAVE_ARCH_PAGE_TEST_DIRTY
@@ -348,6 +435,24 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
348 unsigned long size); 435 unsigned long size);
349#endif 436#endif
350 437
438#ifndef CONFIG_TRANSPARENT_HUGEPAGE
439static inline int pmd_trans_huge(pmd_t pmd)
440{
441 return 0;
442}
443static inline int pmd_trans_splitting(pmd_t pmd)
444{
445 return 0;
446}
447#ifndef __HAVE_ARCH_PMD_WRITE
448static inline int pmd_write(pmd_t pmd)
449{
450 BUG();
451 return 0;
452}
453#endif /* __HAVE_ARCH_PMD_WRITE */
454#endif
455
351#endif /* !__ASSEMBLY__ */ 456#endif /* !__ASSEMBLY__ */
352 457
353#endif /* _ASM_GENERIC_PGTABLE_H */ 458#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 5ac51552d908..dfa2ed4c0d26 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -11,6 +11,9 @@
11/* The full zone was compacted */ 11/* The full zone was compacted */
12#define COMPACT_COMPLETE 3 12#define COMPACT_COMPLETE 3
13 13
14#define COMPACT_MODE_DIRECT_RECLAIM 0
15#define COMPACT_MODE_KSWAPD 1
16
14#ifdef CONFIG_COMPACTION 17#ifdef CONFIG_COMPACTION
15extern int sysctl_compact_memory; 18extern int sysctl_compact_memory;
16extern int sysctl_compaction_handler(struct ctl_table *table, int write, 19extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -21,7 +24,12 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
21 24
22extern int fragmentation_index(struct zone *zone, unsigned int order); 25extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 26extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask); 27 int order, gfp_t gfp_mask, nodemask_t *mask,
28 bool sync);
29extern unsigned long compaction_suitable(struct zone *zone, int order);
30extern unsigned long compact_zone_order(struct zone *zone, int order,
31 gfp_t gfp_mask, bool sync,
32 int compact_mode);
25 33
26/* Do not skip compaction more than 64 times */ 34/* Do not skip compaction more than 64 times */
27#define COMPACT_MAX_DEFER_SHIFT 6 35#define COMPACT_MAX_DEFER_SHIFT 6
@@ -54,7 +62,20 @@ static inline bool compaction_deferred(struct zone *zone)
54 62
55#else 63#else
56static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 64static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
57 int order, gfp_t gfp_mask, nodemask_t *nodemask) 65 int order, gfp_t gfp_mask, nodemask_t *nodemask,
66 bool sync)
67{
68 return COMPACT_CONTINUE;
69}
70
71static inline unsigned long compaction_suitable(struct zone *zone, int order)
72{
73 return COMPACT_SKIPPED;
74}
75
76static inline unsigned long compact_zone_order(struct zone *zone, int order,
77 gfp_t gfp_mask, bool sync,
78 int compact_mode)
58{ 79{
59 return COMPACT_CONTINUE; 80 return COMPACT_CONTINUE;
60} 81}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022faa63..272496d1fae4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,13 @@ struct dm_target {
193 char *error; 193 char *error;
194}; 194};
195 195
196/* Each target can link one of these into the table */
197struct dm_target_callbacks {
198 struct list_head list;
199 int (*congested_fn) (struct dm_target_callbacks *, int);
200 void (*unplug_fn)(struct dm_target_callbacks *);
201};
202
196int dm_register_target(struct target_type *t); 203int dm_register_target(struct target_type *t);
197void dm_unregister_target(struct target_type *t); 204void dm_unregister_target(struct target_type *t);
198 205
@@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type,
269 sector_t start, sector_t len, char *params); 276 sector_t start, sector_t len, char *params);
270 277
271/* 278/*
279 * Target_ctr should call this if it needs to add any callbacks.
280 */
281void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
282
283/*
272 * Finally call this to make the table ready for use. 284 * Finally call this to make the table ready for use.
273 */ 285 */
274int dm_table_complete(struct dm_table *t); 286int dm_table_complete(struct dm_table *t);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 49eab360d5d4..78bbf47bbb96 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -44,7 +44,7 @@
44 * Remove a device, destroy any tables. 44 * Remove a device, destroy any tables.
45 * 45 *
46 * DM_DEV_RENAME: 46 * DM_DEV_RENAME:
47 * Rename a device. 47 * Rename a device or set its uuid if none was previously supplied.
48 * 48 *
49 * DM_SUSPEND: 49 * DM_SUSPEND:
50 * This performs both suspend and resume, depending which flag is 50 * This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
268 268
269#define DM_VERSION_MAJOR 4 269#define DM_VERSION_MAJOR 4
270#define DM_VERSION_MINOR 18 270#define DM_VERSION_MINOR 19
271#define DM_VERSION_PATCHLEVEL 0 271#define DM_VERSION_PATCHLEVEL 1
272#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" 272#define DM_VERSION_EXTRA "-ioctl (2011-01-07)"
273 273
274/* Status bits */ 274/* Status bits */
275#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 275#define DM_READONLY_FLAG (1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
322 */ 322 */
323#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ 323#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */
324 324
325/*
326 * If set, rename changes the uuid not the name. Only permitted
327 * if no uuid was previously supplied: an existing uuid cannot be changed.
328 */
329#define DM_UUID_FLAG (1 << 14) /* In */
330
325#endif /* _LINUX_DM_IOCTL_H */ 331#endif /* _LINUX_DM_IOCTL_H */
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 0c3c3a2110c4..eeace7d3ff15 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -370,6 +370,16 @@
370#define DM_ULOG_REQUEST_TYPE(request_type) \ 370#define DM_ULOG_REQUEST_TYPE(request_type) \
371 (DM_ULOG_REQUEST_MASK & (request_type)) 371 (DM_ULOG_REQUEST_MASK & (request_type))
372 372
373/*
374 * DM_ULOG_REQUEST_VERSION is incremented when there is a
375 * change to the way information is passed between kernel
376 * and userspace. This could be a structure change of
377 * dm_ulog_request or a change in the way requests are
378 * issued/handled. Changes are outlined here:
379 * version 1: Initial implementation
380 */
381#define DM_ULOG_REQUEST_VERSION 1
382
373struct dm_ulog_request { 383struct dm_ulog_request {
374 /* 384 /*
375 * The local unique identifier (luid) and the universally unique 385 * The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
383 */ 393 */
384 uint64_t luid; 394 uint64_t luid;
385 char uuid[DM_UUID_LEN]; 395 char uuid[DM_UUID_LEN];
386 char padding[7]; /* Padding because DM_UUID_LEN = 129 */ 396 char padding[3]; /* Padding because DM_UUID_LEN = 129 */
387 397
398 uint32_t version; /* See DM_ULOG_REQUEST_VERSION */
388 int32_t error; /* Used to report back processing errors */ 399 int32_t error; /* Used to report back processing errors */
389 400
390 uint32_t seq; /* Sequence number for request */ 401 uint32_t seq; /* Sequence number for request */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f54adfcbec9c..a3b148a91874 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
34#else 34#else
35#define ___GFP_NOTRACK 0 35#define ___GFP_NOTRACK 0
36#endif 36#endif
37#define ___GFP_NO_KSWAPD 0x400000u
37 38
38/* 39/*
39 * GFP bitmasks.. 40 * GFP bitmasks..
@@ -81,13 +82,15 @@ struct vm_area_struct;
81#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ 82#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
82#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ 83#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
83 84
85#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
86
84/* 87/*
85 * This may seem redundant, but it's a way of annotating false positives vs. 88 * This may seem redundant, but it's a way of annotating false positives vs.
86 * allocations that simply cannot be supported (e.g. page tables). 89 * allocations that simply cannot be supported (e.g. page tables).
87 */ 90 */
88#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) 91#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
89 92
90#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ 93#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */
91#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) 94#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
92 95
93/* This equals 0, but use constants in case they ever change */ 96/* This equals 0, but use constants in case they ever change */
@@ -106,6 +109,9 @@ struct vm_area_struct;
106 __GFP_HARDWALL | __GFP_HIGHMEM | \ 109 __GFP_HARDWALL | __GFP_HIGHMEM | \
107 __GFP_MOVABLE) 110 __GFP_MOVABLE)
108#define GFP_IOFS (__GFP_IO | __GFP_FS) 111#define GFP_IOFS (__GFP_IO | __GFP_FS)
112#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
113 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
114 __GFP_NO_KSWAPD)
109 115
110#ifdef CONFIG_NUMA 116#ifdef CONFIG_NUMA
111#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) 117#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -325,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
325{ 331{
326 return alloc_pages_current(gfp_mask, order); 332 return alloc_pages_current(gfp_mask, order);
327} 333}
328extern struct page *alloc_page_vma(gfp_t gfp_mask, 334extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
329 struct vm_area_struct *vma, unsigned long addr); 335 struct vm_area_struct *vma, unsigned long addr);
330#else 336#else
331#define alloc_pages(gfp_mask, order) \ 337#define alloc_pages(gfp_mask, order) \
332 alloc_pages_node(numa_node_id(), gfp_mask, order) 338 alloc_pages_node(numa_node_id(), gfp_mask, order)
333#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) 339#define alloc_pages_vma(gfp_mask, order, vma, addr) \
340 alloc_pages(gfp_mask, order)
334#endif 341#endif
335#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 342#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
343#define alloc_page_vma(gfp_mask, vma, addr) \
344 alloc_pages_vma(gfp_mask, 0, vma, addr)
336 345
337extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 346extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
338extern unsigned long get_zeroed_page(gfp_t gfp_mask); 347extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index f79d67f413e4..4b47ed96f131 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number)
30 return 0; 30 return 0;
31} 31}
32 32
33static inline int __must_check gpio_request(unsigned gpio, const char *label) 33static inline int gpio_request(unsigned gpio, const char *label)
34{ 34{
35 return -ENOSYS; 35 return -ENOSYS;
36} 36}
@@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num)
62 WARN_ON(1); 62 WARN_ON(1);
63} 63}
64 64
65static inline int __must_check gpio_direction_input(unsigned gpio) 65static inline int gpio_direction_input(unsigned gpio)
66{ 66{
67 return -ENOSYS; 67 return -ENOSYS;
68} 68}
69 69
70static inline int __must_check gpio_direction_output(unsigned gpio, int value) 70static inline int gpio_direction_output(unsigned gpio, int value)
71{ 71{
72 return -ENOSYS; 72 return -ENOSYS;
73} 73}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
new file mode 100644
index 000000000000..8e6c8c42bc3c
--- /dev/null
+++ b/include/linux/huge_mm.h
@@ -0,0 +1,179 @@
1#ifndef _LINUX_HUGE_MM_H
2#define _LINUX_HUGE_MM_H
3
4extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
5 struct vm_area_struct *vma,
6 unsigned long address, pmd_t *pmd,
7 unsigned int flags);
8extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
9 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
10 struct vm_area_struct *vma);
11extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
12 unsigned long address, pmd_t *pmd,
13 pmd_t orig_pmd);
14extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
15extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
16 unsigned long addr,
17 pmd_t *pmd,
18 unsigned int flags);
19extern int zap_huge_pmd(struct mmu_gather *tlb,
20 struct vm_area_struct *vma,
21 pmd_t *pmd);
22extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
23 unsigned long addr, unsigned long end,
24 unsigned char *vec);
25extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, pgprot_t newprot);
27
28enum transparent_hugepage_flag {
29 TRANSPARENT_HUGEPAGE_FLAG,
30 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
31 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
32 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
33 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
34#ifdef CONFIG_DEBUG_VM
35 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
36#endif
37};
38
39enum page_check_address_pmd_flag {
40 PAGE_CHECK_ADDRESS_PMD_FLAG,
41 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
42 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
43};
44extern pmd_t *page_check_address_pmd(struct page *page,
45 struct mm_struct *mm,
46 unsigned long address,
47 enum page_check_address_pmd_flag flag);
48
49#ifdef CONFIG_TRANSPARENT_HUGEPAGE
50#define HPAGE_PMD_SHIFT HPAGE_SHIFT
51#define HPAGE_PMD_MASK HPAGE_MASK
52#define HPAGE_PMD_SIZE HPAGE_SIZE
53
54#define transparent_hugepage_enabled(__vma) \
55 ((transparent_hugepage_flags & \
56 (1<<TRANSPARENT_HUGEPAGE_FLAG) || \
57 (transparent_hugepage_flags & \
58 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \
59 ((__vma)->vm_flags & VM_HUGEPAGE))) && \
60 !((__vma)->vm_flags & VM_NOHUGEPAGE))
61#define transparent_hugepage_defrag(__vma) \
62 ((transparent_hugepage_flags & \
63 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) || \
64 (transparent_hugepage_flags & \
65 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
66 (__vma)->vm_flags & VM_HUGEPAGE))
67#ifdef CONFIG_DEBUG_VM
68#define transparent_hugepage_debug_cow() \
69 (transparent_hugepage_flags & \
70 (1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
71#else /* CONFIG_DEBUG_VM */
72#define transparent_hugepage_debug_cow() 0
73#endif /* CONFIG_DEBUG_VM */
74
75extern unsigned long transparent_hugepage_flags;
76extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
77 pmd_t *dst_pmd, pmd_t *src_pmd,
78 struct vm_area_struct *vma,
79 unsigned long addr, unsigned long end);
80extern int handle_pte_fault(struct mm_struct *mm,
81 struct vm_area_struct *vma, unsigned long address,
82 pte_t *pte, pmd_t *pmd, unsigned int flags);
83extern int split_huge_page(struct page *page);
84extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
85#define split_huge_page_pmd(__mm, __pmd) \
86 do { \
87 pmd_t *____pmd = (__pmd); \
88 if (unlikely(pmd_trans_huge(*____pmd))) \
89 __split_huge_page_pmd(__mm, ____pmd); \
90 } while (0)
91#define wait_split_huge_page(__anon_vma, __pmd) \
92 do { \
93 pmd_t *____pmd = (__pmd); \
94 spin_unlock_wait(&(__anon_vma)->root->lock); \
95 /* \
96 * spin_unlock_wait() is just a loop in C and so the \
97 * CPU can reorder anything around it. \
98 */ \
99 smp_mb(); \
100 BUG_ON(pmd_trans_splitting(*____pmd) || \
101 pmd_trans_huge(*____pmd)); \
102 } while (0)
103#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
104#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
105#if HPAGE_PMD_ORDER > MAX_ORDER
106#error "hugepages can't be allocated by the buddy allocator"
107#endif
108extern int hugepage_madvise(struct vm_area_struct *vma,
109 unsigned long *vm_flags, int advice);
110extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
111 unsigned long start,
112 unsigned long end,
113 long adjust_next);
114static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
115 unsigned long start,
116 unsigned long end,
117 long adjust_next)
118{
119 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
120 return;
121 __vma_adjust_trans_huge(vma, start, end, adjust_next);
122}
123static inline int hpage_nr_pages(struct page *page)
124{
125 if (unlikely(PageTransHuge(page)))
126 return HPAGE_PMD_NR;
127 return 1;
128}
129static inline struct page *compound_trans_head(struct page *page)
130{
131 if (PageTail(page)) {
132 struct page *head;
133 head = page->first_page;
134 smp_rmb();
135 /*
136 * head may be a dangling pointer.
137 * __split_huge_page_refcount clears PageTail before
138 * overwriting first_page, so if PageTail is still
139 * there it means the head pointer isn't dangling.
140 */
141 if (PageTail(page))
142 return head;
143 }
144 return page;
145}
146#else /* CONFIG_TRANSPARENT_HUGEPAGE */
147#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
148#define HPAGE_PMD_MASK ({ BUG(); 0; })
149#define HPAGE_PMD_SIZE ({ BUG(); 0; })
150
151#define hpage_nr_pages(x) 1
152
153#define transparent_hugepage_enabled(__vma) 0
154
155#define transparent_hugepage_flags 0UL
156static inline int split_huge_page(struct page *page)
157{
158 return 0;
159}
160#define split_huge_page_pmd(__mm, __pmd) \
161 do { } while (0)
162#define wait_split_huge_page(__anon_vma, __pmd) \
163 do { } while (0)
164#define compound_trans_head(page) compound_head(page)
165static inline int hugepage_madvise(struct vm_area_struct *vma,
166 unsigned long *vm_flags, int advice)
167{
168 BUG();
169 return 0;
170}
171static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
172 unsigned long start,
173 unsigned long end,
174 long adjust_next)
175{
176}
177#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
178
179#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68cc7458..6a64c6fa81af 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -57,7 +57,7 @@ struct irq_desc {
57#endif 57#endif
58 58
59 struct timer_rand_state *timer_rand_state; 59 struct timer_rand_state *timer_rand_state;
60 unsigned int *kstat_irqs; 60 unsigned int __percpu *kstat_irqs;
61 irq_flow_handler_t handle_irq; 61 irq_flow_handler_t handle_irq;
62 struct irqaction *action; /* IRQ action list */ 62 struct irqaction *action; /* IRQ action list */
63 unsigned int status; /* IRQ status */ 63 unsigned int status; /* IRQ status */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 57dac7022b63..5a9d9059520b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
600#define NUMA_BUILD 0 600#define NUMA_BUILD 0
601#endif 601#endif
602 602
603/* This helps us avoid #ifdef CONFIG_COMPACTION */
604#ifdef CONFIG_COMPACTION
605#define COMPACTION_BUILD 1
606#else
607#define COMPACTION_BUILD 0
608#endif
609
603/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ 610/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
604#ifdef CONFIG_FTRACE_MCOUNT_RECORD 611#ifdef CONFIG_FTRACE_MCOUNT_RECORD
605# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD 612# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44e83ba12b5b..0cce2db580c3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
46extern unsigned long long nr_context_switches(void); 46extern unsigned long long nr_context_switches(void);
47 47
48#ifndef CONFIG_GENERIC_HARDIRQS 48#ifndef CONFIG_GENERIC_HARDIRQS
49#define kstat_irqs_this_cpu(irq) \
50 (this_cpu_read(kstat.irqs[irq])
51 49
52struct irq_desc; 50struct irq_desc;
53 51
54static inline void kstat_incr_irqs_this_cpu(unsigned int irq, 52static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
55 struct irq_desc *desc) 53 struct irq_desc *desc)
56{ 54{
57 kstat_this_cpu.irqs[irq]++; 55 __this_cpu_inc(kstat.irqs[irq]);
58 kstat_this_cpu.irqs_sum++; 56 __this_cpu_inc(kstat.irqs_sum);
59} 57}
60 58
61static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 59static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
65#else 63#else
66#include <linux/irq.h> 64#include <linux/irq.h>
67extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); 65extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
68#define kstat_irqs_this_cpu(DESC) \ 66
69 ((DESC)->kstat_irqs[smp_processor_id()]) 67#define kstat_incr_irqs_this_cpu(irqno, DESC) \
70#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ 68do { \
71 ((DESC)->kstat_irqs[smp_processor_id()]++);\ 69 __this_cpu_inc(*(DESC)->kstat_irqs); \
72 kstat_this_cpu.irqs_sum++; } while (0) 70 __this_cpu_inc(kstat.irqs_sum); \
71} while (0)
73 72
74#endif 73#endif
75 74
76static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) 75static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
77{ 76{
78 kstat_this_cpu.softirqs[irq]++; 77 __this_cpu_inc(kstat.softirqs[irq]);
79} 78}
80 79
81static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) 80static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
new file mode 100644
index 000000000000..6b394f0b5148
--- /dev/null
+++ b/include/linux/khugepaged.h
@@ -0,0 +1,67 @@
1#ifndef _LINUX_KHUGEPAGED_H
2#define _LINUX_KHUGEPAGED_H
3
4#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
5
6#ifdef CONFIG_TRANSPARENT_HUGEPAGE
7extern int __khugepaged_enter(struct mm_struct *mm);
8extern void __khugepaged_exit(struct mm_struct *mm);
9extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
10
11#define khugepaged_enabled() \
12 (transparent_hugepage_flags & \
13 ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \
14 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
15#define khugepaged_always() \
16 (transparent_hugepage_flags & \
17 (1<<TRANSPARENT_HUGEPAGE_FLAG))
18#define khugepaged_req_madv() \
19 (transparent_hugepage_flags & \
20 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
21#define khugepaged_defrag() \
22 (transparent_hugepage_flags & \
23 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
24
25static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
26{
27 if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
28 return __khugepaged_enter(mm);
29 return 0;
30}
31
32static inline void khugepaged_exit(struct mm_struct *mm)
33{
34 if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
35 __khugepaged_exit(mm);
36}
37
38static inline int khugepaged_enter(struct vm_area_struct *vma)
39{
40 if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
41 if ((khugepaged_always() ||
42 (khugepaged_req_madv() &&
43 vma->vm_flags & VM_HUGEPAGE)) &&
44 !(vma->vm_flags & VM_NOHUGEPAGE))
45 if (__khugepaged_enter(vma->vm_mm))
46 return -ENOMEM;
47 return 0;
48}
49#else /* CONFIG_TRANSPARENT_HUGEPAGE */
50static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
51{
52 return 0;
53}
54static inline void khugepaged_exit(struct mm_struct *mm)
55{
56}
57static inline int khugepaged_enter(struct vm_area_struct *vma)
58{
59 return 0;
60}
61static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
62{
63 return 0;
64}
65#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
66
67#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 159a0762aeaf..6a576f989437 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,11 @@ struct page_cgroup;
25struct page; 25struct page;
26struct mm_struct; 26struct mm_struct;
27 27
28/* Stats that can be updated by kernel. */
29enum mem_cgroup_page_stat_item {
30 MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
31};
32
28extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 33extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
29 struct list_head *dst, 34 struct list_head *dst,
30 unsigned long *scanned, int order, 35 unsigned long *scanned, int order,
@@ -93,7 +98,7 @@ extern int
93mem_cgroup_prepare_migration(struct page *page, 98mem_cgroup_prepare_migration(struct page *page,
94 struct page *newpage, struct mem_cgroup **ptr); 99 struct page *newpage, struct mem_cgroup **ptr);
95extern void mem_cgroup_end_migration(struct mem_cgroup *mem, 100extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
96 struct page *oldpage, struct page *newpage); 101 struct page *oldpage, struct page *newpage, bool migration_ok);
97 102
98/* 103/*
99 * For memory reclaim. 104 * For memory reclaim.
@@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void)
121 return false; 126 return false;
122} 127}
123 128
124void mem_cgroup_update_file_mapped(struct page *page, int val); 129void mem_cgroup_update_page_stat(struct page *page,
130 enum mem_cgroup_page_stat_item idx,
131 int val);
132
133static inline void mem_cgroup_inc_page_stat(struct page *page,
134 enum mem_cgroup_page_stat_item idx)
135{
136 mem_cgroup_update_page_stat(page, idx, 1);
137}
138
139static inline void mem_cgroup_dec_page_stat(struct page *page,
140 enum mem_cgroup_page_stat_item idx)
141{
142 mem_cgroup_update_page_stat(page, idx, -1);
143}
144
125unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 145unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
126 gfp_t gfp_mask); 146 gfp_t gfp_mask);
127u64 mem_cgroup_get_limit(struct mem_cgroup *mem); 147u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -231,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
231} 251}
232 252
233static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, 253static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
234 struct page *oldpage, 254 struct page *oldpage, struct page *newpage, bool migration_ok)
235 struct page *newpage)
236{ 255{
237} 256}
238 257
@@ -293,8 +312,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
293{ 312{
294} 313}
295 314
296static inline void mem_cgroup_update_file_mapped(struct page *page, 315static inline void mem_cgroup_inc_page_stat(struct page *page,
297 int val) 316 enum mem_cgroup_page_stat_item idx)
317{
318}
319
320static inline void mem_cgroup_dec_page_stat(struct page *page,
321 enum mem_cgroup_page_stat_item idx)
298{ 322{
299} 323}
300 324
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 31c237a00c48..24376fe7ee68 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -13,12 +13,16 @@ struct mem_section;
13#ifdef CONFIG_MEMORY_HOTPLUG 13#ifdef CONFIG_MEMORY_HOTPLUG
14 14
15/* 15/*
16 * Types for free bootmem. 16 * Types for free bootmem stored in page->lru.next. These have to be in
17 * The normal smallest mapcount is -1. Here is smaller value than it. 17 * some random range in unsigned long space for debugging purposes.
18 */ 18 */
19#define SECTION_INFO (-1 - 1) 19enum {
20#define MIX_SECTION_INFO (-1 - 2) 20 MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
21#define NODE_INFO (-1 - 3) 21 SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
22 MIX_SECTION_INFO,
23 NODE_INFO,
24 MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
25};
22 26
23/* 27/*
24 * pgdat resizing functions 28 * pgdat resizing functions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 085527fb8261..e39aeecfe9a2 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l);
13extern int migrate_page(struct address_space *, 13extern int migrate_page(struct address_space *,
14 struct page *, struct page *); 14 struct page *, struct page *);
15extern int migrate_pages(struct list_head *l, new_page_t x, 15extern int migrate_pages(struct list_head *l, new_page_t x,
16 unsigned long private, int offlining); 16 unsigned long private, bool offlining,
17 bool sync);
17extern int migrate_huge_pages(struct list_head *l, new_page_t x, 18extern int migrate_huge_pages(struct list_head *l, new_page_t x,
18 unsigned long private, int offlining); 19 unsigned long private, bool offlining,
20 bool sync);
19 21
20extern int fail_migrate_page(struct address_space *, 22extern int fail_migrate_page(struct address_space *,
21 struct page *, struct page *); 23 struct page *, struct page *);
@@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
33 35
34static inline void putback_lru_pages(struct list_head *l) {} 36static inline void putback_lru_pages(struct list_head *l) {}
35static inline int migrate_pages(struct list_head *l, new_page_t x, 37static inline int migrate_pages(struct list_head *l, new_page_t x,
36 unsigned long private, int offlining) { return -ENOSYS; } 38 unsigned long private, bool offlining,
39 bool sync) { return -ENOSYS; }
37static inline int migrate_huge_pages(struct list_head *l, new_page_t x, 40static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
38 unsigned long private, int offlining) { return -ENOSYS; } 41 unsigned long private, bool offlining,
42 bool sync) { return -ENOSYS; }
39 43
40static inline int migrate_prep(void) { return -ENOSYS; } 44static inline int migrate_prep(void) { return -ENOSYS; }
41static inline int migrate_prep_local(void) { return -ENOSYS; } 45static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 721f451c3029..956a35532f47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
14#include <linux/mm_types.h> 14#include <linux/mm_types.h>
15#include <linux/range.h> 15#include <linux/range.h>
16#include <linux/pfn.h> 16#include <linux/pfn.h>
17#include <linux/bit_spinlock.h>
17 18
18struct mempolicy; 19struct mempolicy;
19struct anon_vma; 20struct anon_vma;
@@ -82,6 +83,7 @@ extern unsigned int kobjsize(const void *objp);
82#define VM_GROWSUP 0x00000200 83#define VM_GROWSUP 0x00000200
83#else 84#else
84#define VM_GROWSUP 0x00000000 85#define VM_GROWSUP 0x00000000
86#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */
85#endif 87#endif
86#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 88#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
87#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 89#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
@@ -101,7 +103,11 @@ extern unsigned int kobjsize(const void *objp);
101#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 103#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
102#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 104#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
103#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ 105#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
106#ifndef CONFIG_TRANSPARENT_HUGEPAGE
104#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ 107#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
108#else
109#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */
110#endif
105#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ 111#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
106#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ 112#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
107 113
@@ -242,6 +248,7 @@ struct inode;
242 * files which need it (119 of them) 248 * files which need it (119 of them)
243 */ 249 */
244#include <linux/page-flags.h> 250#include <linux/page-flags.h>
251#include <linux/huge_mm.h>
245 252
246/* 253/*
247 * Methods to modify the page usage count. 254 * Methods to modify the page usage count.
@@ -305,6 +312,39 @@ static inline int is_vmalloc_or_module_addr(const void *x)
305} 312}
306#endif 313#endif
307 314
315static inline void compound_lock(struct page *page)
316{
317#ifdef CONFIG_TRANSPARENT_HUGEPAGE
318 bit_spin_lock(PG_compound_lock, &page->flags);
319#endif
320}
321
322static inline void compound_unlock(struct page *page)
323{
324#ifdef CONFIG_TRANSPARENT_HUGEPAGE
325 bit_spin_unlock(PG_compound_lock, &page->flags);
326#endif
327}
328
329static inline unsigned long compound_lock_irqsave(struct page *page)
330{
331 unsigned long uninitialized_var(flags);
332#ifdef CONFIG_TRANSPARENT_HUGEPAGE
333 local_irq_save(flags);
334 compound_lock(page);
335#endif
336 return flags;
337}
338
339static inline void compound_unlock_irqrestore(struct page *page,
340 unsigned long flags)
341{
342#ifdef CONFIG_TRANSPARENT_HUGEPAGE
343 compound_unlock(page);
344 local_irq_restore(flags);
345#endif
346}
347
308static inline struct page *compound_head(struct page *page) 348static inline struct page *compound_head(struct page *page)
309{ 349{
310 if (unlikely(PageTail(page))) 350 if (unlikely(PageTail(page)))
@@ -319,9 +359,29 @@ static inline int page_count(struct page *page)
319 359
320static inline void get_page(struct page *page) 360static inline void get_page(struct page *page)
321{ 361{
322 page = compound_head(page); 362 /*
323 VM_BUG_ON(atomic_read(&page->_count) == 0); 363 * Getting a normal page or the head of a compound page
364 * requires to already have an elevated page->_count. Only if
365 * we're getting a tail page, the elevated page->_count is
366 * required only in the head page, so for tail pages the
367 * bugcheck only verifies that the page->_count isn't
368 * negative.
369 */
370 VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
324 atomic_inc(&page->_count); 371 atomic_inc(&page->_count);
372 /*
373 * Getting a tail page will elevate both the head and tail
374 * page->_count(s).
375 */
376 if (unlikely(PageTail(page))) {
377 /*
378 * This is safe only because
379 * __split_huge_page_refcount can't run under
380 * get_page().
381 */
382 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
383 atomic_inc(&page->first_page->_count);
384 }
325} 385}
326 386
327static inline struct page *virt_to_head_page(const void *x) 387static inline struct page *virt_to_head_page(const void *x)
@@ -339,6 +399,27 @@ static inline void init_page_count(struct page *page)
339 atomic_set(&page->_count, 1); 399 atomic_set(&page->_count, 1);
340} 400}
341 401
402/*
403 * PageBuddy() indicate that the page is free and in the buddy system
404 * (see mm/page_alloc.c).
405 */
406static inline int PageBuddy(struct page *page)
407{
408 return atomic_read(&page->_mapcount) == -2;
409}
410
411static inline void __SetPageBuddy(struct page *page)
412{
413 VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
414 atomic_set(&page->_mapcount, -2);
415}
416
417static inline void __ClearPageBuddy(struct page *page)
418{
419 VM_BUG_ON(!PageBuddy(page));
420 atomic_set(&page->_mapcount, -1);
421}
422
342void put_page(struct page *page); 423void put_page(struct page *page);
343void put_pages_list(struct list_head *pages); 424void put_pages_list(struct list_head *pages);
344 425
@@ -370,12 +451,39 @@ static inline int compound_order(struct page *page)
370 return (unsigned long)page[1].lru.prev; 451 return (unsigned long)page[1].lru.prev;
371} 452}
372 453
454static inline int compound_trans_order(struct page *page)
455{
456 int order;
457 unsigned long flags;
458
459 if (!PageHead(page))
460 return 0;
461
462 flags = compound_lock_irqsave(page);
463 order = compound_order(page);
464 compound_unlock_irqrestore(page, flags);
465 return order;
466}
467
373static inline void set_compound_order(struct page *page, unsigned long order) 468static inline void set_compound_order(struct page *page, unsigned long order)
374{ 469{
375 page[1].lru.prev = (void *)order; 470 page[1].lru.prev = (void *)order;
376} 471}
377 472
378/* 473/*
474 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
475 * servicing faults for write access. In the normal case, do always want
476 * pte_mkwrite. But get_user_pages can cause write faults for mappings
477 * that do not have writing enabled, when used by access_process_vm.
478 */
479static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
480{
481 if (likely(vma->vm_flags & VM_WRITE))
482 pte = pte_mkwrite(pte);
483 return pte;
484}
485
486/*
379 * Multiple processes may "see" the same page. E.g. for untouched 487 * Multiple processes may "see" the same page. E.g. for untouched
380 * mappings of /dev/null, all processes see the same page full of 488 * mappings of /dev/null, all processes see the same page full of
381 * zeroes, and text pages of executables and shared libraries have 489 * zeroes, and text pages of executables and shared libraries have
@@ -657,7 +765,7 @@ static inline struct address_space *page_mapping(struct page *page)
657 VM_BUG_ON(PageSlab(page)); 765 VM_BUG_ON(PageSlab(page));
658 if (unlikely(PageSwapCache(page))) 766 if (unlikely(PageSwapCache(page)))
659 mapping = &swapper_space; 767 mapping = &swapper_space;
660 else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) 768 else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
661 mapping = NULL; 769 mapping = NULL;
662 return mapping; 770 return mapping;
663} 771}
@@ -1064,7 +1172,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1064int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1172int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1065#endif 1173#endif
1066 1174
1067int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); 1175int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
1176 pmd_t *pmd, unsigned long address);
1068int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); 1177int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
1069 1178
1070/* 1179/*
@@ -1133,16 +1242,18 @@ static inline void pgtable_page_dtor(struct page *page)
1133 pte_unmap(pte); \ 1242 pte_unmap(pte); \
1134} while (0) 1243} while (0)
1135 1244
1136#define pte_alloc_map(mm, pmd, address) \ 1245#define pte_alloc_map(mm, vma, pmd, address) \
1137 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ 1246 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \
1138 NULL: pte_offset_map(pmd, address)) 1247 pmd, address))? \
1248 NULL: pte_offset_map(pmd, address))
1139 1249
1140#define pte_alloc_map_lock(mm, pmd, address, ptlp) \ 1250#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
1141 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ 1251 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \
1252 pmd, address))? \
1142 NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) 1253 NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
1143 1254
1144#define pte_alloc_kernel(pmd, address) \ 1255#define pte_alloc_kernel(pmd, address) \
1145 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ 1256 ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
1146 NULL: pte_offset_kernel(pmd, address)) 1257 NULL: pte_offset_kernel(pmd, address))
1147 1258
1148extern void free_area_init(unsigned long * zones_size); 1259extern void free_area_init(unsigned long * zones_size);
@@ -1415,6 +1526,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
1415#define FOLL_GET 0x04 /* do get_page on page */ 1526#define FOLL_GET 0x04 /* do get_page on page */
1416#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ 1527#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
1417#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ 1528#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
1529#define FOLL_MLOCK 0x40 /* mark page as mlocked */
1530#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
1418 1531
1419typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 1532typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
1420 void *data); 1533 void *data);
@@ -1518,5 +1631,14 @@ static inline int is_hwpoison_address(unsigned long addr)
1518 1631
1519extern void dump_page(struct page *page); 1632extern void dump_page(struct page *page);
1520 1633
1634#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
1635extern void clear_huge_page(struct page *page,
1636 unsigned long addr,
1637 unsigned int pages_per_huge_page);
1638extern void copy_user_huge_page(struct page *dst, struct page *src,
1639 unsigned long addr, struct vm_area_struct *vma,
1640 unsigned int pages_per_huge_page);
1641#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
1642
1521#endif /* __KERNEL__ */ 1643#endif /* __KERNEL__ */
1522#endif /* _LINUX_MM_H */ 1644#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8835b877b8db..8f7d24712dc1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,6 +1,8 @@
1#ifndef LINUX_MM_INLINE_H 1#ifndef LINUX_MM_INLINE_H
2#define LINUX_MM_INLINE_H 2#define LINUX_MM_INLINE_H
3 3
4#include <linux/huge_mm.h>
5
4/** 6/**
5 * page_is_file_cache - should the page be on a file LRU or anon LRU? 7 * page_is_file_cache - should the page be on a file LRU or anon LRU?
6 * @page: the page to test 8 * @page: the page to test
@@ -20,18 +22,25 @@ static inline int page_is_file_cache(struct page *page)
20} 22}
21 23
22static inline void 24static inline void
23add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) 25__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
26 struct list_head *head)
24{ 27{
25 list_add(&page->lru, &zone->lru[l].list); 28 list_add(&page->lru, head);
26 __inc_zone_state(zone, NR_LRU_BASE + l); 29 __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
27 mem_cgroup_add_lru_list(page, l); 30 mem_cgroup_add_lru_list(page, l);
28} 31}
29 32
30static inline void 33static inline void
34add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
35{
36 __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
37}
38
39static inline void
31del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) 40del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
32{ 41{
33 list_del(&page->lru); 42 list_del(&page->lru);
34 __dec_zone_state(zone, NR_LRU_BASE + l); 43 __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
35 mem_cgroup_del_lru_list(page, l); 44 mem_cgroup_del_lru_list(page, l);
36} 45}
37 46
@@ -66,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
66 l += LRU_ACTIVE; 75 l += LRU_ACTIVE;
67 } 76 }
68 } 77 }
69 __dec_zone_state(zone, NR_LRU_BASE + l); 78 __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
70 mem_cgroup_del_lru_list(page, l); 79 mem_cgroup_del_lru_list(page, l);
71} 80}
72 81
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb7288a782fd..26bc4e2cd275 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -310,6 +310,9 @@ struct mm_struct {
310#ifdef CONFIG_MMU_NOTIFIER 310#ifdef CONFIG_MMU_NOTIFIER
311 struct mmu_notifier_mm *mmu_notifier_mm; 311 struct mmu_notifier_mm *mmu_notifier_mm;
312#endif 312#endif
313#ifdef CONFIG_TRANSPARENT_HUGEPAGE
314 pgtable_t pmd_huge_pte; /* protected by page_table_lock */
315#endif
313 /* How many tasks sharing this mm are OOM_DISABLE */ 316 /* How many tasks sharing this mm are OOM_DISABLE */
314 atomic_t oom_disable_count; 317 atomic_t oom_disable_count;
315}; 318};
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index bf173502d744..38d393092812 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -94,12 +94,12 @@ struct sh_mmcif_plat_data {
94 94
95static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) 95static inline u32 sh_mmcif_readl(void __iomem *addr, int reg)
96{ 96{
97 return readl(addr + reg); 97 return __raw_readl(addr + reg);
98} 98}
99 99
100static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) 100static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val)
101{ 101{
102 writel(val, addr + reg); 102 __raw_writel(val, addr + reg);
103} 103}
104 104
105#define SH_MMCIF_BBS 512 /* boot block size */ 105#define SH_MMCIF_BBS 512 /* boot block size */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 43dcfbdc39de..cc2e7dfea9d7 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -62,6 +62,16 @@ struct mmu_notifier_ops {
62 unsigned long address); 62 unsigned long address);
63 63
64 /* 64 /*
65 * test_young is called to check the young/accessed bitflag in
66 * the secondary pte. This is used to know if the page is
67 * frequently used without actually clearing the flag or tearing
68 * down the secondary mapping on the page.
69 */
70 int (*test_young)(struct mmu_notifier *mn,
71 struct mm_struct *mm,
72 unsigned long address);
73
74 /*
65 * change_pte is called in cases that pte mapping to page is changed: 75 * change_pte is called in cases that pte mapping to page is changed:
66 * for example, when ksm remaps pte to point to a new shared page. 76 * for example, when ksm remaps pte to point to a new shared page.
67 */ 77 */
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
163extern void __mmu_notifier_release(struct mm_struct *mm); 173extern void __mmu_notifier_release(struct mm_struct *mm);
164extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 174extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
165 unsigned long address); 175 unsigned long address);
176extern int __mmu_notifier_test_young(struct mm_struct *mm,
177 unsigned long address);
166extern void __mmu_notifier_change_pte(struct mm_struct *mm, 178extern void __mmu_notifier_change_pte(struct mm_struct *mm,
167 unsigned long address, pte_t pte); 179 unsigned long address, pte_t pte);
168extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, 180extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
186 return 0; 198 return 0;
187} 199}
188 200
201static inline int mmu_notifier_test_young(struct mm_struct *mm,
202 unsigned long address)
203{
204 if (mm_has_notifiers(mm))
205 return __mmu_notifier_test_young(mm, address);
206 return 0;
207}
208
189static inline void mmu_notifier_change_pte(struct mm_struct *mm, 209static inline void mmu_notifier_change_pte(struct mm_struct *mm,
190 unsigned long address, pte_t pte) 210 unsigned long address, pte_t pte)
191{ 211{
@@ -243,6 +263,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
243 __pte; \ 263 __pte; \
244}) 264})
245 265
266#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \
267({ \
268 pmd_t __pmd; \
269 struct vm_area_struct *___vma = __vma; \
270 unsigned long ___address = __address; \
271 VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
272 mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
273 (__address)+HPAGE_PMD_SIZE);\
274 __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \
275 mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
276 (__address)+HPAGE_PMD_SIZE); \
277 __pmd; \
278})
279
280#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \
281({ \
282 struct vm_area_struct *___vma = __vma; \
283 unsigned long ___address = __address; \
284 VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
285 mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
286 (__address)+HPAGE_PMD_SIZE);\
287 pmdp_splitting_flush(___vma, ___address, __pmdp); \
288 mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
289 (__address)+HPAGE_PMD_SIZE); \
290})
291
246#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ 292#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
247({ \ 293({ \
248 int __young; \ 294 int __young; \
@@ -254,6 +300,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
254 __young; \ 300 __young; \
255}) 301})
256 302
303#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \
304({ \
305 int __young; \
306 struct vm_area_struct *___vma = __vma; \
307 unsigned long ___address = __address; \
308 __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \
309 __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
310 ___address); \
311 __young; \
312})
313
257#define set_pte_at_notify(__mm, __address, __ptep, __pte) \ 314#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
258({ \ 315({ \
259 struct mm_struct *___mm = __mm; \ 316 struct mm_struct *___mm = __mm; \
@@ -276,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
276 return 0; 333 return 0;
277} 334}
278 335
336static inline int mmu_notifier_test_young(struct mm_struct *mm,
337 unsigned long address)
338{
339 return 0;
340}
341
279static inline void mmu_notifier_change_pte(struct mm_struct *mm, 342static inline void mmu_notifier_change_pte(struct mm_struct *mm,
280 unsigned long address, pte_t pte) 343 unsigned long address, pte_t pte)
281{ 344{
@@ -305,7 +368,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
305} 368}
306 369
307#define ptep_clear_flush_young_notify ptep_clear_flush_young 370#define ptep_clear_flush_young_notify ptep_clear_flush_young
371#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
308#define ptep_clear_flush_notify ptep_clear_flush 372#define ptep_clear_flush_notify ptep_clear_flush
373#define pmdp_clear_flush_notify pmdp_clear_flush
374#define pmdp_splitting_flush_notify pmdp_splitting_flush
309#define set_pte_at_notify set_pte_at 375#define set_pte_at_notify set_pte_at
310 376
311#endif /* CONFIG_MMU_NOTIFIER */ 377#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24ebe9cfd..02ecb0189b1d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,7 @@ enum zone_stat_item {
114 NUMA_LOCAL, /* allocation from local node */ 114 NUMA_LOCAL, /* allocation from local node */
115 NUMA_OTHER, /* allocation from other node */ 115 NUMA_OTHER, /* allocation from other node */
116#endif 116#endif
117 NR_ANON_TRANSPARENT_HUGEPAGES,
117 NR_VM_ZONE_STAT_ITEMS }; 118 NR_VM_ZONE_STAT_ITEMS };
118 119
119/* 120/*
@@ -458,12 +459,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
458 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 459 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
459} 460}
460 461
461#ifdef CONFIG_SMP
462unsigned long zone_nr_free_pages(struct zone *zone);
463#else
464#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
465#endif /* CONFIG_SMP */
466
467/* 462/*
468 * The "priority" of VM scanning is how much of the queues we will scan in one 463 * The "priority" of VM scanning is how much of the queues we will scan in one
469 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 464 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -645,6 +640,7 @@ typedef struct pglist_data {
645 wait_queue_head_t kswapd_wait; 640 wait_queue_head_t kswapd_wait;
646 struct task_struct *kswapd; 641 struct task_struct *kswapd;
647 int kswapd_max_order; 642 int kswapd_max_order;
643 enum zone_type classzone_idx;
648} pg_data_t; 644} pg_data_t;
649 645
650#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 646#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -660,8 +656,10 @@ typedef struct pglist_data {
660 656
661extern struct mutex zonelists_mutex; 657extern struct mutex zonelists_mutex;
662void build_all_zonelists(void *data); 658void build_all_zonelists(void *data);
663void wakeup_kswapd(struct zone *zone, int order); 659void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
664int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 660bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
661 int classzone_idx, int alloc_flags);
662bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
665 int classzone_idx, int alloc_flags); 663 int classzone_idx, int alloc_flags);
666enum memmap_context { 664enum memmap_context {
667 MEMMAP_EARLY, 665 MEMMAP_EARLY,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5f38c460367e..0db8037e2725 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -48,9 +48,6 @@
48 * struct page (these bits with information) are always mapped into kernel 48 * struct page (these bits with information) are always mapped into kernel
49 * address space... 49 * address space...
50 * 50 *
51 * PG_buddy is set to indicate that the page is free and in the buddy system
52 * (see mm/page_alloc.c).
53 *
54 * PG_hwpoison indicates that a page got corrupted in hardware and contains 51 * PG_hwpoison indicates that a page got corrupted in hardware and contains
55 * data with incorrect ECC bits that triggered a machine check. Accessing is 52 * data with incorrect ECC bits that triggered a machine check. Accessing is
56 * not safe since it may cause another machine check. Don't touch! 53 * not safe since it may cause another machine check. Don't touch!
@@ -96,7 +93,6 @@ enum pageflags {
96 PG_swapcache, /* Swap page: swp_entry_t in private */ 93 PG_swapcache, /* Swap page: swp_entry_t in private */
97 PG_mappedtodisk, /* Has blocks allocated on-disk */ 94 PG_mappedtodisk, /* Has blocks allocated on-disk */
98 PG_reclaim, /* To be reclaimed asap */ 95 PG_reclaim, /* To be reclaimed asap */
99 PG_buddy, /* Page is free, on buddy lists */
100 PG_swapbacked, /* Page is backed by RAM/swap */ 96 PG_swapbacked, /* Page is backed by RAM/swap */
101 PG_unevictable, /* Page is "unevictable" */ 97 PG_unevictable, /* Page is "unevictable" */
102#ifdef CONFIG_MMU 98#ifdef CONFIG_MMU
@@ -108,6 +104,9 @@ enum pageflags {
108#ifdef CONFIG_MEMORY_FAILURE 104#ifdef CONFIG_MEMORY_FAILURE
109 PG_hwpoison, /* hardware poisoned page. Don't touch */ 105 PG_hwpoison, /* hardware poisoned page. Don't touch */
110#endif 106#endif
107#ifdef CONFIG_TRANSPARENT_HUGEPAGE
108 PG_compound_lock,
109#endif
111 __NR_PAGEFLAGS, 110 __NR_PAGEFLAGS,
112 111
113 /* Filesystems */ 112 /* Filesystems */
@@ -198,7 +197,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
198struct page; /* forward declaration */ 197struct page; /* forward declaration */
199 198
200TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) 199TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
201PAGEFLAG(Error, error) 200PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
202PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) 201PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
203PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) 202PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
204PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) 203PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
@@ -230,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
230 * risky: they bypass page accounting. 229 * risky: they bypass page accounting.
231 */ 230 */
232TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) 231TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
233__PAGEFLAG(Buddy, buddy)
234PAGEFLAG(MappedToDisk, mappedtodisk) 232PAGEFLAG(MappedToDisk, mappedtodisk)
235 233
236/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ 234/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
@@ -344,7 +342,7 @@ static inline void set_page_writeback(struct page *page)
344 * tests can be used in performance sensitive paths. PageCompound is 342 * tests can be used in performance sensitive paths. PageCompound is
345 * generally not used in hot code paths. 343 * generally not used in hot code paths.
346 */ 344 */
347__PAGEFLAG(Head, head) 345__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
348__PAGEFLAG(Tail, tail) 346__PAGEFLAG(Tail, tail)
349 347
350static inline int PageCompound(struct page *page) 348static inline int PageCompound(struct page *page)
@@ -352,6 +350,13 @@ static inline int PageCompound(struct page *page)
352 return page->flags & ((1L << PG_head) | (1L << PG_tail)); 350 return page->flags & ((1L << PG_head) | (1L << PG_tail));
353 351
354} 352}
353#ifdef CONFIG_TRANSPARENT_HUGEPAGE
354static inline void ClearPageCompound(struct page *page)
355{
356 BUG_ON(!PageHead(page));
357 ClearPageHead(page);
358}
359#endif
355#else 360#else
356/* 361/*
357 * Reduce page flag use as much as possible by overlapping 362 * Reduce page flag use as much as possible by overlapping
@@ -389,14 +394,61 @@ static inline void __ClearPageTail(struct page *page)
389 page->flags &= ~PG_head_tail_mask; 394 page->flags &= ~PG_head_tail_mask;
390} 395}
391 396
397#ifdef CONFIG_TRANSPARENT_HUGEPAGE
398static inline void ClearPageCompound(struct page *page)
399{
400 BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
401 clear_bit(PG_compound, &page->flags);
402}
403#endif
404
392#endif /* !PAGEFLAGS_EXTENDED */ 405#endif /* !PAGEFLAGS_EXTENDED */
393 406
407#ifdef CONFIG_TRANSPARENT_HUGEPAGE
408/*
409 * PageHuge() only returns true for hugetlbfs pages, but not for
410 * normal or transparent huge pages.
411 *
412 * PageTransHuge() returns true for both transparent huge and
413 * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
414 * called only in the core VM paths where hugetlbfs pages can't exist.
415 */
416static inline int PageTransHuge(struct page *page)
417{
418 VM_BUG_ON(PageTail(page));
419 return PageHead(page);
420}
421
422static inline int PageTransCompound(struct page *page)
423{
424 return PageCompound(page);
425}
426
427#else
428
429static inline int PageTransHuge(struct page *page)
430{
431 return 0;
432}
433
434static inline int PageTransCompound(struct page *page)
435{
436 return 0;
437}
438#endif
439
394#ifdef CONFIG_MMU 440#ifdef CONFIG_MMU
395#define __PG_MLOCKED (1 << PG_mlocked) 441#define __PG_MLOCKED (1 << PG_mlocked)
396#else 442#else
397#define __PG_MLOCKED 0 443#define __PG_MLOCKED 0
398#endif 444#endif
399 445
446#ifdef CONFIG_TRANSPARENT_HUGEPAGE
447#define __PG_COMPOUND_LOCK (1 << PG_compound_lock)
448#else
449#define __PG_COMPOUND_LOCK 0
450#endif
451
400/* 452/*
401 * Flags checked when a page is freed. Pages being freed should not have 453 * Flags checked when a page is freed. Pages being freed should not have
402 * these flags set. It they are, there is a problem. 454 * these flags set. It they are, there is a problem.
@@ -404,9 +456,10 @@ static inline void __ClearPageTail(struct page *page)
404#define PAGE_FLAGS_CHECK_AT_FREE \ 456#define PAGE_FLAGS_CHECK_AT_FREE \
405 (1 << PG_lru | 1 << PG_locked | \ 457 (1 << PG_lru | 1 << PG_locked | \
406 1 << PG_private | 1 << PG_private_2 | \ 458 1 << PG_private | 1 << PG_private_2 | \
407 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 459 1 << PG_writeback | 1 << PG_reserved | \
408 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 460 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
409 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) 461 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
462 __PG_COMPOUND_LOCK)
410 463
411/* 464/*
412 * Flags checked when a page is prepped for return by the page allocator. 465 * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index b02195dfc1b0..5b0c971d7cae 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -35,12 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page);
35 35
36enum { 36enum {
37 /* flags for mem_cgroup */ 37 /* flags for mem_cgroup */
38 PCG_LOCK, /* page cgroup is locked */ 38 PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
39 PCG_CACHE, /* charged as cache */ 39 PCG_CACHE, /* charged as cache */
40 PCG_USED, /* this object is in use. */ 40 PCG_USED, /* this object is in use. */
41 PCG_ACCT_LRU, /* page has been accounted for */
42 PCG_FILE_MAPPED, /* page is accounted as "mapped" */
43 PCG_MIGRATION, /* under page migration */ 41 PCG_MIGRATION, /* under page migration */
42 /* flags for mem_cgroup and file and I/O status */
43 PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
44 PCG_FILE_MAPPED, /* page is accounted as "mapped" */
45 PCG_FILE_DIRTY, /* page is dirty */
46 PCG_FILE_WRITEBACK, /* page is under writeback */
47 PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
48 /* No lock in page_cgroup */
49 PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
44}; 50};
45 51
46#define TESTPCGFLAG(uname, lname) \ 52#define TESTPCGFLAG(uname, lname) \
@@ -59,6 +65,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
59static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ 65static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
60 { return test_and_clear_bit(PCG_##lname, &pc->flags); } 66 { return test_and_clear_bit(PCG_##lname, &pc->flags); }
61 67
68#define TESTSETPCGFLAG(uname, lname) \
69static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \
70 { return test_and_set_bit(PCG_##lname, &pc->flags); }
71
62/* Cache flag is set only once (at allocation) */ 72/* Cache flag is set only once (at allocation) */
63TESTPCGFLAG(Cache, CACHE) 73TESTPCGFLAG(Cache, CACHE)
64CLEARPCGFLAG(Cache, CACHE) 74CLEARPCGFLAG(Cache, CACHE)
@@ -78,6 +88,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
78CLEARPCGFLAG(FileMapped, FILE_MAPPED) 88CLEARPCGFLAG(FileMapped, FILE_MAPPED)
79TESTPCGFLAG(FileMapped, FILE_MAPPED) 89TESTPCGFLAG(FileMapped, FILE_MAPPED)
80 90
91SETPCGFLAG(FileDirty, FILE_DIRTY)
92CLEARPCGFLAG(FileDirty, FILE_DIRTY)
93TESTPCGFLAG(FileDirty, FILE_DIRTY)
94TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
95TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
96
97SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
98CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
99TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
100
101SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
102CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
103TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
104TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
105TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
106
81SETPCGFLAG(Migration, MIGRATION) 107SETPCGFLAG(Migration, MIGRATION)
82CLEARPCGFLAG(Migration, MIGRATION) 108CLEARPCGFLAG(Migration, MIGRATION)
83TESTPCGFLAG(Migration, MIGRATION) 109TESTPCGFLAG(Migration, MIGRATION)
@@ -94,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
94 120
95static inline void lock_page_cgroup(struct page_cgroup *pc) 121static inline void lock_page_cgroup(struct page_cgroup *pc)
96{ 122{
123 /*
124 * Don't take this lock in IRQ context.
125 * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
126 */
97 bit_spin_lock(PCG_LOCK, &pc->flags); 127 bit_spin_lock(PCG_LOCK, &pc->flags);
98} 128}
99 129
@@ -107,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc)
107 return bit_spin_is_locked(PCG_LOCK, &pc->flags); 137 return bit_spin_is_locked(PCG_LOCK, &pc->flags);
108} 138}
109 139
140static inline void move_lock_page_cgroup(struct page_cgroup *pc,
141 unsigned long *flags)
142{
143 /*
144 * We know updates to pc->flags of page cache's stats are from both of
145 * usual context or IRQ context. Disable IRQ to avoid deadlock.
146 */
147 local_irq_save(*flags);
148 bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
149}
150
151static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
152 unsigned long *flags)
153{
154 bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
155 local_irq_restore(*flags);
156}
157
110#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 158#else /* CONFIG_CGROUP_MEM_RES_CTLR */
111struct page_cgroup; 159struct page_cgroup;
112 160
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2d1ffe3cf1ee..9c66e994540f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
48 48
49static inline int mapping_unevictable(struct address_space *mapping) 49static inline int mapping_unevictable(struct address_space *mapping)
50{ 50{
51 if (likely(mapping)) 51 if (mapping)
52 return test_bit(AS_UNEVICTABLE, &mapping->flags); 52 return test_bit(AS_UNEVICTABLE, &mapping->flags);
53 return !!mapping; 53 return !!mapping;
54} 54}
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ab2baa5c4884..23241c2fecce 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot)
146} 146}
147 147
148/** 148/**
149 * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held
150 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
151 * Returns: item that was stored in that slot with any direct pointer flag
152 * removed.
153 *
154 * Similar to radix_tree_deref_slot but only used during migration when a pages
155 * mapping is being moved. The caller does not hold the RCU read lock but it
156 * must hold the tree lock to prevent parallel updates.
157 */
158static inline void *radix_tree_deref_slot_protected(void **pslot,
159 spinlock_t *treelock)
160{
161 return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
162}
163
164/**
149 * radix_tree_deref_retry - check radix_tree_deref_slot 165 * radix_tree_deref_retry - check radix_tree_deref_slot
150 * @arg: pointer returned by radix_tree_deref_slot 166 * @arg: pointer returned by radix_tree_deref_slot
151 * Returns: 0 if retry is not required, otherwise retry is required 167 * Returns: 0 if retry is not required, otherwise retry is required
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb83c0da2071..e9fd04ca1e51 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,6 +198,8 @@ enum ttu_flags {
198}; 198};
199#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) 199#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
200 200
201bool is_vma_temporary_stack(struct vm_area_struct *vma);
202
201int try_to_unmap(struct page *, enum ttu_flags flags); 203int try_to_unmap(struct page *, enum ttu_flags flags);
202int try_to_unmap_one(struct page *, struct vm_area_struct *, 204int try_to_unmap_one(struct page *, struct vm_area_struct *,
203 unsigned long address, enum ttu_flags flags); 205 unsigned long address, enum ttu_flags flags);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96e23215e276..d747f948b34e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,7 +21,8 @@
21#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ 21#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
22#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ 22#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
23#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ 23#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
24#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ 24/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
25 and is now available for re-use. */
25#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ 26#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
26#define CLONE_NEWIPC 0x08000000 /* New ipcs */ 27#define CLONE_NEWIPC 0x08000000 /* New ipcs */
27#define CLONE_NEWUSER 0x10000000 /* New user namespace */ 28#define CLONE_NEWUSER 0x10000000 /* New user namespace */
@@ -433,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm);
433#endif 434#endif
434 /* leave room for more dump flags */ 435 /* leave room for more dump flags */
435#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ 436#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
437#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
436 438
437#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) 439#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
438 440
@@ -633,6 +635,8 @@ struct signal_struct {
633 635
634 int oom_adj; /* OOM kill score adjustment (bit shift) */ 636 int oom_adj; /* OOM kill score adjustment (bit shift) */
635 int oom_score_adj; /* OOM kill score adjustment */ 637 int oom_score_adj; /* OOM kill score adjustment */
638 int oom_score_adj_min; /* OOM kill score adjustment minimum value.
639 * Only settable by CAP_SYS_RESOURCE. */
636 640
637 struct mutex cred_guard_mutex; /* guard against foreign influences on 641 struct mutex cred_guard_mutex; /* guard against foreign influences on
638 * credential calculations 642 * credential calculations
diff --git a/include/linux/swap.h b/include/linux/swap.h
index eba53e71d2cc..4d559325d919 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void);
208/* linux/mm/swap.c */ 208/* linux/mm/swap.c */
209extern void __lru_cache_add(struct page *, enum lru_list lru); 209extern void __lru_cache_add(struct page *, enum lru_list lru);
210extern void lru_cache_add_lru(struct page *, enum lru_list lru); 210extern void lru_cache_add_lru(struct page *, enum lru_list lru);
211extern void lru_add_page_tail(struct zone* zone,
212 struct page *page, struct page *page_tail);
211extern void activate_page(struct page *); 213extern void activate_page(struct page *);
212extern void mark_page_accessed(struct page *); 214extern void mark_page_accessed(struct page *);
213extern void lru_add_drain(void); 215extern void lru_add_drain(void);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 44b54f619ac6..4ed6fcd6b726 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size);
59extern void *vmalloc_32(unsigned long size); 59extern void *vmalloc_32(unsigned long size);
60extern void *vmalloc_32_user(unsigned long size); 60extern void *vmalloc_32_user(unsigned long size);
61extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); 61extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
62extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, 62extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
63 pgprot_t prot); 63 unsigned long start, unsigned long end, gfp_t gfp_mask,
64 pgprot_t prot, int node, void *caller);
64extern void vfree(const void *addr); 65extern void vfree(const void *addr);
65 66
66extern void *vmap(struct page **pages, unsigned int count, 67extern void *vmap(struct page **pages, unsigned int count,
@@ -90,9 +91,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
90 unsigned long flags, 91 unsigned long flags,
91 unsigned long start, unsigned long end, 92 unsigned long start, unsigned long end,
92 void *caller); 93 void *caller);
93extern struct vm_struct *get_vm_area_node(unsigned long size,
94 unsigned long flags, int node,
95 gfp_t gfp_mask);
96extern struct vm_struct *remove_vm_area(const void *addr); 94extern struct vm_struct *remove_vm_area(const void *addr);
97 95
98extern int map_vm_area(struct vm_struct *area, pgprot_t prot, 96extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
@@ -120,7 +118,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
120#ifdef CONFIG_SMP 118#ifdef CONFIG_SMP
121struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 119struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
122 const size_t *sizes, int nr_vms, 120 const size_t *sizes, int nr_vms,
123 size_t align, gfp_t gfp_mask); 121 size_t align);
124 122
125void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); 123void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
126#endif 124#endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37b3b75..833e676d6d92 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
254extern void __dec_zone_state(struct zone *, enum zone_stat_item); 254extern void __dec_zone_state(struct zone *, enum zone_stat_item);
255 255
256void refresh_cpu_vm_stats(int); 256void refresh_cpu_vm_stats(int);
257
258int calculate_pressure_threshold(struct zone *zone);
259int calculate_normal_threshold(struct zone *zone);
260void set_pgdat_percpu_threshold(pg_data_t *pgdat,
261 int (*calculate_pressure)(struct zone *));
257#else /* CONFIG_SMP */ 262#else /* CONFIG_SMP */
258 263
259/* 264/*
@@ -298,6 +303,8 @@ static inline void __dec_zone_page_state(struct page *page,
298#define dec_zone_page_state __dec_zone_page_state 303#define dec_zone_page_state __dec_zone_page_state
299#define mod_zone_page_state __mod_zone_page_state 304#define mod_zone_page_state __mod_zone_page_state
300 305
306#define set_pgdat_percpu_threshold(pgdat, callback) { }
307
301static inline void refresh_cpu_vm_stats(int cpu) { } 308static inline void refresh_cpu_vm_stats(int cpu) { }
302#endif 309#endif
303 310
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
new file mode 100644
index 000000000000..388bcdd26d46
--- /dev/null
+++ b/include/trace/events/compaction.h
@@ -0,0 +1,74 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM compaction
3
4#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_COMPACTION_H
6
7#include <linux/types.h>
8#include <linux/tracepoint.h>
9#include "gfpflags.h"
10
11DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
12
13 TP_PROTO(unsigned long nr_scanned,
14 unsigned long nr_taken),
15
16 TP_ARGS(nr_scanned, nr_taken),
17
18 TP_STRUCT__entry(
19 __field(unsigned long, nr_scanned)
20 __field(unsigned long, nr_taken)
21 ),
22
23 TP_fast_assign(
24 __entry->nr_scanned = nr_scanned;
25 __entry->nr_taken = nr_taken;
26 ),
27
28 TP_printk("nr_scanned=%lu nr_taken=%lu",
29 __entry->nr_scanned,
30 __entry->nr_taken)
31);
32
33DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages,
34
35 TP_PROTO(unsigned long nr_scanned,
36 unsigned long nr_taken),
37
38 TP_ARGS(nr_scanned, nr_taken)
39);
40
41DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
42 TP_PROTO(unsigned long nr_scanned,
43 unsigned long nr_taken),
44
45 TP_ARGS(nr_scanned, nr_taken)
46);
47
48TRACE_EVENT(mm_compaction_migratepages,
49
50 TP_PROTO(unsigned long nr_migrated,
51 unsigned long nr_failed),
52
53 TP_ARGS(nr_migrated, nr_failed),
54
55 TP_STRUCT__entry(
56 __field(unsigned long, nr_migrated)
57 __field(unsigned long, nr_failed)
58 ),
59
60 TP_fast_assign(
61 __entry->nr_migrated = nr_migrated;
62 __entry->nr_failed = nr_failed;
63 ),
64
65 TP_printk("nr_migrated=%lu nr_failed=%lu",
66 __entry->nr_migrated,
67 __entry->nr_failed)
68);
69
70
71#endif /* _TRACE_COMPACTION_H */
72
73/* This part must be outside protection */
74#include <trace/define_trace.h>
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index c255fcc587bf..ea422aaa23e1 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -25,13 +25,13 @@
25 25
26#define trace_reclaim_flags(page, sync) ( \ 26#define trace_reclaim_flags(page, sync) ( \
27 (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ 27 (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
28 (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ 28 (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
29 ) 29 )
30 30
31#define trace_shrink_flags(file, sync) ( \ 31#define trace_shrink_flags(file, sync) ( \
32 (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ 32 (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
33 (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ 33 (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \
34 (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ 34 (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
35 ) 35 )
36 36
37TRACE_EVENT(mm_vmscan_kswapd_sleep, 37TRACE_EVENT(mm_vmscan_kswapd_sleep,
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 89a2b2db4375..4e249b927eaa 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \
81 TP_ARGS(bdi)) 81 TP_ARGS(bdi))
82 82
83DEFINE_WRITEBACK_EVENT(writeback_nowork); 83DEFINE_WRITEBACK_EVENT(writeback_nowork);
84DEFINE_WRITEBACK_EVENT(writeback_wake_background);
84DEFINE_WRITEBACK_EVENT(writeback_wake_thread); 85DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
85DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); 86DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
86DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 87DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
new file mode 100644
index 000000000000..eb23f4188f5a
--- /dev/null
+++ b/include/xen/gntdev.h
@@ -0,0 +1,119 @@
1/******************************************************************************
2 * gntdev.h
3 *
4 * Interface to /dev/xen/gntdev.
5 *
6 * Copyright (c) 2007, D G Murray
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef __LINUX_PUBLIC_GNTDEV_H__
34#define __LINUX_PUBLIC_GNTDEV_H__
35
36struct ioctl_gntdev_grant_ref {
37 /* The domain ID of the grant to be mapped. */
38 uint32_t domid;
39 /* The grant reference of the grant to be mapped. */
40 uint32_t ref;
41};
42
43/*
44 * Inserts the grant references into the mapping table of an instance
45 * of gntdev. N.B. This does not perform the mapping, which is deferred
46 * until mmap() is called with @index as the offset.
47 */
48#define IOCTL_GNTDEV_MAP_GRANT_REF \
49_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
50struct ioctl_gntdev_map_grant_ref {
51 /* IN parameters */
52 /* The number of grants to be mapped. */
53 uint32_t count;
54 uint32_t pad;
55 /* OUT parameters */
56 /* The offset to be used on a subsequent call to mmap(). */
57 uint64_t index;
58 /* Variable IN parameter. */
59 /* Array of grant references, of size @count. */
60 struct ioctl_gntdev_grant_ref refs[1];
61};
62
63/*
64 * Removes the grant references from the mapping table of an instance of
65 * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
66 * before this ioctl is called, or an error will result.
67 */
68#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
69_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
70struct ioctl_gntdev_unmap_grant_ref {
71 /* IN parameters */
72 /* The offset was returned by the corresponding map operation. */
73 uint64_t index;
74 /* The number of pages to be unmapped. */
75 uint32_t count;
76 uint32_t pad;
77};
78
79/*
80 * Returns the offset in the driver's address space that corresponds
81 * to @vaddr. This can be used to perform a munmap(), followed by an
82 * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
83 * the caller. The number of pages that were allocated at the same time as
84 * @vaddr is returned in @count.
85 *
86 * N.B. Where more than one page has been mapped into a contiguous range, the
87 * supplied @vaddr must correspond to the start of the range; otherwise
88 * an error will result. It is only possible to munmap() the entire
89 * contiguously-allocated range at once, and not any subrange thereof.
90 */
91#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
92_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
93struct ioctl_gntdev_get_offset_for_vaddr {
94 /* IN parameters */
95 /* The virtual address of the first mapped page in a range. */
96 uint64_t vaddr;
97 /* OUT parameters */
98 /* The offset that was used in the initial mmap() operation. */
99 uint64_t offset;
100 /* The number of pages mapped in the VM area that begins at @vaddr. */
101 uint32_t count;
102 uint32_t pad;
103};
104
105/*
106 * Sets the maximum number of grants that may mapped at once by this gntdev
107 * instance.
108 *
109 * N.B. This must be called before any other ioctl is performed on the device.
110 */
111#define IOCTL_GNTDEV_SET_MAX_GRANTS \
112_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
113struct ioctl_gntdev_set_max_grants {
114 /* IN parameter */
115 /* The maximum number of grants that may be mapped at once. */
116 uint32_t count;
117};
118
119#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 9a731706a016..b1fab6b5b3ef 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -37,10 +37,16 @@
37#ifndef __ASM_GNTTAB_H__ 37#ifndef __ASM_GNTTAB_H__
38#define __ASM_GNTTAB_H__ 38#define __ASM_GNTTAB_H__
39 39
40#include <asm/xen/hypervisor.h> 40#include <asm/page.h>
41
42#include <xen/interface/xen.h>
41#include <xen/interface/grant_table.h> 43#include <xen/interface/grant_table.h>
44
45#include <asm/xen/hypervisor.h>
42#include <asm/xen/grant_table.h> 46#include <asm/xen/grant_table.h>
43 47
48#include <xen/features.h>
49
44/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ 50/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
45#define NR_GRANT_FRAMES 4 51#define NR_GRANT_FRAMES 4
46 52
@@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
107void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, 113void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
108 unsigned long pfn); 114 unsigned long pfn);
109 115
116static inline void
117gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
118 uint32_t flags, grant_ref_t ref, domid_t domid)
119{
120 if (flags & GNTMAP_contains_pte)
121 map->host_addr = addr;
122 else if (xen_feature(XENFEAT_auto_translated_physmap))
123 map->host_addr = __pa(addr);
124 else
125 map->host_addr = addr;
126
127 map->flags = flags;
128 map->ref = ref;
129 map->dom = domid;
130}
131
132static inline void
133gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
134 uint32_t flags, grant_handle_t handle)
135{
136 if (flags & GNTMAP_contains_pte)
137 unmap->host_addr = addr;
138 else if (xen_feature(XENFEAT_auto_translated_physmap))
139 unmap->host_addr = __pa(addr);
140 else
141 unmap->host_addr = addr;
142
143 unmap->handle = handle;
144 unmap->dev_bus_addr = 0;
145}
146
110int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, 147int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
111 unsigned long max_nr_gframes, 148 unsigned long max_nr_gframes,
112 struct grant_entry **__shared); 149 struct grant_entry **__shared);
@@ -118,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void);
118 155
119#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) 156#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
120 157
158int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
159 struct page **pages, unsigned int count);
160int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
161 struct page **pages, unsigned int count);
162
121#endif /* __ASM_GNTTAB_H__ */ 163#endif /* __ASM_GNTTAB_H__ */
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h> 68#include <linux/oom.h>
69#include <linux/khugepaged.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
330 retval = ksm_fork(mm, oldmm); 331 retval = ksm_fork(mm, oldmm);
331 if (retval) 332 if (retval)
332 goto out; 333 goto out;
334 retval = khugepaged_fork(mm, oldmm);
335 if (retval)
336 goto out;
333 337
334 prev = NULL; 338 prev = NULL;
335 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
529 mm_free_pgd(mm); 533 mm_free_pgd(mm);
530 destroy_context(mm); 534 destroy_context(mm);
531 mmu_notifier_mm_destroy(mm); 535 mmu_notifier_mm_destroy(mm);
536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537 VM_BUG_ON(mm->pmd_huge_pte);
538#endif
532 free_mm(mm); 539 free_mm(mm);
533} 540}
534EXPORT_SYMBOL_GPL(__mmdrop); 541EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
543 if (atomic_dec_and_test(&mm->mm_users)) { 550 if (atomic_dec_and_test(&mm->mm_users)) {
544 exit_aio(mm); 551 exit_aio(mm);
545 ksm_exit(mm); 552 ksm_exit(mm);
553 khugepaged_exit(mm); /* must run before exit_mmap */
546 exit_mmap(mm); 554 exit_mmap(mm);
547 set_mm_exe_file(mm, NULL); 555 set_mm_exe_file(mm, NULL);
548 if (!list_empty(&mm->mmlist)) { 556 if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
669 mm->token_priority = 0; 677 mm->token_priority = 0;
670 mm->last_interval = 0; 678 mm->last_interval = 0;
671 679
680#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681 mm->pmd_huge_pte = NULL;
682#endif
683
672 if (!mm_init(mm, tsk)) 684 if (!mm_init(mm, tsk))
673 goto fail_nomem; 685 goto fail_nomem;
674 686
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
910 922
911 sig->oom_adj = current->signal->oom_adj; 923 sig->oom_adj = current->signal->oom_adj;
912 sig->oom_score_adj = current->signal->oom_score_adj; 924 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
913 926
914 mutex_init(&sig->cred_guard_mutex); 927 mutex_init(&sig->cred_guard_mutex);
915 928
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
1410 } 1423 }
1411 1424
1412 /* 1425 /*
1413 * We hope to recycle these flags after 2.6.26
1414 */
1415 if (unlikely(clone_flags & CLONE_STOPPED)) {
1416 static int __read_mostly count = 100;
1417
1418 if (count > 0 && printk_ratelimit()) {
1419 char comm[TASK_COMM_LEN];
1420
1421 count--;
1422 printk(KERN_INFO "fork(): process `%s' used deprecated "
1423 "clone flags 0x%lx\n",
1424 get_task_comm(comm, current),
1425 clone_flags & CLONE_STOPPED);
1426 }
1427 }
1428
1429 /*
1430 * When called from kernel_thread, don't do user tracing stuff. 1426 * When called from kernel_thread, don't do user tracing stuff.
1431 */ 1427 */
1432 if (likely(user_mode(regs))) 1428 if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
1464 */ 1460 */
1465 p->flags &= ~PF_STARTING; 1461 p->flags &= ~PF_STARTING;
1466 1462
1467 if (unlikely(clone_flags & CLONE_STOPPED)) { 1463 wake_up_new_task(p, clone_flags);
1468 /*
1469 * We'll start up with an immediate SIGSTOP.
1470 */
1471 sigaddset(&p->pending.signal, SIGSTOP);
1472 set_tsk_thread_flag(p, TIF_SIGPENDING);
1473 __set_task_state(p, TASK_STOPPED);
1474 } else {
1475 wake_up_new_task(p, clone_flags);
1476 }
1477 1464
1478 tracehook_report_clone_complete(trace, regs, 1465 tracehook_report_clone_complete(trace, regs,
1479 clone_flags, nr, p); 1466 clone_flags, nr, p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..52075633373f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
233{ 233{
234 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
235 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
236 struct page *page; 236 struct page *page, *page_head;
237 int err; 237 int err;
238 238
239 /* 239 /*
@@ -265,11 +265,46 @@ again:
265 if (err < 0) 265 if (err < 0)
266 return err; 266 return err;
267 267
268 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
269 lock_page(page); 269 page_head = page;
270 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
271 unlock_page(page);
272 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
273 goto again; 308 goto again;
274 } 309 }
275 310
@@ -280,20 +315,20 @@ again:
280 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
281 * the object not the particular process. 316 * the object not the particular process.
282 */ 317 */
283 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
284 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
285 key->private.mm = mm; 320 key->private.mm = mm;
286 key->private.address = address; 321 key->private.address = address;
287 } else { 322 } else {
288 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
289 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
290 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
291 } 326 }
292 327
293 get_futex_key_refs(key); 328 get_futex_key_refs(key);
294 329
295 unlock_page(page); 330 unlock_page(page_head);
296 put_page(page); 331 put_page(page_head);
297 return 0; 332 return 0;
298} 333}
299 334
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{ 74{
75 int cpu;
76
75 desc->irq_data.irq = irq; 77 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip; 78 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
83 desc->irq_count = 0; 85 desc->irq_count = 0;
84 desc->irqs_unhandled = 0; 86 desc->irqs_unhandled = 0;
85 desc->name = NULL; 87 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); 88 for_each_possible_cpu(cpu)
89 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
87 desc_smp_init(desc, node); 90 desc_smp_init(desc, node);
88} 91}
89 92
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
133 if (!desc) 136 if (!desc)
134 return NULL; 137 return NULL;
135 /* allocate based on nr_cpu_ids */ 138 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), 139 desc->kstat_irqs = alloc_percpu(unsigned int);
137 gfp, node);
138 if (!desc->kstat_irqs) 140 if (!desc->kstat_irqs)
139 goto err_desc; 141 goto err_desc;
140 142
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
149 return desc; 151 return desc;
150 152
151err_kstat: 153err_kstat:
152 kfree(desc->kstat_irqs); 154 free_percpu(desc->kstat_irqs);
153err_desc: 155err_desc:
154 kfree(desc); 156 kfree(desc);
155 return NULL; 157 return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
166 mutex_unlock(&sparse_irq_lock); 168 mutex_unlock(&sparse_irq_lock);
167 169
168 free_masks(desc); 170 free_masks(desc);
169 kfree(desc->kstat_irqs); 171 free_percpu(desc->kstat_irqs);
170 kfree(desc); 172 kfree(desc);
171} 173}
172 174
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
234 } 236 }
235}; 237};
236 238
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void) 239int __init early_irq_init(void)
239{ 240{
240 int count, i, node = first_online_node; 241 int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
250 for (i = 0; i < count; i++) { 251 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i; 252 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip; 253 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i]; 254 /* TODO : do this allocation on-demand ... */
255 desc[i].kstat_irqs = alloc_percpu(unsigned int);
254 alloc_masks(desc + i, GFP_KERNEL, node); 256 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node); 257 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 258 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
275 277
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 278static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{ 279{
280#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
281 struct irq_desc *desc;
282 unsigned int i;
283
284 for (i = 0; i < cnt; i++) {
285 desc = irq_to_desc(start + i);
286 if (desc && !desc->kstat_irqs) {
287 unsigned int __percpu *stats = alloc_percpu(unsigned int);
288
289 if (!stats)
290 return -1;
291 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
292 free_percpu(stats);
293 }
294 }
295#endif
278 return start; 296 return start;
279} 297}
280#endif /* !CONFIG_SPARSE_IRQ */ 298#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 409unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{ 410{
393 struct irq_desc *desc = irq_to_desc(irq); 411 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 412
413 return desc && desc->kstat_irqs ?
414 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
395} 415}
396 416
397#ifdef CONFIG_GENERIC_HARDIRQS 417#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
401 int cpu; 421 int cpu;
402 int sum = 0; 422 int sum = 0;
403 423
404 if (!desc) 424 if (!desc || !desc->kstat_irqs)
405 return 0; 425 return 0;
406 for_each_possible_cpu(cpu) 426 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu]; 427 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
408 return sum; 428 return sum;
409} 429}
410#endif /* CONFIG_GENERIC_HARDIRQS */ 430#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..3ad483bdf505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
302 302
303 See Documentation/nommu-mmap.txt for more information. 303 See Documentation/nommu-mmap.txt for more information.
304 304
305config TRANSPARENT_HUGEPAGE
306 bool "Transparent Hugepage Support"
307 depends on X86 && MMU
308 select COMPACTION
309 help
310 Transparent Hugepages allows the kernel to use huge pages and
311 huge tlb transparently to the applications whenever possible.
312 This feature can improve computing performance to certain
313 applications by speeding up page faults during memory
314 allocation, by reducing the number of tlb misses and by speeding
315 up the pagetable walking.
316
317 If memory constrained on embedded, you may want to say N.
318
319choice
320 prompt "Transparent Hugepage Support sysfs defaults"
321 depends on TRANSPARENT_HUGEPAGE
322 default TRANSPARENT_HUGEPAGE_ALWAYS
323 help
324 Selects the sysfs defaults for Transparent Hugepage Support.
325
326 config TRANSPARENT_HUGEPAGE_ALWAYS
327 bool "always"
328 help
329 Enabling Transparent Hugepage always, can increase the
330 memory footprint of applications without a guaranteed
331 benefit but it will work automatically for all applications.
332
333 config TRANSPARENT_HUGEPAGE_MADVISE
334 bool "madvise"
335 help
336 Enabling Transparent Hugepage madvise, will only provide a
337 performance improvement benefit to the applications using
338 madvise(MADV_HUGEPAGE) but it won't risk to increase the
339 memory footprint of applications without a guaranteed
340 benefit.
341endchoice
342
305# 343#
306# UP and nommu archs use km based percpu allocator 344# UP and nommu archs use km based percpu allocator
307# 345#
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
37obj-$(CONFIG_FS_XIP) += filemap_xip.o 37obj-$(CONFIG_FS_XIP) += filemap_xip.o
38obj-$(CONFIG_MIGRATION) += migrate.o 38obj-$(CONFIG_MIGRATION) += migrate.o
39obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 41obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
41obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 42obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
42obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 43obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 1a8894eadf72..6d592a021072 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#define CREATE_TRACE_POINTS
20#include <trace/events/compaction.h>
21
19/* 22/*
20 * compact_control is used to track pages being migrated and the free pages 23 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts 24 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
30 unsigned long nr_migratepages; /* Number of pages to migrate */ 33 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */ 34 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */
33 37
34 /* Account for isolated anon and file pages */ 38 /* Account for isolated anon and file pages */
35 unsigned long nr_anon; 39 unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
38 unsigned int order; /* order a direct compactor needs */ 42 unsigned int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 43 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 44 struct zone *zone;
45
46 int compact_mode;
41}; 47};
42 48
43static unsigned long release_freepages(struct list_head *freelist) 49static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
60 struct list_head *freelist) 66 struct list_head *freelist)
61{ 67{
62 unsigned long zone_end_pfn, end_pfn; 68 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0; 69 int nr_scanned = 0, total_isolated = 0;
64 struct page *cursor; 70 struct page *cursor;
65 71
66 /* Get the last PFN we should scan for free pages at */ 72 /* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
81 87
82 if (!pfn_valid_within(blockpfn)) 88 if (!pfn_valid_within(blockpfn))
83 continue; 89 continue;
90 nr_scanned++;
84 91
85 if (!PageBuddy(page)) 92 if (!PageBuddy(page))
86 continue; 93 continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
100 } 107 }
101 } 108 }
102 109
110 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
103 return total_isolated; 111 return total_isolated;
104} 112}
105 113
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
234 struct compact_control *cc) 242 struct compact_control *cc)
235{ 243{
236 unsigned long low_pfn, end_pfn; 244 unsigned long low_pfn, end_pfn;
245 unsigned long last_pageblock_nr = 0, pageblock_nr;
246 unsigned long nr_scanned = 0, nr_isolated = 0;
237 struct list_head *migratelist = &cc->migratepages; 247 struct list_head *migratelist = &cc->migratepages;
238 248
239 /* Do not scan outside zone boundaries */ 249 /* Do not scan outside zone boundaries */
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
266 struct page *page; 276 struct page *page;
267 if (!pfn_valid_within(low_pfn)) 277 if (!pfn_valid_within(low_pfn))
268 continue; 278 continue;
279 nr_scanned++;
269 280
270 /* Get the page and skip if free */ 281 /* Get the page and skip if free */
271 page = pfn_to_page(low_pfn); 282 page = pfn_to_page(low_pfn);
272 if (PageBuddy(page)) 283 if (PageBuddy(page))
273 continue; 284 continue;
274 285
286 /*
287 * For async migration, also only scan in MOVABLE blocks. Async
288 * migration is optimistic to see if the minimum amount of work
289 * satisfies the allocation
290 */
291 pageblock_nr = low_pfn >> pageblock_order;
292 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
293 get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
294 low_pfn += pageblock_nr_pages;
295 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
296 last_pageblock_nr = pageblock_nr;
297 continue;
298 }
299
300 if (!PageLRU(page))
301 continue;
302
303 /*
304 * PageLRU is set, and lru_lock excludes isolation,
305 * splitting and collapsing (collapsing has already
306 * happened if PageLRU is set).
307 */
308 if (PageTransHuge(page)) {
309 low_pfn += (1 << compound_order(page)) - 1;
310 continue;
311 }
312
275 /* Try isolate the page */ 313 /* Try isolate the page */
276 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 314 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
277 continue; 315 continue;
278 316
317 VM_BUG_ON(PageTransCompound(page));
318
279 /* Successfully isolated */ 319 /* Successfully isolated */
280 del_page_from_lru_list(zone, page, page_lru(page)); 320 del_page_from_lru_list(zone, page, page_lru(page));
281 list_add(&page->lru, migratelist); 321 list_add(&page->lru, migratelist);
282 cc->nr_migratepages++; 322 cc->nr_migratepages++;
323 nr_isolated++;
283 324
284 /* Avoid isolating too much */ 325 /* Avoid isolating too much */
285 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 326 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
291 spin_unlock_irq(&zone->lru_lock); 332 spin_unlock_irq(&zone->lru_lock);
292 cc->migrate_pfn = low_pfn; 333 cc->migrate_pfn = low_pfn;
293 334
335 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
336
294 return cc->nr_migratepages; 337 return cc->nr_migratepages;
295} 338}
296 339
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
341} 384}
342 385
343static int compact_finished(struct zone *zone, 386static int compact_finished(struct zone *zone,
344 struct compact_control *cc) 387 struct compact_control *cc)
345{ 388{
346 unsigned int order; 389 unsigned int order;
347 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); 390 unsigned long watermark;
348 391
349 if (fatal_signal_pending(current)) 392 if (fatal_signal_pending(current))
350 return COMPACT_PARTIAL; 393 return COMPACT_PARTIAL;
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone,
354 return COMPACT_COMPLETE; 397 return COMPACT_COMPLETE;
355 398
356 /* Compaction run is not finished if the watermark is not met */ 399 /* Compaction run is not finished if the watermark is not met */
400 if (cc->compact_mode != COMPACT_MODE_KSWAPD)
401 watermark = low_wmark_pages(zone);
402 else
403 watermark = high_wmark_pages(zone);
404 watermark += (1 << cc->order);
405
357 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
358 return COMPACT_CONTINUE; 407 return COMPACT_CONTINUE;
359 408
360 if (cc->order == -1) 409 if (cc->order == -1)
361 return COMPACT_CONTINUE; 410 return COMPACT_CONTINUE;
362 411
412 /*
413 * Generating only one page of the right order is not enough
414 * for kswapd, we must continue until we're above the high
415 * watermark as a pool for high order GFP_ATOMIC allocations
416 * too.
417 */
418 if (cc->compact_mode == COMPACT_MODE_KSWAPD)
419 return COMPACT_CONTINUE;
420
363 /* Direct compactor: Is a suitable page free? */ 421 /* Direct compactor: Is a suitable page free? */
364 for (order = cc->order; order < MAX_ORDER; order++) { 422 for (order = cc->order; order < MAX_ORDER; order++) {
365 /* Job done if page is free of the right migratetype */ 423 /* Job done if page is free of the right migratetype */
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone,
374 return COMPACT_CONTINUE; 432 return COMPACT_CONTINUE;
375} 433}
376 434
435/*
436 * compaction_suitable: Is this suitable to run compaction on this zone now?
437 * Returns
438 * COMPACT_SKIPPED - If there are too few free pages for compaction
439 * COMPACT_PARTIAL - If the allocation would succeed without compaction
440 * COMPACT_CONTINUE - If compaction should run now
441 */
442unsigned long compaction_suitable(struct zone *zone, int order)
443{
444 int fragindex;
445 unsigned long watermark;
446
447 /*
448 * Watermarks for order-0 must be met for compaction. Note the 2UL.
449 * This is because during migration, copies of pages need to be
450 * allocated and for a short time, the footprint is higher
451 */
452 watermark = low_wmark_pages(zone) + (2UL << order);
453 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
454 return COMPACT_SKIPPED;
455
456 /*
457 * fragmentation index determines if allocation failures are due to
458 * low memory or external fragmentation
459 *
460 * index of -1 implies allocations might succeed dependingon watermarks
461 * index towards 0 implies failure is due to lack of memory
462 * index towards 1000 implies failure is due to fragmentation
463 *
464 * Only compact if a failure would be due to fragmentation.
465 */
466 fragindex = fragmentation_index(zone, order);
467 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
468 return COMPACT_SKIPPED;
469
470 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
471 return COMPACT_PARTIAL;
472
473 return COMPACT_CONTINUE;
474}
475
377static int compact_zone(struct zone *zone, struct compact_control *cc) 476static int compact_zone(struct zone *zone, struct compact_control *cc)
378{ 477{
379 int ret; 478 int ret;
380 479
480 ret = compaction_suitable(zone, cc->order);
481 switch (ret) {
482 case COMPACT_PARTIAL:
483 case COMPACT_SKIPPED:
484 /* Compaction is likely to fail */
485 return ret;
486 case COMPACT_CONTINUE:
487 /* Fall through to compaction */
488 ;
489 }
490
381 /* Setup to move all movable pages to the end of the zone */ 491 /* Setup to move all movable pages to the end of the zone */
382 cc->migrate_pfn = zone->zone_start_pfn; 492 cc->migrate_pfn = zone->zone_start_pfn;
383 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 493 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
393 503
394 nr_migrate = cc->nr_migratepages; 504 nr_migrate = cc->nr_migratepages;
395 migrate_pages(&cc->migratepages, compaction_alloc, 505 migrate_pages(&cc->migratepages, compaction_alloc,
396 (unsigned long)cc, 0); 506 (unsigned long)cc, false,
507 cc->sync);
397 update_nr_listpages(cc); 508 update_nr_listpages(cc);
398 nr_remaining = cc->nr_migratepages; 509 nr_remaining = cc->nr_migratepages;
399 510
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
401 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); 512 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
402 if (nr_remaining) 513 if (nr_remaining)
403 count_vm_events(COMPACTPAGEFAILED, nr_remaining); 514 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
515 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
516 nr_remaining);
404 517
405 /* Release LRU pages not migrated */ 518 /* Release LRU pages not migrated */
406 if (!list_empty(&cc->migratepages)) { 519 if (!list_empty(&cc->migratepages)) {
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
417 return ret; 530 return ret;
418} 531}
419 532
420static unsigned long compact_zone_order(struct zone *zone, 533unsigned long compact_zone_order(struct zone *zone,
421 int order, gfp_t gfp_mask) 534 int order, gfp_t gfp_mask,
535 bool sync,
536 int compact_mode)
422{ 537{
423 struct compact_control cc = { 538 struct compact_control cc = {
424 .nr_freepages = 0, 539 .nr_freepages = 0,
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone,
426 .order = order, 541 .order = order,
427 .migratetype = allocflags_to_migratetype(gfp_mask), 542 .migratetype = allocflags_to_migratetype(gfp_mask),
428 .zone = zone, 543 .zone = zone,
544 .sync = sync,
545 .compact_mode = compact_mode,
429 }; 546 };
430 INIT_LIST_HEAD(&cc.freepages); 547 INIT_LIST_HEAD(&cc.freepages);
431 INIT_LIST_HEAD(&cc.migratepages); 548 INIT_LIST_HEAD(&cc.migratepages);
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500;
441 * @order: The order of the current allocation 558 * @order: The order of the current allocation
442 * @gfp_mask: The GFP mask of the current allocation 559 * @gfp_mask: The GFP mask of the current allocation
443 * @nodemask: The allowed nodes to allocate from 560 * @nodemask: The allowed nodes to allocate from
561 * @sync: Whether migration is synchronous or not
444 * 562 *
445 * This is the main entry point for direct page compaction. 563 * This is the main entry point for direct page compaction.
446 */ 564 */
447unsigned long try_to_compact_pages(struct zonelist *zonelist, 565unsigned long try_to_compact_pages(struct zonelist *zonelist,
448 int order, gfp_t gfp_mask, nodemask_t *nodemask) 566 int order, gfp_t gfp_mask, nodemask_t *nodemask,
567 bool sync)
449{ 568{
450 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 569 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
451 int may_enter_fs = gfp_mask & __GFP_FS; 570 int may_enter_fs = gfp_mask & __GFP_FS;
452 int may_perform_io = gfp_mask & __GFP_IO; 571 int may_perform_io = gfp_mask & __GFP_IO;
453 unsigned long watermark;
454 struct zoneref *z; 572 struct zoneref *z;
455 struct zone *zone; 573 struct zone *zone;
456 int rc = COMPACT_SKIPPED; 574 int rc = COMPACT_SKIPPED;
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
460 * made because an assumption is made that the page allocator can satisfy 578 * made because an assumption is made that the page allocator can satisfy
461 * the "cheaper" orders without taking special steps 579 * the "cheaper" orders without taking special steps
462 */ 580 */
463 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) 581 if (!order || !may_enter_fs || !may_perform_io)
464 return rc; 582 return rc;
465 583
466 count_vm_event(COMPACTSTALL); 584 count_vm_event(COMPACTSTALL);
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
468 /* Compact each zone in the list */ 586 /* Compact each zone in the list */
469 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 587 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
470 nodemask) { 588 nodemask) {
471 int fragindex;
472 int status; 589 int status;
473 590
474 /* 591 status = compact_zone_order(zone, order, gfp_mask, sync,
475 * Watermarks for order-0 must be met for compaction. Note 592 COMPACT_MODE_DIRECT_RECLAIM);
476 * the 2UL. This is because during migration, copies of
477 * pages need to be allocated and for a short time, the
478 * footprint is higher
479 */
480 watermark = low_wmark_pages(zone) + (2UL << order);
481 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
482 continue;
483
484 /*
485 * fragmentation index determines if allocation failures are
486 * due to low memory or external fragmentation
487 *
488 * index of -1 implies allocations might succeed depending
489 * on watermarks
490 * index towards 0 implies failure is due to lack of memory
491 * index towards 1000 implies failure is due to fragmentation
492 *
493 * Only compact if a failure would be due to fragmentation.
494 */
495 fragindex = fragmentation_index(zone, order);
496 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
497 continue;
498
499 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
500 rc = COMPACT_PARTIAL;
501 break;
502 }
503
504 status = compact_zone_order(zone, order, gfp_mask);
505 rc = max(status, rc); 593 rc = max(status, rc);
506 594
507 if (zone_watermark_ok(zone, order, watermark, 0, 0)) 595 /* If a normal allocation would succeed, stop compacting */
596 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
508 break; 597 break;
509 } 598 }
510 599
@@ -531,6 +620,7 @@ static int compact_node(int nid)
531 .nr_freepages = 0, 620 .nr_freepages = 0,
532 .nr_migratepages = 0, 621 .nr_migratepages = 0,
533 .order = -1, 622 .order = -1,
623 .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
534 }; 624 };
535 625
536 zone = &pgdat->node_zones[zoneid]; 626 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
324 if (mem_flags & __GFP_WAIT) { 324 if (mem_flags & __GFP_WAIT) {
325 DECLARE_WAITQUEUE(wait, current); 325 DECLARE_WAITQUEUE(wait, current);
326 326
327 __set_current_state(TASK_INTERRUPTIBLE); 327 __set_current_state(TASK_UNINTERRUPTIBLE);
328 __add_wait_queue(&pool->waitq, &wait); 328 __add_wait_queue(&pool->waitq, &wait);
329 spin_unlock_irqrestore(&pool->lock, flags); 329 spin_unlock_irqrestore(&pool->lock, flags);
330 330
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
355 355
356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) 356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
357{ 357{
358 unsigned long flags;
359 struct dma_page *page; 358 struct dma_page *page;
360 359
361 spin_lock_irqsave(&pool->lock, flags);
362 list_for_each_entry(page, &pool->page_list, page_list) { 360 list_for_each_entry(page, &pool->page_list, page_list) {
363 if (dma < page->dma) 361 if (dma < page->dma)
364 continue; 362 continue;
365 if (dma < (page->dma + pool->allocation)) 363 if (dma < (page->dma + pool->allocation))
366 goto done; 364 return page;
367 } 365 }
368 page = NULL; 366 return NULL;
369 done:
370 spin_unlock_irqrestore(&pool->lock, flags);
371 return page;
372} 367}
373 368
374/** 369/**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
386 unsigned long flags; 381 unsigned long flags;
387 unsigned int offset; 382 unsigned int offset;
388 383
384 spin_lock_irqsave(&pool->lock, flags);
389 page = pool_find_page(pool, dma); 385 page = pool_find_page(pool, dma);
390 if (!page) { 386 if (!page) {
387 spin_unlock_irqrestore(&pool->lock, flags);
391 if (pool->dev) 388 if (pool->dev)
392 dev_err(pool->dev, 389 dev_err(pool->dev,
393 "dma_pool_free %s, %p/%lx (bad dma)\n", 390 "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
401 offset = vaddr - page->vaddr; 398 offset = vaddr - page->vaddr;
402#ifdef DMAPOOL_DEBUG 399#ifdef DMAPOOL_DEBUG
403 if ((dma - page->dma) != offset) { 400 if ((dma - page->dma) != offset) {
401 spin_unlock_irqrestore(&pool->lock, flags);
404 if (pool->dev) 402 if (pool->dev)
405 dev_err(pool->dev, 403 dev_err(pool->dev,
406 "dma_pool_free %s, %p (bad vaddr)/%Lx\n", 404 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
418 chain = *(int *)(page->vaddr + chain); 416 chain = *(int *)(page->vaddr + chain);
419 continue; 417 continue;
420 } 418 }
419 spin_unlock_irqrestore(&pool->lock, flags);
421 if (pool->dev) 420 if (pool->dev)
422 dev_err(pool->dev, "dma_pool_free %s, dma %Lx " 421 dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
423 "already free\n", pool->name, 422 "already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
432 memset(vaddr, POOL_POISON_FREED, pool->size); 431 memset(vaddr, POOL_POISON_FREED, pool->size);
433#endif 432#endif
434 433
435 spin_lock_irqsave(&pool->lock, flags);
436 page->in_use--; 434 page->in_use--;
437 *(int *)vaddr = page->offset; 435 *(int *)vaddr = page->offset;
438 page->offset = offset; 436 page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index ca389394fa2a..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
298 continue; 298 continue;
299 299
300 wait_on_page_writeback(page); 300 wait_on_page_writeback(page);
301 if (PageError(page)) 301 if (TestClearPageError(page))
302 ret = -EIO; 302 ret = -EIO;
303 } 303 }
304 pagevec_release(&pvec); 304 pagevec_release(&pvec);
@@ -837,9 +837,6 @@ repeat:
837 if (radix_tree_deref_retry(page)) 837 if (radix_tree_deref_retry(page))
838 goto restart; 838 goto restart;
839 839
840 if (page->mapping == NULL || page->index != index)
841 break;
842
843 if (!page_cache_get_speculative(page)) 840 if (!page_cache_get_speculative(page))
844 goto repeat; 841 goto repeat;
845 842
@@ -849,6 +846,16 @@ repeat:
849 goto repeat; 846 goto repeat;
850 } 847 }
851 848
849 /*
850 * must check mapping and index after taking the ref.
851 * otherwise we can get both false positives and false
852 * negatives, which is just confusing to the caller.
853 */
854 if (page->mapping == NULL || page->index != index) {
855 page_cache_release(page);
856 break;
857 }
858
852 pages[ret] = page; 859 pages[ret] = page;
853 ret++; 860 ret++;
854 index++; 861 index++;
@@ -2220,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2220 gfp_notmask = __GFP_FS; 2227 gfp_notmask = __GFP_FS;
2221repeat: 2228repeat:
2222 page = find_lock_page(mapping, index); 2229 page = find_lock_page(mapping, index);
2223 if (likely(page)) 2230 if (page)
2224 return page; 2231 return page;
2225 2232
2226 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2233 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..004c9c2aac78
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2346 @@
1/*
2 * Copyright (C) 2009 Red Hat, Inc.
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/highmem.h>
11#include <linux/hugetlb.h>
12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
15#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
18#include <linux/freezer.h>
19#include <linux/mman.h>
20#include <asm/tlb.h>
21#include <asm/pgalloc.h>
22#include "internal.h"
23
24/*
25 * By default transparent hugepage support is enabled for all mappings
26 * and khugepaged scans all mappings. Defrag is only invoked by
27 * khugepaged hugepage allocations and by page faults inside
28 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
29 * allocations.
30 */
31unsigned long transparent_hugepage_flags __read_mostly =
32#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
33 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
34#endif
35#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
36 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
37#endif
38 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
40
41/* default scan 8*512 pte (or vmas) every 30 second */
42static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
43static unsigned int khugepaged_pages_collapsed;
44static unsigned int khugepaged_full_scans;
45static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
46/* during fragmentation poll the hugepage allocator once every minute */
47static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
48static struct task_struct *khugepaged_thread __read_mostly;
49static DEFINE_MUTEX(khugepaged_mutex);
50static DEFINE_SPINLOCK(khugepaged_mm_lock);
51static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
52/*
53 * default collapse hugepages if there is at least one pte mapped like
54 * it would have happened if the vma was large enough during page
55 * fault.
56 */
57static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
58
59static int khugepaged(void *none);
60static int mm_slots_hash_init(void);
61static int khugepaged_slab_init(void);
62static void khugepaged_slab_free(void);
63
64#define MM_SLOTS_HASH_HEADS 1024
65static struct hlist_head *mm_slots_hash __read_mostly;
66static struct kmem_cache *mm_slot_cache __read_mostly;
67
68/**
69 * struct mm_slot - hash lookup from mm to mm_slot
70 * @hash: hash collision list
71 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
72 * @mm: the mm that this information is valid for
73 */
74struct mm_slot {
75 struct hlist_node hash;
76 struct list_head mm_node;
77 struct mm_struct *mm;
78};
79
80/**
81 * struct khugepaged_scan - cursor for scanning
82 * @mm_head: the head of the mm list to scan
83 * @mm_slot: the current mm_slot we are scanning
84 * @address: the next address inside that to be scanned
85 *
86 * There is only the one khugepaged_scan instance of this cursor structure.
87 */
88struct khugepaged_scan {
89 struct list_head mm_head;
90 struct mm_slot *mm_slot;
91 unsigned long address;
92} khugepaged_scan = {
93 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94};
95
96
97static int set_recommended_min_free_kbytes(void)
98{
99 struct zone *zone;
100 int nr_zones = 0;
101 unsigned long recommended_min;
102 extern int min_free_kbytes;
103
104 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
105 &transparent_hugepage_flags) &&
106 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
107 &transparent_hugepage_flags))
108 return 0;
109
110 for_each_populated_zone(zone)
111 nr_zones++;
112
113 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
114 recommended_min = pageblock_nr_pages * nr_zones * 2;
115
116 /*
117 * Make sure that on average at least two pageblocks are almost free
118 * of another type, one for a migratetype to fall back to and a
119 * second to avoid subsequent fallbacks of other types There are 3
120 * MIGRATE_TYPES we care about.
121 */
122 recommended_min += pageblock_nr_pages * nr_zones *
123 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
124
125 /* don't ever allow to reserve more than 5% of the lowmem */
126 recommended_min = min(recommended_min,
127 (unsigned long) nr_free_buffer_pages() / 20);
128 recommended_min <<= (PAGE_SHIFT-10);
129
130 if (recommended_min > min_free_kbytes)
131 min_free_kbytes = recommended_min;
132 setup_per_zone_wmarks();
133 return 0;
134}
135late_initcall(set_recommended_min_free_kbytes);
136
137static int start_khugepaged(void)
138{
139 int err = 0;
140 if (khugepaged_enabled()) {
141 int wakeup;
142 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
143 err = -ENOMEM;
144 goto out;
145 }
146 mutex_lock(&khugepaged_mutex);
147 if (!khugepaged_thread)
148 khugepaged_thread = kthread_run(khugepaged, NULL,
149 "khugepaged");
150 if (unlikely(IS_ERR(khugepaged_thread))) {
151 printk(KERN_ERR
152 "khugepaged: kthread_run(khugepaged) failed\n");
153 err = PTR_ERR(khugepaged_thread);
154 khugepaged_thread = NULL;
155 }
156 wakeup = !list_empty(&khugepaged_scan.mm_head);
157 mutex_unlock(&khugepaged_mutex);
158 if (wakeup)
159 wake_up_interruptible(&khugepaged_wait);
160
161 set_recommended_min_free_kbytes();
162 } else
163 /* wakeup to exit */
164 wake_up_interruptible(&khugepaged_wait);
165out:
166 return err;
167}
168
169#ifdef CONFIG_SYSFS
170
171static ssize_t double_flag_show(struct kobject *kobj,
172 struct kobj_attribute *attr, char *buf,
173 enum transparent_hugepage_flag enabled,
174 enum transparent_hugepage_flag req_madv)
175{
176 if (test_bit(enabled, &transparent_hugepage_flags)) {
177 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
178 return sprintf(buf, "[always] madvise never\n");
179 } else if (test_bit(req_madv, &transparent_hugepage_flags))
180 return sprintf(buf, "always [madvise] never\n");
181 else
182 return sprintf(buf, "always madvise [never]\n");
183}
184static ssize_t double_flag_store(struct kobject *kobj,
185 struct kobj_attribute *attr,
186 const char *buf, size_t count,
187 enum transparent_hugepage_flag enabled,
188 enum transparent_hugepage_flag req_madv)
189{
190 if (!memcmp("always", buf,
191 min(sizeof("always")-1, count))) {
192 set_bit(enabled, &transparent_hugepage_flags);
193 clear_bit(req_madv, &transparent_hugepage_flags);
194 } else if (!memcmp("madvise", buf,
195 min(sizeof("madvise")-1, count))) {
196 clear_bit(enabled, &transparent_hugepage_flags);
197 set_bit(req_madv, &transparent_hugepage_flags);
198 } else if (!memcmp("never", buf,
199 min(sizeof("never")-1, count))) {
200 clear_bit(enabled, &transparent_hugepage_flags);
201 clear_bit(req_madv, &transparent_hugepage_flags);
202 } else
203 return -EINVAL;
204
205 return count;
206}
207
208static ssize_t enabled_show(struct kobject *kobj,
209 struct kobj_attribute *attr, char *buf)
210{
211 return double_flag_show(kobj, attr, buf,
212 TRANSPARENT_HUGEPAGE_FLAG,
213 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
214}
215static ssize_t enabled_store(struct kobject *kobj,
216 struct kobj_attribute *attr,
217 const char *buf, size_t count)
218{
219 ssize_t ret;
220
221 ret = double_flag_store(kobj, attr, buf, count,
222 TRANSPARENT_HUGEPAGE_FLAG,
223 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
224
225 if (ret > 0) {
226 int err = start_khugepaged();
227 if (err)
228 ret = err;
229 }
230
231 if (ret > 0 &&
232 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
233 &transparent_hugepage_flags) ||
234 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
235 &transparent_hugepage_flags)))
236 set_recommended_min_free_kbytes();
237
238 return ret;
239}
240static struct kobj_attribute enabled_attr =
241 __ATTR(enabled, 0644, enabled_show, enabled_store);
242
243static ssize_t single_flag_show(struct kobject *kobj,
244 struct kobj_attribute *attr, char *buf,
245 enum transparent_hugepage_flag flag)
246{
247 if (test_bit(flag, &transparent_hugepage_flags))
248 return sprintf(buf, "[yes] no\n");
249 else
250 return sprintf(buf, "yes [no]\n");
251}
252static ssize_t single_flag_store(struct kobject *kobj,
253 struct kobj_attribute *attr,
254 const char *buf, size_t count,
255 enum transparent_hugepage_flag flag)
256{
257 if (!memcmp("yes", buf,
258 min(sizeof("yes")-1, count))) {
259 set_bit(flag, &transparent_hugepage_flags);
260 } else if (!memcmp("no", buf,
261 min(sizeof("no")-1, count))) {
262 clear_bit(flag, &transparent_hugepage_flags);
263 } else
264 return -EINVAL;
265
266 return count;
267}
268
269/*
270 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
271 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
272 * memory just to allocate one more hugepage.
273 */
274static ssize_t defrag_show(struct kobject *kobj,
275 struct kobj_attribute *attr, char *buf)
276{
277 return double_flag_show(kobj, attr, buf,
278 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
279 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
280}
281static ssize_t defrag_store(struct kobject *kobj,
282 struct kobj_attribute *attr,
283 const char *buf, size_t count)
284{
285 return double_flag_store(kobj, attr, buf, count,
286 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
287 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
288}
289static struct kobj_attribute defrag_attr =
290 __ATTR(defrag, 0644, defrag_show, defrag_store);
291
292#ifdef CONFIG_DEBUG_VM
293static ssize_t debug_cow_show(struct kobject *kobj,
294 struct kobj_attribute *attr, char *buf)
295{
296 return single_flag_show(kobj, attr, buf,
297 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
298}
299static ssize_t debug_cow_store(struct kobject *kobj,
300 struct kobj_attribute *attr,
301 const char *buf, size_t count)
302{
303 return single_flag_store(kobj, attr, buf, count,
304 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
305}
306static struct kobj_attribute debug_cow_attr =
307 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
308#endif /* CONFIG_DEBUG_VM */
309
310static struct attribute *hugepage_attr[] = {
311 &enabled_attr.attr,
312 &defrag_attr.attr,
313#ifdef CONFIG_DEBUG_VM
314 &debug_cow_attr.attr,
315#endif
316 NULL,
317};
318
319static struct attribute_group hugepage_attr_group = {
320 .attrs = hugepage_attr,
321};
322
323static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
324 struct kobj_attribute *attr,
325 char *buf)
326{
327 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
328}
329
330static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
331 struct kobj_attribute *attr,
332 const char *buf, size_t count)
333{
334 unsigned long msecs;
335 int err;
336
337 err = strict_strtoul(buf, 10, &msecs);
338 if (err || msecs > UINT_MAX)
339 return -EINVAL;
340
341 khugepaged_scan_sleep_millisecs = msecs;
342 wake_up_interruptible(&khugepaged_wait);
343
344 return count;
345}
346static struct kobj_attribute scan_sleep_millisecs_attr =
347 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
348 scan_sleep_millisecs_store);
349
350static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
351 struct kobj_attribute *attr,
352 char *buf)
353{
354 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
355}
356
357static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
358 struct kobj_attribute *attr,
359 const char *buf, size_t count)
360{
361 unsigned long msecs;
362 int err;
363
364 err = strict_strtoul(buf, 10, &msecs);
365 if (err || msecs > UINT_MAX)
366 return -EINVAL;
367
368 khugepaged_alloc_sleep_millisecs = msecs;
369 wake_up_interruptible(&khugepaged_wait);
370
371 return count;
372}
373static struct kobj_attribute alloc_sleep_millisecs_attr =
374 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
375 alloc_sleep_millisecs_store);
376
377static ssize_t pages_to_scan_show(struct kobject *kobj,
378 struct kobj_attribute *attr,
379 char *buf)
380{
381 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
382}
383static ssize_t pages_to_scan_store(struct kobject *kobj,
384 struct kobj_attribute *attr,
385 const char *buf, size_t count)
386{
387 int err;
388 unsigned long pages;
389
390 err = strict_strtoul(buf, 10, &pages);
391 if (err || !pages || pages > UINT_MAX)
392 return -EINVAL;
393
394 khugepaged_pages_to_scan = pages;
395
396 return count;
397}
398static struct kobj_attribute pages_to_scan_attr =
399 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
400 pages_to_scan_store);
401
402static ssize_t pages_collapsed_show(struct kobject *kobj,
403 struct kobj_attribute *attr,
404 char *buf)
405{
406 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
407}
408static struct kobj_attribute pages_collapsed_attr =
409 __ATTR_RO(pages_collapsed);
410
411static ssize_t full_scans_show(struct kobject *kobj,
412 struct kobj_attribute *attr,
413 char *buf)
414{
415 return sprintf(buf, "%u\n", khugepaged_full_scans);
416}
417static struct kobj_attribute full_scans_attr =
418 __ATTR_RO(full_scans);
419
420static ssize_t khugepaged_defrag_show(struct kobject *kobj,
421 struct kobj_attribute *attr, char *buf)
422{
423 return single_flag_show(kobj, attr, buf,
424 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
425}
426static ssize_t khugepaged_defrag_store(struct kobject *kobj,
427 struct kobj_attribute *attr,
428 const char *buf, size_t count)
429{
430 return single_flag_store(kobj, attr, buf, count,
431 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
432}
433static struct kobj_attribute khugepaged_defrag_attr =
434 __ATTR(defrag, 0644, khugepaged_defrag_show,
435 khugepaged_defrag_store);
436
437/*
438 * max_ptes_none controls if khugepaged should collapse hugepages over
439 * any unmapped ptes in turn potentially increasing the memory
440 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
441 * reduce the available free memory in the system as it
442 * runs. Increasing max_ptes_none will instead potentially reduce the
443 * free memory in the system during the khugepaged scan.
444 */
445static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
446 struct kobj_attribute *attr,
447 char *buf)
448{
449 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
450}
451static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
452 struct kobj_attribute *attr,
453 const char *buf, size_t count)
454{
455 int err;
456 unsigned long max_ptes_none;
457
458 err = strict_strtoul(buf, 10, &max_ptes_none);
459 if (err || max_ptes_none > HPAGE_PMD_NR-1)
460 return -EINVAL;
461
462 khugepaged_max_ptes_none = max_ptes_none;
463
464 return count;
465}
466static struct kobj_attribute khugepaged_max_ptes_none_attr =
467 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
468 khugepaged_max_ptes_none_store);
469
470static struct attribute *khugepaged_attr[] = {
471 &khugepaged_defrag_attr.attr,
472 &khugepaged_max_ptes_none_attr.attr,
473 &pages_to_scan_attr.attr,
474 &pages_collapsed_attr.attr,
475 &full_scans_attr.attr,
476 &scan_sleep_millisecs_attr.attr,
477 &alloc_sleep_millisecs_attr.attr,
478 NULL,
479};
480
481static struct attribute_group khugepaged_attr_group = {
482 .attrs = khugepaged_attr,
483 .name = "khugepaged",
484};
485#endif /* CONFIG_SYSFS */
486
487static int __init hugepage_init(void)
488{
489 int err;
490#ifdef CONFIG_SYSFS
491 static struct kobject *hugepage_kobj;
492#endif
493
494 err = -EINVAL;
495 if (!has_transparent_hugepage()) {
496 transparent_hugepage_flags = 0;
497 goto out;
498 }
499
500#ifdef CONFIG_SYSFS
501 err = -ENOMEM;
502 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
503 if (unlikely(!hugepage_kobj)) {
504 printk(KERN_ERR "hugepage: failed kobject create\n");
505 goto out;
506 }
507
508 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
509 if (err) {
510 printk(KERN_ERR "hugepage: failed register hugeage group\n");
511 goto out;
512 }
513
514 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
515 if (err) {
516 printk(KERN_ERR "hugepage: failed register hugeage group\n");
517 goto out;
518 }
519#endif
520
521 err = khugepaged_slab_init();
522 if (err)
523 goto out;
524
525 err = mm_slots_hash_init();
526 if (err) {
527 khugepaged_slab_free();
528 goto out;
529 }
530
531 /*
532 * By default disable transparent hugepages on smaller systems,
533 * where the extra memory used could hurt more than TLB overhead
534 * is likely to save. The admin can still enable it through /sys.
535 */
536 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
537 transparent_hugepage_flags = 0;
538
539 start_khugepaged();
540
541 set_recommended_min_free_kbytes();
542
543out:
544 return err;
545}
546module_init(hugepage_init)
547
548static int __init setup_transparent_hugepage(char *str)
549{
550 int ret = 0;
551 if (!str)
552 goto out;
553 if (!strcmp(str, "always")) {
554 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
555 &transparent_hugepage_flags);
556 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
557 &transparent_hugepage_flags);
558 ret = 1;
559 } else if (!strcmp(str, "madvise")) {
560 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
561 &transparent_hugepage_flags);
562 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
563 &transparent_hugepage_flags);
564 ret = 1;
565 } else if (!strcmp(str, "never")) {
566 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
567 &transparent_hugepage_flags);
568 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
569 &transparent_hugepage_flags);
570 ret = 1;
571 }
572out:
573 if (!ret)
574 printk(KERN_WARNING
575 "transparent_hugepage= cannot parse, ignored\n");
576 return ret;
577}
578__setup("transparent_hugepage=", setup_transparent_hugepage);
579
580static void prepare_pmd_huge_pte(pgtable_t pgtable,
581 struct mm_struct *mm)
582{
583 assert_spin_locked(&mm->page_table_lock);
584
585 /* FIFO */
586 if (!mm->pmd_huge_pte)
587 INIT_LIST_HEAD(&pgtable->lru);
588 else
589 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
590 mm->pmd_huge_pte = pgtable;
591}
592
593static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
594{
595 if (likely(vma->vm_flags & VM_WRITE))
596 pmd = pmd_mkwrite(pmd);
597 return pmd;
598}
599
600static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
601 struct vm_area_struct *vma,
602 unsigned long haddr, pmd_t *pmd,
603 struct page *page)
604{
605 int ret = 0;
606 pgtable_t pgtable;
607
608 VM_BUG_ON(!PageCompound(page));
609 pgtable = pte_alloc_one(mm, haddr);
610 if (unlikely(!pgtable)) {
611 mem_cgroup_uncharge_page(page);
612 put_page(page);
613 return VM_FAULT_OOM;
614 }
615
616 clear_huge_page(page, haddr, HPAGE_PMD_NR);
617 __SetPageUptodate(page);
618
619 spin_lock(&mm->page_table_lock);
620 if (unlikely(!pmd_none(*pmd))) {
621 spin_unlock(&mm->page_table_lock);
622 mem_cgroup_uncharge_page(page);
623 put_page(page);
624 pte_free(mm, pgtable);
625 } else {
626 pmd_t entry;
627 entry = mk_pmd(page, vma->vm_page_prot);
628 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
629 entry = pmd_mkhuge(entry);
630 /*
631 * The spinlocking to take the lru_lock inside
632 * page_add_new_anon_rmap() acts as a full memory
633 * barrier to be sure clear_huge_page writes become
634 * visible after the set_pmd_at() write.
635 */
636 page_add_new_anon_rmap(page, vma, haddr);
637 set_pmd_at(mm, haddr, pmd, entry);
638 prepare_pmd_huge_pte(pgtable, mm);
639 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
640 spin_unlock(&mm->page_table_lock);
641 }
642
643 return ret;
644}
645
646static inline gfp_t alloc_hugepage_gfpmask(int defrag)
647{
648 return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
649}
650
651static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma,
653 unsigned long haddr)
654{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
656 HPAGE_PMD_ORDER, vma, haddr);
657}
658
659#ifndef CONFIG_NUMA
660static inline struct page *alloc_hugepage(int defrag)
661{
662 return alloc_pages(alloc_hugepage_gfpmask(defrag),
663 HPAGE_PMD_ORDER);
664}
665#endif
666
667int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
668 unsigned long address, pmd_t *pmd,
669 unsigned int flags)
670{
671 struct page *page;
672 unsigned long haddr = address & HPAGE_PMD_MASK;
673 pte_t *pte;
674
675 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
676 if (unlikely(anon_vma_prepare(vma)))
677 return VM_FAULT_OOM;
678 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr);
682 if (unlikely(!page))
683 goto out;
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
685 put_page(page);
686 goto out;
687 }
688
689 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
690 }
691out:
692 /*
693 * Use __pte_alloc instead of pte_alloc_map, because we can't
694 * run pte_offset_map on the pmd, if an huge pmd could
695 * materialize from under us from a different thread.
696 */
697 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
698 return VM_FAULT_OOM;
699 /* if an huge pmd materialized from under us just retry later */
700 if (unlikely(pmd_trans_huge(*pmd)))
701 return 0;
702 /*
703 * A regular pmd is established and it can't morph into a huge pmd
704 * from under us anymore at this point because we hold the mmap_sem
705 * read mode and khugepaged takes it in write mode. So now it's
706 * safe to run pte_offset_map().
707 */
708 pte = pte_offset_map(pmd, address);
709 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
710}
711
712int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
713 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
714 struct vm_area_struct *vma)
715{
716 struct page *src_page;
717 pmd_t pmd;
718 pgtable_t pgtable;
719 int ret;
720
721 ret = -ENOMEM;
722 pgtable = pte_alloc_one(dst_mm, addr);
723 if (unlikely(!pgtable))
724 goto out;
725
726 spin_lock(&dst_mm->page_table_lock);
727 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
728
729 ret = -EAGAIN;
730 pmd = *src_pmd;
731 if (unlikely(!pmd_trans_huge(pmd))) {
732 pte_free(dst_mm, pgtable);
733 goto out_unlock;
734 }
735 if (unlikely(pmd_trans_splitting(pmd))) {
736 /* split huge page running from under us */
737 spin_unlock(&src_mm->page_table_lock);
738 spin_unlock(&dst_mm->page_table_lock);
739 pte_free(dst_mm, pgtable);
740
741 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
742 goto out;
743 }
744 src_page = pmd_page(pmd);
745 VM_BUG_ON(!PageHead(src_page));
746 get_page(src_page);
747 page_dup_rmap(src_page);
748 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
749
750 pmdp_set_wrprotect(src_mm, addr, src_pmd);
751 pmd = pmd_mkold(pmd_wrprotect(pmd));
752 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
753 prepare_pmd_huge_pte(pgtable, dst_mm);
754
755 ret = 0;
756out_unlock:
757 spin_unlock(&src_mm->page_table_lock);
758 spin_unlock(&dst_mm->page_table_lock);
759out:
760 return ret;
761}
762
763/* no "address" argument so destroys page coloring of some arch */
764pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
765{
766 pgtable_t pgtable;
767
768 assert_spin_locked(&mm->page_table_lock);
769
770 /* FIFO */
771 pgtable = mm->pmd_huge_pte;
772 if (list_empty(&pgtable->lru))
773 mm->pmd_huge_pte = NULL;
774 else {
775 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
776 struct page, lru);
777 list_del(&pgtable->lru);
778 }
779 return pgtable;
780}
781
782static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
783 struct vm_area_struct *vma,
784 unsigned long address,
785 pmd_t *pmd, pmd_t orig_pmd,
786 struct page *page,
787 unsigned long haddr)
788{
789 pgtable_t pgtable;
790 pmd_t _pmd;
791 int ret = 0, i;
792 struct page **pages;
793
794 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
795 GFP_KERNEL);
796 if (unlikely(!pages)) {
797 ret |= VM_FAULT_OOM;
798 goto out;
799 }
800
801 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
803 vma, address);
804 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm,
806 GFP_KERNEL))) {
807 if (pages[i])
808 put_page(pages[i]);
809 mem_cgroup_uncharge_start();
810 while (--i >= 0) {
811 mem_cgroup_uncharge_page(pages[i]);
812 put_page(pages[i]);
813 }
814 mem_cgroup_uncharge_end();
815 kfree(pages);
816 ret |= VM_FAULT_OOM;
817 goto out;
818 }
819 }
820
821 for (i = 0; i < HPAGE_PMD_NR; i++) {
822 copy_user_highpage(pages[i], page + i,
823 haddr + PAGE_SHIFT*i, vma);
824 __SetPageUptodate(pages[i]);
825 cond_resched();
826 }
827
828 spin_lock(&mm->page_table_lock);
829 if (unlikely(!pmd_same(*pmd, orig_pmd)))
830 goto out_free_pages;
831 VM_BUG_ON(!PageHead(page));
832
833 pmdp_clear_flush_notify(vma, haddr, pmd);
834 /* leave pmd empty until pte is filled */
835
836 pgtable = get_pmd_huge_pte(mm);
837 pmd_populate(mm, &_pmd, pgtable);
838
839 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
840 pte_t *pte, entry;
841 entry = mk_pte(pages[i], vma->vm_page_prot);
842 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
843 page_add_new_anon_rmap(pages[i], vma, haddr);
844 pte = pte_offset_map(&_pmd, haddr);
845 VM_BUG_ON(!pte_none(*pte));
846 set_pte_at(mm, haddr, pte, entry);
847 pte_unmap(pte);
848 }
849 kfree(pages);
850
851 mm->nr_ptes++;
852 smp_wmb(); /* make pte visible before pmd */
853 pmd_populate(mm, pmd, pgtable);
854 page_remove_rmap(page);
855 spin_unlock(&mm->page_table_lock);
856
857 ret |= VM_FAULT_WRITE;
858 put_page(page);
859
860out:
861 return ret;
862
863out_free_pages:
864 spin_unlock(&mm->page_table_lock);
865 mem_cgroup_uncharge_start();
866 for (i = 0; i < HPAGE_PMD_NR; i++) {
867 mem_cgroup_uncharge_page(pages[i]);
868 put_page(pages[i]);
869 }
870 mem_cgroup_uncharge_end();
871 kfree(pages);
872 goto out;
873}
874
875int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
876 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
877{
878 int ret = 0;
879 struct page *page, *new_page;
880 unsigned long haddr;
881
882 VM_BUG_ON(!vma->anon_vma);
883 spin_lock(&mm->page_table_lock);
884 if (unlikely(!pmd_same(*pmd, orig_pmd)))
885 goto out_unlock;
886
887 page = pmd_page(orig_pmd);
888 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
889 haddr = address & HPAGE_PMD_MASK;
890 if (page_mapcount(page) == 1) {
891 pmd_t entry;
892 entry = pmd_mkyoung(orig_pmd);
893 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
894 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
895 update_mmu_cache(vma, address, entry);
896 ret |= VM_FAULT_WRITE;
897 goto out_unlock;
898 }
899 get_page(page);
900 spin_unlock(&mm->page_table_lock);
901
902 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr);
906 else
907 new_page = NULL;
908
909 if (unlikely(!new_page)) {
910 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
911 pmd, orig_pmd, page, haddr);
912 put_page(page);
913 goto out;
914 }
915
916 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
917 put_page(new_page);
918 put_page(page);
919 ret |= VM_FAULT_OOM;
920 goto out;
921 }
922
923 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
924 __SetPageUptodate(new_page);
925
926 spin_lock(&mm->page_table_lock);
927 put_page(page);
928 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
929 mem_cgroup_uncharge_page(new_page);
930 put_page(new_page);
931 } else {
932 pmd_t entry;
933 VM_BUG_ON(!PageHead(page));
934 entry = mk_pmd(new_page, vma->vm_page_prot);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 entry = pmd_mkhuge(entry);
937 pmdp_clear_flush_notify(vma, haddr, pmd);
938 page_add_new_anon_rmap(new_page, vma, haddr);
939 set_pmd_at(mm, haddr, pmd, entry);
940 update_mmu_cache(vma, address, entry);
941 page_remove_rmap(page);
942 put_page(page);
943 ret |= VM_FAULT_WRITE;
944 }
945out_unlock:
946 spin_unlock(&mm->page_table_lock);
947out:
948 return ret;
949}
950
951struct page *follow_trans_huge_pmd(struct mm_struct *mm,
952 unsigned long addr,
953 pmd_t *pmd,
954 unsigned int flags)
955{
956 struct page *page = NULL;
957
958 assert_spin_locked(&mm->page_table_lock);
959
960 if (flags & FOLL_WRITE && !pmd_write(*pmd))
961 goto out;
962
963 page = pmd_page(*pmd);
964 VM_BUG_ON(!PageHead(page));
965 if (flags & FOLL_TOUCH) {
966 pmd_t _pmd;
967 /*
968 * We should set the dirty bit only for FOLL_WRITE but
969 * for now the dirty bit in the pmd is meaningless.
970 * And if the dirty bit will become meaningful and
971 * we'll only set it with FOLL_WRITE, an atomic
972 * set_bit will be required on the pmd to set the
973 * young bit, instead of the current set_pmd_at.
974 */
975 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
976 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
977 }
978 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
979 VM_BUG_ON(!PageCompound(page));
980 if (flags & FOLL_GET)
981 get_page(page);
982
983out:
984 return page;
985}
986
987int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
988 pmd_t *pmd)
989{
990 int ret = 0;
991
992 spin_lock(&tlb->mm->page_table_lock);
993 if (likely(pmd_trans_huge(*pmd))) {
994 if (unlikely(pmd_trans_splitting(*pmd))) {
995 spin_unlock(&tlb->mm->page_table_lock);
996 wait_split_huge_page(vma->anon_vma,
997 pmd);
998 } else {
999 struct page *page;
1000 pgtable_t pgtable;
1001 pgtable = get_pmd_huge_pte(tlb->mm);
1002 page = pmd_page(*pmd);
1003 pmd_clear(pmd);
1004 page_remove_rmap(page);
1005 VM_BUG_ON(page_mapcount(page) < 0);
1006 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1007 VM_BUG_ON(!PageHead(page));
1008 spin_unlock(&tlb->mm->page_table_lock);
1009 tlb_remove_page(tlb, page);
1010 pte_free(tlb->mm, pgtable);
1011 ret = 1;
1012 }
1013 } else
1014 spin_unlock(&tlb->mm->page_table_lock);
1015
1016 return ret;
1017}
1018
1019int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1020 unsigned long addr, unsigned long end,
1021 unsigned char *vec)
1022{
1023 int ret = 0;
1024
1025 spin_lock(&vma->vm_mm->page_table_lock);
1026 if (likely(pmd_trans_huge(*pmd))) {
1027 ret = !pmd_trans_splitting(*pmd);
1028 spin_unlock(&vma->vm_mm->page_table_lock);
1029 if (unlikely(!ret))
1030 wait_split_huge_page(vma->anon_vma, pmd);
1031 else {
1032 /*
1033 * All logical pages in the range are present
1034 * if backed by a huge page.
1035 */
1036 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1037 }
1038 } else
1039 spin_unlock(&vma->vm_mm->page_table_lock);
1040
1041 return ret;
1042}
1043
1044int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1045 unsigned long addr, pgprot_t newprot)
1046{
1047 struct mm_struct *mm = vma->vm_mm;
1048 int ret = 0;
1049
1050 spin_lock(&mm->page_table_lock);
1051 if (likely(pmd_trans_huge(*pmd))) {
1052 if (unlikely(pmd_trans_splitting(*pmd))) {
1053 spin_unlock(&mm->page_table_lock);
1054 wait_split_huge_page(vma->anon_vma, pmd);
1055 } else {
1056 pmd_t entry;
1057
1058 entry = pmdp_get_and_clear(mm, addr, pmd);
1059 entry = pmd_modify(entry, newprot);
1060 set_pmd_at(mm, addr, pmd, entry);
1061 spin_unlock(&vma->vm_mm->page_table_lock);
1062 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1063 ret = 1;
1064 }
1065 } else
1066 spin_unlock(&vma->vm_mm->page_table_lock);
1067
1068 return ret;
1069}
1070
1071pmd_t *page_check_address_pmd(struct page *page,
1072 struct mm_struct *mm,
1073 unsigned long address,
1074 enum page_check_address_pmd_flag flag)
1075{
1076 pgd_t *pgd;
1077 pud_t *pud;
1078 pmd_t *pmd, *ret = NULL;
1079
1080 if (address & ~HPAGE_PMD_MASK)
1081 goto out;
1082
1083 pgd = pgd_offset(mm, address);
1084 if (!pgd_present(*pgd))
1085 goto out;
1086
1087 pud = pud_offset(pgd, address);
1088 if (!pud_present(*pud))
1089 goto out;
1090
1091 pmd = pmd_offset(pud, address);
1092 if (pmd_none(*pmd))
1093 goto out;
1094 if (pmd_page(*pmd) != page)
1095 goto out;
1096 /*
1097 * split_vma() may create temporary aliased mappings. There is
1098 * no risk as long as all huge pmd are found and have their
1099 * splitting bit set before __split_huge_page_refcount
1100 * runs. Finding the same huge pmd more than once during the
1101 * same rmap walk is not a problem.
1102 */
1103 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1104 pmd_trans_splitting(*pmd))
1105 goto out;
1106 if (pmd_trans_huge(*pmd)) {
1107 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1108 !pmd_trans_splitting(*pmd));
1109 ret = pmd;
1110 }
1111out:
1112 return ret;
1113}
1114
1115static int __split_huge_page_splitting(struct page *page,
1116 struct vm_area_struct *vma,
1117 unsigned long address)
1118{
1119 struct mm_struct *mm = vma->vm_mm;
1120 pmd_t *pmd;
1121 int ret = 0;
1122
1123 spin_lock(&mm->page_table_lock);
1124 pmd = page_check_address_pmd(page, mm, address,
1125 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1126 if (pmd) {
1127 /*
1128 * We can't temporarily set the pmd to null in order
1129 * to split it, the pmd must remain marked huge at all
1130 * times or the VM won't take the pmd_trans_huge paths
1131 * and it won't wait on the anon_vma->root->lock to
1132 * serialize against split_huge_page*.
1133 */
1134 pmdp_splitting_flush_notify(vma, address, pmd);
1135 ret = 1;
1136 }
1137 spin_unlock(&mm->page_table_lock);
1138
1139 return ret;
1140}
1141
1142static void __split_huge_page_refcount(struct page *page)
1143{
1144 int i;
1145 unsigned long head_index = page->index;
1146 struct zone *zone = page_zone(page);
1147 int zonestat;
1148
1149 /* prevent PageLRU to go away from under us, and freeze lru stats */
1150 spin_lock_irq(&zone->lru_lock);
1151 compound_lock(page);
1152
1153 for (i = 1; i < HPAGE_PMD_NR; i++) {
1154 struct page *page_tail = page + i;
1155
1156 /* tail_page->_count cannot change */
1157 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1158 BUG_ON(page_count(page) <= 0);
1159 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1160 BUG_ON(atomic_read(&page_tail->_count) <= 0);
1161
1162 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb();
1164
1165 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1166 page_tail->flags |= (page->flags &
1167 ((1L << PG_referenced) |
1168 (1L << PG_swapbacked) |
1169 (1L << PG_mlocked) |
1170 (1L << PG_uptodate)));
1171 page_tail->flags |= (1L << PG_dirty);
1172
1173 /*
1174 * 1) clear PageTail before overwriting first_page
1175 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1176 */
1177 smp_wmb();
1178
1179 /*
1180 * __split_huge_page_splitting() already set the
1181 * splitting bit in all pmd that could map this
1182 * hugepage, that will ensure no CPU can alter the
1183 * mapcount on the head page. The mapcount is only
1184 * accounted in the head page and it has to be
1185 * transferred to all tail pages in the below code. So
1186 * for this code to be safe, the split the mapcount
1187 * can't change. But that doesn't mean userland can't
1188 * keep changing and reading the page contents while
1189 * we transfer the mapcount, so the pmd splitting
1190 * status is achieved setting a reserved bit in the
1191 * pmd, not by clearing the present bit.
1192 */
1193 BUG_ON(page_mapcount(page_tail));
1194 page_tail->_mapcount = page->_mapcount;
1195
1196 BUG_ON(page_tail->mapping);
1197 page_tail->mapping = page->mapping;
1198
1199 page_tail->index = ++head_index;
1200
1201 BUG_ON(!PageAnon(page_tail));
1202 BUG_ON(!PageUptodate(page_tail));
1203 BUG_ON(!PageDirty(page_tail));
1204 BUG_ON(!PageSwapBacked(page_tail));
1205
1206 lru_add_page_tail(zone, page, page_tail);
1207 }
1208
1209 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1210 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1211
1212 /*
1213 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1214 * so adjust those appropriately if this page is on the LRU.
1215 */
1216 if (PageLRU(page)) {
1217 zonestat = NR_LRU_BASE + page_lru(page);
1218 __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1219 }
1220
1221 ClearPageCompound(page);
1222 compound_unlock(page);
1223 spin_unlock_irq(&zone->lru_lock);
1224
1225 for (i = 1; i < HPAGE_PMD_NR; i++) {
1226 struct page *page_tail = page + i;
1227 BUG_ON(page_count(page_tail) <= 0);
1228 /*
1229 * Tail pages may be freed if there wasn't any mapping
1230 * like if add_to_swap() is running on a lru page that
1231 * had its mapping zapped. And freeing these pages
1232 * requires taking the lru_lock so we do the put_page
1233 * of the tail pages after the split is complete.
1234 */
1235 put_page(page_tail);
1236 }
1237
1238 /*
1239 * Only the head page (now become a regular page) is required
1240 * to be pinned by the caller.
1241 */
1242 BUG_ON(page_count(page) <= 0);
1243}
1244
1245static int __split_huge_page_map(struct page *page,
1246 struct vm_area_struct *vma,
1247 unsigned long address)
1248{
1249 struct mm_struct *mm = vma->vm_mm;
1250 pmd_t *pmd, _pmd;
1251 int ret = 0, i;
1252 pgtable_t pgtable;
1253 unsigned long haddr;
1254
1255 spin_lock(&mm->page_table_lock);
1256 pmd = page_check_address_pmd(page, mm, address,
1257 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1258 if (pmd) {
1259 pgtable = get_pmd_huge_pte(mm);
1260 pmd_populate(mm, &_pmd, pgtable);
1261
1262 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1263 i++, haddr += PAGE_SIZE) {
1264 pte_t *pte, entry;
1265 BUG_ON(PageCompound(page+i));
1266 entry = mk_pte(page + i, vma->vm_page_prot);
1267 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1268 if (!pmd_write(*pmd))
1269 entry = pte_wrprotect(entry);
1270 else
1271 BUG_ON(page_mapcount(page) != 1);
1272 if (!pmd_young(*pmd))
1273 entry = pte_mkold(entry);
1274 pte = pte_offset_map(&_pmd, haddr);
1275 BUG_ON(!pte_none(*pte));
1276 set_pte_at(mm, haddr, pte, entry);
1277 pte_unmap(pte);
1278 }
1279
1280 mm->nr_ptes++;
1281 smp_wmb(); /* make pte visible before pmd */
1282 /*
1283 * Up to this point the pmd is present and huge and
1284 * userland has the whole access to the hugepage
1285 * during the split (which happens in place). If we
1286 * overwrite the pmd with the not-huge version
1287 * pointing to the pte here (which of course we could
1288 * if all CPUs were bug free), userland could trigger
1289 * a small page size TLB miss on the small sized TLB
1290 * while the hugepage TLB entry is still established
1291 * in the huge TLB. Some CPU doesn't like that. See
1292 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1293 * Erratum 383 on page 93. Intel should be safe but is
1294 * also warns that it's only safe if the permission
1295 * and cache attributes of the two entries loaded in
1296 * the two TLB is identical (which should be the case
1297 * here). But it is generally safer to never allow
1298 * small and huge TLB entries for the same virtual
1299 * address to be loaded simultaneously. So instead of
1300 * doing "pmd_populate(); flush_tlb_range();" we first
1301 * mark the current pmd notpresent (atomically because
1302 * here the pmd_trans_huge and pmd_trans_splitting
1303 * must remain set at all times on the pmd until the
1304 * split is complete for this pmd), then we flush the
1305 * SMP TLB and finally we write the non-huge version
1306 * of the pmd entry with pmd_populate.
1307 */
1308 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1309 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1310 pmd_populate(mm, pmd, pgtable);
1311 ret = 1;
1312 }
1313 spin_unlock(&mm->page_table_lock);
1314
1315 return ret;
1316}
1317
1318/* must be called with anon_vma->root->lock hold */
1319static void __split_huge_page(struct page *page,
1320 struct anon_vma *anon_vma)
1321{
1322 int mapcount, mapcount2;
1323 struct anon_vma_chain *avc;
1324
1325 BUG_ON(!PageHead(page));
1326 BUG_ON(PageTail(page));
1327
1328 mapcount = 0;
1329 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1330 struct vm_area_struct *vma = avc->vma;
1331 unsigned long addr = vma_address(page, vma);
1332 BUG_ON(is_vma_temporary_stack(vma));
1333 if (addr == -EFAULT)
1334 continue;
1335 mapcount += __split_huge_page_splitting(page, vma, addr);
1336 }
1337 /*
1338 * It is critical that new vmas are added to the tail of the
1339 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1340 * and establishes a child pmd before
1341 * __split_huge_page_splitting() freezes the parent pmd (so if
1342 * we fail to prevent copy_huge_pmd() from running until the
1343 * whole __split_huge_page() is complete), we will still see
1344 * the newly established pmd of the child later during the
1345 * walk, to be able to set it as pmd_trans_splitting too.
1346 */
1347 if (mapcount != page_mapcount(page))
1348 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1349 mapcount, page_mapcount(page));
1350 BUG_ON(mapcount != page_mapcount(page));
1351
1352 __split_huge_page_refcount(page);
1353
1354 mapcount2 = 0;
1355 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1356 struct vm_area_struct *vma = avc->vma;
1357 unsigned long addr = vma_address(page, vma);
1358 BUG_ON(is_vma_temporary_stack(vma));
1359 if (addr == -EFAULT)
1360 continue;
1361 mapcount2 += __split_huge_page_map(page, vma, addr);
1362 }
1363 if (mapcount != mapcount2)
1364 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1365 mapcount, mapcount2, page_mapcount(page));
1366 BUG_ON(mapcount != mapcount2);
1367}
1368
1369int split_huge_page(struct page *page)
1370{
1371 struct anon_vma *anon_vma;
1372 int ret = 1;
1373
1374 BUG_ON(!PageAnon(page));
1375 anon_vma = page_lock_anon_vma(page);
1376 if (!anon_vma)
1377 goto out;
1378 ret = 0;
1379 if (!PageCompound(page))
1380 goto out_unlock;
1381
1382 BUG_ON(!PageSwapBacked(page));
1383 __split_huge_page(page, anon_vma);
1384
1385 BUG_ON(PageCompound(page));
1386out_unlock:
1387 page_unlock_anon_vma(anon_vma);
1388out:
1389 return ret;
1390}
1391
1392int hugepage_madvise(struct vm_area_struct *vma,
1393 unsigned long *vm_flags, int advice)
1394{
1395 switch (advice) {
1396 case MADV_HUGEPAGE:
1397 /*
1398 * Be somewhat over-protective like KSM for now!
1399 */
1400 if (*vm_flags & (VM_HUGEPAGE |
1401 VM_SHARED | VM_MAYSHARE |
1402 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1403 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1404 VM_MIXEDMAP | VM_SAO))
1405 return -EINVAL;
1406 *vm_flags &= ~VM_NOHUGEPAGE;
1407 *vm_flags |= VM_HUGEPAGE;
1408 /*
1409 * If the vma become good for khugepaged to scan,
1410 * register it here without waiting a page fault that
1411 * may not happen any time soon.
1412 */
1413 if (unlikely(khugepaged_enter_vma_merge(vma)))
1414 return -ENOMEM;
1415 break;
1416 case MADV_NOHUGEPAGE:
1417 /*
1418 * Be somewhat over-protective like KSM for now!
1419 */
1420 if (*vm_flags & (VM_NOHUGEPAGE |
1421 VM_SHARED | VM_MAYSHARE |
1422 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1423 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1424 VM_MIXEDMAP | VM_SAO))
1425 return -EINVAL;
1426 *vm_flags &= ~VM_HUGEPAGE;
1427 *vm_flags |= VM_NOHUGEPAGE;
1428 /*
1429 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1430 * this vma even if we leave the mm registered in khugepaged if
1431 * it got registered before VM_NOHUGEPAGE was set.
1432 */
1433 break;
1434 }
1435
1436 return 0;
1437}
1438
1439static int __init khugepaged_slab_init(void)
1440{
1441 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1442 sizeof(struct mm_slot),
1443 __alignof__(struct mm_slot), 0, NULL);
1444 if (!mm_slot_cache)
1445 return -ENOMEM;
1446
1447 return 0;
1448}
1449
1450static void __init khugepaged_slab_free(void)
1451{
1452 kmem_cache_destroy(mm_slot_cache);
1453 mm_slot_cache = NULL;
1454}
1455
1456static inline struct mm_slot *alloc_mm_slot(void)
1457{
1458 if (!mm_slot_cache) /* initialization failed */
1459 return NULL;
1460 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1461}
1462
1463static inline void free_mm_slot(struct mm_slot *mm_slot)
1464{
1465 kmem_cache_free(mm_slot_cache, mm_slot);
1466}
1467
1468static int __init mm_slots_hash_init(void)
1469{
1470 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1471 GFP_KERNEL);
1472 if (!mm_slots_hash)
1473 return -ENOMEM;
1474 return 0;
1475}
1476
1477#if 0
1478static void __init mm_slots_hash_free(void)
1479{
1480 kfree(mm_slots_hash);
1481 mm_slots_hash = NULL;
1482}
1483#endif
1484
1485static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1486{
1487 struct mm_slot *mm_slot;
1488 struct hlist_head *bucket;
1489 struct hlist_node *node;
1490
1491 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1492 % MM_SLOTS_HASH_HEADS];
1493 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1494 if (mm == mm_slot->mm)
1495 return mm_slot;
1496 }
1497 return NULL;
1498}
1499
1500static void insert_to_mm_slots_hash(struct mm_struct *mm,
1501 struct mm_slot *mm_slot)
1502{
1503 struct hlist_head *bucket;
1504
1505 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1506 % MM_SLOTS_HASH_HEADS];
1507 mm_slot->mm = mm;
1508 hlist_add_head(&mm_slot->hash, bucket);
1509}
1510
1511static inline int khugepaged_test_exit(struct mm_struct *mm)
1512{
1513 return atomic_read(&mm->mm_users) == 0;
1514}
1515
1516int __khugepaged_enter(struct mm_struct *mm)
1517{
1518 struct mm_slot *mm_slot;
1519 int wakeup;
1520
1521 mm_slot = alloc_mm_slot();
1522 if (!mm_slot)
1523 return -ENOMEM;
1524
1525 /* __khugepaged_exit() must not run from under us */
1526 VM_BUG_ON(khugepaged_test_exit(mm));
1527 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1528 free_mm_slot(mm_slot);
1529 return 0;
1530 }
1531
1532 spin_lock(&khugepaged_mm_lock);
1533 insert_to_mm_slots_hash(mm, mm_slot);
1534 /*
1535 * Insert just behind the scanning cursor, to let the area settle
1536 * down a little.
1537 */
1538 wakeup = list_empty(&khugepaged_scan.mm_head);
1539 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1540 spin_unlock(&khugepaged_mm_lock);
1541
1542 atomic_inc(&mm->mm_count);
1543 if (wakeup)
1544 wake_up_interruptible(&khugepaged_wait);
1545
1546 return 0;
1547}
1548
1549int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1550{
1551 unsigned long hstart, hend;
1552 if (!vma->anon_vma)
1553 /*
1554 * Not yet faulted in so we will register later in the
1555 * page fault if needed.
1556 */
1557 return 0;
1558 if (vma->vm_file || vma->vm_ops)
1559 /* khugepaged not yet working on file or special mappings */
1560 return 0;
1561 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1562 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1563 hend = vma->vm_end & HPAGE_PMD_MASK;
1564 if (hstart < hend)
1565 return khugepaged_enter(vma);
1566 return 0;
1567}
1568
1569void __khugepaged_exit(struct mm_struct *mm)
1570{
1571 struct mm_slot *mm_slot;
1572 int free = 0;
1573
1574 spin_lock(&khugepaged_mm_lock);
1575 mm_slot = get_mm_slot(mm);
1576 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1577 hlist_del(&mm_slot->hash);
1578 list_del(&mm_slot->mm_node);
1579 free = 1;
1580 }
1581
1582 if (free) {
1583 spin_unlock(&khugepaged_mm_lock);
1584 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1585 free_mm_slot(mm_slot);
1586 mmdrop(mm);
1587 } else if (mm_slot) {
1588 spin_unlock(&khugepaged_mm_lock);
1589 /*
1590 * This is required to serialize against
1591 * khugepaged_test_exit() (which is guaranteed to run
1592 * under mmap sem read mode). Stop here (after we
1593 * return all pagetables will be destroyed) until
1594 * khugepaged has finished working on the pagetables
1595 * under the mmap_sem.
1596 */
1597 down_write(&mm->mmap_sem);
1598 up_write(&mm->mmap_sem);
1599 } else
1600 spin_unlock(&khugepaged_mm_lock);
1601}
1602
1603static void release_pte_page(struct page *page)
1604{
1605 /* 0 stands for page_is_file_cache(page) == false */
1606 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1607 unlock_page(page);
1608 putback_lru_page(page);
1609}
1610
1611static void release_pte_pages(pte_t *pte, pte_t *_pte)
1612{
1613 while (--_pte >= pte) {
1614 pte_t pteval = *_pte;
1615 if (!pte_none(pteval))
1616 release_pte_page(pte_page(pteval));
1617 }
1618}
1619
1620static void release_all_pte_pages(pte_t *pte)
1621{
1622 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1623}
1624
1625static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1626 unsigned long address,
1627 pte_t *pte)
1628{
1629 struct page *page;
1630 pte_t *_pte;
1631 int referenced = 0, isolated = 0, none = 0;
1632 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1633 _pte++, address += PAGE_SIZE) {
1634 pte_t pteval = *_pte;
1635 if (pte_none(pteval)) {
1636 if (++none <= khugepaged_max_ptes_none)
1637 continue;
1638 else {
1639 release_pte_pages(pte, _pte);
1640 goto out;
1641 }
1642 }
1643 if (!pte_present(pteval) || !pte_write(pteval)) {
1644 release_pte_pages(pte, _pte);
1645 goto out;
1646 }
1647 page = vm_normal_page(vma, address, pteval);
1648 if (unlikely(!page)) {
1649 release_pte_pages(pte, _pte);
1650 goto out;
1651 }
1652 VM_BUG_ON(PageCompound(page));
1653 BUG_ON(!PageAnon(page));
1654 VM_BUG_ON(!PageSwapBacked(page));
1655
1656 /* cannot use mapcount: can't collapse if there's a gup pin */
1657 if (page_count(page) != 1) {
1658 release_pte_pages(pte, _pte);
1659 goto out;
1660 }
1661 /*
1662 * We can do it before isolate_lru_page because the
1663 * page can't be freed from under us. NOTE: PG_lock
1664 * is needed to serialize against split_huge_page
1665 * when invoked from the VM.
1666 */
1667 if (!trylock_page(page)) {
1668 release_pte_pages(pte, _pte);
1669 goto out;
1670 }
1671 /*
1672 * Isolate the page to avoid collapsing an hugepage
1673 * currently in use by the VM.
1674 */
1675 if (isolate_lru_page(page)) {
1676 unlock_page(page);
1677 release_pte_pages(pte, _pte);
1678 goto out;
1679 }
1680 /* 0 stands for page_is_file_cache(page) == false */
1681 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1682 VM_BUG_ON(!PageLocked(page));
1683 VM_BUG_ON(PageLRU(page));
1684
1685 /* If there is no mapped pte young don't collapse the page */
1686 if (pte_young(pteval) || PageReferenced(page) ||
1687 mmu_notifier_test_young(vma->vm_mm, address))
1688 referenced = 1;
1689 }
1690 if (unlikely(!referenced))
1691 release_all_pte_pages(pte);
1692 else
1693 isolated = 1;
1694out:
1695 return isolated;
1696}
1697
1698static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1699 struct vm_area_struct *vma,
1700 unsigned long address,
1701 spinlock_t *ptl)
1702{
1703 pte_t *_pte;
1704 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1705 pte_t pteval = *_pte;
1706 struct page *src_page;
1707
1708 if (pte_none(pteval)) {
1709 clear_user_highpage(page, address);
1710 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1711 } else {
1712 src_page = pte_page(pteval);
1713 copy_user_highpage(page, src_page, address, vma);
1714 VM_BUG_ON(page_mapcount(src_page) != 1);
1715 VM_BUG_ON(page_count(src_page) != 2);
1716 release_pte_page(src_page);
1717 /*
1718 * ptl mostly unnecessary, but preempt has to
1719 * be disabled to update the per-cpu stats
1720 * inside page_remove_rmap().
1721 */
1722 spin_lock(ptl);
1723 /*
1724 * paravirt calls inside pte_clear here are
1725 * superfluous.
1726 */
1727 pte_clear(vma->vm_mm, address, _pte);
1728 page_remove_rmap(src_page);
1729 spin_unlock(ptl);
1730 free_page_and_swap_cache(src_page);
1731 }
1732
1733 address += PAGE_SIZE;
1734 page++;
1735 }
1736}
1737
1738static void collapse_huge_page(struct mm_struct *mm,
1739 unsigned long address,
1740 struct page **hpage,
1741 struct vm_area_struct *vma)
1742{
1743 pgd_t *pgd;
1744 pud_t *pud;
1745 pmd_t *pmd, _pmd;
1746 pte_t *pte;
1747 pgtable_t pgtable;
1748 struct page *new_page;
1749 spinlock_t *ptl;
1750 int isolated;
1751 unsigned long hstart, hend;
1752
1753 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1754#ifndef CONFIG_NUMA
1755 VM_BUG_ON(!*hpage);
1756 new_page = *hpage;
1757#else
1758 VM_BUG_ON(*hpage);
1759 /*
1760 * Allocate the page while the vma is still valid and under
1761 * the mmap_sem read mode so there is no memory allocation
1762 * later when we take the mmap_sem in write mode. This is more
1763 * friendly behavior (OTOH it may actually hide bugs) to
1764 * filesystems in userland with daemons allocating memory in
1765 * the userland I/O paths. Allocating memory with the
1766 * mmap_sem in read mode is good idea also to allow greater
1767 * scalability.
1768 */
1769 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
1770 if (unlikely(!new_page)) {
1771 up_read(&mm->mmap_sem);
1772 *hpage = ERR_PTR(-ENOMEM);
1773 return;
1774 }
1775#endif
1776 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1777 up_read(&mm->mmap_sem);
1778 put_page(new_page);
1779 return;
1780 }
1781
1782 /* after allocating the hugepage upgrade to mmap_sem write mode */
1783 up_read(&mm->mmap_sem);
1784
1785 /*
1786 * Prevent all access to pagetables with the exception of
1787 * gup_fast later hanlded by the ptep_clear_flush and the VM
1788 * handled by the anon_vma lock + PG_lock.
1789 */
1790 down_write(&mm->mmap_sem);
1791 if (unlikely(khugepaged_test_exit(mm)))
1792 goto out;
1793
1794 vma = find_vma(mm, address);
1795 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1796 hend = vma->vm_end & HPAGE_PMD_MASK;
1797 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1798 goto out;
1799
1800 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1801 (vma->vm_flags & VM_NOHUGEPAGE))
1802 goto out;
1803
1804 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1805 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1806 goto out;
1807 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1808
1809 pgd = pgd_offset(mm, address);
1810 if (!pgd_present(*pgd))
1811 goto out;
1812
1813 pud = pud_offset(pgd, address);
1814 if (!pud_present(*pud))
1815 goto out;
1816
1817 pmd = pmd_offset(pud, address);
1818 /* pmd can't go away or become huge under us */
1819 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1820 goto out;
1821
1822 anon_vma_lock(vma->anon_vma);
1823
1824 pte = pte_offset_map(pmd, address);
1825 ptl = pte_lockptr(mm, pmd);
1826
1827 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1828 /*
1829 * After this gup_fast can't run anymore. This also removes
1830 * any huge TLB entry from the CPU so we won't allow
1831 * huge and small TLB entries for the same virtual address
1832 * to avoid the risk of CPU bugs in that area.
1833 */
1834 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1835 spin_unlock(&mm->page_table_lock);
1836
1837 spin_lock(ptl);
1838 isolated = __collapse_huge_page_isolate(vma, address, pte);
1839 spin_unlock(ptl);
1840 pte_unmap(pte);
1841
1842 if (unlikely(!isolated)) {
1843 spin_lock(&mm->page_table_lock);
1844 BUG_ON(!pmd_none(*pmd));
1845 set_pmd_at(mm, address, pmd, _pmd);
1846 spin_unlock(&mm->page_table_lock);
1847 anon_vma_unlock(vma->anon_vma);
1848 mem_cgroup_uncharge_page(new_page);
1849 goto out;
1850 }
1851
1852 /*
1853 * All pages are isolated and locked so anon_vma rmap
1854 * can't run anymore.
1855 */
1856 anon_vma_unlock(vma->anon_vma);
1857
1858 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1859 __SetPageUptodate(new_page);
1860 pgtable = pmd_pgtable(_pmd);
1861 VM_BUG_ON(page_count(pgtable) != 1);
1862 VM_BUG_ON(page_mapcount(pgtable) != 0);
1863
1864 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1865 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1866 _pmd = pmd_mkhuge(_pmd);
1867
1868 /*
1869 * spin_lock() below is not the equivalent of smp_wmb(), so
1870 * this is needed to avoid the copy_huge_page writes to become
1871 * visible after the set_pmd_at() write.
1872 */
1873 smp_wmb();
1874
1875 spin_lock(&mm->page_table_lock);
1876 BUG_ON(!pmd_none(*pmd));
1877 page_add_new_anon_rmap(new_page, vma, address);
1878 set_pmd_at(mm, address, pmd, _pmd);
1879 update_mmu_cache(vma, address, entry);
1880 prepare_pmd_huge_pte(pgtable, mm);
1881 mm->nr_ptes--;
1882 spin_unlock(&mm->page_table_lock);
1883
1884#ifndef CONFIG_NUMA
1885 *hpage = NULL;
1886#endif
1887 khugepaged_pages_collapsed++;
1888out_up_write:
1889 up_write(&mm->mmap_sem);
1890 return;
1891
1892out:
1893#ifdef CONFIG_NUMA
1894 put_page(new_page);
1895#endif
1896 goto out_up_write;
1897}
1898
1899static int khugepaged_scan_pmd(struct mm_struct *mm,
1900 struct vm_area_struct *vma,
1901 unsigned long address,
1902 struct page **hpage)
1903{
1904 pgd_t *pgd;
1905 pud_t *pud;
1906 pmd_t *pmd;
1907 pte_t *pte, *_pte;
1908 int ret = 0, referenced = 0, none = 0;
1909 struct page *page;
1910 unsigned long _address;
1911 spinlock_t *ptl;
1912
1913 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1914
1915 pgd = pgd_offset(mm, address);
1916 if (!pgd_present(*pgd))
1917 goto out;
1918
1919 pud = pud_offset(pgd, address);
1920 if (!pud_present(*pud))
1921 goto out;
1922
1923 pmd = pmd_offset(pud, address);
1924 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1925 goto out;
1926
1927 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1928 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1929 _pte++, _address += PAGE_SIZE) {
1930 pte_t pteval = *_pte;
1931 if (pte_none(pteval)) {
1932 if (++none <= khugepaged_max_ptes_none)
1933 continue;
1934 else
1935 goto out_unmap;
1936 }
1937 if (!pte_present(pteval) || !pte_write(pteval))
1938 goto out_unmap;
1939 page = vm_normal_page(vma, _address, pteval);
1940 if (unlikely(!page))
1941 goto out_unmap;
1942 VM_BUG_ON(PageCompound(page));
1943 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1944 goto out_unmap;
1945 /* cannot use mapcount: can't collapse if there's a gup pin */
1946 if (page_count(page) != 1)
1947 goto out_unmap;
1948 if (pte_young(pteval) || PageReferenced(page) ||
1949 mmu_notifier_test_young(vma->vm_mm, address))
1950 referenced = 1;
1951 }
1952 if (referenced)
1953 ret = 1;
1954out_unmap:
1955 pte_unmap_unlock(pte, ptl);
1956 if (ret)
1957 /* collapse_huge_page will return with the mmap_sem released */
1958 collapse_huge_page(mm, address, hpage, vma);
1959out:
1960 return ret;
1961}
1962
1963static void collect_mm_slot(struct mm_slot *mm_slot)
1964{
1965 struct mm_struct *mm = mm_slot->mm;
1966
1967 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1968
1969 if (khugepaged_test_exit(mm)) {
1970 /* free mm_slot */
1971 hlist_del(&mm_slot->hash);
1972 list_del(&mm_slot->mm_node);
1973
1974 /*
1975 * Not strictly needed because the mm exited already.
1976 *
1977 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1978 */
1979
1980 /* khugepaged_mm_lock actually not necessary for the below */
1981 free_mm_slot(mm_slot);
1982 mmdrop(mm);
1983 }
1984}
1985
1986static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1987 struct page **hpage)
1988{
1989 struct mm_slot *mm_slot;
1990 struct mm_struct *mm;
1991 struct vm_area_struct *vma;
1992 int progress = 0;
1993
1994 VM_BUG_ON(!pages);
1995 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1996
1997 if (khugepaged_scan.mm_slot)
1998 mm_slot = khugepaged_scan.mm_slot;
1999 else {
2000 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2001 struct mm_slot, mm_node);
2002 khugepaged_scan.address = 0;
2003 khugepaged_scan.mm_slot = mm_slot;
2004 }
2005 spin_unlock(&khugepaged_mm_lock);
2006
2007 mm = mm_slot->mm;
2008 down_read(&mm->mmap_sem);
2009 if (unlikely(khugepaged_test_exit(mm)))
2010 vma = NULL;
2011 else
2012 vma = find_vma(mm, khugepaged_scan.address);
2013
2014 progress++;
2015 for (; vma; vma = vma->vm_next) {
2016 unsigned long hstart, hend;
2017
2018 cond_resched();
2019 if (unlikely(khugepaged_test_exit(mm))) {
2020 progress++;
2021 break;
2022 }
2023
2024 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2025 !khugepaged_always()) ||
2026 (vma->vm_flags & VM_NOHUGEPAGE)) {
2027 progress++;
2028 continue;
2029 }
2030
2031 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
2032 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
2033 khugepaged_scan.address = vma->vm_end;
2034 progress++;
2035 continue;
2036 }
2037 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
2038
2039 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2040 hend = vma->vm_end & HPAGE_PMD_MASK;
2041 if (hstart >= hend) {
2042 progress++;
2043 continue;
2044 }
2045 if (khugepaged_scan.address < hstart)
2046 khugepaged_scan.address = hstart;
2047 if (khugepaged_scan.address > hend) {
2048 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
2049 progress++;
2050 continue;
2051 }
2052 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2053
2054 while (khugepaged_scan.address < hend) {
2055 int ret;
2056 cond_resched();
2057 if (unlikely(khugepaged_test_exit(mm)))
2058 goto breakouterloop;
2059
2060 VM_BUG_ON(khugepaged_scan.address < hstart ||
2061 khugepaged_scan.address + HPAGE_PMD_SIZE >
2062 hend);
2063 ret = khugepaged_scan_pmd(mm, vma,
2064 khugepaged_scan.address,
2065 hpage);
2066 /* move to next address */
2067 khugepaged_scan.address += HPAGE_PMD_SIZE;
2068 progress += HPAGE_PMD_NR;
2069 if (ret)
2070 /* we released mmap_sem so break loop */
2071 goto breakouterloop_mmap_sem;
2072 if (progress >= pages)
2073 goto breakouterloop;
2074 }
2075 }
2076breakouterloop:
2077 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2078breakouterloop_mmap_sem:
2079
2080 spin_lock(&khugepaged_mm_lock);
2081 BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2082 /*
2083 * Release the current mm_slot if this mm is about to die, or
2084 * if we scanned all vmas of this mm.
2085 */
2086 if (khugepaged_test_exit(mm) || !vma) {
2087 /*
2088 * Make sure that if mm_users is reaching zero while
2089 * khugepaged runs here, khugepaged_exit will find
2090 * mm_slot not pointing to the exiting mm.
2091 */
2092 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2093 khugepaged_scan.mm_slot = list_entry(
2094 mm_slot->mm_node.next,
2095 struct mm_slot, mm_node);
2096 khugepaged_scan.address = 0;
2097 } else {
2098 khugepaged_scan.mm_slot = NULL;
2099 khugepaged_full_scans++;
2100 }
2101
2102 collect_mm_slot(mm_slot);
2103 }
2104
2105 return progress;
2106}
2107
2108static int khugepaged_has_work(void)
2109{
2110 return !list_empty(&khugepaged_scan.mm_head) &&
2111 khugepaged_enabled();
2112}
2113
2114static int khugepaged_wait_event(void)
2115{
2116 return !list_empty(&khugepaged_scan.mm_head) ||
2117 !khugepaged_enabled();
2118}
2119
2120static void khugepaged_do_scan(struct page **hpage)
2121{
2122 unsigned int progress = 0, pass_through_head = 0;
2123 unsigned int pages = khugepaged_pages_to_scan;
2124
2125 barrier(); /* write khugepaged_pages_to_scan to local stack */
2126
2127 while (progress < pages) {
2128 cond_resched();
2129
2130#ifndef CONFIG_NUMA
2131 if (!*hpage) {
2132 *hpage = alloc_hugepage(khugepaged_defrag());
2133 if (unlikely(!*hpage))
2134 break;
2135 }
2136#else
2137 if (IS_ERR(*hpage))
2138 break;
2139#endif
2140
2141 if (unlikely(kthread_should_stop() || freezing(current)))
2142 break;
2143
2144 spin_lock(&khugepaged_mm_lock);
2145 if (!khugepaged_scan.mm_slot)
2146 pass_through_head++;
2147 if (khugepaged_has_work() &&
2148 pass_through_head < 2)
2149 progress += khugepaged_scan_mm_slot(pages - progress,
2150 hpage);
2151 else
2152 progress = pages;
2153 spin_unlock(&khugepaged_mm_lock);
2154 }
2155}
2156
2157static void khugepaged_alloc_sleep(void)
2158{
2159 DEFINE_WAIT(wait);
2160 add_wait_queue(&khugepaged_wait, &wait);
2161 schedule_timeout_interruptible(
2162 msecs_to_jiffies(
2163 khugepaged_alloc_sleep_millisecs));
2164 remove_wait_queue(&khugepaged_wait, &wait);
2165}
2166
2167#ifndef CONFIG_NUMA
2168static struct page *khugepaged_alloc_hugepage(void)
2169{
2170 struct page *hpage;
2171
2172 do {
2173 hpage = alloc_hugepage(khugepaged_defrag());
2174 if (!hpage)
2175 khugepaged_alloc_sleep();
2176 } while (unlikely(!hpage) &&
2177 likely(khugepaged_enabled()));
2178 return hpage;
2179}
2180#endif
2181
2182static void khugepaged_loop(void)
2183{
2184 struct page *hpage;
2185
2186#ifdef CONFIG_NUMA
2187 hpage = NULL;
2188#endif
2189 while (likely(khugepaged_enabled())) {
2190#ifndef CONFIG_NUMA
2191 hpage = khugepaged_alloc_hugepage();
2192 if (unlikely(!hpage))
2193 break;
2194#else
2195 if (IS_ERR(hpage)) {
2196 khugepaged_alloc_sleep();
2197 hpage = NULL;
2198 }
2199#endif
2200
2201 khugepaged_do_scan(&hpage);
2202#ifndef CONFIG_NUMA
2203 if (hpage)
2204 put_page(hpage);
2205#endif
2206 try_to_freeze();
2207 if (unlikely(kthread_should_stop()))
2208 break;
2209 if (khugepaged_has_work()) {
2210 DEFINE_WAIT(wait);
2211 if (!khugepaged_scan_sleep_millisecs)
2212 continue;
2213 add_wait_queue(&khugepaged_wait, &wait);
2214 schedule_timeout_interruptible(
2215 msecs_to_jiffies(
2216 khugepaged_scan_sleep_millisecs));
2217 remove_wait_queue(&khugepaged_wait, &wait);
2218 } else if (khugepaged_enabled())
2219 wait_event_freezable(khugepaged_wait,
2220 khugepaged_wait_event());
2221 }
2222}
2223
2224static int khugepaged(void *none)
2225{
2226 struct mm_slot *mm_slot;
2227
2228 set_freezable();
2229 set_user_nice(current, 19);
2230
2231 /* serialize with start_khugepaged() */
2232 mutex_lock(&khugepaged_mutex);
2233
2234 for (;;) {
2235 mutex_unlock(&khugepaged_mutex);
2236 BUG_ON(khugepaged_thread != current);
2237 khugepaged_loop();
2238 BUG_ON(khugepaged_thread != current);
2239
2240 mutex_lock(&khugepaged_mutex);
2241 if (!khugepaged_enabled())
2242 break;
2243 if (unlikely(kthread_should_stop()))
2244 break;
2245 }
2246
2247 spin_lock(&khugepaged_mm_lock);
2248 mm_slot = khugepaged_scan.mm_slot;
2249 khugepaged_scan.mm_slot = NULL;
2250 if (mm_slot)
2251 collect_mm_slot(mm_slot);
2252 spin_unlock(&khugepaged_mm_lock);
2253
2254 khugepaged_thread = NULL;
2255 mutex_unlock(&khugepaged_mutex);
2256
2257 return 0;
2258}
2259
2260void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2261{
2262 struct page *page;
2263
2264 spin_lock(&mm->page_table_lock);
2265 if (unlikely(!pmd_trans_huge(*pmd))) {
2266 spin_unlock(&mm->page_table_lock);
2267 return;
2268 }
2269 page = pmd_page(*pmd);
2270 VM_BUG_ON(!page_count(page));
2271 get_page(page);
2272 spin_unlock(&mm->page_table_lock);
2273
2274 split_huge_page(page);
2275
2276 put_page(page);
2277 BUG_ON(pmd_trans_huge(*pmd));
2278}
2279
2280static void split_huge_page_address(struct mm_struct *mm,
2281 unsigned long address)
2282{
2283 pgd_t *pgd;
2284 pud_t *pud;
2285 pmd_t *pmd;
2286
2287 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2288
2289 pgd = pgd_offset(mm, address);
2290 if (!pgd_present(*pgd))
2291 return;
2292
2293 pud = pud_offset(pgd, address);
2294 if (!pud_present(*pud))
2295 return;
2296
2297 pmd = pmd_offset(pud, address);
2298 if (!pmd_present(*pmd))
2299 return;
2300 /*
2301 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2302 * materialize from under us.
2303 */
2304 split_huge_page_pmd(mm, pmd);
2305}
2306
2307void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2308 unsigned long start,
2309 unsigned long end,
2310 long adjust_next)
2311{
2312 /*
2313 * If the new start address isn't hpage aligned and it could
2314 * previously contain an hugepage: check if we need to split
2315 * an huge pmd.
2316 */
2317 if (start & ~HPAGE_PMD_MASK &&
2318 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2319 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2320 split_huge_page_address(vma->vm_mm, start);
2321
2322 /*
2323 * If the new end address isn't hpage aligned and it could
2324 * previously contain an hugepage: check if we need to split
2325 * an huge pmd.
2326 */
2327 if (end & ~HPAGE_PMD_MASK &&
2328 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2329 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2330 split_huge_page_address(vma->vm_mm, end);
2331
2332 /*
2333 * If we're also updating the vma->vm_next->vm_start, if the new
2334 * vm_next->vm_start isn't page aligned and it could previously
2335 * contain an hugepage: check if we need to split an huge pmd.
2336 */
2337 if (adjust_next > 0) {
2338 struct vm_area_struct *next = vma->vm_next;
2339 unsigned long nstart = next->vm_start;
2340 nstart += adjust_next << PAGE_SHIFT;
2341 if (nstart & ~HPAGE_PMD_MASK &&
2342 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2343 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2344 split_huge_page_address(next->vm_mm, nstart);
2345 }
2346}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
394 return 0; 394 return 0;
395} 395}
396 396
397static void clear_gigantic_page(struct page *page,
398 unsigned long addr, unsigned long sz)
399{
400 int i;
401 struct page *p = page;
402
403 might_sleep();
404 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
405 cond_resched();
406 clear_user_highpage(p, addr + i * PAGE_SIZE);
407 }
408}
409static void clear_huge_page(struct page *page,
410 unsigned long addr, unsigned long sz)
411{
412 int i;
413
414 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
415 clear_gigantic_page(page, addr, sz);
416 return;
417 }
418
419 might_sleep();
420 for (i = 0; i < sz/PAGE_SIZE; i++) {
421 cond_resched();
422 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
423 }
424}
425
426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma)
428{
429 int i;
430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst;
432 struct page *src_base = src;
433
434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
437
438 i++;
439 dst = mem_map_next(dst, dst_base, i);
440 src = mem_map_next(src, src_base, i);
441 }
442}
443
444static void copy_user_huge_page(struct page *dst, struct page *src,
445 unsigned long addr, struct vm_area_struct *vma)
446{
447 int i;
448 struct hstate *h = hstate_vma(vma);
449
450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
451 copy_user_gigantic_page(dst, src, addr, vma);
452 return;
453 }
454
455 might_sleep();
456 for (i = 0; i < pages_per_huge_page(h); i++) {
457 cond_resched();
458 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
459 }
460}
461
462static void copy_gigantic_page(struct page *dst, struct page *src) 397static void copy_gigantic_page(struct page *dst, struct page *src)
463{ 398{
464 int i; 399 int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1428 1363
1429 return sprintf(buf, "%lu\n", nr_huge_pages); 1364 return sprintf(buf, "%lu\n", nr_huge_pages);
1430} 1365}
1366
1431static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1367static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1432 struct kobject *kobj, struct kobj_attribute *attr, 1368 struct kobject *kobj, struct kobj_attribute *attr,
1433 const char *buf, size_t len) 1369 const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1440 1376
1441 err = strict_strtoul(buf, 10, &count); 1377 err = strict_strtoul(buf, 10, &count);
1442 if (err) 1378 if (err)
1443 return 0; 1379 goto out;
1444 1380
1445 h = kobj_to_hstate(kobj, &nid); 1381 h = kobj_to_hstate(kobj, &nid);
1382 if (h->order >= MAX_ORDER) {
1383 err = -EINVAL;
1384 goto out;
1385 }
1386
1446 if (nid == NUMA_NO_NODE) { 1387 if (nid == NUMA_NO_NODE) {
1447 /* 1388 /*
1448 * global hstate attribute 1389 * global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1468 NODEMASK_FREE(nodes_allowed); 1409 NODEMASK_FREE(nodes_allowed);
1469 1410
1470 return len; 1411 return len;
1412out:
1413 NODEMASK_FREE(nodes_allowed);
1414 return err;
1471} 1415}
1472 1416
1473static ssize_t nr_hugepages_show(struct kobject *kobj, 1417static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1510 struct hstate *h = kobj_to_hstate(kobj, NULL); 1454 struct hstate *h = kobj_to_hstate(kobj, NULL);
1511 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1455 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1512} 1456}
1457
1513static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1458static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1514 struct kobj_attribute *attr, const char *buf, size_t count) 1459 struct kobj_attribute *attr, const char *buf, size_t count)
1515{ 1460{
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1517 unsigned long input; 1462 unsigned long input;
1518 struct hstate *h = kobj_to_hstate(kobj, NULL); 1463 struct hstate *h = kobj_to_hstate(kobj, NULL);
1519 1464
1465 if (h->order >= MAX_ORDER)
1466 return -EINVAL;
1467
1520 err = strict_strtoul(buf, 10, &input); 1468 err = strict_strtoul(buf, 10, &input);
1521 if (err) 1469 if (err)
1522 return 0; 1470 return err;
1523 1471
1524 spin_lock(&hugetlb_lock); 1472 spin_lock(&hugetlb_lock);
1525 h->nr_overcommit_huge_pages = input; 1473 h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1922{ 1870{
1923 struct hstate *h = &default_hstate; 1871 struct hstate *h = &default_hstate;
1924 unsigned long tmp; 1872 unsigned long tmp;
1873 int ret;
1925 1874
1926 if (!write) 1875 if (!write)
1927 tmp = h->max_huge_pages; 1876 tmp = h->max_huge_pages;
1928 1877
1878 if (write && h->order >= MAX_ORDER)
1879 return -EINVAL;
1880
1929 table->data = &tmp; 1881 table->data = &tmp;
1930 table->maxlen = sizeof(unsigned long); 1882 table->maxlen = sizeof(unsigned long);
1931 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1883 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1884 if (ret)
1885 goto out;
1932 1886
1933 if (write) { 1887 if (write) {
1934 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1888 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1943 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1897 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1944 NODEMASK_FREE(nodes_allowed); 1898 NODEMASK_FREE(nodes_allowed);
1945 } 1899 }
1946 1900out:
1947 return 0; 1901 return ret;
1948} 1902}
1949 1903
1950int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1904int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1982{ 1936{
1983 struct hstate *h = &default_hstate; 1937 struct hstate *h = &default_hstate;
1984 unsigned long tmp; 1938 unsigned long tmp;
1939 int ret;
1985 1940
1986 if (!write) 1941 if (!write)
1987 tmp = h->nr_overcommit_huge_pages; 1942 tmp = h->nr_overcommit_huge_pages;
1988 1943
1944 if (write && h->order >= MAX_ORDER)
1945 return -EINVAL;
1946
1989 table->data = &tmp; 1947 table->data = &tmp;
1990 table->maxlen = sizeof(unsigned long); 1948 table->maxlen = sizeof(unsigned long);
1991 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1949 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1950 if (ret)
1951 goto out;
1992 1952
1993 if (write) { 1953 if (write) {
1994 spin_lock(&hugetlb_lock); 1954 spin_lock(&hugetlb_lock);
1995 h->nr_overcommit_huge_pages = tmp; 1955 h->nr_overcommit_huge_pages = tmp;
1996 spin_unlock(&hugetlb_lock); 1956 spin_unlock(&hugetlb_lock);
1997 } 1957 }
1998 1958out:
1999 return 0; 1959 return ret;
2000} 1960}
2001 1961
2002#endif /* CONFIG_SYSCTL */ 1962#endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
2454 return VM_FAULT_OOM; 2414 return VM_FAULT_OOM;
2455 } 2415 }
2456 2416
2457 copy_user_huge_page(new_page, old_page, address, vma); 2417 copy_user_huge_page(new_page, old_page, address, vma,
2418 pages_per_huge_page(h));
2458 __SetPageUptodate(new_page); 2419 __SetPageUptodate(new_page);
2459 2420
2460 /* 2421 /*
@@ -2558,7 +2519,7 @@ retry:
2558 ret = -PTR_ERR(page); 2519 ret = -PTR_ERR(page);
2559 goto out; 2520 goto out;
2560 } 2521 }
2561 clear_huge_page(page, address, huge_page_size(h)); 2522 clear_huge_page(page, address, pages_per_huge_page(h));
2562 __SetPageUptodate(page); 2523 __SetPageUptodate(page);
2563 2524
2564 if (vma->vm_flags & VM_MAYSHARE) { 2525 if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..4c98630f0f77 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
39 39
40extern unsigned long highest_memmap_pfn; 40extern unsigned long highest_memmap_pfn;
41 41
42#ifdef CONFIG_SMP
43extern int putback_active_lru_page(struct zone *zone, struct page *page);
44#else
45static inline int putback_active_lru_page(struct zone *zone, struct page *page)
46{
47 return 0;
48}
49#endif
50
42/* 51/*
43 * in mm/vmscan.c: 52 * in mm/vmscan.c:
44 */ 53 */
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
134 } 143 }
135} 144}
136 145
146#ifdef CONFIG_TRANSPARENT_HUGEPAGE
147extern unsigned long vma_address(struct page *page,
148 struct vm_area_struct *vma);
149#endif
137#else /* !CONFIG_MMU */ 150#else /* !CONFIG_MMU */
138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 151static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
139{ 152{
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
243 256
244int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 257int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
245 unsigned long start, int len, unsigned int foll_flags, 258 unsigned long start, int len, unsigned int foll_flags,
246 struct page **pages, struct vm_area_struct **vmas); 259 struct page **pages, struct vm_area_struct **vmas,
260 int *nonblocking);
247 261
248#define ZONE_RECLAIM_NOSCAN -2 262#define ZONE_RECLAIM_NOSCAN -2
249#define ZONE_RECLAIM_FULL -1 263#define ZONE_RECLAIM_FULL -1
diff --git a/mm/ksm.c b/mm/ksm.c
index 43bc893470b4..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/freezer.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39#include "internal.h" 40#include "internal.h"
@@ -411,6 +412,20 @@ out:
411 up_read(&mm->mmap_sem); 412 up_read(&mm->mmap_sem);
412} 413}
413 414
415static struct page *page_trans_compound_anon(struct page *page)
416{
417 if (PageTransCompound(page)) {
418 struct page *head = compound_trans_head(page);
419 /*
420 * head may actually be splitted and freed from under
421 * us but it's ok here.
422 */
423 if (PageAnon(head))
424 return head;
425 }
426 return NULL;
427}
428
414static struct page *get_mergeable_page(struct rmap_item *rmap_item) 429static struct page *get_mergeable_page(struct rmap_item *rmap_item)
415{ 430{
416 struct mm_struct *mm = rmap_item->mm; 431 struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
430 page = follow_page(vma, addr, FOLL_GET); 445 page = follow_page(vma, addr, FOLL_GET);
431 if (IS_ERR_OR_NULL(page)) 446 if (IS_ERR_OR_NULL(page))
432 goto out; 447 goto out;
433 if (PageAnon(page)) { 448 if (PageAnon(page) || page_trans_compound_anon(page)) {
434 flush_anon_page(vma, page, addr); 449 flush_anon_page(vma, page, addr);
435 flush_dcache_page(page); 450 flush_dcache_page(page);
436 } else { 451 } else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
708 if (addr == -EFAULT) 723 if (addr == -EFAULT)
709 goto out; 724 goto out;
710 725
726 BUG_ON(PageTransCompound(page));
711 ptep = page_check_address(page, mm, addr, &ptl, 0); 727 ptep = page_check_address(page, mm, addr, &ptl, 0);
712 if (!ptep) 728 if (!ptep)
713 goto out; 729 goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
783 goto out; 799 goto out;
784 800
785 pmd = pmd_offset(pud, addr); 801 pmd = pmd_offset(pud, addr);
802 BUG_ON(pmd_trans_huge(*pmd));
786 if (!pmd_present(*pmd)) 803 if (!pmd_present(*pmd))
787 goto out; 804 goto out;
788 805
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
800 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
801 818
802 page_remove_rmap(page); 819 page_remove_rmap(page);
820 if (!page_mapped(page))
821 try_to_free_swap(page);
803 put_page(page); 822 put_page(page);
804 823
805 pte_unmap_unlock(ptep, ptl); 824 pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
808 return err; 827 return err;
809} 828}
810 829
830static int page_trans_compound_anon_split(struct page *page)
831{
832 int ret = 0;
833 struct page *transhuge_head = page_trans_compound_anon(page);
834 if (transhuge_head) {
835 /* Get the reference on the head to split it. */
836 if (get_page_unless_zero(transhuge_head)) {
837 /*
838 * Recheck we got the reference while the head
839 * was still anonymous.
840 */
841 if (PageAnon(transhuge_head))
842 ret = split_huge_page(transhuge_head);
843 else
844 /*
845 * Retry later if split_huge_page run
846 * from under us.
847 */
848 ret = 1;
849 put_page(transhuge_head);
850 } else
851 /* Retry later if split_huge_page run from under us. */
852 ret = 1;
853 }
854 return ret;
855}
856
811/* 857/*
812 * try_to_merge_one_page - take two pages and merge them into one 858 * try_to_merge_one_page - take two pages and merge them into one
813 * @vma: the vma that holds the pte pointing to page 859 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
828 874
829 if (!(vma->vm_flags & VM_MERGEABLE)) 875 if (!(vma->vm_flags & VM_MERGEABLE))
830 goto out; 876 goto out;
877 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
878 goto out;
879 BUG_ON(PageTransCompound(page));
831 if (!PageAnon(page)) 880 if (!PageAnon(page))
832 goto out; 881 goto out;
833 882
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1247 1296
1248 slot = ksm_scan.mm_slot; 1297 slot = ksm_scan.mm_slot;
1249 if (slot == &ksm_mm_head) { 1298 if (slot == &ksm_mm_head) {
1299 /*
1300 * A number of pages can hang around indefinitely on per-cpu
1301 * pagevecs, raised page count preventing write_protect_page
1302 * from merging them. Though it doesn't really matter much,
1303 * it is puzzling to see some stuck in pages_volatile until
1304 * other activity jostles them out, and they also prevented
1305 * LTP's KSM test from succeeding deterministically; so drain
1306 * them here (here rather than on entry to ksm_do_scan(),
1307 * so we don't IPI too often when pages_to_scan is set low).
1308 */
1309 lru_add_drain_all();
1310
1250 root_unstable_tree = RB_ROOT; 1311 root_unstable_tree = RB_ROOT;
1251 1312
1252 spin_lock(&ksm_mmlist_lock); 1313 spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
1277 if (ksm_test_exit(mm)) 1338 if (ksm_test_exit(mm))
1278 break; 1339 break;
1279 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1340 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1280 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { 1341 if (IS_ERR_OR_NULL(*page)) {
1342 ksm_scan.address += PAGE_SIZE;
1343 cond_resched();
1344 continue;
1345 }
1346 if (PageAnon(*page) ||
1347 page_trans_compound_anon(*page)) {
1281 flush_anon_page(vma, *page, ksm_scan.address); 1348 flush_anon_page(vma, *page, ksm_scan.address);
1282 flush_dcache_page(*page); 1349 flush_dcache_page(*page);
1283 rmap_item = get_next_rmap_item(slot, 1350 rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
1291 up_read(&mm->mmap_sem); 1358 up_read(&mm->mmap_sem);
1292 return rmap_item; 1359 return rmap_item;
1293 } 1360 }
1294 if (!IS_ERR_OR_NULL(*page)) 1361 put_page(*page);
1295 put_page(*page);
1296 ksm_scan.address += PAGE_SIZE; 1362 ksm_scan.address += PAGE_SIZE;
1297 cond_resched(); 1363 cond_resched();
1298 } 1364 }
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1352 struct rmap_item *rmap_item; 1418 struct rmap_item *rmap_item;
1353 struct page *uninitialized_var(page); 1419 struct page *uninitialized_var(page);
1354 1420
1355 while (scan_npages--) { 1421 while (scan_npages-- && likely(!freezing(current))) {
1356 cond_resched(); 1422 cond_resched();
1357 rmap_item = scan_get_next_rmap_item(&page); 1423 rmap_item = scan_get_next_rmap_item(&page);
1358 if (!rmap_item) 1424 if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
1370 1436
1371static int ksm_scan_thread(void *nothing) 1437static int ksm_scan_thread(void *nothing)
1372{ 1438{
1439 set_freezable();
1373 set_user_nice(current, 5); 1440 set_user_nice(current, 5);
1374 1441
1375 while (!kthread_should_stop()) { 1442 while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
1378 ksm_do_scan(ksm_thread_pages_to_scan); 1445 ksm_do_scan(ksm_thread_pages_to_scan);
1379 mutex_unlock(&ksm_thread_mutex); 1446 mutex_unlock(&ksm_thread_mutex);
1380 1447
1448 try_to_freeze();
1449
1381 if (ksmd_should_run()) { 1450 if (ksmd_should_run()) {
1382 schedule_timeout_interruptible( 1451 schedule_timeout_interruptible(
1383 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1452 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1384 } else { 1453 } else {
1385 wait_event_interruptible(ksm_thread_wait, 1454 wait_event_freezable(ksm_thread_wait,
1386 ksmd_should_run() || kthread_should_stop()); 1455 ksmd_should_run() || kthread_should_stop());
1387 } 1456 }
1388 } 1457 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
71 if (error) 71 if (error)
72 goto out; 72 goto out;
73 break; 73 break;
74 case MADV_HUGEPAGE:
75 case MADV_NOHUGEPAGE:
76 error = hugepage_madvise(vma, &new_flags, behavior);
77 if (error)
78 goto out;
79 break;
74 } 80 }
75 81
76 if (new_flags == vma->vm_flags) { 82 if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
283 case MADV_MERGEABLE: 289 case MADV_MERGEABLE:
284 case MADV_UNMERGEABLE: 290 case MADV_UNMERGEABLE:
285#endif 291#endif
292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
293 case MADV_HUGEPAGE:
294 case MADV_NOHUGEPAGE:
295#endif
286 return 1; 296 return 1;
287 297
288 default: 298 default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00bb8a64d028..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
292 unsigned long moved_charge; 292 unsigned long moved_charge;
293 unsigned long moved_swap; 293 unsigned long moved_swap;
294 struct task_struct *moving_task; /* a task moving charges */ 294 struct task_struct *moving_task; /* a task moving charges */
295 struct mm_struct *mm;
296 wait_queue_head_t waitq; /* a waitq for other context */ 295 wait_queue_head_t waitq; /* a waitq for other context */
297} mc = { 296} mc = {
298 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
821 return; 820 return;
822 VM_BUG_ON(list_empty(&pc->lru)); 821 VM_BUG_ON(list_empty(&pc->lru));
823 list_del_init(&pc->lru); 822 list_del_init(&pc->lru);
824 return;
825} 823}
826 824
827void mem_cgroup_del_lru(struct page *page) 825void mem_cgroup_del_lru(struct page *page)
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1087 case 0: 1085 case 0:
1088 list_move(&page->lru, dst); 1086 list_move(&page->lru, dst);
1089 mem_cgroup_del_lru(page); 1087 mem_cgroup_del_lru(page);
1090 nr_taken++; 1088 nr_taken += hpage_nr_pages(page);
1091 break; 1089 break;
1092 case -EBUSY: 1090 case -EBUSY:
1093 /* we don't affect global LRU but rotate in our LRU */ 1091 /* we don't affect global LRU but rotate in our LRU */
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1312 u64 limit; 1310 u64 limit;
1313 u64 memsw; 1311 u64 memsw;
1314 1312
1315 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1313 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1316 total_swap_pages; 1314 limit += total_swap_pages << PAGE_SHIFT;
1315
1317 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1316 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1318 /* 1317 /*
1319 * If memsw is finite and limits the amount of swap space available 1318 * If memsw is finite and limits the amount of swap space available
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1600 * possibility of race condition. If there is, we take a lock. 1599 * possibility of race condition. If there is, we take a lock.
1601 */ 1600 */
1602 1601
1603static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) 1602void mem_cgroup_update_page_stat(struct page *page,
1603 enum mem_cgroup_page_stat_item idx, int val)
1604{ 1604{
1605 struct mem_cgroup *mem; 1605 struct mem_cgroup *mem;
1606 struct page_cgroup *pc = lookup_page_cgroup(page); 1606 struct page_cgroup *pc = lookup_page_cgroup(page);
1607 bool need_unlock = false; 1607 bool need_unlock = false;
1608 unsigned long uninitialized_var(flags);
1608 1609
1609 if (unlikely(!pc)) 1610 if (unlikely(!pc))
1610 return; 1611 return;
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1616 /* pc->mem_cgroup is unstable ? */ 1617 /* pc->mem_cgroup is unstable ? */
1617 if (unlikely(mem_cgroup_stealed(mem))) { 1618 if (unlikely(mem_cgroup_stealed(mem))) {
1618 /* take a lock against to access pc->mem_cgroup */ 1619 /* take a lock against to access pc->mem_cgroup */
1619 lock_page_cgroup(pc); 1620 move_lock_page_cgroup(pc, &flags);
1620 need_unlock = true; 1621 need_unlock = true;
1621 mem = pc->mem_cgroup; 1622 mem = pc->mem_cgroup;
1622 if (!mem || !PageCgroupUsed(pc)) 1623 if (!mem || !PageCgroupUsed(pc))
1623 goto out; 1624 goto out;
1624 } 1625 }
1625 1626
1626 this_cpu_add(mem->stat->count[idx], val);
1627
1628 switch (idx) { 1627 switch (idx) {
1629 case MEM_CGROUP_STAT_FILE_MAPPED: 1628 case MEMCG_NR_FILE_MAPPED:
1630 if (val > 0) 1629 if (val > 0)
1631 SetPageCgroupFileMapped(pc); 1630 SetPageCgroupFileMapped(pc);
1632 else if (!page_mapped(page)) 1631 else if (!page_mapped(page))
1633 ClearPageCgroupFileMapped(pc); 1632 ClearPageCgroupFileMapped(pc);
1633 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1634 break; 1634 break;
1635 default: 1635 default:
1636 BUG(); 1636 BUG();
1637 } 1637 }
1638 1638
1639 this_cpu_add(mem->stat->count[idx], val);
1640
1639out: 1641out:
1640 if (unlikely(need_unlock)) 1642 if (unlikely(need_unlock))
1641 unlock_page_cgroup(pc); 1643 move_unlock_page_cgroup(pc, &flags);
1642 rcu_read_unlock(); 1644 rcu_read_unlock();
1643 return; 1645 return;
1644} 1646}
1645 1647EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1646void mem_cgroup_update_file_mapped(struct page *page, int val)
1647{
1648 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1649}
1650 1648
1651/* 1649/*
1652 * size of first charge trial. "32" comes from vmscan.c's magic value. 1650 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1887 * oom-killer can be invoked. 1885 * oom-killer can be invoked.
1888 */ 1886 */
1889static int __mem_cgroup_try_charge(struct mm_struct *mm, 1887static int __mem_cgroup_try_charge(struct mm_struct *mm,
1890 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1888 gfp_t gfp_mask,
1889 struct mem_cgroup **memcg, bool oom,
1890 int page_size)
1891{ 1891{
1892 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1892 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1893 struct mem_cgroup *mem = NULL; 1893 struct mem_cgroup *mem = NULL;
1894 int ret; 1894 int ret;
1895 int csize = CHARGE_SIZE; 1895 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1896 1896
1897 /* 1897 /*
1898 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1898 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1917,7 @@ again:
1917 VM_BUG_ON(css_is_removed(&mem->css)); 1917 VM_BUG_ON(css_is_removed(&mem->css));
1918 if (mem_cgroup_is_root(mem)) 1918 if (mem_cgroup_is_root(mem))
1919 goto done; 1919 goto done;
1920 if (consume_stock(mem)) 1920 if (page_size == PAGE_SIZE && consume_stock(mem))
1921 goto done; 1921 goto done;
1922 css_get(&mem->css); 1922 css_get(&mem->css);
1923 } else { 1923 } else {
@@ -1940,7 +1940,7 @@ again:
1940 rcu_read_unlock(); 1940 rcu_read_unlock();
1941 goto done; 1941 goto done;
1942 } 1942 }
1943 if (consume_stock(mem)) { 1943 if (page_size == PAGE_SIZE && consume_stock(mem)) {
1944 /* 1944 /*
1945 * It seems dagerous to access memcg without css_get(). 1945 * It seems dagerous to access memcg without css_get().
1946 * But considering how consume_stok works, it's not 1946 * But considering how consume_stok works, it's not
@@ -1981,7 +1981,7 @@ again:
1981 case CHARGE_OK: 1981 case CHARGE_OK:
1982 break; 1982 break;
1983 case CHARGE_RETRY: /* not in OOM situation but retry */ 1983 case CHARGE_RETRY: /* not in OOM situation but retry */
1984 csize = PAGE_SIZE; 1984 csize = page_size;
1985 css_put(&mem->css); 1985 css_put(&mem->css);
1986 mem = NULL; 1986 mem = NULL;
1987 goto again; 1987 goto again;
@@ -2002,8 +2002,8 @@ again:
2002 } 2002 }
2003 } while (ret != CHARGE_OK); 2003 } while (ret != CHARGE_OK);
2004 2004
2005 if (csize > PAGE_SIZE) 2005 if (csize > page_size)
2006 refill_stock(mem, csize - PAGE_SIZE); 2006 refill_stock(mem, csize - page_size);
2007 css_put(&mem->css); 2007 css_put(&mem->css);
2008done: 2008done:
2009 *memcg = mem; 2009 *memcg = mem;
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2031 } 2031 }
2032} 2032}
2033 2033
2034static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 2034static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2035 int page_size)
2035{ 2036{
2036 __mem_cgroup_cancel_charge(mem, 1); 2037 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2037} 2038}
2038 2039
2039/* 2040/*
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2087 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 2088 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
2088 * USED state. If already USED, uncharge and return. 2089 * USED state. If already USED, uncharge and return.
2089 */ 2090 */
2090 2091static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
2091static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2092 struct page_cgroup *pc,
2092 struct page_cgroup *pc, 2093 enum charge_type ctype)
2093 enum charge_type ctype)
2094{ 2094{
2095 /* try_charge() can return NULL to *memcg, taking care of it. */
2096 if (!mem)
2097 return;
2098
2099 lock_page_cgroup(pc);
2100 if (unlikely(PageCgroupUsed(pc))) {
2101 unlock_page_cgroup(pc);
2102 mem_cgroup_cancel_charge(mem);
2103 return;
2104 }
2105
2106 pc->mem_cgroup = mem; 2095 pc->mem_cgroup = mem;
2107 /* 2096 /*
2108 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2097 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2127 } 2116 }
2128 2117
2129 mem_cgroup_charge_statistics(mem, pc, true); 2118 mem_cgroup_charge_statistics(mem, pc, true);
2119}
2120
2121static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2122 struct page_cgroup *pc,
2123 enum charge_type ctype,
2124 int page_size)
2125{
2126 int i;
2127 int count = page_size >> PAGE_SHIFT;
2128
2129 /* try_charge() can return NULL to *memcg, taking care of it. */
2130 if (!mem)
2131 return;
2132
2133 lock_page_cgroup(pc);
2134 if (unlikely(PageCgroupUsed(pc))) {
2135 unlock_page_cgroup(pc);
2136 mem_cgroup_cancel_charge(mem, page_size);
2137 return;
2138 }
2139
2140 /*
2141 * we don't need page_cgroup_lock about tail pages, becase they are not
2142 * accessed by any other context at this point.
2143 */
2144 for (i = 0; i < count; i++)
2145 ____mem_cgroup_commit_charge(mem, pc + i, ctype);
2130 2146
2131 unlock_page_cgroup(pc); 2147 unlock_page_cgroup(pc);
2132 /* 2148 /*
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2173 mem_cgroup_charge_statistics(from, pc, false); 2189 mem_cgroup_charge_statistics(from, pc, false);
2174 if (uncharge) 2190 if (uncharge)
2175 /* This is not "cancel", but cancel_charge does all we need. */ 2191 /* This is not "cancel", but cancel_charge does all we need. */
2176 mem_cgroup_cancel_charge(from); 2192 mem_cgroup_cancel_charge(from, PAGE_SIZE);
2177 2193
2178 /* caller should have done css_get */ 2194 /* caller should have done css_get */
2179 pc->mem_cgroup = to; 2195 pc->mem_cgroup = to;
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2195 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2211 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
2196{ 2212{
2197 int ret = -EINVAL; 2213 int ret = -EINVAL;
2214 unsigned long flags;
2215
2198 lock_page_cgroup(pc); 2216 lock_page_cgroup(pc);
2199 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2217 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2218 move_lock_page_cgroup(pc, &flags);
2200 __mem_cgroup_move_account(pc, from, to, uncharge); 2219 __mem_cgroup_move_account(pc, from, to, uncharge);
2220 move_unlock_page_cgroup(pc, &flags);
2201 ret = 0; 2221 ret = 0;
2202 } 2222 }
2203 unlock_page_cgroup(pc); 2223 unlock_page_cgroup(pc);
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2234 goto put; 2254 goto put;
2235 2255
2236 parent = mem_cgroup_from_cont(pcg); 2256 parent = mem_cgroup_from_cont(pcg);
2237 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 2257 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
2258 PAGE_SIZE);
2238 if (ret || !parent) 2259 if (ret || !parent)
2239 goto put_back; 2260 goto put_back;
2240 2261
2241 ret = mem_cgroup_move_account(pc, child, parent, true); 2262 ret = mem_cgroup_move_account(pc, child, parent, true);
2242 if (ret) 2263 if (ret)
2243 mem_cgroup_cancel_charge(parent); 2264 mem_cgroup_cancel_charge(parent, PAGE_SIZE);
2244put_back: 2265put_back:
2245 putback_lru_page(page); 2266 putback_lru_page(page);
2246put: 2267put:
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2261 struct mem_cgroup *mem = NULL; 2282 struct mem_cgroup *mem = NULL;
2262 struct page_cgroup *pc; 2283 struct page_cgroup *pc;
2263 int ret; 2284 int ret;
2285 int page_size = PAGE_SIZE;
2286
2287 if (PageTransHuge(page)) {
2288 page_size <<= compound_order(page);
2289 VM_BUG_ON(!PageTransHuge(page));
2290 }
2264 2291
2265 pc = lookup_page_cgroup(page); 2292 pc = lookup_page_cgroup(page);
2266 /* can happen at boot */ 2293 /* can happen at boot */
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2268 return 0; 2295 return 0;
2269 prefetchw(pc); 2296 prefetchw(pc);
2270 2297
2271 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
2272 if (ret || !mem) 2299 if (ret || !mem)
2273 return ret; 2300 return ret;
2274 2301
2275 __mem_cgroup_commit_charge(mem, pc, ctype); 2302 __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
2276 return 0; 2303 return 0;
2277} 2304}
2278 2305
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
2281{ 2308{
2282 if (mem_cgroup_disabled()) 2309 if (mem_cgroup_disabled())
2283 return 0; 2310 return 0;
2284 if (PageCompound(page))
2285 return 0;
2286 /* 2311 /*
2287 * If already mapped, we don't have to account. 2312 * If already mapped, we don't have to account.
2288 * If page cache, page->mapping has address_space. 2313 * If page cache, page->mapping has address_space.
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2388 if (!mem) 2413 if (!mem)
2389 goto charge_cur_mm; 2414 goto charge_cur_mm;
2390 *ptr = mem; 2415 *ptr = mem;
2391 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2416 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
2392 css_put(&mem->css); 2417 css_put(&mem->css);
2393 return ret; 2418 return ret;
2394charge_cur_mm: 2419charge_cur_mm:
2395 if (unlikely(!mm)) 2420 if (unlikely(!mm))
2396 mm = &init_mm; 2421 mm = &init_mm;
2397 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2422 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
2398} 2423}
2399 2424
2400static void 2425static void
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2410 cgroup_exclude_rmdir(&ptr->css); 2435 cgroup_exclude_rmdir(&ptr->css);
2411 pc = lookup_page_cgroup(page); 2436 pc = lookup_page_cgroup(page);
2412 mem_cgroup_lru_del_before_commit_swapcache(page); 2437 mem_cgroup_lru_del_before_commit_swapcache(page);
2413 __mem_cgroup_commit_charge(ptr, pc, ctype); 2438 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2414 mem_cgroup_lru_add_after_commit_swapcache(page); 2439 mem_cgroup_lru_add_after_commit_swapcache(page);
2415 /* 2440 /*
2416 * Now swap is on-memory. This means this page may be 2441 * Now swap is on-memory. This means this page may be
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2459 return; 2484 return;
2460 if (!mem) 2485 if (!mem)
2461 return; 2486 return;
2462 mem_cgroup_cancel_charge(mem); 2487 mem_cgroup_cancel_charge(mem, PAGE_SIZE);
2463} 2488}
2464 2489
2465static void 2490static void
2466__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2491__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2492 int page_size)
2467{ 2493{
2468 struct memcg_batch_info *batch = NULL; 2494 struct memcg_batch_info *batch = NULL;
2469 bool uncharge_memsw = true; 2495 bool uncharge_memsw = true;
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2490 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2516 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2491 goto direct_uncharge; 2517 goto direct_uncharge;
2492 2518
2519 if (page_size != PAGE_SIZE)
2520 goto direct_uncharge;
2521
2493 /* 2522 /*
2494 * In typical case, batch->memcg == mem. This means we can 2523 * In typical case, batch->memcg == mem. This means we can
2495 * merge a series of uncharges to an uncharge of res_counter. 2524 * merge a series of uncharges to an uncharge of res_counter.
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2503 batch->memsw_bytes += PAGE_SIZE; 2532 batch->memsw_bytes += PAGE_SIZE;
2504 return; 2533 return;
2505direct_uncharge: 2534direct_uncharge:
2506 res_counter_uncharge(&mem->res, PAGE_SIZE); 2535 res_counter_uncharge(&mem->res, page_size);
2507 if (uncharge_memsw) 2536 if (uncharge_memsw)
2508 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2537 res_counter_uncharge(&mem->memsw, page_size);
2509 if (unlikely(batch->memcg != mem)) 2538 if (unlikely(batch->memcg != mem))
2510 memcg_oom_recover(mem); 2539 memcg_oom_recover(mem);
2511 return; 2540 return;
@@ -2517,8 +2546,11 @@ direct_uncharge:
2517static struct mem_cgroup * 2546static struct mem_cgroup *
2518__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2547__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2519{ 2548{
2549 int i;
2550 int count;
2520 struct page_cgroup *pc; 2551 struct page_cgroup *pc;
2521 struct mem_cgroup *mem = NULL; 2552 struct mem_cgroup *mem = NULL;
2553 int page_size = PAGE_SIZE;
2522 2554
2523 if (mem_cgroup_disabled()) 2555 if (mem_cgroup_disabled())
2524 return NULL; 2556 return NULL;
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2526 if (PageSwapCache(page)) 2558 if (PageSwapCache(page))
2527 return NULL; 2559 return NULL;
2528 2560
2561 if (PageTransHuge(page)) {
2562 page_size <<= compound_order(page);
2563 VM_BUG_ON(!PageTransHuge(page));
2564 }
2565
2566 count = page_size >> PAGE_SHIFT;
2529 /* 2567 /*
2530 * Check if our page_cgroup is valid 2568 * Check if our page_cgroup is valid
2531 */ 2569 */
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2558 break; 2596 break;
2559 } 2597 }
2560 2598
2561 mem_cgroup_charge_statistics(mem, pc, false); 2599 for (i = 0; i < count; i++)
2600 mem_cgroup_charge_statistics(mem, pc + i, false);
2562 2601
2563 ClearPageCgroupUsed(pc); 2602 ClearPageCgroupUsed(pc);
2564 /* 2603 /*
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2579 mem_cgroup_get(mem); 2618 mem_cgroup_get(mem);
2580 } 2619 }
2581 if (!mem_cgroup_is_root(mem)) 2620 if (!mem_cgroup_is_root(mem))
2582 __do_uncharge(mem, ctype); 2621 __do_uncharge(mem, ctype, page_size);
2583 2622
2584 return mem; 2623 return mem;
2585 2624
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2774 enum charge_type ctype; 2813 enum charge_type ctype;
2775 int ret = 0; 2814 int ret = 0;
2776 2815
2816 VM_BUG_ON(PageTransHuge(page));
2777 if (mem_cgroup_disabled()) 2817 if (mem_cgroup_disabled())
2778 return 0; 2818 return 0;
2779 2819
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2823 return 0; 2863 return 0;
2824 2864
2825 *ptr = mem; 2865 *ptr = mem;
2826 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2866 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
2827 css_put(&mem->css);/* drop extra refcnt */ 2867 css_put(&mem->css);/* drop extra refcnt */
2828 if (ret || *ptr == NULL) { 2868 if (ret || *ptr == NULL) {
2829 if (PageAnon(page)) { 2869 if (PageAnon(page)) {
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page,
2850 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2890 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2851 else 2891 else
2852 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2892 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2853 __mem_cgroup_commit_charge(mem, pc, ctype); 2893 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
2854 return ret; 2894 return ret;
2855} 2895}
2856 2896
2857/* remove redundant charge if migration failed*/ 2897/* remove redundant charge if migration failed*/
2858void mem_cgroup_end_migration(struct mem_cgroup *mem, 2898void mem_cgroup_end_migration(struct mem_cgroup *mem,
2859 struct page *oldpage, struct page *newpage) 2899 struct page *oldpage, struct page *newpage, bool migration_ok)
2860{ 2900{
2861 struct page *used, *unused; 2901 struct page *used, *unused;
2862 struct page_cgroup *pc; 2902 struct page_cgroup *pc;
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2865 return; 2905 return;
2866 /* blocks rmdir() */ 2906 /* blocks rmdir() */
2867 cgroup_exclude_rmdir(&mem->css); 2907 cgroup_exclude_rmdir(&mem->css);
2868 /* at migration success, oldpage->mapping is NULL. */ 2908 if (!migration_ok) {
2869 if (oldpage->mapping) {
2870 used = oldpage; 2909 used = oldpage;
2871 unused = newpage; 2910 unused = newpage;
2872 } else { 2911 } else {
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4176 */ 4215 */
4177 if (!node_state(node, N_NORMAL_MEMORY)) 4216 if (!node_state(node, N_NORMAL_MEMORY))
4178 tmp = -1; 4217 tmp = -1;
4179 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4218 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4180 if (!pn) 4219 if (!pn)
4181 return 1; 4220 return 1;
4182 4221
4183 mem->info.nodeinfo[node] = pn; 4222 mem->info.nodeinfo[node] = pn;
4184 memset(pn, 0, sizeof(*pn));
4185
4186 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4223 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4187 mz = &pn->zoneinfo[zone]; 4224 mz = &pn->zoneinfo[zone];
4188 for_each_lru(l) 4225 for_each_lru(l)
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4206 4243
4207 /* Can be very big if MAX_NUMNODES is very big */ 4244 /* Can be very big if MAX_NUMNODES is very big */
4208 if (size < PAGE_SIZE) 4245 if (size < PAGE_SIZE)
4209 mem = kmalloc(size, GFP_KERNEL); 4246 mem = kzalloc(size, GFP_KERNEL);
4210 else 4247 else
4211 mem = vmalloc(size); 4248 mem = vzalloc(size);
4212 4249
4213 if (!mem) 4250 if (!mem)
4214 return NULL; 4251 return NULL;
4215 4252
4216 memset(mem, 0, size);
4217 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4253 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4218 if (!mem->stat) 4254 if (!mem->stat)
4219 goto out_free; 4255 goto out_free;
@@ -4461,7 +4497,8 @@ one_by_one:
4461 batch_count = PRECHARGE_COUNT_AT_ONCE; 4497 batch_count = PRECHARGE_COUNT_AT_ONCE;
4462 cond_resched(); 4498 cond_resched();
4463 } 4499 }
4464 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4500 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
4501 PAGE_SIZE);
4465 if (ret || !mem) 4502 if (ret || !mem)
4466 /* mem_cgroup_clear_mc() will do uncharge later */ 4503 /* mem_cgroup_clear_mc() will do uncharge later */
4467 return -ENOMEM; 4504 return -ENOMEM;
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4623 pte_t *pte; 4660 pte_t *pte;
4624 spinlock_t *ptl; 4661 spinlock_t *ptl;
4625 4662
4663 VM_BUG_ON(pmd_trans_huge(*pmd));
4626 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4664 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4627 for (; addr != end; pte++, addr += PAGE_SIZE) 4665 for (; addr != end; pte++, addr += PAGE_SIZE)
4628 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4666 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4638 unsigned long precharge; 4676 unsigned long precharge;
4639 struct vm_area_struct *vma; 4677 struct vm_area_struct *vma;
4640 4678
4641 /* We've already held the mmap_sem */ 4679 down_read(&mm->mmap_sem);
4642 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4680 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4643 struct mm_walk mem_cgroup_count_precharge_walk = { 4681 struct mm_walk mem_cgroup_count_precharge_walk = {
4644 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4682 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4650 walk_page_range(vma->vm_start, vma->vm_end, 4688 walk_page_range(vma->vm_start, vma->vm_end,
4651 &mem_cgroup_count_precharge_walk); 4689 &mem_cgroup_count_precharge_walk);
4652 } 4690 }
4691 up_read(&mm->mmap_sem);
4653 4692
4654 precharge = mc.precharge; 4693 precharge = mc.precharge;
4655 mc.precharge = 0; 4694 mc.precharge = 0;
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4659 4698
4660static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4699static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4661{ 4700{
4662 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4701 unsigned long precharge = mem_cgroup_count_precharge(mm);
4702
4703 VM_BUG_ON(mc.moving_task);
4704 mc.moving_task = current;
4705 return mem_cgroup_do_precharge(precharge);
4663} 4706}
4664 4707
4665static void mem_cgroup_clear_mc(void) 4708/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4709static void __mem_cgroup_clear_mc(void)
4666{ 4710{
4667 struct mem_cgroup *from = mc.from; 4711 struct mem_cgroup *from = mc.from;
4668 struct mem_cgroup *to = mc.to; 4712 struct mem_cgroup *to = mc.to;
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void)
4697 PAGE_SIZE * mc.moved_swap); 4741 PAGE_SIZE * mc.moved_swap);
4698 } 4742 }
4699 /* we've already done mem_cgroup_get(mc.to) */ 4743 /* we've already done mem_cgroup_get(mc.to) */
4700
4701 mc.moved_swap = 0; 4744 mc.moved_swap = 0;
4702 } 4745 }
4703 if (mc.mm) { 4746 memcg_oom_recover(from);
4704 up_read(&mc.mm->mmap_sem); 4747 memcg_oom_recover(to);
4705 mmput(mc.mm); 4748 wake_up_all(&mc.waitq);
4706 } 4749}
4750
4751static void mem_cgroup_clear_mc(void)
4752{
4753 struct mem_cgroup *from = mc.from;
4754
4755 /*
4756 * we must clear moving_task before waking up waiters at the end of
4757 * task migration.
4758 */
4759 mc.moving_task = NULL;
4760 __mem_cgroup_clear_mc();
4707 spin_lock(&mc.lock); 4761 spin_lock(&mc.lock);
4708 mc.from = NULL; 4762 mc.from = NULL;
4709 mc.to = NULL; 4763 mc.to = NULL;
4710 spin_unlock(&mc.lock); 4764 spin_unlock(&mc.lock);
4711 mc.moving_task = NULL;
4712 mc.mm = NULL;
4713 mem_cgroup_end_move(from); 4765 mem_cgroup_end_move(from);
4714 memcg_oom_recover(from);
4715 memcg_oom_recover(to);
4716 wake_up_all(&mc.waitq);
4717} 4766}
4718 4767
4719static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4768static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4735 return 0; 4784 return 0;
4736 /* We move charges only when we move a owner of the mm */ 4785 /* We move charges only when we move a owner of the mm */
4737 if (mm->owner == p) { 4786 if (mm->owner == p) {
4738 /*
4739 * We do all the move charge works under one mmap_sem to
4740 * avoid deadlock with down_write(&mmap_sem)
4741 * -> try_charge() -> if (mc.moving_task) -> sleep.
4742 */
4743 down_read(&mm->mmap_sem);
4744
4745 VM_BUG_ON(mc.from); 4787 VM_BUG_ON(mc.from);
4746 VM_BUG_ON(mc.to); 4788 VM_BUG_ON(mc.to);
4747 VM_BUG_ON(mc.precharge); 4789 VM_BUG_ON(mc.precharge);
4748 VM_BUG_ON(mc.moved_charge); 4790 VM_BUG_ON(mc.moved_charge);
4749 VM_BUG_ON(mc.moved_swap); 4791 VM_BUG_ON(mc.moved_swap);
4750 VM_BUG_ON(mc.moving_task);
4751 VM_BUG_ON(mc.mm);
4752
4753 mem_cgroup_start_move(from); 4792 mem_cgroup_start_move(from);
4754 spin_lock(&mc.lock); 4793 spin_lock(&mc.lock);
4755 mc.from = from; 4794 mc.from = from;
4756 mc.to = mem; 4795 mc.to = mem;
4757 mc.precharge = 0;
4758 mc.moved_charge = 0;
4759 mc.moved_swap = 0;
4760 spin_unlock(&mc.lock); 4796 spin_unlock(&mc.lock);
4761 mc.moving_task = current; 4797 /* We set mc.moving_task later */
4762 mc.mm = mm;
4763 4798
4764 ret = mem_cgroup_precharge_mc(mm); 4799 ret = mem_cgroup_precharge_mc(mm);
4765 if (ret) 4800 if (ret)
4766 mem_cgroup_clear_mc(); 4801 mem_cgroup_clear_mc();
4767 /* We call up_read() and mmput() in clear_mc(). */ 4802 }
4768 } else 4803 mmput(mm);
4769 mmput(mm);
4770 } 4804 }
4771 return ret; 4805 return ret;
4772} 4806}
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4789 spinlock_t *ptl; 4823 spinlock_t *ptl;
4790 4824
4791retry: 4825retry:
4826 VM_BUG_ON(pmd_trans_huge(*pmd));
4792 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4827 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4793 for (; addr != end; addr += PAGE_SIZE) { 4828 for (; addr != end; addr += PAGE_SIZE) {
4794 pte_t ptent = *(pte++); 4829 pte_t ptent = *(pte++);
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4854 struct vm_area_struct *vma; 4889 struct vm_area_struct *vma;
4855 4890
4856 lru_add_drain_all(); 4891 lru_add_drain_all();
4857 /* We've already held the mmap_sem */ 4892retry:
4893 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
4894 /*
4895 * Someone who are holding the mmap_sem might be waiting in
4896 * waitq. So we cancel all extra charges, wake up all waiters,
4897 * and retry. Because we cancel precharges, we might not be able
4898 * to move enough charges, but moving charge is a best-effort
4899 * feature anyway, so it wouldn't be a big problem.
4900 */
4901 __mem_cgroup_clear_mc();
4902 cond_resched();
4903 goto retry;
4904 }
4858 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4905 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4859 int ret; 4906 int ret;
4860 struct mm_walk mem_cgroup_move_charge_walk = { 4907 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4873 */ 4920 */
4874 break; 4921 break;
4875 } 4922 }
4923 up_read(&mm->mmap_sem);
4876} 4924}
4877 4925
4878static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4926static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4881 struct task_struct *p, 4929 struct task_struct *p,
4882 bool threadgroup) 4930 bool threadgroup)
4883{ 4931{
4884 if (!mc.mm) 4932 struct mm_struct *mm;
4933
4934 if (!mc.to)
4885 /* no need to move charge */ 4935 /* no need to move charge */
4886 return; 4936 return;
4887 4937
4888 mem_cgroup_move_charge(mc.mm); 4938 mm = get_task_mm(p);
4939 if (mm) {
4940 mem_cgroup_move_charge(mm);
4941 mmput(mm);
4942 }
4889 mem_cgroup_clear_mc(); 4943 mem_cgroup_clear_mc();
4890} 4944}
4891#else /* !CONFIG_MMU */ 4945#else /* !CONFIG_MMU */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
203#ifdef __ARCH_SI_TRAPNO 203#ifdef __ARCH_SI_TRAPNO
204 si.si_trapno = trapno; 204 si.si_trapno = trapno;
205#endif 205#endif
206 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 206 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
207 /* 207 /*
208 * Don't use force here, it's convenient if the signal 208 * Don't use force here, it's convenient if the signal
209 * can be temporarily blocked. 209 * can be temporarily blocked.
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
386 struct task_struct *tsk; 386 struct task_struct *tsk;
387 struct anon_vma *av; 387 struct anon_vma *av;
388 388
389 if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 return;
389 read_lock(&tasklist_lock); 391 read_lock(&tasklist_lock);
390 av = page_lock_anon_vma(page); 392 av = page_lock_anon_vma(page);
391 if (av == NULL) /* Not actually mapped anymore */ 393 if (av == NULL) /* Not actually mapped anymore */
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
928static void set_page_hwpoison_huge_page(struct page *hpage) 930static void set_page_hwpoison_huge_page(struct page *hpage)
929{ 931{
930 int i; 932 int i;
931 int nr_pages = 1 << compound_order(hpage); 933 int nr_pages = 1 << compound_trans_order(hpage);
932 for (i = 0; i < nr_pages; i++) 934 for (i = 0; i < nr_pages; i++)
933 SetPageHWPoison(hpage + i); 935 SetPageHWPoison(hpage + i);
934} 936}
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
936static void clear_page_hwpoison_huge_page(struct page *hpage) 938static void clear_page_hwpoison_huge_page(struct page *hpage)
937{ 939{
938 int i; 940 int i;
939 int nr_pages = 1 << compound_order(hpage); 941 int nr_pages = 1 << compound_trans_order(hpage);
940 for (i = 0; i < nr_pages; i++) 942 for (i = 0; i < nr_pages; i++)
941 ClearPageHWPoison(hpage + i); 943 ClearPageHWPoison(hpage + i);
942} 944}
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
966 return 0; 968 return 0;
967 } 969 }
968 970
969 nr_pages = 1 << compound_order(hpage); 971 nr_pages = 1 << compound_trans_order(hpage);
970 atomic_long_add(nr_pages, &mce_bad_pages); 972 atomic_long_add(nr_pages, &mce_bad_pages);
971 973
972 /* 974 /*
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
1164 return 0; 1166 return 0;
1165 } 1167 }
1166 1168
1167 nr_pages = 1 << compound_order(page); 1169 nr_pages = 1 << compound_trans_order(page);
1168 1170
1169 if (!get_page_unless_zero(page)) { 1171 if (!get_page_unless_zero(page)) {
1170 /* 1172 /*
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1290 /* Keep page count to indicate a given hugepage is isolated. */ 1292 /* Keep page count to indicate a given hugepage is isolated. */
1291 1293
1292 list_add(&hpage->lru, &pagelist); 1294 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1295 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 true);
1294 if (ret) { 1297 if (ret) {
1295 putback_lru_pages(&pagelist); 1298 putback_lru_pages(&pagelist);
1296 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1299 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1297 pfn, ret, page->flags); 1300 pfn, ret, page->flags);
1298 if (ret > 0) 1301 if (ret > 0)
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1301 } 1304 }
1302done: 1305done:
1303 if (!PageHWPoison(hpage)) 1306 if (!PageHWPoison(hpage))
1304 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); 1307 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1305 set_page_hwpoison_huge_page(hpage); 1308 set_page_hwpoison_huge_page(hpage);
1306 dequeue_hwpoisoned_huge_page(hpage); 1309 dequeue_hwpoisoned_huge_page(hpage);
1307 /* keep elevated page count for bad page */ 1310 /* keep elevated page count for bad page */
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags)
1413 LIST_HEAD(pagelist); 1416 LIST_HEAD(pagelist);
1414 1417
1415 list_add(&page->lru, &pagelist); 1418 list_add(&page->lru, &pagelist);
1416 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1419 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 0, true);
1417 if (ret) { 1421 if (ret) {
1418 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1422 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1419 pfn, ret, page->flags); 1423 pfn, ret, page->flags);
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..31250faff390 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
394 } 394 }
395} 395}
396 396
397int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 397int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
398 pmd_t *pmd, unsigned long address)
398{ 399{
399 pgtable_t new = pte_alloc_one(mm, address); 400 pgtable_t new = pte_alloc_one(mm, address);
401 int wait_split_huge_page;
400 if (!new) 402 if (!new)
401 return -ENOMEM; 403 return -ENOMEM;
402 404
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
416 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 418 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
417 419
418 spin_lock(&mm->page_table_lock); 420 spin_lock(&mm->page_table_lock);
419 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 421 wait_split_huge_page = 0;
422 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
420 mm->nr_ptes++; 423 mm->nr_ptes++;
421 pmd_populate(mm, pmd, new); 424 pmd_populate(mm, pmd, new);
422 new = NULL; 425 new = NULL;
423 } 426 } else if (unlikely(pmd_trans_splitting(*pmd)))
427 wait_split_huge_page = 1;
424 spin_unlock(&mm->page_table_lock); 428 spin_unlock(&mm->page_table_lock);
425 if (new) 429 if (new)
426 pte_free(mm, new); 430 pte_free(mm, new);
431 if (wait_split_huge_page)
432 wait_split_huge_page(vma->anon_vma, pmd);
427 return 0; 433 return 0;
428} 434}
429 435
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
436 smp_wmb(); /* See comment in __pte_alloc */ 442 smp_wmb(); /* See comment in __pte_alloc */
437 443
438 spin_lock(&init_mm.page_table_lock); 444 spin_lock(&init_mm.page_table_lock);
439 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 445 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
440 pmd_populate_kernel(&init_mm, pmd, new); 446 pmd_populate_kernel(&init_mm, pmd, new);
441 new = NULL; 447 new = NULL;
442 } 448 } else
449 VM_BUG_ON(pmd_trans_splitting(*pmd));
443 spin_unlock(&init_mm.page_table_lock); 450 spin_unlock(&init_mm.page_table_lock);
444 if (new) 451 if (new)
445 pte_free_kernel(&init_mm, new); 452 pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
719 return 0; 726 return 0;
720} 727}
721 728
722static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 729int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
723 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
724 unsigned long addr, unsigned long end) 731 unsigned long addr, unsigned long end)
725{ 732{
726 pte_t *orig_src_pte, *orig_dst_pte; 733 pte_t *orig_src_pte, *orig_dst_pte;
727 pte_t *src_pte, *dst_pte; 734 pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
795 src_pmd = pmd_offset(src_pud, addr); 802 src_pmd = pmd_offset(src_pud, addr);
796 do { 803 do {
797 next = pmd_addr_end(addr, end); 804 next = pmd_addr_end(addr, end);
805 if (pmd_trans_huge(*src_pmd)) {
806 int err;
807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
808 err = copy_huge_pmd(dst_mm, src_mm,
809 dst_pmd, src_pmd, addr, vma);
810 if (err == -ENOMEM)
811 return -ENOMEM;
812 if (!err)
813 continue;
814 /* fall through */
815 }
798 if (pmd_none_or_clear_bad(src_pmd)) 816 if (pmd_none_or_clear_bad(src_pmd))
799 continue; 817 continue;
800 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
997 pmd = pmd_offset(pud, addr); 1015 pmd = pmd_offset(pud, addr);
998 do { 1016 do {
999 next = pmd_addr_end(addr, end); 1017 next = pmd_addr_end(addr, end);
1018 if (pmd_trans_huge(*pmd)) {
1019 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) {
1023 (*zap_work)--;
1024 continue;
1025 }
1026 /* fall through */
1027 }
1000 if (pmd_none_or_clear_bad(pmd)) { 1028 if (pmd_none_or_clear_bad(pmd)) {
1001 (*zap_work)--; 1029 (*zap_work)--;
1002 continue; 1030 continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1262 pud = pud_offset(pgd, address); 1290 pud = pud_offset(pgd, address);
1263 if (pud_none(*pud)) 1291 if (pud_none(*pud))
1264 goto no_page_table; 1292 goto no_page_table;
1265 if (pud_huge(*pud)) { 1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1266 BUG_ON(flags & FOLL_GET); 1294 BUG_ON(flags & FOLL_GET);
1267 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1268 goto out; 1296 goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1273 pmd = pmd_offset(pud, address); 1301 pmd = pmd_offset(pud, address);
1274 if (pmd_none(*pmd)) 1302 if (pmd_none(*pmd))
1275 goto no_page_table; 1303 goto no_page_table;
1276 if (pmd_huge(*pmd)) { 1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1277 BUG_ON(flags & FOLL_GET); 1305 BUG_ON(flags & FOLL_GET);
1278 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1279 goto out; 1307 goto out;
1280 } 1308 }
1309 if (pmd_trans_huge(*pmd)) {
1310 if (flags & FOLL_SPLIT) {
1311 split_huge_page_pmd(mm, pmd);
1312 goto split_fallthrough;
1313 }
1314 spin_lock(&mm->page_table_lock);
1315 if (likely(pmd_trans_huge(*pmd))) {
1316 if (unlikely(pmd_trans_splitting(*pmd))) {
1317 spin_unlock(&mm->page_table_lock);
1318 wait_split_huge_page(vma->anon_vma, pmd);
1319 } else {
1320 page = follow_trans_huge_pmd(mm, address,
1321 pmd, flags);
1322 spin_unlock(&mm->page_table_lock);
1323 goto out;
1324 }
1325 } else
1326 spin_unlock(&mm->page_table_lock);
1327 /* fall through */
1328 }
1329split_fallthrough:
1281 if (unlikely(pmd_bad(*pmd))) 1330 if (unlikely(pmd_bad(*pmd)))
1282 goto no_page_table; 1331 goto no_page_table;
1283 1332
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1310 */ 1359 */
1311 mark_page_accessed(page); 1360 mark_page_accessed(page);
1312 } 1361 }
1362 if (flags & FOLL_MLOCK) {
1363 /*
1364 * The preliminary mapping check is mainly to avoid the
1365 * pointless overhead of lock_page on the ZERO_PAGE
1366 * which might bounce very badly if there is contention.
1367 *
1368 * If the page is already locked, we don't need to
1369 * handle it now - vmscan will handle it later if and
1370 * when it attempts to reclaim the page.
1371 */
1372 if (page->mapping && trylock_page(page)) {
1373 lru_add_drain(); /* push cached pages to LRU */
1374 /*
1375 * Because we lock page here and migration is
1376 * blocked by the pte's page reference, we need
1377 * only check for file-cache page truncation.
1378 */
1379 if (page->mapping)
1380 mlock_vma_page(page);
1381 unlock_page(page);
1382 }
1383 }
1313unlock: 1384unlock:
1314 pte_unmap_unlock(ptep, ptl); 1385 pte_unmap_unlock(ptep, ptl);
1315out: 1386out:
@@ -1341,7 +1412,8 @@ no_page_table:
1341 1412
1342int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1343 unsigned long start, int nr_pages, unsigned int gup_flags, 1414 unsigned long start, int nr_pages, unsigned int gup_flags,
1344 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas,
1416 int *nonblocking)
1345{ 1417{
1346 int i; 1418 int i;
1347 unsigned long vm_flags; 1419 unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1386 pmd = pmd_offset(pud, pg); 1458 pmd = pmd_offset(pud, pg);
1387 if (pmd_none(*pmd)) 1459 if (pmd_none(*pmd))
1388 return i ? : -EFAULT; 1460 return i ? : -EFAULT;
1461 VM_BUG_ON(pmd_trans_huge(*pmd));
1389 pte = pte_offset_map(pmd, pg); 1462 pte = pte_offset_map(pmd, pg);
1390 if (pte_none(*pte)) { 1463 if (pte_none(*pte)) {
1391 pte_unmap(pte); 1464 pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1441 cond_resched(); 1514 cond_resched();
1442 while (!(page = follow_page(vma, start, foll_flags))) { 1515 while (!(page = follow_page(vma, start, foll_flags))) {
1443 int ret; 1516 int ret;
1517 unsigned int fault_flags = 0;
1518
1519 if (foll_flags & FOLL_WRITE)
1520 fault_flags |= FAULT_FLAG_WRITE;
1521 if (nonblocking)
1522 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1444 1523
1445 ret = handle_mm_fault(mm, vma, start, 1524 ret = handle_mm_fault(mm, vma, start,
1446 (foll_flags & FOLL_WRITE) ? 1525 fault_flags);
1447 FAULT_FLAG_WRITE : 0);
1448 1526
1449 if (ret & VM_FAULT_ERROR) { 1527 if (ret & VM_FAULT_ERROR) {
1450 if (ret & VM_FAULT_OOM) 1528 if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1460 else 1538 else
1461 tsk->min_flt++; 1539 tsk->min_flt++;
1462 1540
1541 if (ret & VM_FAULT_RETRY) {
1542 *nonblocking = 0;
1543 return i;
1544 }
1545
1463 /* 1546 /*
1464 * The VM_FAULT_WRITE bit tells us that 1547 * The VM_FAULT_WRITE bit tells us that
1465 * do_wp_page has broken COW when necessary, 1548 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1559 if (force) 1642 if (force)
1560 flags |= FOLL_FORCE; 1643 flags |= FOLL_FORCE;
1561 1644
1562 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1645 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1646 NULL);
1563} 1647}
1564EXPORT_SYMBOL(get_user_pages); 1648EXPORT_SYMBOL(get_user_pages);
1565 1649
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
1584 struct page *page; 1668 struct page *page;
1585 1669
1586 if (__get_user_pages(current, current->mm, addr, 1, 1670 if (__get_user_pages(current, current->mm, addr, 1,
1587 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) 1671 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1672 NULL) < 1)
1588 return NULL; 1673 return NULL;
1589 flush_cache_page(vma, addr, page_to_pfn(page)); 1674 flush_cache_page(vma, addr, page_to_pfn(page));
1590 return page; 1675 return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1598 pud_t * pud = pud_alloc(mm, pgd, addr); 1683 pud_t * pud = pud_alloc(mm, pgd, addr);
1599 if (pud) { 1684 if (pud) {
1600 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1685 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1601 if (pmd) 1686 if (pmd) {
1687 VM_BUG_ON(pmd_trans_huge(*pmd));
1602 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1688 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1689 }
1603 } 1690 }
1604 return NULL; 1691 return NULL;
1605} 1692}
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1818 pmd = pmd_alloc(mm, pud, addr); 1905 pmd = pmd_alloc(mm, pud, addr);
1819 if (!pmd) 1906 if (!pmd)
1820 return -ENOMEM; 1907 return -ENOMEM;
1908 VM_BUG_ON(pmd_trans_huge(*pmd));
1821 do { 1909 do {
1822 next = pmd_addr_end(addr, end); 1910 next = pmd_addr_end(addr, end);
1823 if (remap_pte_range(mm, pmd, addr, next, 1911 if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2048 return same; 2136 return same;
2049} 2137}
2050 2138
2051/*
2052 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
2053 * servicing faults for write access. In the normal case, do always want
2054 * pte_mkwrite. But get_user_pages can cause write faults for mappings
2055 * that do not have writing enabled, when used by access_process_vm.
2056 */
2057static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
2058{
2059 if (likely(vma->vm_flags & VM_WRITE))
2060 pte = pte_mkwrite(pte);
2061 return pte;
2062}
2063
2064static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2139static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2065{ 2140{
2066 /* 2141 /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2112{ 2187{
2113 struct page *old_page, *new_page; 2188 struct page *old_page, *new_page;
2114 pte_t entry; 2189 pte_t entry;
2115 int reuse = 0, ret = 0; 2190 int ret = 0;
2116 int page_mkwrite = 0; 2191 int page_mkwrite = 0;
2117 struct page *dirty_page = NULL; 2192 struct page *dirty_page = NULL;
2118 2193
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2149 } 2224 }
2150 page_cache_release(old_page); 2225 page_cache_release(old_page);
2151 } 2226 }
2152 reuse = reuse_swap_page(old_page); 2227 if (reuse_swap_page(old_page)) {
2153 if (reuse)
2154 /* 2228 /*
2155 * The page is all ours. Move it to our anon_vma so 2229 * The page is all ours. Move it to our anon_vma so
2156 * the rmap code will not search our parent or siblings. 2230 * the rmap code will not search our parent or siblings.
2157 * Protected against the rmap code by the page lock. 2231 * Protected against the rmap code by the page lock.
2158 */ 2232 */
2159 page_move_anon_rmap(old_page, vma, address); 2233 page_move_anon_rmap(old_page, vma, address);
2234 unlock_page(old_page);
2235 goto reuse;
2236 }
2160 unlock_page(old_page); 2237 unlock_page(old_page);
2161 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2238 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2162 (VM_WRITE|VM_SHARED))) { 2239 (VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2220 } 2297 }
2221 dirty_page = old_page; 2298 dirty_page = old_page;
2222 get_page(dirty_page); 2299 get_page(dirty_page);
2223 reuse = 1;
2224 }
2225 2300
2226 if (reuse) {
2227reuse: 2301reuse:
2228 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2302 flush_cache_page(vma, address, pte_pfn(orig_pte));
2229 entry = pte_mkyoung(orig_pte); 2303 entry = pte_mkyoung(orig_pte);
2230 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2304 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2231 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2305 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2232 update_mmu_cache(vma, address, page_table); 2306 update_mmu_cache(vma, address, page_table);
2307 pte_unmap_unlock(page_table, ptl);
2233 ret |= VM_FAULT_WRITE; 2308 ret |= VM_FAULT_WRITE;
2234 goto unlock; 2309
2310 if (!dirty_page)
2311 return ret;
2312
2313 /*
2314 * Yes, Virginia, this is actually required to prevent a race
2315 * with clear_page_dirty_for_io() from clearing the page dirty
2316 * bit after it clear all dirty ptes, but before a racing
2317 * do_wp_page installs a dirty pte.
2318 *
2319 * do_no_page is protected similarly.
2320 */
2321 if (!page_mkwrite) {
2322 wait_on_page_locked(dirty_page);
2323 set_page_dirty_balance(dirty_page, page_mkwrite);
2324 }
2325 put_page(dirty_page);
2326 if (page_mkwrite) {
2327 struct address_space *mapping = dirty_page->mapping;
2328
2329 set_page_dirty(dirty_page);
2330 unlock_page(dirty_page);
2331 page_cache_release(dirty_page);
2332 if (mapping) {
2333 /*
2334 * Some device drivers do not set page.mapping
2335 * but still dirty their pages
2336 */
2337 balance_dirty_pages_ratelimited(mapping);
2338 }
2339 }
2340
2341 /* file_update_time outside page_lock */
2342 if (vma->vm_file)
2343 file_update_time(vma->vm_file);
2344
2345 return ret;
2235 } 2346 }
2236 2347
2237 /* 2348 /*
@@ -2337,39 +2448,6 @@ gotten:
2337 page_cache_release(old_page); 2448 page_cache_release(old_page);
2338unlock: 2449unlock:
2339 pte_unmap_unlock(page_table, ptl); 2450 pte_unmap_unlock(page_table, ptl);
2340 if (dirty_page) {
2341 /*
2342 * Yes, Virginia, this is actually required to prevent a race
2343 * with clear_page_dirty_for_io() from clearing the page dirty
2344 * bit after it clear all dirty ptes, but before a racing
2345 * do_wp_page installs a dirty pte.
2346 *
2347 * do_no_page is protected similarly.
2348 */
2349 if (!page_mkwrite) {
2350 wait_on_page_locked(dirty_page);
2351 set_page_dirty_balance(dirty_page, page_mkwrite);
2352 }
2353 put_page(dirty_page);
2354 if (page_mkwrite) {
2355 struct address_space *mapping = dirty_page->mapping;
2356
2357 set_page_dirty(dirty_page);
2358 unlock_page(dirty_page);
2359 page_cache_release(dirty_page);
2360 if (mapping) {
2361 /*
2362 * Some device drivers do not set page.mapping
2363 * but still dirty their pages
2364 */
2365 balance_dirty_pages_ratelimited(mapping);
2366 }
2367 }
2368
2369 /* file_update_time outside page_lock */
2370 if (vma->vm_file)
2371 file_update_time(vma->vm_file);
2372 }
2373 return ret; 2451 return ret;
2374oom_free_new: 2452oom_free_new:
2375 page_cache_release(new_page); 2453 page_cache_release(new_page);
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3147 * but allow concurrent faults), and pte mapped but not yet locked. 3225 * but allow concurrent faults), and pte mapped but not yet locked.
3148 * We return with mmap_sem still held, but pte unmapped and unlocked. 3226 * We return with mmap_sem still held, but pte unmapped and unlocked.
3149 */ 3227 */
3150static inline int handle_pte_fault(struct mm_struct *mm, 3228int handle_pte_fault(struct mm_struct *mm,
3151 struct vm_area_struct *vma, unsigned long address, 3229 struct vm_area_struct *vma, unsigned long address,
3152 pte_t *pte, pmd_t *pmd, unsigned int flags) 3230 pte_t *pte, pmd_t *pmd, unsigned int flags)
3153{ 3231{
3154 pte_t entry; 3232 pte_t entry;
3155 spinlock_t *ptl; 3233 spinlock_t *ptl;
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 pmd = pmd_alloc(mm, pud, address); 3306 pmd = pmd_alloc(mm, pud, address);
3229 if (!pmd) 3307 if (!pmd)
3230 return VM_FAULT_OOM; 3308 return VM_FAULT_OOM;
3231 pte = pte_alloc_map(mm, pmd, address); 3309 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3232 if (!pte) 3310 if (!vma->vm_ops)
3311 return do_huge_pmd_anonymous_page(mm, vma, address,
3312 pmd, flags);
3313 } else {
3314 pmd_t orig_pmd = *pmd;
3315 barrier();
3316 if (pmd_trans_huge(orig_pmd)) {
3317 if (flags & FAULT_FLAG_WRITE &&
3318 !pmd_write(orig_pmd) &&
3319 !pmd_trans_splitting(orig_pmd))
3320 return do_huge_pmd_wp_page(mm, vma, address,
3321 pmd, orig_pmd);
3322 return 0;
3323 }
3324 }
3325
3326 /*
3327 * Use __pte_alloc instead of pte_alloc_map, because we can't
3328 * run pte_offset_map on the pmd, if an huge pmd could
3329 * materialize from under us from a different thread.
3330 */
3331 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
3233 return VM_FAULT_OOM; 3332 return VM_FAULT_OOM;
3333 /* if an huge pmd materialized from under us just retry later */
3334 if (unlikely(pmd_trans_huge(*pmd)))
3335 return 0;
3336 /*
3337 * A regular pmd is established and it can't morph into a huge pmd
3338 * from under us anymore at this point because we hold the mmap_sem
3339 * read mode and khugepaged takes it in write mode. So now it's
3340 * safe to run pte_offset_map().
3341 */
3342 pte = pte_offset_map(pmd, address);
3234 3343
3235 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3344 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3236} 3345}
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
3296 vma = find_vma(current->mm, addr); 3405 vma = find_vma(current->mm, addr);
3297 if (!vma) 3406 if (!vma)
3298 return -ENOMEM; 3407 return -ENOMEM;
3299 write = (vma->vm_flags & VM_WRITE) != 0; 3408 /*
3409 * We want to touch writable mappings with a write fault in order
3410 * to break COW, except for shared mappings because these don't COW
3411 * and we would not want to dirty them for nothing.
3412 */
3413 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3300 BUG_ON(addr >= end); 3414 BUG_ON(addr >= end);
3301 BUG_ON(end > vma->vm_end); 3415 BUG_ON(end > vma->vm_end);
3302 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3416 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
3368 goto out; 3482 goto out;
3369 3483
3370 pmd = pmd_offset(pud, address); 3484 pmd = pmd_offset(pud, address);
3485 VM_BUG_ON(pmd_trans_huge(*pmd));
3371 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3486 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3372 goto out; 3487 goto out;
3373 3488
@@ -3608,3 +3723,74 @@ void might_fault(void)
3608} 3723}
3609EXPORT_SYMBOL(might_fault); 3724EXPORT_SYMBOL(might_fault);
3610#endif 3725#endif
3726
3727#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3728static void clear_gigantic_page(struct page *page,
3729 unsigned long addr,
3730 unsigned int pages_per_huge_page)
3731{
3732 int i;
3733 struct page *p = page;
3734
3735 might_sleep();
3736 for (i = 0; i < pages_per_huge_page;
3737 i++, p = mem_map_next(p, page, i)) {
3738 cond_resched();
3739 clear_user_highpage(p, addr + i * PAGE_SIZE);
3740 }
3741}
3742void clear_huge_page(struct page *page,
3743 unsigned long addr, unsigned int pages_per_huge_page)
3744{
3745 int i;
3746
3747 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3748 clear_gigantic_page(page, addr, pages_per_huge_page);
3749 return;
3750 }
3751
3752 might_sleep();
3753 for (i = 0; i < pages_per_huge_page; i++) {
3754 cond_resched();
3755 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3756 }
3757}
3758
3759static void copy_user_gigantic_page(struct page *dst, struct page *src,
3760 unsigned long addr,
3761 struct vm_area_struct *vma,
3762 unsigned int pages_per_huge_page)
3763{
3764 int i;
3765 struct page *dst_base = dst;
3766 struct page *src_base = src;
3767
3768 for (i = 0; i < pages_per_huge_page; ) {
3769 cond_resched();
3770 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3771
3772 i++;
3773 dst = mem_map_next(dst, dst_base, i);
3774 src = mem_map_next(src, src_base, i);
3775 }
3776}
3777
3778void copy_user_huge_page(struct page *dst, struct page *src,
3779 unsigned long addr, struct vm_area_struct *vma,
3780 unsigned int pages_per_huge_page)
3781{
3782 int i;
3783
3784 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3785 copy_user_gigantic_page(dst, src, addr, vma,
3786 pages_per_huge_page);
3787 return;
3788 }
3789
3790 might_sleep();
3791 for (i = 0; i < pages_per_huge_page; i++) {
3792 cond_resched();
3793 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3794 }
3795}
3796#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..e92f04749fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
82 82
83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
84#ifndef CONFIG_SPARSEMEM_VMEMMAP 84#ifndef CONFIG_SPARSEMEM_VMEMMAP
85static void get_page_bootmem(unsigned long info, struct page *page, int type) 85static void get_page_bootmem(unsigned long info, struct page *page,
86 unsigned long type)
86{ 87{
87 atomic_set(&page->_mapcount, type); 88 page->lru.next = (struct list_head *) type;
88 SetPagePrivate(page); 89 SetPagePrivate(page);
89 set_page_private(page, info); 90 set_page_private(page, info);
90 atomic_inc(&page->_count); 91 atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
94 * so use __ref to tell modpost not to generate a warning */ 95 * so use __ref to tell modpost not to generate a warning */
95void __ref put_page_bootmem(struct page *page) 96void __ref put_page_bootmem(struct page *page)
96{ 97{
97 int type; 98 unsigned long type;
98 99
99 type = atomic_read(&page->_mapcount); 100 type = (unsigned long) page->lru.next;
100 BUG_ON(type >= -1); 101 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
102 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
101 103
102 if (atomic_dec_return(&page->_count) == 1) { 104 if (atomic_dec_return(&page->_count) == 1) {
103 ClearPagePrivate(page); 105 ClearPagePrivate(page);
104 set_page_private(page, 0); 106 set_page_private(page, 0);
105 reset_page_mapcount(page); 107 INIT_LIST_HEAD(&page->lru);
106 __free_pages_bootmem(page, 0); 108 __free_pages_bootmem(page, 0);
107 } 109 }
108 110
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
733 goto out; 735 goto out;
734 } 736 }
735 /* this function returns # of failed pages */ 737 /* this function returns # of failed pages */
736 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); 738 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
739 true, true);
737 if (ret) 740 if (ret)
738 putback_lru_pages(&source); 741 putback_lru_pages(&source);
739 } 742 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
514 pmd = pmd_offset(pud, addr); 514 pmd = pmd_offset(pud, addr);
515 do { 515 do {
516 next = pmd_addr_end(addr, end); 516 next = pmd_addr_end(addr, end);
517 split_huge_page_pmd(vma->vm_mm, pmd);
517 if (pmd_none_or_clear_bad(pmd)) 518 if (pmd_none_or_clear_bad(pmd))
518 continue; 519 continue;
519 if (check_pte_range(vma, pmd, addr, next, nodes, 520 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
935 return PTR_ERR(vma); 936 return PTR_ERR(vma);
936 937
937 if (!list_empty(&pagelist)) { 938 if (!list_empty(&pagelist)) {
938 err = migrate_pages(&pagelist, new_node_page, dest, 0); 939 err = migrate_pages(&pagelist, new_node_page, dest,
940 false, true);
939 if (err) 941 if (err)
940 putback_lru_pages(&pagelist); 942 putback_lru_pages(&pagelist);
941 } 943 }
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1155 1157
1156 if (!list_empty(&pagelist)) { 1158 if (!list_empty(&pagelist)) {
1157 nr_failed = migrate_pages(&pagelist, new_vma_page, 1159 nr_failed = migrate_pages(&pagelist, new_vma_page,
1158 (unsigned long)vma, 0); 1160 (unsigned long)vma,
1161 false, true);
1159 if (nr_failed) 1162 if (nr_failed)
1160 putback_lru_pages(&pagelist); 1163 putback_lru_pages(&pagelist);
1161 } 1164 }
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1308 1311
1309 /* Find the mm_struct */ 1312 /* Find the mm_struct */
1310 rcu_read_lock(); 1313 rcu_read_lock();
1311 read_lock(&tasklist_lock);
1312 task = pid ? find_task_by_vpid(pid) : current; 1314 task = pid ? find_task_by_vpid(pid) : current;
1313 if (!task) { 1315 if (!task) {
1314 read_unlock(&tasklist_lock);
1315 rcu_read_unlock(); 1316 rcu_read_unlock();
1316 err = -ESRCH; 1317 err = -ESRCH;
1317 goto out; 1318 goto out;
1318 } 1319 }
1319 mm = get_task_mm(task); 1320 mm = get_task_mm(task);
1320 read_unlock(&tasklist_lock);
1321 rcu_read_unlock(); 1321 rcu_read_unlock();
1322 1322
1323 err = -EINVAL; 1323 err = -EINVAL;
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1796} 1796}
1797 1797
1798/** 1798/**
1799 * alloc_page_vma - Allocate a page for a VMA. 1799 * alloc_pages_vma - Allocate a page for a VMA.
1800 * 1800 *
1801 * @gfp: 1801 * @gfp:
1802 * %GFP_USER user allocation. 1802 * %GFP_USER user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1805 * %GFP_FS allocation should not call back into a file system. 1805 * %GFP_FS allocation should not call back into a file system.
1806 * %GFP_ATOMIC don't sleep. 1806 * %GFP_ATOMIC don't sleep.
1807 * 1807 *
1808 * @order:Order of the GFP allocation.
1808 * @vma: Pointer to VMA or NULL if not available. 1809 * @vma: Pointer to VMA or NULL if not available.
1809 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1810 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1810 * 1811 *
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1818 * Should be called with the mm_sem of the vma hold. 1819 * Should be called with the mm_sem of the vma hold.
1819 */ 1820 */
1820struct page * 1821struct page *
1821alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1822alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1823 unsigned long addr)
1822{ 1824{
1823 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1825 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1824 struct zonelist *zl; 1826 struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1830 1832
1831 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1833 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1832 mpol_cond_put(pol); 1834 mpol_cond_put(pol);
1833 page = alloc_page_interleave(gfp, 0, nid); 1835 page = alloc_page_interleave(gfp, order, nid);
1834 put_mems_allowed(); 1836 put_mems_allowed();
1835 return page; 1837 return page;
1836 } 1838 }
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1839 /* 1841 /*
1840 * slow path: ref counted shared policy 1842 * slow path: ref counted shared policy
1841 */ 1843 */
1842 struct page *page = __alloc_pages_nodemask(gfp, 0, 1844 struct page *page = __alloc_pages_nodemask(gfp, order,
1843 zl, policy_nodemask(gfp, pol)); 1845 zl, policy_nodemask(gfp, pol));
1844 __mpol_put(pol); 1846 __mpol_put(pol);
1845 put_mems_allowed(); 1847 put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1848 /* 1850 /*
1849 * fast path: default or task policy 1851 * fast path: default or task policy
1850 */ 1852 */
1851 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1853 page = __alloc_pages_nodemask(gfp, order, zl,
1854 policy_nodemask(gfp, pol));
1852 put_mems_allowed(); 1855 put_mems_allowed();
1853 return page; 1856 return page;
1854} 1857}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
113 goto out; 113 goto out;
114 114
115 pmd = pmd_offset(pud, addr); 115 pmd = pmd_offset(pud, addr);
116 if (pmd_trans_huge(*pmd))
117 goto out;
116 if (!pmd_present(*pmd)) 118 if (!pmd_present(*pmd))
117 goto out; 119 goto out;
118 120
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
246 248
247 expected_count = 2 + page_has_private(page); 249 expected_count = 2 + page_has_private(page);
248 if (page_count(page) != expected_count || 250 if (page_count(page) != expected_count ||
249 (struct page *)radix_tree_deref_slot(pslot) != page) { 251 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
250 spin_unlock_irq(&mapping->tree_lock); 252 spin_unlock_irq(&mapping->tree_lock);
251 return -EAGAIN; 253 return -EAGAIN;
252 } 254 }
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
318 320
319 expected_count = 2 + page_has_private(page); 321 expected_count = 2 + page_has_private(page);
320 if (page_count(page) != expected_count || 322 if (page_count(page) != expected_count ||
321 (struct page *)radix_tree_deref_slot(pslot) != page) { 323 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
322 spin_unlock_irq(&mapping->tree_lock); 324 spin_unlock_irq(&mapping->tree_lock);
323 return -EAGAIN; 325 return -EAGAIN;
324 } 326 }
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
614 * to the newly allocated page in newpage. 616 * to the newly allocated page in newpage.
615 */ 617 */
616static int unmap_and_move(new_page_t get_new_page, unsigned long private, 618static int unmap_and_move(new_page_t get_new_page, unsigned long private,
617 struct page *page, int force, int offlining) 619 struct page *page, int force, bool offlining, bool sync)
618{ 620{
619 int rc = 0; 621 int rc = 0;
620 int *result = NULL; 622 int *result = NULL;
621 struct page *newpage = get_new_page(page, private, &result); 623 struct page *newpage = get_new_page(page, private, &result);
622 int remap_swapcache = 1; 624 int remap_swapcache = 1;
623 int rcu_locked = 0;
624 int charge = 0; 625 int charge = 0;
625 struct mem_cgroup *mem = NULL; 626 struct mem_cgroup *mem = NULL;
626 struct anon_vma *anon_vma = NULL; 627 struct anon_vma *anon_vma = NULL;
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
632 /* page was freed from under us. So we are done. */ 633 /* page was freed from under us. So we are done. */
633 goto move_newpage; 634 goto move_newpage;
634 } 635 }
636 if (unlikely(PageTransHuge(page)))
637 if (unlikely(split_huge_page(page)))
638 goto move_newpage;
635 639
636 /* prepare cgroup just returns 0 or -ENOMEM */ 640 /* prepare cgroup just returns 0 or -ENOMEM */
637 rc = -EAGAIN; 641 rc = -EAGAIN;
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639 if (!trylock_page(page)) { 643 if (!trylock_page(page)) {
640 if (!force) 644 if (!force)
641 goto move_newpage; 645 goto move_newpage;
646
647 /*
648 * It's not safe for direct compaction to call lock_page.
649 * For example, during page readahead pages are added locked
650 * to the LRU. Later, when the IO completes the pages are
651 * marked uptodate and unlocked. However, the queueing
652 * could be merging multiple pages for one bio (e.g.
653 * mpage_readpages). If an allocation happens for the
654 * second or third page, the process can end up locking
655 * the same page twice and deadlocking. Rather than
656 * trying to be clever about what pages can be locked,
657 * avoid the use of lock_page for direct compaction
658 * altogether.
659 */
660 if (current->flags & PF_MEMALLOC)
661 goto move_newpage;
662
642 lock_page(page); 663 lock_page(page);
643 } 664 }
644 665
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
665 BUG_ON(charge); 686 BUG_ON(charge);
666 687
667 if (PageWriteback(page)) { 688 if (PageWriteback(page)) {
668 if (!force) 689 if (!force || !sync)
669 goto uncharge; 690 goto uncharge;
670 wait_on_page_writeback(page); 691 wait_on_page_writeback(page);
671 } 692 }
672 /* 693 /*
673 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 694 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
674 * we cannot notice that anon_vma is freed while we migrates a page. 695 * we cannot notice that anon_vma is freed while we migrates a page.
675 * This rcu_read_lock() delays freeing anon_vma pointer until the end 696 * This get_anon_vma() delays freeing anon_vma pointer until the end
676 * of migration. File cache pages are no problem because of page_lock() 697 * of migration. File cache pages are no problem because of page_lock()
677 * File Caches may use write_page() or lock_page() in migration, then, 698 * File Caches may use write_page() or lock_page() in migration, then,
678 * just care Anon page here. 699 * just care Anon page here.
679 */ 700 */
680 if (PageAnon(page)) { 701 if (PageAnon(page)) {
681 rcu_read_lock(); 702 /*
682 rcu_locked = 1; 703 * Only page_lock_anon_vma() understands the subtleties of
683 704 * getting a hold on an anon_vma from outside one of its mms.
684 /* Determine how to safely use anon_vma */ 705 */
685 if (!page_mapped(page)) { 706 anon_vma = page_lock_anon_vma(page);
686 if (!PageSwapCache(page)) 707 if (anon_vma) {
687 goto rcu_unlock; 708 /*
688 709 * Take a reference count on the anon_vma if the
710 * page is mapped so that it is guaranteed to
711 * exist when the page is remapped later
712 */
713 get_anon_vma(anon_vma);
714 page_unlock_anon_vma(anon_vma);
715 } else if (PageSwapCache(page)) {
689 /* 716 /*
690 * We cannot be sure that the anon_vma of an unmapped 717 * We cannot be sure that the anon_vma of an unmapped
691 * swapcache page is safe to use because we don't 718 * swapcache page is safe to use because we don't
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
700 */ 727 */
701 remap_swapcache = 0; 728 remap_swapcache = 0;
702 } else { 729 } else {
703 /* 730 goto uncharge;
704 * Take a reference count on the anon_vma if the
705 * page is mapped so that it is guaranteed to
706 * exist when the page is remapped later
707 */
708 anon_vma = page_anon_vma(page);
709 get_anon_vma(anon_vma);
710 } 731 }
711 } 732 }
712 733
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
723 * free the metadata, so the page can be freed. 744 * free the metadata, so the page can be freed.
724 */ 745 */
725 if (!page->mapping) { 746 if (!page->mapping) {
726 if (!PageAnon(page) && page_has_private(page)) { 747 VM_BUG_ON(PageAnon(page));
727 /* 748 if (page_has_private(page)) {
728 * Go direct to try_to_free_buffers() here because
729 * a) that's what try_to_release_page() would do anyway
730 * b) we may be under rcu_read_lock() here, so we can't
731 * use GFP_KERNEL which is what try_to_release_page()
732 * needs to be effective.
733 */
734 try_to_free_buffers(page); 749 try_to_free_buffers(page);
735 goto rcu_unlock; 750 goto uncharge;
736 } 751 }
737 goto skip_unmap; 752 goto skip_unmap;
738 } 753 }
@@ -746,17 +761,14 @@ skip_unmap:
746 761
747 if (rc && remap_swapcache) 762 if (rc && remap_swapcache)
748 remove_migration_ptes(page, page); 763 remove_migration_ptes(page, page);
749rcu_unlock:
750 764
751 /* Drop an anon_vma reference if we took one */ 765 /* Drop an anon_vma reference if we took one */
752 if (anon_vma) 766 if (anon_vma)
753 drop_anon_vma(anon_vma); 767 drop_anon_vma(anon_vma);
754 768
755 if (rcu_locked)
756 rcu_read_unlock();
757uncharge: 769uncharge:
758 if (!charge) 770 if (!charge)
759 mem_cgroup_end_migration(mem, page, newpage); 771 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
760unlock: 772unlock:
761 unlock_page(page); 773 unlock_page(page);
762 774
@@ -810,12 +822,11 @@ move_newpage:
810 */ 822 */
811static int unmap_and_move_huge_page(new_page_t get_new_page, 823static int unmap_and_move_huge_page(new_page_t get_new_page,
812 unsigned long private, struct page *hpage, 824 unsigned long private, struct page *hpage,
813 int force, int offlining) 825 int force, bool offlining, bool sync)
814{ 826{
815 int rc = 0; 827 int rc = 0;
816 int *result = NULL; 828 int *result = NULL;
817 struct page *new_hpage = get_new_page(hpage, private, &result); 829 struct page *new_hpage = get_new_page(hpage, private, &result);
818 int rcu_locked = 0;
819 struct anon_vma *anon_vma = NULL; 830 struct anon_vma *anon_vma = NULL;
820 831
821 if (!new_hpage) 832 if (!new_hpage)
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
824 rc = -EAGAIN; 835 rc = -EAGAIN;
825 836
826 if (!trylock_page(hpage)) { 837 if (!trylock_page(hpage)) {
827 if (!force) 838 if (!force || !sync)
828 goto out; 839 goto out;
829 lock_page(hpage); 840 lock_page(hpage);
830 } 841 }
831 842
832 if (PageAnon(hpage)) { 843 if (PageAnon(hpage)) {
833 rcu_read_lock(); 844 anon_vma = page_lock_anon_vma(hpage);
834 rcu_locked = 1; 845 if (anon_vma) {
835 846 get_anon_vma(anon_vma);
836 if (page_mapped(hpage)) { 847 page_unlock_anon_vma(anon_vma);
837 anon_vma = page_anon_vma(hpage);
838 atomic_inc(&anon_vma->external_refcount);
839 } 848 }
840 } 849 }
841 850
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
847 if (rc) 856 if (rc)
848 remove_migration_ptes(hpage, hpage); 857 remove_migration_ptes(hpage, hpage);
849 858
850 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, 859 if (anon_vma)
851 &anon_vma->lock)) { 860 drop_anon_vma(anon_vma);
852 int empty = list_empty(&anon_vma->head);
853 spin_unlock(&anon_vma->lock);
854 if (empty)
855 anon_vma_free(anon_vma);
856 }
857
858 if (rcu_locked)
859 rcu_read_unlock();
860out: 861out:
861 unlock_page(hpage); 862 unlock_page(hpage);
862 863
@@ -892,7 +893,8 @@ out:
892 * Return: Number of pages not migrated or error code. 893 * Return: Number of pages not migrated or error code.
893 */ 894 */
894int migrate_pages(struct list_head *from, 895int migrate_pages(struct list_head *from,
895 new_page_t get_new_page, unsigned long private, int offlining) 896 new_page_t get_new_page, unsigned long private, bool offlining,
897 bool sync)
896{ 898{
897 int retry = 1; 899 int retry = 1;
898 int nr_failed = 0; 900 int nr_failed = 0;
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from,
912 cond_resched(); 914 cond_resched();
913 915
914 rc = unmap_and_move(get_new_page, private, 916 rc = unmap_and_move(get_new_page, private,
915 page, pass > 2, offlining); 917 page, pass > 2, offlining,
918 sync);
916 919
917 switch(rc) { 920 switch(rc) {
918 case -ENOMEM: 921 case -ENOMEM:
@@ -941,7 +944,8 @@ out:
941} 944}
942 945
943int migrate_huge_pages(struct list_head *from, 946int migrate_huge_pages(struct list_head *from,
944 new_page_t get_new_page, unsigned long private, int offlining) 947 new_page_t get_new_page, unsigned long private, bool offlining,
948 bool sync)
945{ 949{
946 int retry = 1; 950 int retry = 1;
947 int nr_failed = 0; 951 int nr_failed = 0;
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from,
957 cond_resched(); 961 cond_resched();
958 962
959 rc = unmap_and_move_huge_page(get_new_page, 963 rc = unmap_and_move_huge_page(get_new_page,
960 private, page, pass > 2, offlining); 964 private, page, pass > 2, offlining,
965 sync);
961 966
962 switch(rc) { 967 switch(rc) {
963 case -ENOMEM: 968 case -ENOMEM:
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1042 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) 1047 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1043 goto set_status; 1048 goto set_status;
1044 1049
1045 page = follow_page(vma, pp->addr, FOLL_GET); 1050 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1046 1051
1047 err = PTR_ERR(page); 1052 err = PTR_ERR(page);
1048 if (IS_ERR(page)) 1053 if (IS_ERR(page))
@@ -1090,7 +1095,7 @@ set_status:
1090 err = 0; 1095 err = 0;
1091 if (!list_empty(&pagelist)) { 1096 if (!list_empty(&pagelist)) {
1092 err = migrate_pages(&pagelist, new_page_node, 1097 err = migrate_pages(&pagelist, new_page_node,
1093 (unsigned long)pm, 0); 1098 (unsigned long)pm, 0, true);
1094 if (err) 1099 if (err)
1095 putback_lru_pages(&pagelist); 1100 putback_lru_pages(&pagelist);
1096 } 1101 }
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
154 pmd = pmd_offset(pud, addr); 154 pmd = pmd_offset(pud, addr);
155 do { 155 do {
156 next = pmd_addr_end(addr, end); 156 next = pmd_addr_end(addr, end);
157 if (pmd_trans_huge(*pmd)) {
158 if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
159 vec += (next - addr) >> PAGE_SHIFT;
160 continue;
161 }
162 /* fall through */
163 }
157 if (pmd_none_or_clear_bad(pmd)) 164 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec); 165 mincore_unmapped_range(vma, addr, next, vec);
159 else 166 else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..13e81ee8be9d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
155 * vma->vm_mm->mmap_sem must be held for at least read. 155 * vma->vm_mm->mmap_sem must be held for at least read.
156 */ 156 */
157static long __mlock_vma_pages_range(struct vm_area_struct *vma, 157static long __mlock_vma_pages_range(struct vm_area_struct *vma,
158 unsigned long start, unsigned long end) 158 unsigned long start, unsigned long end,
159 int *nonblocking)
159{ 160{
160 struct mm_struct *mm = vma->vm_mm; 161 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 162 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 163 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0;
165 int gup_flags; 164 int gup_flags;
166 165
167 VM_BUG_ON(start & ~PAGE_MASK); 166 VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
170 VM_BUG_ON(end > vma->vm_end); 169 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 170 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 171
173 gup_flags = FOLL_TOUCH | FOLL_GET; 172 gup_flags = FOLL_TOUCH;
174 if (vma->vm_flags & VM_WRITE) 173 /*
174 * We want to touch writable mappings with a write fault in order
175 * to break COW, except for shared mappings because these don't COW
176 * and we would not want to dirty them for nothing.
177 */
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
175 gup_flags |= FOLL_WRITE; 179 gup_flags |= FOLL_WRITE;
176 180
181 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK;
183
177 /* We don't try to access the guard page of a stack vma */ 184 /* We don't try to access the guard page of a stack vma */
178 if (stack_guard_page(vma, start)) { 185 if (stack_guard_page(vma, start)) {
179 addr += PAGE_SIZE; 186 addr += PAGE_SIZE;
180 nr_pages--; 187 nr_pages--;
181 } 188 }
182 189
183 while (nr_pages > 0) { 190 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
184 int i; 191 NULL, NULL, nonblocking);
185
186 cond_resched();
187
188 /*
189 * get_user_pages makes pages present if we are
190 * setting mlock. and this extra reference count will
191 * disable migration of this page. However, page may
192 * still be truncated out from under us.
193 */
194 ret = __get_user_pages(current, mm, addr,
195 min_t(int, nr_pages, ARRAY_SIZE(pages)),
196 gup_flags, pages, NULL);
197 /*
198 * This can happen for, e.g., VM_NONLINEAR regions before
199 * a page has been allocated and mapped at a given offset,
200 * or for addresses that map beyond end of a file.
201 * We'll mlock the pages if/when they get faulted in.
202 */
203 if (ret < 0)
204 break;
205
206 lru_add_drain(); /* push cached pages to LRU */
207
208 for (i = 0; i < ret; i++) {
209 struct page *page = pages[i];
210
211 if (page->mapping) {
212 /*
213 * That preliminary check is mainly to avoid
214 * the pointless overhead of lock_page on the
215 * ZERO_PAGE: which might bounce very badly if
216 * there is contention. However, we're still
217 * dirtying its cacheline with get/put_page:
218 * we'll add another __get_user_pages flag to
219 * avoid it if that case turns out to matter.
220 */
221 lock_page(page);
222 /*
223 * Because we lock page here and migration is
224 * blocked by the elevated reference, we need
225 * only check for file-cache page truncation.
226 */
227 if (page->mapping)
228 mlock_vma_page(page);
229 unlock_page(page);
230 }
231 put_page(page); /* ref from get_user_pages() */
232 }
233
234 addr += ret * PAGE_SIZE;
235 nr_pages -= ret;
236 ret = 0;
237 }
238
239 return ret; /* 0 or negative error code */
240} 192}
241 193
242/* 194/*
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
280 is_vm_hugetlb_page(vma) || 232 is_vm_hugetlb_page(vma) ||
281 vma == get_gate_vma(current))) { 233 vma == get_gate_vma(current))) {
282 234
283 __mlock_vma_pages_range(vma, start, end); 235 __mlock_vma_pages_range(vma, start, end, NULL);
284 236
285 /* Hide errors from mmap() and other callers */ 237 /* Hide errors from mmap() and other callers */
286 return 0; 238 return 0;
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
372 int ret = 0; 324 int ret = 0;
373 int lock = newflags & VM_LOCKED; 325 int lock = newflags & VM_LOCKED;
374 326
375 if (newflags == vma->vm_flags || 327 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
376 (vma->vm_flags & (VM_IO | VM_PFNMAP))) 328 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
377 goto out; /* don't set VM_LOCKED, don't count */ 329 goto out; /* don't set VM_LOCKED, don't count */
378 330
379 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
380 is_vm_hugetlb_page(vma) ||
381 vma == get_gate_vma(current)) {
382 if (lock)
383 make_pages_present(start, end);
384 goto out; /* don't set VM_LOCKED, don't count */
385 }
386
387 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 331 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
388 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 332 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
389 vma->vm_file, pgoff, vma_policy(vma)); 333 vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +363,10 @@ success:
419 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 363 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
420 */ 364 */
421 365
422 if (lock) { 366 if (lock)
423 vma->vm_flags = newflags; 367 vma->vm_flags = newflags;
424 ret = __mlock_vma_pages_range(vma, start, end); 368 else
425 if (ret < 0)
426 ret = __mlock_posix_error_return(ret);
427 } else {
428 munlock_vma_pages_range(vma, start, end); 369 munlock_vma_pages_range(vma, start, end);
429 }
430 370
431out: 371out:
432 *prev = vma; 372 *prev = vma;
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
439 struct vm_area_struct * vma, * prev; 379 struct vm_area_struct * vma, * prev;
440 int error; 380 int error;
441 381
442 len = PAGE_ALIGN(len); 382 VM_BUG_ON(start & ~PAGE_MASK);
383 VM_BUG_ON(len != PAGE_ALIGN(len));
443 end = start + len; 384 end = start + len;
444 if (end < start) 385 if (end < start)
445 return -EINVAL; 386 return -EINVAL;
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
482 return error; 423 return error;
483} 424}
484 425
426static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
427{
428 struct mm_struct *mm = current->mm;
429 unsigned long end, nstart, nend;
430 struct vm_area_struct *vma = NULL;
431 int locked = 0;
432 int ret = 0;
433
434 VM_BUG_ON(start & ~PAGE_MASK);
435 VM_BUG_ON(len != PAGE_ALIGN(len));
436 end = start + len;
437
438 for (nstart = start; nstart < end; nstart = nend) {
439 /*
440 * We want to fault in pages for [nstart; end) address range.
441 * Find first corresponding VMA.
442 */
443 if (!locked) {
444 locked = 1;
445 down_read(&mm->mmap_sem);
446 vma = find_vma(mm, nstart);
447 } else if (nstart >= vma->vm_end)
448 vma = vma->vm_next;
449 if (!vma || vma->vm_start >= end)
450 break;
451 /*
452 * Set [nstart; nend) to intersection of desired address
453 * range with the first VMA. Also, skip undesirable VMA types.
454 */
455 nend = min(end, vma->vm_end);
456 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
457 continue;
458 if (nstart < vma->vm_start)
459 nstart = vma->vm_start;
460 /*
461 * Now fault in a range of pages. __mlock_vma_pages_range()
462 * double checks the vma flags, so that it won't mlock pages
463 * if the vma was already munlocked.
464 */
465 ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
466 if (ret < 0) {
467 if (ignore_errors) {
468 ret = 0;
469 continue; /* continue at next VMA */
470 }
471 ret = __mlock_posix_error_return(ret);
472 break;
473 }
474 nend = nstart + ret * PAGE_SIZE;
475 ret = 0;
476 }
477 if (locked)
478 up_read(&mm->mmap_sem);
479 return ret; /* 0 or negative error code */
480}
481
485SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 482SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
486{ 483{
487 unsigned long locked; 484 unsigned long locked;
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
507 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 504 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
508 error = do_mlock(start, len, 1); 505 error = do_mlock(start, len, 1);
509 up_write(&current->mm->mmap_sem); 506 up_write(&current->mm->mmap_sem);
507 if (!error)
508 error = do_mlock_pages(start, len, 0);
510 return error; 509 return error;
511} 510}
512 511
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
571 capable(CAP_IPC_LOCK)) 570 capable(CAP_IPC_LOCK))
572 ret = do_mlockall(flags); 571 ret = do_mlockall(flags);
573 up_write(&current->mm->mmap_sem); 572 up_write(&current->mm->mmap_sem);
573 if (!ret && (flags & MCL_CURRENT)) {
574 /* Ignore errors */
575 do_mlock_pages(0, TASK_SIZE, 1);
576 }
574out: 577out:
575 return ret; 578 return ret;
576} 579}
diff --git a/mm/mmap.c b/mm/mmap.c
index 50a4aa0255a0..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 35#include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
253 down_write(&mm->mmap_sem); 254 down_write(&mm->mmap_sem);
254 255
255#ifdef CONFIG_COMPAT_BRK 256#ifdef CONFIG_COMPAT_BRK
256 min_brk = mm->end_code; 257 /*
258 * CONFIG_COMPAT_BRK can still be overridden by setting
259 * randomize_va_space to 2, which will still cause mm->start_brk
260 * to be arbitrarily shifted
261 */
262 if (mm->start_brk > PAGE_ALIGN(mm->end_data))
263 min_brk = mm->start_brk;
264 else
265 min_brk = mm->end_data;
257#else 266#else
258 min_brk = mm->start_brk; 267 min_brk = mm->start_brk;
259#endif 268#endif
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end);
588 } 597 }
589 } 598 }
590 599
600 vma_adjust_trans_huge(vma, start, end, adjust_next);
601
591 /* 602 /*
592 * When changing only vma->vm_end, we don't really need anon_vma 603 * When changing only vma->vm_end, we don't really need anon_vma
593 * lock. This is a fairly rare case by itself, but the anon_vma 604 * lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
815 end, prev->vm_pgoff, NULL); 826 end, prev->vm_pgoff, NULL);
816 if (err) 827 if (err)
817 return NULL; 828 return NULL;
829 khugepaged_enter_vma_merge(prev);
818 return prev; 830 return prev;
819 } 831 }
820 832
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
833 next->vm_pgoff - pglen, NULL); 845 next->vm_pgoff - pglen, NULL);
834 if (err) 846 if (err)
835 return NULL; 847 return NULL;
848 khugepaged_enter_vma_merge(area);
836 return area; 849 return area;
837 } 850 }
838 851
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1761 } 1774 }
1762 } 1775 }
1763 vma_unlock_anon_vma(vma); 1776 vma_unlock_anon_vma(vma);
1777 khugepaged_enter_vma_merge(vma);
1764 return error; 1778 return error;
1765} 1779}
1766#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1780#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1808 } 1822 }
1809 } 1823 }
1810 vma_unlock_anon_vma(vma); 1824 vma_unlock_anon_vma(vma);
1825 khugepaged_enter_vma_merge(vma);
1811 return error; 1826 return error;
1812} 1827}
1813 1828
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
100 return young; 100 return young;
101} 101}
102 102
103int __mmu_notifier_test_young(struct mm_struct *mm,
104 unsigned long address)
105{
106 struct mmu_notifier *mn;
107 struct hlist_node *n;
108 int young = 0;
109
110 rcu_read_lock();
111 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
112 if (mn->ops->test_young) {
113 young = mn->ops->test_young(mn, mm, address);
114 if (young)
115 break;
116 }
117 }
118 rcu_read_unlock();
119
120 return young;
121}
122
103void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 123void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104 pte_t pte) 124 pte_t pte)
105{ 125{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
78 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
79} 79}
80 80
81static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
82 unsigned long addr, unsigned long end, pgprot_t newprot, 82 unsigned long addr, unsigned long end, pgprot_t newprot,
83 int dirty_accountable) 83 int dirty_accountable)
84{ 84{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
88 pmd = pmd_offset(pud, addr); 88 pmd = pmd_offset(pud, addr);
89 do { 89 do {
90 next = pmd_addr_end(addr, end); 90 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot))
95 continue;
96 /* fall through */
97 }
91 if (pmd_none_or_clear_bad(pmd)) 98 if (pmd_none_or_clear_bad(pmd))
92 continue; 99 continue;
93 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); 100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
101 dirty_accountable);
94 } while (pmd++, addr = next, addr != end); 102 } while (pmd++, addr = next, addr != end);
95} 103}
96 104
97static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
98 unsigned long addr, unsigned long end, pgprot_t newprot, 106 unsigned long addr, unsigned long end, pgprot_t newprot,
99 int dirty_accountable) 107 int dirty_accountable)
100{ 108{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
106 next = pud_addr_end(addr, end); 114 next = pud_addr_end(addr, end);
107 if (pud_none_or_clear_bad(pud)) 115 if (pud_none_or_clear_bad(pud))
108 continue; 116 continue;
109 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); 117 change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable);
110 } while (pud++, addr = next, addr != end); 119 } while (pud++, addr = next, addr != end);
111} 120}
112 121
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
126 next = pgd_addr_end(addr, end); 135 next = pgd_addr_end(addr, end);
127 if (pgd_none_or_clear_bad(pgd)) 136 if (pgd_none_or_clear_bad(pgd))
128 continue; 137 continue;
129 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); 138 change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable);
130 } while (pgd++, addr = next, addr != end); 140 } while (pgd++, addr = next, addr != end);
131 flush_tlb_range(vma, start, end); 141 flush_tlb_range(vma, start, end);
132} 142}
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd);
44 if (pmd_none_or_clear_bad(pmd)) 45 if (pmd_none_or_clear_bad(pmd))
45 return NULL; 46 return NULL;
46 47
47 return pmd; 48 return pmd;
48} 49}
49 50
50static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 51static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
52 unsigned long addr)
51{ 53{
52 pgd_t *pgd; 54 pgd_t *pgd;
53 pud_t *pud; 55 pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
62 if (!pmd) 64 if (!pmd)
63 return NULL; 65 return NULL;
64 66
65 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) 67 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
66 return NULL; 69 return NULL;
67 70
68 return pmd; 71 return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
147 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 150 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
148 if (!old_pmd) 151 if (!old_pmd)
149 continue; 152 continue;
150 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); 153 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
151 if (!new_pmd) 154 if (!new_pmd)
152 break; 155 break;
153 next = (new_addr + PMD_SIZE) & PMD_MASK; 156 next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef4045d010d5..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
127 127
128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
129 unsigned long start, int nr_pages, unsigned int foll_flags, 129 unsigned long start, int nr_pages, unsigned int foll_flags,
130 struct page **pages, struct vm_area_struct **vmas) 130 struct page **pages, struct vm_area_struct **vmas,
131 int *retry)
131{ 132{
132 struct vm_area_struct *vma; 133 struct vm_area_struct *vma;
133 unsigned long vm_flags; 134 unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 if (force) 186 if (force)
186 flags |= FOLL_FORCE; 187 flags |= FOLL_FORCE;
187 188
188 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 189 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
190 NULL);
189} 191}
190EXPORT_SYMBOL(get_user_pages); 192EXPORT_SYMBOL(get_user_pages);
191 193
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b5d8a1f820a0..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
410{ 410{
411 unsigned long background; 411 unsigned long background;
412 unsigned long dirty; 412 unsigned long dirty;
413 unsigned long available_memory = determine_dirtyable_memory(); 413 unsigned long uninitialized_var(available_memory);
414 struct task_struct *tsk; 414 struct task_struct *tsk;
415 415
416 if (!vm_dirty_bytes || !dirty_background_bytes)
417 available_memory = determine_dirtyable_memory();
418
416 if (vm_dirty_bytes) 419 if (vm_dirty_bytes)
417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 420 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
418 else 421 else
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
1103int __set_page_dirty_no_writeback(struct page *page) 1106int __set_page_dirty_no_writeback(struct page *page)
1104{ 1107{
1105 if (!PageDirty(page)) 1108 if (!PageDirty(page))
1106 SetPageDirty(page); 1109 return !TestSetPageDirty(page);
1107 return 0; 1110 return 0;
1108} 1111}
1109 1112
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
357 } 357 }
358} 358}
359 359
360/* update __split_huge_page_refcount if you change this function */
360static int destroy_compound_page(struct page *page, unsigned long order) 361static int destroy_compound_page(struct page *page, unsigned long order)
361{ 362{
362 int i; 363 int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
426 * 427 *
427 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 428 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
428 */ 429 */
429static inline struct page *
430__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
431{
432 unsigned long buddy_idx = page_idx ^ (1 << order);
433
434 return page + (buddy_idx - page_idx);
435}
436
437static inline unsigned long 430static inline unsigned long
438__find_combined_index(unsigned long page_idx, unsigned int order) 431__find_buddy_index(unsigned long page_idx, unsigned int order)
439{ 432{
440 return (page_idx & ~(1 << order)); 433 return page_idx ^ (1 << order);
441} 434}
442 435
443/* 436/*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
448 * (c) a page and its buddy have the same order && 441 * (c) a page and its buddy have the same order &&
449 * (d) a page and its buddy are in the same zone. 442 * (d) a page and its buddy are in the same zone.
450 * 443 *
451 * For recording whether a page is in the buddy system, we use PG_buddy. 444 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
452 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 445 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
453 * 446 *
454 * For recording page's order, we use page_private(page). 447 * For recording page's order, we use page_private(page).
455 */ 448 */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
482 * as necessary, plus some accounting needed to play nicely with other 475 * as necessary, plus some accounting needed to play nicely with other
483 * parts of the VM system. 476 * parts of the VM system.
484 * At each level, we keep a list of pages, which are heads of continuous 477 * At each level, we keep a list of pages, which are heads of continuous
485 * free pages of length of (1 << order) and marked with PG_buddy. Page's 478 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
486 * order is recorded in page_private(page) field. 479 * order is recorded in page_private(page) field.
487 * So when we are allocating or freeing one, we can derive the state of the 480 * So when we are allocating or freeing one, we can derive the state of the
488 * other. That is, if we allocate a small block, and both were 481 * other. That is, if we allocate a small block, and both were
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
499{ 492{
500 unsigned long page_idx; 493 unsigned long page_idx;
501 unsigned long combined_idx; 494 unsigned long combined_idx;
495 unsigned long uninitialized_var(buddy_idx);
502 struct page *buddy; 496 struct page *buddy;
503 497
504 if (unlikely(PageCompound(page))) 498 if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
513 VM_BUG_ON(bad_range(zone, page)); 507 VM_BUG_ON(bad_range(zone, page));
514 508
515 while (order < MAX_ORDER-1) { 509 while (order < MAX_ORDER-1) {
516 buddy = __page_find_buddy(page, page_idx, order); 510 buddy_idx = __find_buddy_index(page_idx, order);
511 buddy = page + (buddy_idx - page_idx);
517 if (!page_is_buddy(page, buddy, order)) 512 if (!page_is_buddy(page, buddy, order))
518 break; 513 break;
519 514
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
521 list_del(&buddy->lru); 516 list_del(&buddy->lru);
522 zone->free_area[order].nr_free--; 517 zone->free_area[order].nr_free--;
523 rmv_page_order(buddy); 518 rmv_page_order(buddy);
524 combined_idx = __find_combined_index(page_idx, order); 519 combined_idx = buddy_idx & page_idx;
525 page = page + (combined_idx - page_idx); 520 page = page + (combined_idx - page_idx);
526 page_idx = combined_idx; 521 page_idx = combined_idx;
527 order++; 522 order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
538 */ 533 */
539 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
540 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
541 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = buddy_idx & page_idx;
542 higher_page = page + combined_idx - page_idx; 537 higher_page = page + (combined_idx - page_idx);
543 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); 538 buddy_idx = __find_buddy_index(combined_idx, order + 1);
539 higher_buddy = page + (buddy_idx - combined_idx);
544 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 540 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
545 list_add_tail(&page->lru, 541 list_add_tail(&page->lru,
546 &zone->free_area[order].free_list[migratetype]); 542 &zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
651 trace_mm_page_free_direct(page, order); 647 trace_mm_page_free_direct(page, order);
652 kmemcheck_free_shadow(page, order); 648 kmemcheck_free_shadow(page, order);
653 649
654 for (i = 0; i < (1 << order); i++) { 650 if (PageAnon(page))
655 struct page *pg = page + i; 651 page->mapping = NULL;
656 652 for (i = 0; i < (1 << order); i++)
657 if (PageAnon(pg)) 653 bad += free_pages_check(page + i);
658 pg->mapping = NULL;
659 bad += free_pages_check(pg);
660 }
661 if (bad) 654 if (bad)
662 return false; 655 return false;
663 656
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1460#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1453#endif /* CONFIG_FAIL_PAGE_ALLOC */
1461 1454
1462/* 1455/*
1463 * Return 1 if free pages are above 'mark'. This takes into account the order 1456 * Return true if free pages are above 'mark'. This takes into account the order
1464 * of the allocation. 1457 * of the allocation.
1465 */ 1458 */
1466int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1459static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1467 int classzone_idx, int alloc_flags) 1460 int classzone_idx, int alloc_flags, long free_pages)
1468{ 1461{
1469 /* free_pages my go negative - that's OK */ 1462 /* free_pages my go negative - that's OK */
1470 long min = mark; 1463 long min = mark;
1471 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1472 int o; 1464 int o;
1473 1465
1466 free_pages -= (1 << order) + 1;
1474 if (alloc_flags & ALLOC_HIGH) 1467 if (alloc_flags & ALLOC_HIGH)
1475 min -= min / 2; 1468 min -= min / 2;
1476 if (alloc_flags & ALLOC_HARDER) 1469 if (alloc_flags & ALLOC_HARDER)
1477 min -= min / 4; 1470 min -= min / 4;
1478 1471
1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1472 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1480 return 0; 1473 return false;
1481 for (o = 0; o < order; o++) { 1474 for (o = 0; o < order; o++) {
1482 /* At the next order, this order's pages become unavailable */ 1475 /* At the next order, this order's pages become unavailable */
1483 free_pages -= z->free_area[o].nr_free << o; 1476 free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1486 min >>= 1; 1479 min >>= 1;
1487 1480
1488 if (free_pages <= min) 1481 if (free_pages <= min)
1489 return 0; 1482 return false;
1490 } 1483 }
1491 return 1; 1484 return true;
1485}
1486
1487bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1488 int classzone_idx, int alloc_flags)
1489{
1490 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1491 zone_page_state(z, NR_FREE_PAGES));
1492}
1493
1494bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1495 int classzone_idx, int alloc_flags)
1496{
1497 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1498
1499 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1500 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1501
1502 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1503 free_pages);
1492} 1504}
1493 1505
1494#ifdef CONFIG_NUMA 1506#ifdef CONFIG_NUMA
@@ -1793,15 +1805,18 @@ static struct page *
1793__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1805__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1794 struct zonelist *zonelist, enum zone_type high_zoneidx, 1806 struct zonelist *zonelist, enum zone_type high_zoneidx,
1795 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1807 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1796 int migratetype, unsigned long *did_some_progress) 1808 int migratetype, unsigned long *did_some_progress,
1809 bool sync_migration)
1797{ 1810{
1798 struct page *page; 1811 struct page *page;
1799 1812
1800 if (!order || compaction_deferred(preferred_zone)) 1813 if (!order || compaction_deferred(preferred_zone))
1801 return NULL; 1814 return NULL;
1802 1815
1816 current->flags |= PF_MEMALLOC;
1803 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1817 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1804 nodemask); 1818 nodemask, sync_migration);
1819 current->flags &= ~PF_MEMALLOC;
1805 if (*did_some_progress != COMPACT_SKIPPED) { 1820 if (*did_some_progress != COMPACT_SKIPPED) {
1806 1821
1807 /* Page migration frees to the PCP lists but we want merging */ 1822 /* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1852,8 @@ static inline struct page *
1837__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1852__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1838 struct zonelist *zonelist, enum zone_type high_zoneidx, 1853 struct zonelist *zonelist, enum zone_type high_zoneidx,
1839 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1854 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1840 int migratetype, unsigned long *did_some_progress) 1855 int migratetype, unsigned long *did_some_progress,
1856 bool sync_migration)
1841{ 1857{
1842 return NULL; 1858 return NULL;
1843} 1859}
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1852{ 1868{
1853 struct page *page = NULL; 1869 struct page *page = NULL;
1854 struct reclaim_state reclaim_state; 1870 struct reclaim_state reclaim_state;
1855 struct task_struct *p = current;
1856 bool drained = false; 1871 bool drained = false;
1857 1872
1858 cond_resched(); 1873 cond_resched();
1859 1874
1860 /* We now go into synchronous reclaim */ 1875 /* We now go into synchronous reclaim */
1861 cpuset_memory_pressure_bump(); 1876 cpuset_memory_pressure_bump();
1862 p->flags |= PF_MEMALLOC; 1877 current->flags |= PF_MEMALLOC;
1863 lockdep_set_current_reclaim_state(gfp_mask); 1878 lockdep_set_current_reclaim_state(gfp_mask);
1864 reclaim_state.reclaimed_slab = 0; 1879 reclaim_state.reclaimed_slab = 0;
1865 p->reclaim_state = &reclaim_state; 1880 current->reclaim_state = &reclaim_state;
1866 1881
1867 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 1882 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1868 1883
1869 p->reclaim_state = NULL; 1884 current->reclaim_state = NULL;
1870 lockdep_clear_current_reclaim_state(); 1885 lockdep_clear_current_reclaim_state();
1871 p->flags &= ~PF_MEMALLOC; 1886 current->flags &= ~PF_MEMALLOC;
1872 1887
1873 cond_resched(); 1888 cond_resched();
1874 1889
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1920 1935
1921static inline 1936static inline
1922void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 1937void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1923 enum zone_type high_zoneidx) 1938 enum zone_type high_zoneidx,
1939 enum zone_type classzone_idx)
1924{ 1940{
1925 struct zoneref *z; 1941 struct zoneref *z;
1926 struct zone *zone; 1942 struct zone *zone;
1927 1943
1928 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1944 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1929 wakeup_kswapd(zone, order); 1945 wakeup_kswapd(zone, order, classzone_idx);
1930} 1946}
1931 1947
1932static inline int 1948static inline int
1933gfp_to_alloc_flags(gfp_t gfp_mask) 1949gfp_to_alloc_flags(gfp_t gfp_mask)
1934{ 1950{
1935 struct task_struct *p = current;
1936 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 1951 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1937 const gfp_t wait = gfp_mask & __GFP_WAIT; 1952 const gfp_t wait = gfp_mask & __GFP_WAIT;
1938 1953
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1948 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 1963 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1949 1964
1950 if (!wait) { 1965 if (!wait) {
1951 alloc_flags |= ALLOC_HARDER; 1966 /*
1967 * Not worth trying to allocate harder for
1968 * __GFP_NOMEMALLOC even if it can't schedule.
1969 */
1970 if (!(gfp_mask & __GFP_NOMEMALLOC))
1971 alloc_flags |= ALLOC_HARDER;
1952 /* 1972 /*
1953 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1973 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1954 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1974 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1955 */ 1975 */
1956 alloc_flags &= ~ALLOC_CPUSET; 1976 alloc_flags &= ~ALLOC_CPUSET;
1957 } else if (unlikely(rt_task(p)) && !in_interrupt()) 1977 } else if (unlikely(rt_task(current)) && !in_interrupt())
1958 alloc_flags |= ALLOC_HARDER; 1978 alloc_flags |= ALLOC_HARDER;
1959 1979
1960 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1980 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1961 if (!in_interrupt() && 1981 if (!in_interrupt() &&
1962 ((p->flags & PF_MEMALLOC) || 1982 ((current->flags & PF_MEMALLOC) ||
1963 unlikely(test_thread_flag(TIF_MEMDIE)))) 1983 unlikely(test_thread_flag(TIF_MEMDIE))))
1964 alloc_flags |= ALLOC_NO_WATERMARKS; 1984 alloc_flags |= ALLOC_NO_WATERMARKS;
1965 } 1985 }
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1978 int alloc_flags; 1998 int alloc_flags;
1979 unsigned long pages_reclaimed = 0; 1999 unsigned long pages_reclaimed = 0;
1980 unsigned long did_some_progress; 2000 unsigned long did_some_progress;
1981 struct task_struct *p = current; 2001 bool sync_migration = false;
1982 2002
1983 /* 2003 /*
1984 * In the slowpath, we sanity check order to avoid ever trying to 2004 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2003 goto nopage; 2023 goto nopage;
2004 2024
2005restart: 2025restart:
2006 wake_all_kswapd(order, zonelist, high_zoneidx); 2026 if (!(gfp_mask & __GFP_NO_KSWAPD))
2027 wake_all_kswapd(order, zonelist, high_zoneidx,
2028 zone_idx(preferred_zone));
2007 2029
2008 /* 2030 /*
2009 * OK, we're below the kswapd watermark and have kicked background 2031 * OK, we're below the kswapd watermark and have kicked background
@@ -2034,21 +2056,26 @@ rebalance:
2034 goto nopage; 2056 goto nopage;
2035 2057
2036 /* Avoid recursion of direct reclaim */ 2058 /* Avoid recursion of direct reclaim */
2037 if (p->flags & PF_MEMALLOC) 2059 if (current->flags & PF_MEMALLOC)
2038 goto nopage; 2060 goto nopage;
2039 2061
2040 /* Avoid allocations with no watermarks from looping endlessly */ 2062 /* Avoid allocations with no watermarks from looping endlessly */
2041 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2063 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2042 goto nopage; 2064 goto nopage;
2043 2065
2044 /* Try direct compaction */ 2066 /*
2067 * Try direct compaction. The first pass is asynchronous. Subsequent
2068 * attempts after direct reclaim are synchronous
2069 */
2045 page = __alloc_pages_direct_compact(gfp_mask, order, 2070 page = __alloc_pages_direct_compact(gfp_mask, order,
2046 zonelist, high_zoneidx, 2071 zonelist, high_zoneidx,
2047 nodemask, 2072 nodemask,
2048 alloc_flags, preferred_zone, 2073 alloc_flags, preferred_zone,
2049 migratetype, &did_some_progress); 2074 migratetype, &did_some_progress,
2075 sync_migration);
2050 if (page) 2076 if (page)
2051 goto got_pg; 2077 goto got_pg;
2078 sync_migration = true;
2052 2079
2053 /* Try direct reclaim and then allocating */ 2080 /* Try direct reclaim and then allocating */
2054 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2081 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2129,27 @@ rebalance:
2102 /* Wait for some write requests to complete then retry */ 2129 /* Wait for some write requests to complete then retry */
2103 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2130 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2104 goto rebalance; 2131 goto rebalance;
2132 } else {
2133 /*
2134 * High-order allocations do not necessarily loop after
2135 * direct reclaim and reclaim/compaction depends on compaction
2136 * being called after reclaim so call directly if necessary
2137 */
2138 page = __alloc_pages_direct_compact(gfp_mask, order,
2139 zonelist, high_zoneidx,
2140 nodemask,
2141 alloc_flags, preferred_zone,
2142 migratetype, &did_some_progress,
2143 sync_migration);
2144 if (page)
2145 goto got_pg;
2105 } 2146 }
2106 2147
2107nopage: 2148nopage:
2108 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2149 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
2109 printk(KERN_WARNING "%s: page allocation failure." 2150 printk(KERN_WARNING "%s: page allocation failure."
2110 " order:%d, mode:0x%x\n", 2151 " order:%d, mode:0x%x\n",
2111 p->comm, order, gfp_mask); 2152 current->comm, order, gfp_mask);
2112 dump_stack(); 2153 dump_stack();
2113 show_mem(); 2154 show_mem();
2114 } 2155 }
@@ -2442,7 +2483,7 @@ void show_free_areas(void)
2442 " all_unreclaimable? %s" 2483 " all_unreclaimable? %s"
2443 "\n", 2484 "\n",
2444 zone->name, 2485 zone->name,
2445 K(zone_nr_free_pages(zone)), 2486 K(zone_page_state(zone, NR_FREE_PAGES)),
2446 K(min_wmark_pages(zone)), 2487 K(min_wmark_pages(zone)),
2447 K(low_wmark_pages(zone)), 2488 K(low_wmark_pages(zone)),
2448 K(high_wmark_pages(zone)), 2489 K(high_wmark_pages(zone)),
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
2585 2626
2586static __init int setup_numa_zonelist_order(char *s) 2627static __init int setup_numa_zonelist_order(char *s)
2587{ 2628{
2588 if (s) 2629 int ret;
2589 return __parse_numa_zonelist_order(s); 2630
2590 return 0; 2631 if (!s)
2632 return 0;
2633
2634 ret = __parse_numa_zonelist_order(s);
2635 if (ret == 0)
2636 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2637
2638 return ret;
2591} 2639}
2592early_param("numa_zonelist_order", setup_numa_zonelist_order); 2640early_param("numa_zonelist_order", setup_numa_zonelist_order);
2593 2641
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = {
5517 {1UL << PG_swapcache, "swapcache" }, 5565 {1UL << PG_swapcache, "swapcache" },
5518 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5566 {1UL << PG_mappedtodisk, "mappedtodisk" },
5519 {1UL << PG_reclaim, "reclaim" }, 5567 {1UL << PG_reclaim, "reclaim" },
5520 {1UL << PG_buddy, "buddy" },
5521 {1UL << PG_swapbacked, "swapbacked" }, 5568 {1UL << PG_swapbacked, "swapbacked" },
5522 {1UL << PG_unevictable, "unevictable" }, 5569 {1UL << PG_unevictable, "unevictable" },
5523#ifdef CONFIG_MMU 5570#ifdef CONFIG_MMU
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page)
5565{ 5612{
5566 printk(KERN_ALERT 5613 printk(KERN_ALERT
5567 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 5614 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5568 page, page_count(page), page_mapcount(page), 5615 page, atomic_read(&page->_count), page_mapcount(page),
5569 page->mapping, page->index); 5616 page->mapping, page->index);
5570 dump_page_flags(page->flags); 5617 dump_page_flags(page->flags);
5571} 5618}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 38cc58b8b2b0..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
34 pmd = pmd_offset(pud, addr); 34 pmd = pmd_offset(pud, addr);
35 do { 35 do {
36 next = pmd_addr_end(addr, end); 36 next = pmd_addr_end(addr, end);
37 split_huge_page_pmd(walk->mm, pmd);
37 if (pmd_none_or_clear_bad(pmd)) { 38 if (pmd_none_or_clear_bad(pmd)) {
38 if (walk->pte_hole) 39 if (walk->pte_hole)
39 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
421 return NULL; 421 return NULL;
422 422
423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, 423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
424 pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); 424 pcpu_nr_groups, pcpu_atom_size);
425 if (!vms) { 425 if (!vms) {
426 pcpu_free_chunk(chunk); 426 pcpu_free_chunk(chunk);
427 return NULL; 427 return NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..d030548047e2
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,123 @@
1/*
2 * mm/pgtable-generic.c
3 *
4 * Generic pgtable methods declared in asm-generic/pgtable.h
5 *
6 * Copyright (C) 2010 Linus Torvalds
7 */
8
9#include <asm/tlb.h>
10#include <asm-generic/pgtable.h>
11
12#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
13/*
14 * Only sets the access flags (dirty, accessed, and
15 * writable). Furthermore, we know it always gets set to a "more
16 * permissive" setting, which allows most architectures to optimize
17 * this. We return whether the PTE actually changed, which in turn
18 * instructs the caller to do things like update__mmu_cache. This
19 * used to be done in the caller, but sparc needs minor faults to
20 * force that call on sun4c so we changed this macro slightly
21 */
22int ptep_set_access_flags(struct vm_area_struct *vma,
23 unsigned long address, pte_t *ptep,
24 pte_t entry, int dirty)
25{
26 int changed = !pte_same(*ptep, entry);
27 if (changed) {
28 set_pte_at(vma->vm_mm, address, ptep, entry);
29 flush_tlb_page(vma, address);
30 }
31 return changed;
32}
33#endif
34
35#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
36int pmdp_set_access_flags(struct vm_area_struct *vma,
37 unsigned long address, pmd_t *pmdp,
38 pmd_t entry, int dirty)
39{
40#ifdef CONFIG_TRANSPARENT_HUGEPAGE
41 int changed = !pmd_same(*pmdp, entry);
42 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
43 if (changed) {
44 set_pmd_at(vma->vm_mm, address, pmdp, entry);
45 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
46 }
47 return changed;
48#else /* CONFIG_TRANSPARENT_HUGEPAGE */
49 BUG();
50 return 0;
51#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
52}
53#endif
54
55#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
56int ptep_clear_flush_young(struct vm_area_struct *vma,
57 unsigned long address, pte_t *ptep)
58{
59 int young;
60 young = ptep_test_and_clear_young(vma, address, ptep);
61 if (young)
62 flush_tlb_page(vma, address);
63 return young;
64}
65#endif
66
67#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
68int pmdp_clear_flush_young(struct vm_area_struct *vma,
69 unsigned long address, pmd_t *pmdp)
70{
71 int young;
72#ifndef CONFIG_TRANSPARENT_HUGEPAGE
73 BUG();
74#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
75 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
76 young = pmdp_test_and_clear_young(vma, address, pmdp);
77 if (young)
78 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
79 return young;
80}
81#endif
82
83#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
84pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
85 pte_t *ptep)
86{
87 pte_t pte;
88 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
89 flush_tlb_page(vma, address);
90 return pte;
91}
92#endif
93
94#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
95pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
96 pmd_t *pmdp)
97{
98 pmd_t pmd;
99#ifndef CONFIG_TRANSPARENT_HUGEPAGE
100 BUG();
101#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
102 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
103 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
104 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
105 return pmd;
106}
107#endif
108
109#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
110pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
111 pmd_t *pmdp)
112{
113#ifdef CONFIG_TRANSPARENT_HUGEPAGE
114 pmd_t pmd = pmd_mksplitting(*pmdp);
115 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
116 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
117 /* tlb flush only to serialize against gup-fast */
118 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
119#else /* CONFIG_TRANSPARENT_HUGEPAGE */
120 BUG();
121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
122}
123#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index c95d2ba27a0b..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
177 list_add(&avc->same_vma, &vma->anon_vma_chain); 177 list_add(&avc->same_vma, &vma->anon_vma_chain);
178 178
179 anon_vma_lock(anon_vma); 179 anon_vma_lock(anon_vma);
180 /*
181 * It's critical to add new vmas to the tail of the anon_vma,
182 * see comment in huge_memory.c:__split_huge_page().
183 */
180 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 184 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
181 anon_vma_unlock(anon_vma); 185 anon_vma_unlock(anon_vma);
182} 186}
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
360 * Returns virtual address or -EFAULT if page's index/offset is not 364 * Returns virtual address or -EFAULT if page's index/offset is not
361 * within the range mapped the @vma. 365 * within the range mapped the @vma.
362 */ 366 */
363static inline unsigned long 367inline unsigned long
364vma_address(struct page *page, struct vm_area_struct *vma) 368vma_address(struct page *page, struct vm_area_struct *vma)
365{ 369{
366 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 370 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
435 pmd = pmd_offset(pud, address); 439 pmd = pmd_offset(pud, address);
436 if (!pmd_present(*pmd)) 440 if (!pmd_present(*pmd))
437 return NULL; 441 return NULL;
442 if (pmd_trans_huge(*pmd))
443 return NULL;
438 444
439 pte = pte_offset_map(pmd, address); 445 pte = pte_offset_map(pmd, address);
440 /* Make a quick check before getting the lock */ 446 /* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
489 unsigned long *vm_flags) 495 unsigned long *vm_flags)
490{ 496{
491 struct mm_struct *mm = vma->vm_mm; 497 struct mm_struct *mm = vma->vm_mm;
492 pte_t *pte;
493 spinlock_t *ptl;
494 int referenced = 0; 498 int referenced = 0;
495 499
496 pte = page_check_address(page, mm, address, &ptl, 0);
497 if (!pte)
498 goto out;
499
500 /* 500 /*
501 * Don't want to elevate referenced for mlocked page that gets this far, 501 * Don't want to elevate referenced for mlocked page that gets this far,
502 * in order that it progresses to try_to_unmap and is moved to the 502 * in order that it progresses to try_to_unmap and is moved to the
503 * unevictable list. 503 * unevictable list.
504 */ 504 */
505 if (vma->vm_flags & VM_LOCKED) { 505 if (vma->vm_flags & VM_LOCKED) {
506 *mapcount = 1; /* break early from loop */ 506 *mapcount = 0; /* break early from loop */
507 *vm_flags |= VM_LOCKED; 507 *vm_flags |= VM_LOCKED;
508 goto out_unmap; 508 goto out;
509 }
510
511 if (ptep_clear_flush_young_notify(vma, address, pte)) {
512 /*
513 * Don't treat a reference through a sequentially read
514 * mapping as such. If the page has been used in
515 * another mapping, we will catch it; if this other
516 * mapping is already gone, the unmap path will have
517 * set PG_referenced or activated the page.
518 */
519 if (likely(!VM_SequentialReadHint(vma)))
520 referenced++;
521 } 509 }
522 510
523 /* Pretend the page is referenced if the task has the 511 /* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
526 rwsem_is_locked(&mm->mmap_sem)) 514 rwsem_is_locked(&mm->mmap_sem))
527 referenced++; 515 referenced++;
528 516
529out_unmap: 517 if (unlikely(PageTransHuge(page))) {
518 pmd_t *pmd;
519
520 spin_lock(&mm->page_table_lock);
521 pmd = page_check_address_pmd(page, mm, address,
522 PAGE_CHECK_ADDRESS_PMD_FLAG);
523 if (pmd && !pmd_trans_splitting(*pmd) &&
524 pmdp_clear_flush_young_notify(vma, address, pmd))
525 referenced++;
526 spin_unlock(&mm->page_table_lock);
527 } else {
528 pte_t *pte;
529 spinlock_t *ptl;
530
531 pte = page_check_address(page, mm, address, &ptl, 0);
532 if (!pte)
533 goto out;
534
535 if (ptep_clear_flush_young_notify(vma, address, pte)) {
536 /*
537 * Don't treat a reference through a sequentially read
538 * mapping as such. If the page has been used in
539 * another mapping, we will catch it; if this other
540 * mapping is already gone, the unmap path will have
541 * set PG_referenced or activated the page.
542 */
543 if (likely(!VM_SequentialReadHint(vma)))
544 referenced++;
545 }
546 pte_unmap_unlock(pte, ptl);
547 }
548
530 (*mapcount)--; 549 (*mapcount)--;
531 pte_unmap_unlock(pte, ptl);
532 550
533 if (referenced) 551 if (referenced)
534 *vm_flags |= vma->vm_flags; 552 *vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
864 struct vm_area_struct *vma, unsigned long address, int exclusive) 882 struct vm_area_struct *vma, unsigned long address, int exclusive)
865{ 883{
866 int first = atomic_inc_and_test(&page->_mapcount); 884 int first = atomic_inc_and_test(&page->_mapcount);
867 if (first) 885 if (first) {
868 __inc_zone_page_state(page, NR_ANON_PAGES); 886 if (!PageTransHuge(page))
887 __inc_zone_page_state(page, NR_ANON_PAGES);
888 else
889 __inc_zone_page_state(page,
890 NR_ANON_TRANSPARENT_HUGEPAGES);
891 }
869 if (unlikely(PageKsm(page))) 892 if (unlikely(PageKsm(page)))
870 return; 893 return;
871 894
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
893 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 916 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
894 SetPageSwapBacked(page); 917 SetPageSwapBacked(page);
895 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 918 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
896 __inc_zone_page_state(page, NR_ANON_PAGES); 919 if (!PageTransHuge(page))
920 __inc_zone_page_state(page, NR_ANON_PAGES);
921 else
922 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
897 __page_set_anon_rmap(page, vma, address, 1); 923 __page_set_anon_rmap(page, vma, address, 1);
898 if (page_evictable(page, vma)) 924 if (page_evictable(page, vma))
899 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 925 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
911{ 937{
912 if (atomic_inc_and_test(&page->_mapcount)) { 938 if (atomic_inc_and_test(&page->_mapcount)) {
913 __inc_zone_page_state(page, NR_FILE_MAPPED); 939 __inc_zone_page_state(page, NR_FILE_MAPPED);
914 mem_cgroup_update_file_mapped(page, 1); 940 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
915 } 941 }
916} 942}
917 943
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
946 return; 972 return;
947 if (PageAnon(page)) { 973 if (PageAnon(page)) {
948 mem_cgroup_uncharge_page(page); 974 mem_cgroup_uncharge_page(page);
949 __dec_zone_page_state(page, NR_ANON_PAGES); 975 if (!PageTransHuge(page))
976 __dec_zone_page_state(page, NR_ANON_PAGES);
977 else
978 __dec_zone_page_state(page,
979 NR_ANON_TRANSPARENT_HUGEPAGES);
950 } else { 980 } else {
951 __dec_zone_page_state(page, NR_FILE_MAPPED); 981 __dec_zone_page_state(page, NR_FILE_MAPPED);
952 mem_cgroup_update_file_mapped(page, -1); 982 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
953 } 983 }
954 /* 984 /*
955 * It would be tidy to reset the PageAnon mapping here, 985 * It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1202 return ret; 1232 return ret;
1203} 1233}
1204 1234
1205static bool is_vma_temporary_stack(struct vm_area_struct *vma) 1235bool is_vma_temporary_stack(struct vm_area_struct *vma)
1206{ 1236{
1207 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1237 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1208 1238
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1400 int ret; 1430 int ret;
1401 1431
1402 BUG_ON(!PageLocked(page)); 1432 BUG_ON(!PageLocked(page));
1433 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1403 1434
1404 if (unlikely(PageKsm(page))) 1435 if (unlikely(PageKsm(page)))
1405 ret = try_to_unmap_ksm(page, flags); 1436 ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 008cd743a36a..c7ef0070dd86 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3636 len += sprintf(buf + len, "%7ld ", l->count); 3636 len += sprintf(buf + len, "%7ld ", l->count);
3637 3637
3638 if (l->addr) 3638 if (l->addr)
3639 len += sprint_symbol(buf + len, (unsigned long)l->addr); 3639 len += sprintf(buf + len, "%pS", (void *)l->addr);
3640 else 3640 else
3641 len += sprintf(buf + len, "<not-available>"); 3641 len += sprintf(buf + len, "<not-available>");
3642 3642
@@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial);
3946 3946
3947static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3947static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3948{ 3948{
3949 if (s->ctor) { 3949 if (!s->ctor)
3950 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3950 return 0;
3951 3951 return sprintf(buf, "%pS\n", s->ctor);
3952 return n + sprintf(buf + n, "\n");
3953 }
3954 return 0;
3955} 3952}
3956SLAB_ATTR_RO(ctor); 3953SLAB_ATTR_RO(ctor);
3957 3954
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
671static void free_map_bootmem(struct page *page, unsigned long nr_pages) 671static void free_map_bootmem(struct page *page, unsigned long nr_pages)
672{ 672{
673 unsigned long maps_section_nr, removing_section_nr, i; 673 unsigned long maps_section_nr, removing_section_nr, i;
674 int magic; 674 unsigned long magic;
675 675
676 for (i = 0; i < nr_pages; i++, page++) { 676 for (i = 0; i < nr_pages; i++, page++) {
677 magic = atomic_read(&page->_mapcount); 677 magic = (unsigned long) page->lru.next;
678 678
679 BUG_ON(magic == NODE_INFO); 679 BUG_ON(magic == NODE_INFO);
680 680
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..bbc1ce9f9460 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
56 del_page_from_lru(zone, page); 56 del_page_from_lru(zone, page);
57 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
58 } 58 }
59}
60
61static void __put_single_page(struct page *page)
62{
63 __page_cache_release(page);
59 free_hot_cold_page(page, 0); 64 free_hot_cold_page(page, 0);
60} 65}
61 66
62static void put_compound_page(struct page *page) 67static void __put_compound_page(struct page *page)
63{ 68{
64 page = compound_head(page); 69 compound_page_dtor *dtor;
65 if (put_page_testzero(page)) {
66 compound_page_dtor *dtor;
67 70
68 dtor = get_compound_page_dtor(page); 71 __page_cache_release(page);
69 (*dtor)(page); 72 dtor = get_compound_page_dtor(page);
73 (*dtor)(page);
74}
75
76static void put_compound_page(struct page *page)
77{
78 if (unlikely(PageTail(page))) {
79 /* __split_huge_page_refcount can run under us */
80 struct page *page_head = page->first_page;
81 smp_rmb();
82 /*
83 * If PageTail is still set after smp_rmb() we can be sure
84 * that the page->first_page we read wasn't a dangling pointer.
85 * See __split_huge_page_refcount() smp_wmb().
86 */
87 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
88 unsigned long flags;
89 /*
90 * Verify that our page_head wasn't converted
91 * to a a regular page before we got a
92 * reference on it.
93 */
94 if (unlikely(!PageHead(page_head))) {
95 /* PageHead is cleared after PageTail */
96 smp_rmb();
97 VM_BUG_ON(PageTail(page));
98 goto out_put_head;
99 }
100 /*
101 * Only run compound_lock on a valid PageHead,
102 * after having it pinned with
103 * get_page_unless_zero() above.
104 */
105 smp_mb();
106 /* page_head wasn't a dangling pointer */
107 flags = compound_lock_irqsave(page_head);
108 if (unlikely(!PageTail(page))) {
109 /* __split_huge_page_refcount run before us */
110 compound_unlock_irqrestore(page_head, flags);
111 VM_BUG_ON(PageHead(page_head));
112 out_put_head:
113 if (put_page_testzero(page_head))
114 __put_single_page(page_head);
115 out_put_single:
116 if (put_page_testzero(page))
117 __put_single_page(page);
118 return;
119 }
120 VM_BUG_ON(page_head != page->first_page);
121 /*
122 * We can release the refcount taken by
123 * get_page_unless_zero now that
124 * split_huge_page_refcount is blocked on the
125 * compound_lock.
126 */
127 if (put_page_testzero(page_head))
128 VM_BUG_ON(1);
129 /* __split_huge_page_refcount will wait now */
130 VM_BUG_ON(atomic_read(&page->_count) <= 0);
131 atomic_dec(&page->_count);
132 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
133 compound_unlock_irqrestore(page_head, flags);
134 if (put_page_testzero(page_head)) {
135 if (PageHead(page_head))
136 __put_compound_page(page_head);
137 else
138 __put_single_page(page_head);
139 }
140 } else {
141 /* page_head is a dangling pointer */
142 VM_BUG_ON(PageTail(page));
143 goto out_put_single;
144 }
145 } else if (put_page_testzero(page)) {
146 if (PageHead(page))
147 __put_compound_page(page);
148 else
149 __put_single_page(page);
70 } 150 }
71} 151}
72 152
@@ -75,7 +155,7 @@ void put_page(struct page *page)
75 if (unlikely(PageCompound(page))) 155 if (unlikely(PageCompound(page)))
76 put_compound_page(page); 156 put_compound_page(page);
77 else if (put_page_testzero(page)) 157 else if (put_page_testzero(page))
78 __page_cache_release(page); 158 __put_single_page(page);
79} 159}
80EXPORT_SYMBOL(put_page); 160EXPORT_SYMBOL(put_page);
81 161
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages)
98} 178}
99EXPORT_SYMBOL(put_pages_list); 179EXPORT_SYMBOL(put_pages_list);
100 180
101/* 181static void pagevec_lru_move_fn(struct pagevec *pvec,
102 * pagevec_move_tail() must be called with IRQ disabled. 182 void (*move_fn)(struct page *page, void *arg),
103 * Otherwise this may cause nasty races. 183 void *arg)
104 */
105static void pagevec_move_tail(struct pagevec *pvec)
106{ 184{
107 int i; 185 int i;
108 int pgmoved = 0;
109 struct zone *zone = NULL; 186 struct zone *zone = NULL;
187 unsigned long flags = 0;
110 188
111 for (i = 0; i < pagevec_count(pvec); i++) { 189 for (i = 0; i < pagevec_count(pvec); i++) {
112 struct page *page = pvec->pages[i]; 190 struct page *page = pvec->pages[i];
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec)
114 192
115 if (pagezone != zone) { 193 if (pagezone != zone) {
116 if (zone) 194 if (zone)
117 spin_unlock(&zone->lru_lock); 195 spin_unlock_irqrestore(&zone->lru_lock, flags);
118 zone = pagezone; 196 zone = pagezone;
119 spin_lock(&zone->lru_lock); 197 spin_lock_irqsave(&zone->lru_lock, flags);
120 }
121 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
122 int lru = page_lru_base_type(page);
123 list_move_tail(&page->lru, &zone->lru[lru].list);
124 pgmoved++;
125 } 198 }
199
200 (*move_fn)(page, arg);
126 } 201 }
127 if (zone) 202 if (zone)
128 spin_unlock(&zone->lru_lock); 203 spin_unlock_irqrestore(&zone->lru_lock, flags);
129 __count_vm_events(PGROTATED, pgmoved); 204 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
130 release_pages(pvec->pages, pvec->nr, pvec->cold);
131 pagevec_reinit(pvec); 205 pagevec_reinit(pvec);
132} 206}
133 207
208static void pagevec_move_tail_fn(struct page *page, void *arg)
209{
210 int *pgmoved = arg;
211 struct zone *zone = page_zone(page);
212
213 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
214 int lru = page_lru_base_type(page);
215 list_move_tail(&page->lru, &zone->lru[lru].list);
216 (*pgmoved)++;
217 }
218}
219
220/*
221 * pagevec_move_tail() must be called with IRQ disabled.
222 * Otherwise this may cause nasty races.
223 */
224static void pagevec_move_tail(struct pagevec *pvec)
225{
226 int pgmoved = 0;
227
228 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
229 __count_vm_events(PGROTATED, pgmoved);
230}
231
134/* 232/*
135 * Writeback is about to end against a page which has been marked for immediate 233 * Writeback is about to end against a page which has been marked for immediate
136 * reclaim. If it still appears to be reclaimable, move it to the tail of the 234 * reclaim. If it still appears to be reclaimable, move it to the tail of the
137 * inactive list. 235 * inactive list.
138 */ 236 */
139void rotate_reclaimable_page(struct page *page) 237void rotate_reclaimable_page(struct page *page)
140{ 238{
141 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 239 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
142 !PageUnevictable(page) && PageLRU(page)) { 240 !PageUnevictable(page) && PageLRU(page)) {
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
173} 271}
174 272
175/* 273/*
176 * FIXME: speed this up? 274 * A page will go to active list either by activate_page or putback_lru_page.
275 * In the activate_page case, the page hasn't active bit set. The page might
276 * not in LRU list because it's isolated before it gets a chance to be moved to
277 * active list. The window is small because pagevec just stores several pages.
278 * For such case, we do nothing for such page.
279 * In the putback_lru_page case, the page isn't in lru list but has active
280 * bit set
177 */ 281 */
178void activate_page(struct page *page) 282static void __activate_page(struct page *page, void *arg)
179{ 283{
180 struct zone *zone = page_zone(page); 284 struct zone *zone = page_zone(page);
285 int file = page_is_file_cache(page);
286 int lru = page_lru_base_type(page);
287 bool putback = !PageLRU(page);
181 288
182 spin_lock_irq(&zone->lru_lock); 289 /* The page is isolated before it's moved to active list */
183 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 290 if (!PageLRU(page) && !PageActive(page))
184 int file = page_is_file_cache(page); 291 return;
185 int lru = page_lru_base_type(page); 292 if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page))
293 return;
294
295 if (!putback)
186 del_page_from_lru_list(zone, page, lru); 296 del_page_from_lru_list(zone, page, lru);
297 else
298 SetPageLRU(page);
187 299
188 SetPageActive(page); 300 SetPageActive(page);
189 lru += LRU_ACTIVE; 301 lru += LRU_ACTIVE;
190 add_page_to_lru_list(zone, page, lru); 302 add_page_to_lru_list(zone, page, lru);
191 __count_vm_event(PGACTIVATE);
192 303
193 update_page_reclaim_stat(zone, page, file, 1); 304 if (putback)
305 return;
306 __count_vm_event(PGACTIVATE);
307 update_page_reclaim_stat(zone, page, file, 1);
308}
309
310#ifdef CONFIG_SMP
311static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
312
313static void activate_page_drain(int cpu)
314{
315 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
316
317 if (pagevec_count(pvec))
318 pagevec_lru_move_fn(pvec, __activate_page, NULL);
319}
320
321void activate_page(struct page *page)
322{
323 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
324 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
325
326 page_cache_get(page);
327 if (!pagevec_add(pvec, page))
328 pagevec_lru_move_fn(pvec, __activate_page, NULL);
329 put_cpu_var(activate_page_pvecs);
330 }
331}
332
333/* Caller should hold zone->lru_lock */
334int putback_active_lru_page(struct zone *zone, struct page *page)
335{
336 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
337
338 if (!pagevec_add(pvec, page)) {
339 spin_unlock_irq(&zone->lru_lock);
340 pagevec_lru_move_fn(pvec, __activate_page, NULL);
341 spin_lock_irq(&zone->lru_lock);
194 } 342 }
343 put_cpu_var(activate_page_pvecs);
344 return 1;
345}
346
347#else
348static inline void activate_page_drain(int cpu)
349{
350}
351
352void activate_page(struct page *page)
353{
354 struct zone *zone = page_zone(page);
355
356 spin_lock_irq(&zone->lru_lock);
357 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page))
358 __activate_page(page, NULL);
195 spin_unlock_irq(&zone->lru_lock); 359 spin_unlock_irq(&zone->lru_lock);
196} 360}
361#endif
197 362
198/* 363/*
199 * Mark a page as having seen activity. 364 * Mark a page as having seen activity.
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu)
292 pagevec_move_tail(pvec); 457 pagevec_move_tail(pvec);
293 local_irq_restore(flags); 458 local_irq_restore(flags);
294 } 459 }
460 activate_page_drain(cpu);
295} 461}
296 462
297void lru_add_drain(void) 463void lru_add_drain(void)
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec)
399 565
400EXPORT_SYMBOL(__pagevec_release); 566EXPORT_SYMBOL(__pagevec_release);
401 567
568/* used by __split_huge_page_refcount() */
569void lru_add_page_tail(struct zone* zone,
570 struct page *page, struct page *page_tail)
571{
572 int active;
573 enum lru_list lru;
574 const int file = 0;
575 struct list_head *head;
576
577 VM_BUG_ON(!PageHead(page));
578 VM_BUG_ON(PageCompound(page_tail));
579 VM_BUG_ON(PageLRU(page_tail));
580 VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
581
582 SetPageLRU(page_tail);
583
584 if (page_evictable(page_tail, NULL)) {
585 if (PageActive(page)) {
586 SetPageActive(page_tail);
587 active = 1;
588 lru = LRU_ACTIVE_ANON;
589 } else {
590 active = 0;
591 lru = LRU_INACTIVE_ANON;
592 }
593 update_page_reclaim_stat(zone, page_tail, file, active);
594 if (likely(PageLRU(page)))
595 head = page->lru.prev;
596 else
597 head = &zone->lru[lru].list;
598 __add_page_to_lru_list(zone, page_tail, lru, head);
599 } else {
600 SetPageUnevictable(page_tail);
601 add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
602 }
603}
604
605static void ____pagevec_lru_add_fn(struct page *page, void *arg)
606{
607 enum lru_list lru = (enum lru_list)arg;
608 struct zone *zone = page_zone(page);
609 int file = is_file_lru(lru);
610 int active = is_active_lru(lru);
611
612 VM_BUG_ON(PageActive(page));
613 VM_BUG_ON(PageUnevictable(page));
614 VM_BUG_ON(PageLRU(page));
615
616 SetPageLRU(page);
617 if (active)
618 SetPageActive(page);
619 update_page_reclaim_stat(zone, page, file, active);
620 add_page_to_lru_list(zone, page, lru);
621}
622
402/* 623/*
403 * Add the passed pages to the LRU, then drop the caller's refcount 624 * Add the passed pages to the LRU, then drop the caller's refcount
404 * on them. Reinitialises the caller's pagevec. 625 * on them. Reinitialises the caller's pagevec.
405 */ 626 */
406void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 627void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
407{ 628{
408 int i;
409 struct zone *zone = NULL;
410
411 VM_BUG_ON(is_unevictable_lru(lru)); 629 VM_BUG_ON(is_unevictable_lru(lru));
412 630
413 for (i = 0; i < pagevec_count(pvec); i++) { 631 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
414 struct page *page = pvec->pages[i];
415 struct zone *pagezone = page_zone(page);
416 int file;
417 int active;
418
419 if (pagezone != zone) {
420 if (zone)
421 spin_unlock_irq(&zone->lru_lock);
422 zone = pagezone;
423 spin_lock_irq(&zone->lru_lock);
424 }
425 VM_BUG_ON(PageActive(page));
426 VM_BUG_ON(PageUnevictable(page));
427 VM_BUG_ON(PageLRU(page));
428 SetPageLRU(page);
429 active = is_active_lru(lru);
430 file = is_file_lru(lru);
431 if (active)
432 SetPageActive(page);
433 update_page_reclaim_stat(zone, page, file, active);
434 add_page_to_lru_list(zone, page, lru);
435 }
436 if (zone)
437 spin_unlock_irq(&zone->lru_lock);
438 release_pages(pvec->pages, pvec->nr, pvec->cold);
439 pagevec_reinit(pvec);
440} 632}
441 633
442EXPORT_SYMBOL(____pagevec_lru_add); 634EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
157 if (!entry.val) 157 if (!entry.val)
158 return 0; 158 return 0;
159 159
160 if (unlikely(PageTransHuge(page)))
161 if (unlikely(split_huge_page(page))) {
162 swapcache_free(entry, NULL);
163 return 0;
164 }
165
160 /* 166 /*
161 * Radix-tree node allocations from PF_MEMALLOC contexts could 167 * Radix-tree node allocations from PF_MEMALLOC contexts could
162 * completely exhaust the page allocator. __GFP_NOMEMALLOC 168 * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b6adcfbf6f48..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
964 pmd = pmd_offset(pud, addr); 964 pmd = pmd_offset(pud, addr);
965 do { 965 do {
966 next = pmd_addr_end(addr, end); 966 next = pmd_addr_end(addr, end);
967 if (unlikely(pmd_trans_huge(*pmd)))
968 continue;
967 if (pmd_none_or_clear_bad(pmd)) 969 if (pmd_none_or_clear_bad(pmd))
968 continue; 970 continue;
969 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 971 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index eb5cc7d00c5a..cac13b415635 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
748 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 748 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
749 VMALLOC_START, VMALLOC_END, 749 VMALLOC_START, VMALLOC_END,
750 node, gfp_mask); 750 node, gfp_mask);
751 if (unlikely(IS_ERR(va))) { 751 if (IS_ERR(va)) {
752 kfree(vb); 752 kfree(vb);
753 return ERR_CAST(va); 753 return ERR_CAST(va);
754 } 754 }
@@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1315 -1, GFP_KERNEL, caller); 1315 -1, GFP_KERNEL, caller);
1316} 1316}
1317 1317
1318struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1319 int node, gfp_t gfp_mask)
1320{
1321 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1322 node, gfp_mask, __builtin_return_address(0));
1323}
1324
1325static struct vm_struct *find_vm_area(const void *addr) 1318static struct vm_struct *find_vm_area(const void *addr)
1326{ 1319{
1327 struct vmap_area *va; 1320 struct vmap_area *va;
@@ -1537,25 +1530,12 @@ fail:
1537 return NULL; 1530 return NULL;
1538} 1531}
1539 1532
1540void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1541{
1542 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1543 __builtin_return_address(0));
1544
1545 /*
1546 * A ref_count = 3 is needed because the vm_struct and vmap_area
1547 * structures allocated in the __get_vm_area_node() function contain
1548 * references to the virtual address of the vmalloc'ed block.
1549 */
1550 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1551
1552 return addr;
1553}
1554
1555/** 1533/**
1556 * __vmalloc_node - allocate virtually contiguous memory 1534 * __vmalloc_node_range - allocate virtually contiguous memory
1557 * @size: allocation size 1535 * @size: allocation size
1558 * @align: desired alignment 1536 * @align: desired alignment
1537 * @start: vm area range start
1538 * @end: vm area range end
1559 * @gfp_mask: flags for the page level allocator 1539 * @gfp_mask: flags for the page level allocator
1560 * @prot: protection mask for the allocated pages 1540 * @prot: protection mask for the allocated pages
1561 * @node: node to use for allocation or -1 1541 * @node: node to use for allocation or -1
@@ -1565,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1565 * allocator with @gfp_mask flags. Map them into contiguous 1545 * allocator with @gfp_mask flags. Map them into contiguous
1566 * kernel virtual space, using a pagetable protection of @prot. 1546 * kernel virtual space, using a pagetable protection of @prot.
1567 */ 1547 */
1568static void *__vmalloc_node(unsigned long size, unsigned long align, 1548void *__vmalloc_node_range(unsigned long size, unsigned long align,
1569 gfp_t gfp_mask, pgprot_t prot, 1549 unsigned long start, unsigned long end, gfp_t gfp_mask,
1570 int node, void *caller) 1550 pgprot_t prot, int node, void *caller)
1571{ 1551{
1572 struct vm_struct *area; 1552 struct vm_struct *area;
1573 void *addr; 1553 void *addr;
@@ -1577,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1577 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1557 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1578 return NULL; 1558 return NULL;
1579 1559
1580 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1560 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
1581 VMALLOC_END, node, gfp_mask, caller); 1561 gfp_mask, caller);
1582 1562
1583 if (!area) 1563 if (!area)
1584 return NULL; 1564 return NULL;
@@ -1595,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1595 return addr; 1575 return addr;
1596} 1576}
1597 1577
1578/**
1579 * __vmalloc_node - allocate virtually contiguous memory
1580 * @size: allocation size
1581 * @align: desired alignment
1582 * @gfp_mask: flags for the page level allocator
1583 * @prot: protection mask for the allocated pages
1584 * @node: node to use for allocation or -1
1585 * @caller: caller's return address
1586 *
1587 * Allocate enough pages to cover @size from the page level
1588 * allocator with @gfp_mask flags. Map them into contiguous
1589 * kernel virtual space, using a pagetable protection of @prot.
1590 */
1591static void *__vmalloc_node(unsigned long size, unsigned long align,
1592 gfp_t gfp_mask, pgprot_t prot,
1593 int node, void *caller)
1594{
1595 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1596 gfp_mask, prot, node, caller);
1597}
1598
1598void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1599void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1599{ 1600{
1600 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1601 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2203,17 +2204,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2203 * @sizes: array containing size of each area 2204 * @sizes: array containing size of each area
2204 * @nr_vms: the number of areas to allocate 2205 * @nr_vms: the number of areas to allocate
2205 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2206 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2206 * @gfp_mask: allocation mask
2207 * 2207 *
2208 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2208 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2209 * vm_structs on success, %NULL on failure 2209 * vm_structs on success, %NULL on failure
2210 * 2210 *
2211 * Percpu allocator wants to use congruent vm areas so that it can 2211 * Percpu allocator wants to use congruent vm areas so that it can
2212 * maintain the offsets among percpu areas. This function allocates 2212 * maintain the offsets among percpu areas. This function allocates
2213 * congruent vmalloc areas for it. These areas tend to be scattered 2213 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
2214 * pretty far, distance between two areas easily going up to 2214 * be scattered pretty far, distance between two areas easily going up
2215 * gigabytes. To avoid interacting with regular vmallocs, these areas 2215 * to gigabytes. To avoid interacting with regular vmallocs, these
2216 * are allocated from top. 2216 * areas are allocated from top.
2217 * 2217 *
2218 * Despite its complicated look, this allocator is rather simple. It 2218 * Despite its complicated look, this allocator is rather simple. It
2219 * does everything top-down and scans areas from the end looking for 2219 * does everything top-down and scans areas from the end looking for
@@ -2224,7 +2224,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2224 */ 2224 */
2225struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2225struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2226 const size_t *sizes, int nr_vms, 2226 const size_t *sizes, int nr_vms,
2227 size_t align, gfp_t gfp_mask) 2227 size_t align)
2228{ 2228{
2229 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2229 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2230 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2230 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2234,8 +2234,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2234 unsigned long base, start, end, last_end; 2234 unsigned long base, start, end, last_end;
2235 bool purged = false; 2235 bool purged = false;
2236 2236
2237 gfp_mask &= GFP_RECLAIM_MASK;
2238
2239 /* verify parameters and allocate data structures */ 2237 /* verify parameters and allocate data structures */
2240 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2238 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2241 for (last_area = 0, area = 0; area < nr_vms; area++) { 2239 for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2268,14 +2266,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2268 return NULL; 2266 return NULL;
2269 } 2267 }
2270 2268
2271 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2269 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
2272 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2270 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
2273 if (!vas || !vms) 2271 if (!vas || !vms)
2274 goto err_free; 2272 goto err_free;
2275 2273
2276 for (area = 0; area < nr_vms; area++) { 2274 for (area = 0; area < nr_vms; area++) {
2277 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2275 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
2278 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2276 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
2279 if (!vas[area] || !vms[area]) 2277 if (!vas[area] || !vms[area])
2280 goto err_free; 2278 goto err_free;
2281 } 2279 }
@@ -2456,13 +2454,8 @@ static int s_show(struct seq_file *m, void *p)
2456 seq_printf(m, "0x%p-0x%p %7ld", 2454 seq_printf(m, "0x%p-0x%p %7ld",
2457 v->addr, v->addr + v->size, v->size); 2455 v->addr, v->addr + v->size, v->size);
2458 2456
2459 if (v->caller) { 2457 if (v->caller)
2460 char buff[KSYM_SYMBOL_LEN]; 2458 seq_printf(m, " %pS", v->caller);
2461
2462 seq_putc(m, ' ');
2463 sprint_symbol(buff, (unsigned long)v->caller);
2464 seq_puts(m, buff);
2465 }
2466 2459
2467 if (v->nr_pages) 2460 if (v->nr_pages)
2468 seq_printf(m, " pages=%d", v->nr_pages); 2461 seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..99999a9b2b0b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -40,6 +41,7 @@
40#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 42#include <linux/delayacct.h>
42#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/compaction.h>
43 45
44#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
45#include <asm/div64.h> 47#include <asm/div64.h>
@@ -51,11 +53,23 @@
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 54#include <trace/events/vmscan.h>
53 55
54enum lumpy_mode { 56/*
55 LUMPY_MODE_NONE, 57 * reclaim_mode determines how the inactive list is shrunk
56 LUMPY_MODE_ASYNC, 58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
57 LUMPY_MODE_SYNC, 59 * RECLAIM_MODE_ASYNC: Do not block
58}; 60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone
66 */
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
59 73
60struct scan_control { 74struct scan_control {
61 /* Incremented by the number of inactive pages that were scanned */ 75 /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
88 * Intend to reclaim enough continuous memory rather than reclaim 102 * Intend to reclaim enough continuous memory rather than reclaim
89 * enough amount of memory. i.e, mode for high order allocation. 103 * enough amount of memory. i.e, mode for high order allocation.
90 */ 104 */
91 enum lumpy_mode lumpy_reclaim_mode; 105 reclaim_mode_t reclaim_mode;
92 106
93 /* Which cgroup do we reclaim from */ 107 /* Which cgroup do we reclaim from */
94 struct mem_cgroup *mem_cgroup; 108 struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
271 return ret; 285 return ret;
272} 286}
273 287
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, 288static void set_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync) 289 bool sync)
276{ 290{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; 291 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
278 292
279 /* 293 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous 294 * Initially assume we are entering either lumpy reclaim or
281 * lumpy reclaim. 295 * reclaim/compaction.Depending on the order, we will either set the
296 * sync mode or just reclaim order-0 pages later.
282 */ 297 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 298 if (COMPACTION_BUILD)
284 return; 299 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
300 else
301 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
285 302
286 /* 303 /*
287 * If we need a large contiguous chunk of memory, or have 304 * Avoid using lumpy reclaim or reclaim/compaction if possible by
288 * trouble getting a small set of contiguous pages, we 305 * restricting when its set to either costly allocations or when
289 * will reclaim both active and inactive pages. 306 * under memory pressure
290 */ 307 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 308 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode; 309 sc->reclaim_mode |= syncmode;
293 else if (sc->order && priority < DEF_PRIORITY - 2) 310 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode; 311 sc->reclaim_mode |= syncmode;
295 else 312 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 313 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
297} 314}
298 315
299static void disable_lumpy_reclaim_mode(struct scan_control *sc) 316static void reset_reclaim_mode(struct scan_control *sc)
300{ 317{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 318 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
302} 319}
303 320
304static inline int is_page_cache_freeable(struct page *page) 321static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
429 * first attempt to free a range of pages fails. 446 * first attempt to free a range of pages fails.
430 */ 447 */
431 if (PageWriteback(page) && 448 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) 449 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
433 wait_on_page_writeback(page); 450 wait_on_page_writeback(page);
434 451
435 if (!PageWriteback(page)) { 452 if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
437 ClearPageReclaim(page); 454 ClearPageReclaim(page);
438 } 455 }
439 trace_mm_vmscan_writepage(page, 456 trace_mm_vmscan_writepage(page,
440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); 457 trace_reclaim_flags(page, sc->reclaim_mode));
441 inc_zone_page_state(page, NR_VMSCAN_WRITE); 458 inc_zone_page_state(page, NR_VMSCAN_WRITE);
442 return PAGE_SUCCESS; 459 return PAGE_SUCCESS;
443 } 460 }
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
622 referenced_page = TestClearPageReferenced(page); 639 referenced_page = TestClearPageReferenced(page);
623 640
624 /* Lumpy reclaim - ignore references */ 641 /* Lumpy reclaim - ignore references */
625 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) 642 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
626 return PAGEREF_RECLAIM; 643 return PAGEREF_RECLAIM;
627 644
628 /* 645 /*
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
739 * for any page for which writeback has already 756 * for any page for which writeback has already
740 * started. 757 * started.
741 */ 758 */
742 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && 759 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
743 may_enter_fs) 760 may_enter_fs)
744 wait_on_page_writeback(page); 761 wait_on_page_writeback(page);
745 else { 762 else {
@@ -895,7 +912,7 @@ cull_mlocked:
895 try_to_free_swap(page); 912 try_to_free_swap(page);
896 unlock_page(page); 913 unlock_page(page);
897 putback_lru_page(page); 914 putback_lru_page(page);
898 disable_lumpy_reclaim_mode(sc); 915 reset_reclaim_mode(sc);
899 continue; 916 continue;
900 917
901activate_locked: 918activate_locked:
@@ -908,7 +925,7 @@ activate_locked:
908keep_locked: 925keep_locked:
909 unlock_page(page); 926 unlock_page(page);
910keep: 927keep:
911 disable_lumpy_reclaim_mode(sc); 928 reset_reclaim_mode(sc);
912keep_lumpy: 929keep_lumpy:
913 list_add(&page->lru, &ret_pages); 930 list_add(&page->lru, &ret_pages);
914 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 931 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1028 case 0: 1045 case 0:
1029 list_move(&page->lru, dst); 1046 list_move(&page->lru, dst);
1030 mem_cgroup_del_lru(page); 1047 mem_cgroup_del_lru(page);
1031 nr_taken++; 1048 nr_taken += hpage_nr_pages(page);
1032 break; 1049 break;
1033 1050
1034 case -EBUSY: 1051 case -EBUSY:
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1086 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1103 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1087 list_move(&cursor_page->lru, dst); 1104 list_move(&cursor_page->lru, dst);
1088 mem_cgroup_del_lru(cursor_page); 1105 mem_cgroup_del_lru(cursor_page);
1089 nr_taken++; 1106 nr_taken += hpage_nr_pages(page);
1090 nr_lumpy_taken++; 1107 nr_lumpy_taken++;
1091 if (PageDirty(cursor_page)) 1108 if (PageDirty(cursor_page))
1092 nr_lumpy_dirty++; 1109 nr_lumpy_dirty++;
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1141 struct page *page; 1158 struct page *page;
1142 1159
1143 list_for_each_entry(page, page_list, lru) { 1160 list_for_each_entry(page, page_list, lru) {
1161 int numpages = hpage_nr_pages(page);
1144 lru = page_lru_base_type(page); 1162 lru = page_lru_base_type(page);
1145 if (PageActive(page)) { 1163 if (PageActive(page)) {
1146 lru += LRU_ACTIVE; 1164 lru += LRU_ACTIVE;
1147 ClearPageActive(page); 1165 ClearPageActive(page);
1148 nr_active++; 1166 nr_active += numpages;
1149 } 1167 }
1150 if (count) 1168 if (count)
1151 count[lru]++; 1169 count[lru] += numpages;
1152 } 1170 }
1153 1171
1154 return nr_active; 1172 return nr_active;
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1253 spin_lock_irq(&zone->lru_lock); 1271 spin_lock_irq(&zone->lru_lock);
1254 continue; 1272 continue;
1255 } 1273 }
1256 SetPageLRU(page);
1257 lru = page_lru(page); 1274 lru = page_lru(page);
1258 add_page_to_lru_list(zone, page, lru);
1259 if (is_active_lru(lru)) { 1275 if (is_active_lru(lru)) {
1260 int file = is_file_lru(lru); 1276 int file = is_file_lru(lru);
1261 reclaim_stat->recent_rotated[file]++; 1277 int numpages = hpage_nr_pages(page);
1278 reclaim_stat->recent_rotated[file] += numpages;
1279 if (putback_active_lru_page(zone, page))
1280 continue;
1262 } 1281 }
1282 SetPageLRU(page);
1283 add_page_to_lru_list(zone, page, lru);
1263 if (!pagevec_add(&pvec, page)) { 1284 if (!pagevec_add(&pvec, page)) {
1264 spin_unlock_irq(&zone->lru_lock); 1285 spin_unlock_irq(&zone->lru_lock);
1265 __pagevec_release(&pvec); 1286 __pagevec_release(&pvec);
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1324 return false; 1345 return false;
1325 1346
1326 /* Only stall on lumpy reclaim */ 1347 /* Only stall on lumpy reclaim */
1327 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 1348 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1328 return false; 1349 return false;
1329 1350
1330 /* If we have relaimed everything on the isolated list, no stall */ 1351 /* If we have relaimed everything on the isolated list, no stall */
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1368 return SWAP_CLUSTER_MAX; 1389 return SWAP_CLUSTER_MAX;
1369 } 1390 }
1370 1391
1371 set_lumpy_reclaim_mode(priority, sc, false); 1392 set_reclaim_mode(priority, sc, false);
1372 lru_add_drain(); 1393 lru_add_drain();
1373 spin_lock_irq(&zone->lru_lock); 1394 spin_lock_irq(&zone->lru_lock);
1374 1395
1375 if (scanning_global_lru(sc)) { 1396 if (scanning_global_lru(sc)) {
1376 nr_taken = isolate_pages_global(nr_to_scan, 1397 nr_taken = isolate_pages_global(nr_to_scan,
1377 &page_list, &nr_scanned, sc->order, 1398 &page_list, &nr_scanned, sc->order,
1378 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1399 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1379 ISOLATE_INACTIVE : ISOLATE_BOTH, 1400 ISOLATE_BOTH : ISOLATE_INACTIVE,
1380 zone, 0, file); 1401 zone, 0, file);
1381 zone->pages_scanned += nr_scanned; 1402 zone->pages_scanned += nr_scanned;
1382 if (current_is_kswapd()) 1403 if (current_is_kswapd())
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1388 } else { 1409 } else {
1389 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1410 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1390 &page_list, &nr_scanned, sc->order, 1411 &page_list, &nr_scanned, sc->order,
1391 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1412 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1392 ISOLATE_INACTIVE : ISOLATE_BOTH, 1413 ISOLATE_BOTH : ISOLATE_INACTIVE,
1393 zone, sc->mem_cgroup, 1414 zone, sc->mem_cgroup,
1394 0, file); 1415 0, file);
1395 /* 1416 /*
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1411 1432
1412 /* Check if we should syncronously wait for writeback */ 1433 /* Check if we should syncronously wait for writeback */
1413 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1434 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1414 set_lumpy_reclaim_mode(priority, sc, true); 1435 set_reclaim_mode(priority, sc, true);
1415 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1436 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1416 } 1437 }
1417 1438
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1426 zone_idx(zone), 1447 zone_idx(zone),
1427 nr_scanned, nr_reclaimed, 1448 nr_scanned, nr_reclaimed,
1428 priority, 1449 priority,
1429 trace_shrink_flags(file, sc->lumpy_reclaim_mode)); 1450 trace_shrink_flags(file, sc->reclaim_mode));
1430 return nr_reclaimed; 1451 return nr_reclaimed;
1431} 1452}
1432 1453
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1466 1487
1467 list_move(&page->lru, &zone->lru[lru].list); 1488 list_move(&page->lru, &zone->lru[lru].list);
1468 mem_cgroup_add_lru_list(page, lru); 1489 mem_cgroup_add_lru_list(page, lru);
1469 pgmoved++; 1490 pgmoved += hpage_nr_pages(page);
1470 1491
1471 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1492 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1472 spin_unlock_irq(&zone->lru_lock); 1493 spin_unlock_irq(&zone->lru_lock);
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1534 } 1555 }
1535 1556
1536 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1557 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1537 nr_rotated++; 1558 nr_rotated += hpage_nr_pages(page);
1538 /* 1559 /*
1539 * Identify referenced, file-backed active pages and 1560 * Identify referenced, file-backed active pages and
1540 * give them one more trip around the active list. So 1561 * give them one more trip around the active list. So
@@ -1805,6 +1826,57 @@ out:
1805} 1826}
1806 1827
1807/* 1828/*
1829 * Reclaim/compaction depends on a number of pages being freed. To avoid
1830 * disruption to the system, a small number of order-0 pages continue to be
1831 * rotated and reclaimed in the normal fashion. However, by the time we get
1832 * back to the allocator and call try_to_compact_zone(), we ensure that
1833 * there are enough free pages for it to be likely successful
1834 */
1835static inline bool should_continue_reclaim(struct zone *zone,
1836 unsigned long nr_reclaimed,
1837 unsigned long nr_scanned,
1838 struct scan_control *sc)
1839{
1840 unsigned long pages_for_compaction;
1841 unsigned long inactive_lru_pages;
1842
1843 /* If not in reclaim/compaction mode, stop */
1844 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1845 return false;
1846
1847 /*
1848 * If we failed to reclaim and have scanned the full list, stop.
1849 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
1850 * faster but obviously would be less likely to succeed
1851 * allocation. If this is desirable, use GFP_REPEAT to decide
1852 * if both reclaimed and scanned should be checked or just
1853 * reclaimed
1854 */
1855 if (!nr_reclaimed && !nr_scanned)
1856 return false;
1857
1858 /*
1859 * If we have not reclaimed enough pages for compaction and the
1860 * inactive lists are large enough, continue reclaiming
1861 */
1862 pages_for_compaction = (2UL << sc->order);
1863 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1864 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1865 if (sc->nr_reclaimed < pages_for_compaction &&
1866 inactive_lru_pages > pages_for_compaction)
1867 return true;
1868
1869 /* If compaction would go ahead or the allocation would succeed, stop */
1870 switch (compaction_suitable(zone, sc->order)) {
1871 case COMPACT_PARTIAL:
1872 case COMPACT_CONTINUE:
1873 return false;
1874 default:
1875 return true;
1876 }
1877}
1878
1879/*
1808 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1880 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1809 */ 1881 */
1810static void shrink_zone(int priority, struct zone *zone, 1882static void shrink_zone(int priority, struct zone *zone,
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone,
1813 unsigned long nr[NR_LRU_LISTS]; 1885 unsigned long nr[NR_LRU_LISTS];
1814 unsigned long nr_to_scan; 1886 unsigned long nr_to_scan;
1815 enum lru_list l; 1887 enum lru_list l;
1816 unsigned long nr_reclaimed = sc->nr_reclaimed; 1888 unsigned long nr_reclaimed;
1817 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1889 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1890 unsigned long nr_scanned = sc->nr_scanned;
1818 1891
1892restart:
1893 nr_reclaimed = 0;
1819 get_scan_count(zone, sc, nr, priority); 1894 get_scan_count(zone, sc, nr, priority);
1820 1895
1821 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1896 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone,
1841 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1916 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1842 break; 1917 break;
1843 } 1918 }
1844 1919 sc->nr_reclaimed += nr_reclaimed;
1845 sc->nr_reclaimed = nr_reclaimed;
1846 1920
1847 /* 1921 /*
1848 * Even if we did not try to evict anon pages at all, we want to 1922 * Even if we did not try to evict anon pages at all, we want to
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone,
1851 if (inactive_anon_is_low(zone, sc)) 1925 if (inactive_anon_is_low(zone, sc))
1852 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1926 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1853 1927
1928 /* reclaim/compaction might need reclaim to continue */
1929 if (should_continue_reclaim(zone, nr_reclaimed,
1930 sc->nr_scanned - nr_scanned, sc))
1931 goto restart;
1932
1854 throttle_vm_writeout(sc->gfp_mask); 1933 throttle_vm_writeout(sc->gfp_mask);
1855} 1934}
1856 1935
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2124} 2203}
2125#endif 2204#endif
2126 2205
2206/*
2207 * pgdat_balanced is used when checking if a node is balanced for high-order
2208 * allocations. Only zones that meet watermarks and are in a zone allowed
2209 * by the callers classzone_idx are added to balanced_pages. The total of
2210 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2211 * for the node to be considered balanced. Forcing all zones to be balanced
2212 * for high orders can cause excessive reclaim when there are imbalanced zones.
2213 * The choice of 25% is due to
2214 * o a 16M DMA zone that is balanced will not balance a zone on any
2215 * reasonable sized machine
2216 * o On all other machines, the top zone must be at least a reasonable
2217 * precentage of the middle zones. For example, on 32-bit x86, highmem
2218 * would need to be at least 256M for it to be balance a whole node.
2219 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2220 * to balance a node on its own. These seemed like reasonable ratios.
2221 */
2222static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2223 int classzone_idx)
2224{
2225 unsigned long present_pages = 0;
2226 int i;
2227
2228 for (i = 0; i <= classzone_idx; i++)
2229 present_pages += pgdat->node_zones[i].present_pages;
2230
2231 return balanced_pages > (present_pages >> 2);
2232}
2233
2127/* is kswapd sleeping prematurely? */ 2234/* is kswapd sleeping prematurely? */
2128static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) 2235static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2236 int classzone_idx)
2129{ 2237{
2130 int i; 2238 int i;
2239 unsigned long balanced = 0;
2240 bool all_zones_ok = true;
2131 2241
2132 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2242 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2133 if (remaining) 2243 if (remaining)
2134 return 1; 2244 return true;
2135 2245
2136 /* If after HZ/10, a zone is below the high mark, it's premature */ 2246 /* Check the watermark levels */
2137 for (i = 0; i < pgdat->nr_zones; i++) { 2247 for (i = 0; i < pgdat->nr_zones; i++) {
2138 struct zone *zone = pgdat->node_zones + i; 2248 struct zone *zone = pgdat->node_zones + i;
2139 2249
2140 if (!populated_zone(zone)) 2250 if (!populated_zone(zone))
2141 continue; 2251 continue;
2142 2252
2143 if (zone->all_unreclaimable) 2253 /*
2254 * balance_pgdat() skips over all_unreclaimable after
2255 * DEF_PRIORITY. Effectively, it considers them balanced so
2256 * they must be considered balanced here as well if kswapd
2257 * is to sleep
2258 */
2259 if (zone->all_unreclaimable) {
2260 balanced += zone->present_pages;
2144 continue; 2261 continue;
2262 }
2145 2263
2146 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2264 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2147 0, 0)) 2265 classzone_idx, 0))
2148 return 1; 2266 all_zones_ok = false;
2267 else
2268 balanced += zone->present_pages;
2149 } 2269 }
2150 2270
2151 return 0; 2271 /*
2272 * For high-order requests, the balanced zones must contain at least
2273 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2274 * must be balanced
2275 */
2276 if (order)
2277 return pgdat_balanced(pgdat, balanced, classzone_idx);
2278 else
2279 return !all_zones_ok;
2152} 2280}
2153 2281
2154/* 2282/*
2155 * For kswapd, balance_pgdat() will work across all this node's zones until 2283 * For kswapd, balance_pgdat() will work across all this node's zones until
2156 * they are all at high_wmark_pages(zone). 2284 * they are all at high_wmark_pages(zone).
2157 * 2285 *
2158 * Returns the number of pages which were actually freed. 2286 * Returns the final order kswapd was reclaiming at
2159 * 2287 *
2160 * There is special handling here for zones which are full of pinned pages. 2288 * There is special handling here for zones which are full of pinned pages.
2161 * This can happen if the pages are all mlocked, or if they are all used by 2289 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2172 * interoperates with the page allocator fallback scheme to ensure that aging 2300 * interoperates with the page allocator fallback scheme to ensure that aging
2173 * of pages is balanced across the zones. 2301 * of pages is balanced across the zones.
2174 */ 2302 */
2175static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 2303static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2304 int *classzone_idx)
2176{ 2305{
2177 int all_zones_ok; 2306 int all_zones_ok;
2307 unsigned long balanced;
2178 int priority; 2308 int priority;
2179 int i; 2309 int i;
2310 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2180 unsigned long total_scanned; 2311 unsigned long total_scanned;
2181 struct reclaim_state *reclaim_state = current->reclaim_state; 2312 struct reclaim_state *reclaim_state = current->reclaim_state;
2182 struct scan_control sc = { 2313 struct scan_control sc = {
@@ -2199,7 +2330,6 @@ loop_again:
2199 count_vm_event(PAGEOUTRUN); 2330 count_vm_event(PAGEOUTRUN);
2200 2331
2201 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2332 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2202 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2203 unsigned long lru_pages = 0; 2333 unsigned long lru_pages = 0;
2204 int has_under_min_watermark_zone = 0; 2334 int has_under_min_watermark_zone = 0;
2205 2335
@@ -2208,6 +2338,7 @@ loop_again:
2208 disable_swap_token(); 2338 disable_swap_token();
2209 2339
2210 all_zones_ok = 1; 2340 all_zones_ok = 1;
2341 balanced = 0;
2211 2342
2212 /* 2343 /*
2213 * Scan in the highmem->dma direction for the highest 2344 * Scan in the highmem->dma direction for the highest
@@ -2230,9 +2361,10 @@ loop_again:
2230 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2361 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2231 &sc, priority, 0); 2362 &sc, priority, 0);
2232 2363
2233 if (!zone_watermark_ok(zone, order, 2364 if (!zone_watermark_ok_safe(zone, order,
2234 high_wmark_pages(zone), 0, 0)) { 2365 high_wmark_pages(zone), 0, 0)) {
2235 end_zone = i; 2366 end_zone = i;
2367 *classzone_idx = i;
2236 break; 2368 break;
2237 } 2369 }
2238 } 2370 }
@@ -2255,6 +2387,7 @@ loop_again:
2255 * cause too much scanning of the lower zones. 2387 * cause too much scanning of the lower zones.
2256 */ 2388 */
2257 for (i = 0; i <= end_zone; i++) { 2389 for (i = 0; i <= end_zone; i++) {
2390 int compaction;
2258 struct zone *zone = pgdat->node_zones + i; 2391 struct zone *zone = pgdat->node_zones + i;
2259 int nr_slab; 2392 int nr_slab;
2260 2393
@@ -2276,7 +2409,7 @@ loop_again:
2276 * We put equal pressure on every zone, unless one 2409 * We put equal pressure on every zone, unless one
2277 * zone has way too many pages free already. 2410 * zone has way too many pages free already.
2278 */ 2411 */
2279 if (!zone_watermark_ok(zone, order, 2412 if (!zone_watermark_ok_safe(zone, order,
2280 8*high_wmark_pages(zone), end_zone, 0)) 2413 8*high_wmark_pages(zone), end_zone, 0))
2281 shrink_zone(priority, zone, &sc); 2414 shrink_zone(priority, zone, &sc);
2282 reclaim_state->reclaimed_slab = 0; 2415 reclaim_state->reclaimed_slab = 0;
@@ -2284,9 +2417,26 @@ loop_again:
2284 lru_pages); 2417 lru_pages);
2285 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2418 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2286 total_scanned += sc.nr_scanned; 2419 total_scanned += sc.nr_scanned;
2420
2421 compaction = 0;
2422 if (order &&
2423 zone_watermark_ok(zone, 0,
2424 high_wmark_pages(zone),
2425 end_zone, 0) &&
2426 !zone_watermark_ok(zone, order,
2427 high_wmark_pages(zone),
2428 end_zone, 0)) {
2429 compact_zone_order(zone,
2430 order,
2431 sc.gfp_mask, false,
2432 COMPACT_MODE_KSWAPD);
2433 compaction = 1;
2434 }
2435
2287 if (zone->all_unreclaimable) 2436 if (zone->all_unreclaimable)
2288 continue; 2437 continue;
2289 if (nr_slab == 0 && !zone_reclaimable(zone)) 2438 if (!compaction && nr_slab == 0 &&
2439 !zone_reclaimable(zone))
2290 zone->all_unreclaimable = 1; 2440 zone->all_unreclaimable = 1;
2291 /* 2441 /*
2292 * If we've done a decent amount of scanning and 2442 * If we've done a decent amount of scanning and
@@ -2297,7 +2447,7 @@ loop_again:
2297 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2447 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2298 sc.may_writepage = 1; 2448 sc.may_writepage = 1;
2299 2449
2300 if (!zone_watermark_ok(zone, order, 2450 if (!zone_watermark_ok_safe(zone, order,
2301 high_wmark_pages(zone), end_zone, 0)) { 2451 high_wmark_pages(zone), end_zone, 0)) {
2302 all_zones_ok = 0; 2452 all_zones_ok = 0;
2303 /* 2453 /*
@@ -2305,7 +2455,7 @@ loop_again:
2305 * means that we have a GFP_ATOMIC allocation 2455 * means that we have a GFP_ATOMIC allocation
2306 * failure risk. Hurry up! 2456 * failure risk. Hurry up!
2307 */ 2457 */
2308 if (!zone_watermark_ok(zone, order, 2458 if (!zone_watermark_ok_safe(zone, order,
2309 min_wmark_pages(zone), end_zone, 0)) 2459 min_wmark_pages(zone), end_zone, 0))
2310 has_under_min_watermark_zone = 1; 2460 has_under_min_watermark_zone = 1;
2311 } else { 2461 } else {
@@ -2317,10 +2467,12 @@ loop_again:
2317 * spectulatively avoid congestion waits 2467 * spectulatively avoid congestion waits
2318 */ 2468 */
2319 zone_clear_flag(zone, ZONE_CONGESTED); 2469 zone_clear_flag(zone, ZONE_CONGESTED);
2470 if (i <= *classzone_idx)
2471 balanced += zone->present_pages;
2320 } 2472 }
2321 2473
2322 } 2474 }
2323 if (all_zones_ok) 2475 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2324 break; /* kswapd: all done */ 2476 break; /* kswapd: all done */
2325 /* 2477 /*
2326 * OK, kswapd is getting into trouble. Take a nap, then take 2478 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2343,7 +2495,13 @@ loop_again:
2343 break; 2495 break;
2344 } 2496 }
2345out: 2497out:
2346 if (!all_zones_ok) { 2498
2499 /*
2500 * order-0: All zones must meet high watermark for a balanced node
2501 * high-order: Balanced zones must make up at least 25% of the node
2502 * for the node to be balanced
2503 */
2504 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2347 cond_resched(); 2505 cond_resched();
2348 2506
2349 try_to_freeze(); 2507 try_to_freeze();
@@ -2368,7 +2526,88 @@ out:
2368 goto loop_again; 2526 goto loop_again;
2369 } 2527 }
2370 2528
2371 return sc.nr_reclaimed; 2529 /*
2530 * If kswapd was reclaiming at a higher order, it has the option of
2531 * sleeping without all zones being balanced. Before it does, it must
2532 * ensure that the watermarks for order-0 on *all* zones are met and
2533 * that the congestion flags are cleared. The congestion flag must
2534 * be cleared as kswapd is the only mechanism that clears the flag
2535 * and it is potentially going to sleep here.
2536 */
2537 if (order) {
2538 for (i = 0; i <= end_zone; i++) {
2539 struct zone *zone = pgdat->node_zones + i;
2540
2541 if (!populated_zone(zone))
2542 continue;
2543
2544 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2545 continue;
2546
2547 /* Confirm the zone is balanced for order-0 */
2548 if (!zone_watermark_ok(zone, 0,
2549 high_wmark_pages(zone), 0, 0)) {
2550 order = sc.order = 0;
2551 goto loop_again;
2552 }
2553
2554 /* If balanced, clear the congested flag */
2555 zone_clear_flag(zone, ZONE_CONGESTED);
2556 }
2557 }
2558
2559 /*
2560 * Return the order we were reclaiming at so sleeping_prematurely()
2561 * makes a decision on the order we were last reclaiming at. However,
2562 * if another caller entered the allocator slow path while kswapd
2563 * was awake, order will remain at the higher level
2564 */
2565 *classzone_idx = end_zone;
2566 return order;
2567}
2568
2569static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2570{
2571 long remaining = 0;
2572 DEFINE_WAIT(wait);
2573
2574 if (freezing(current) || kthread_should_stop())
2575 return;
2576
2577 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2578
2579 /* Try to sleep for a short interval */
2580 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2581 remaining = schedule_timeout(HZ/10);
2582 finish_wait(&pgdat->kswapd_wait, &wait);
2583 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2584 }
2585
2586 /*
2587 * After a short sleep, check if it was a premature sleep. If not, then
2588 * go fully to sleep until explicitly woken up.
2589 */
2590 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2591 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2592
2593 /*
2594 * vmstat counters are not perfectly accurate and the estimated
2595 * value for counters such as NR_FREE_PAGES can deviate from the
2596 * true value by nr_online_cpus * threshold. To avoid the zone
2597 * watermarks being breached while under pressure, we reduce the
2598 * per-cpu vmstat threshold while kswapd is awake and restore
2599 * them before going back to sleep.
2600 */
2601 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2602 schedule();
2603 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2604 } else {
2605 if (remaining)
2606 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2607 else
2608 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2609 }
2610 finish_wait(&pgdat->kswapd_wait, &wait);
2372} 2611}
2373 2612
2374/* 2613/*
@@ -2387,9 +2626,10 @@ out:
2387static int kswapd(void *p) 2626static int kswapd(void *p)
2388{ 2627{
2389 unsigned long order; 2628 unsigned long order;
2629 int classzone_idx;
2390 pg_data_t *pgdat = (pg_data_t*)p; 2630 pg_data_t *pgdat = (pg_data_t*)p;
2391 struct task_struct *tsk = current; 2631 struct task_struct *tsk = current;
2392 DEFINE_WAIT(wait); 2632
2393 struct reclaim_state reclaim_state = { 2633 struct reclaim_state reclaim_state = {
2394 .reclaimed_slab = 0, 2634 .reclaimed_slab = 0,
2395 }; 2635 };
@@ -2417,49 +2657,30 @@ static int kswapd(void *p)
2417 set_freezable(); 2657 set_freezable();
2418 2658
2419 order = 0; 2659 order = 0;
2660 classzone_idx = MAX_NR_ZONES - 1;
2420 for ( ; ; ) { 2661 for ( ; ; ) {
2421 unsigned long new_order; 2662 unsigned long new_order;
2663 int new_classzone_idx;
2422 int ret; 2664 int ret;
2423 2665
2424 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2425 new_order = pgdat->kswapd_max_order; 2666 new_order = pgdat->kswapd_max_order;
2667 new_classzone_idx = pgdat->classzone_idx;
2426 pgdat->kswapd_max_order = 0; 2668 pgdat->kswapd_max_order = 0;
2427 if (order < new_order) { 2669 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2670 if (order < new_order || classzone_idx > new_classzone_idx) {
2428 /* 2671 /*
2429 * Don't sleep if someone wants a larger 'order' 2672 * Don't sleep if someone wants a larger 'order'
2430 * allocation 2673 * allocation or has tigher zone constraints
2431 */ 2674 */
2432 order = new_order; 2675 order = new_order;
2676 classzone_idx = new_classzone_idx;
2433 } else { 2677 } else {
2434 if (!freezing(current) && !kthread_should_stop()) { 2678 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2435 long remaining = 0;
2436
2437 /* Try to sleep for a short interval */
2438 if (!sleeping_prematurely(pgdat, order, remaining)) {
2439 remaining = schedule_timeout(HZ/10);
2440 finish_wait(&pgdat->kswapd_wait, &wait);
2441 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2442 }
2443
2444 /*
2445 * After a short sleep, check if it was a
2446 * premature sleep. If not, then go fully
2447 * to sleep until explicitly woken up
2448 */
2449 if (!sleeping_prematurely(pgdat, order, remaining)) {
2450 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2451 schedule();
2452 } else {
2453 if (remaining)
2454 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2455 else
2456 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2457 }
2458 }
2459
2460 order = pgdat->kswapd_max_order; 2679 order = pgdat->kswapd_max_order;
2680 classzone_idx = pgdat->classzone_idx;
2681 pgdat->kswapd_max_order = 0;
2682 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2461 } 2683 }
2462 finish_wait(&pgdat->kswapd_wait, &wait);
2463 2684
2464 ret = try_to_freeze(); 2685 ret = try_to_freeze();
2465 if (kthread_should_stop()) 2686 if (kthread_should_stop())
@@ -2471,7 +2692,7 @@ static int kswapd(void *p)
2471 */ 2692 */
2472 if (!ret) { 2693 if (!ret) {
2473 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2694 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2474 balance_pgdat(pgdat, order); 2695 order = balance_pgdat(pgdat, order, &classzone_idx);
2475 } 2696 }
2476 } 2697 }
2477 return 0; 2698 return 0;
@@ -2480,23 +2701,26 @@ static int kswapd(void *p)
2480/* 2701/*
2481 * A zone is low on free memory, so wake its kswapd task to service it. 2702 * A zone is low on free memory, so wake its kswapd task to service it.
2482 */ 2703 */
2483void wakeup_kswapd(struct zone *zone, int order) 2704void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2484{ 2705{
2485 pg_data_t *pgdat; 2706 pg_data_t *pgdat;
2486 2707
2487 if (!populated_zone(zone)) 2708 if (!populated_zone(zone))
2488 return; 2709 return;
2489 2710
2490 pgdat = zone->zone_pgdat;
2491 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2492 return;
2493 if (pgdat->kswapd_max_order < order)
2494 pgdat->kswapd_max_order = order;
2495 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2496 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2711 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2497 return; 2712 return;
2713 pgdat = zone->zone_pgdat;
2714 if (pgdat->kswapd_max_order < order) {
2715 pgdat->kswapd_max_order = order;
2716 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2717 }
2498 if (!waitqueue_active(&pgdat->kswapd_wait)) 2718 if (!waitqueue_active(&pgdat->kswapd_wait))
2499 return; 2719 return;
2720 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2721 return;
2722
2723 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2500 wake_up_interruptible(&pgdat->kswapd_wait); 2724 wake_up_interruptible(&pgdat->kswapd_wait);
2501} 2725}
2502 2726
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_threshold(struct zone *zone) 86int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
110int calculate_normal_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
89 int mem; /* memory in 128 MB units */ 113 int mem; /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
142 for_each_populated_zone(zone) { 166 for_each_populated_zone(zone) {
143 unsigned long max_drift, tolerate_drift; 167 unsigned long max_drift, tolerate_drift;
144 168
145 threshold = calculate_threshold(zone); 169 threshold = calculate_normal_threshold(zone);
146 170
147 for_each_online_cpu(cpu) 171 for_each_online_cpu(cpu)
148 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 172 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,6 +185,26 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void set_pgdat_percpu_threshold(pg_data_t *pgdat,
189 int (*calculate_pressure)(struct zone *))
190{
191 struct zone *zone;
192 int cpu;
193 int threshold;
194 int i;
195
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = (*calculate_pressure)(zone);
202 for_each_possible_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206}
207
164/* 208/*
165 * For use when we know that interrupts are disabled. 209 * For use when we know that interrupts are disabled.
166 */ 210 */
@@ -836,6 +880,7 @@ static const char * const vmstat_text[] = {
836 "numa_local", 880 "numa_local",
837 "numa_other", 881 "numa_other",
838#endif 882#endif
883 "nr_anon_transparent_hugepages",
839 "nr_dirty_threshold", 884 "nr_dirty_threshold",
840 "nr_dirty_background_threshold", 885 "nr_dirty_background_threshold",
841 886
@@ -911,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
911 "\n scanned %lu" 956 "\n scanned %lu"
912 "\n spanned %lu" 957 "\n spanned %lu"
913 "\n present %lu", 958 "\n present %lu",
914 zone_nr_free_pages(zone), 959 zone_page_state(zone, NR_FREE_PAGES),
915 min_wmark_pages(zone), 960 min_wmark_pages(zone),
916 low_wmark_pages(zone), 961 low_wmark_pages(zone),
917 high_wmark_pages(zone), 962 high_wmark_pages(zone),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f686251f711..f29abeb6a912 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,8 +104,26 @@ static pfn_t fault_pfn;
104inline int kvm_is_mmio_pfn(pfn_t pfn) 104inline int kvm_is_mmio_pfn(pfn_t pfn)
105{ 105{
106 if (pfn_valid(pfn)) { 106 if (pfn_valid(pfn)) {
107 struct page *page = compound_head(pfn_to_page(pfn)); 107 int reserved;
108 return PageReserved(page); 108 struct page *tail = pfn_to_page(pfn);
109 struct page *head = compound_trans_head(tail);
110 reserved = PageReserved(head);
111 if (head != tail) {
112 /*
113 * "head" is not a dangling pointer
114 * (compound_trans_head takes care of that)
115 * but the hugepage may have been splitted
116 * from under us (and we may not hold a
117 * reference count on the head page so it can
118 * be reused before we run PageReferenced), so
119 * we've to check PageTail before returning
120 * what we just read.
121 */
122 smp_rmb();
123 if (PageTail(tail))
124 return reserved;
125 }
126 return PageReserved(tail);
109 } 127 }
110 128
111 return true; 129 return true;
@@ -352,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
352 return young; 370 return young;
353} 371}
354 372
373static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
374 struct mm_struct *mm,
375 unsigned long address)
376{
377 struct kvm *kvm = mmu_notifier_to_kvm(mn);
378 int young, idx;
379
380 idx = srcu_read_lock(&kvm->srcu);
381 spin_lock(&kvm->mmu_lock);
382 young = kvm_test_age_hva(kvm, address);
383 spin_unlock(&kvm->mmu_lock);
384 srcu_read_unlock(&kvm->srcu, idx);
385
386 return young;
387}
388
355static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 389static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
356 struct mm_struct *mm) 390 struct mm_struct *mm)
357{ 391{
@@ -368,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
368 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 402 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
369 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 403 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
370 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 404 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
405 .test_young = kvm_mmu_notifier_test_young,
371 .change_pte = kvm_mmu_notifier_change_pte, 406 .change_pte = kvm_mmu_notifier_change_pte,
372 .release = kvm_mmu_notifier_release, 407 .release = kvm_mmu_notifier_release,
373}; 408};